--- /dev/null
+/* Copyright (C) 1988-2019 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3. If not see
+<http://www.gnu.org/licenses/>. */
+
+#define IN_TARGET_CODE 1
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "tree.h"
+#include "memmodel.h"
+#include "gimple.h"
+#include "cfghooks.h"
+#include "cfgloop.h"
+#include "df.h"
+#include "tm_p.h"
+#include "stringpool.h"
+#include "expmed.h"
+#include "optabs.h"
+#include "regs.h"
+#include "emit-rtl.h"
+#include "recog.h"
+#include "cgraph.h"
+#include "diagnostic.h"
+#include "cfgbuild.h"
+#include "alias.h"
+#include "fold-const.h"
+#include "attribs.h"
+#include "calls.h"
+#include "stor-layout.h"
+#include "varasm.h"
+#include "output.h"
+#include "insn-attr.h"
+#include "flags.h"
+#include "except.h"
+#include "explow.h"
+#include "expr.h"
+#include "cfgrtl.h"
+#include "common/common-target.h"
+#include "langhooks.h"
+#include "reload.h"
+#include "gimplify.h"
+#include "dwarf2.h"
+#include "tm-constrs.h"
+#include "params.h"
+#include "cselib.h"
+#include "sched-int.h"
+#include "opts.h"
+#include "tree-pass.h"
+#include "context.h"
+#include "pass_manager.h"
+#include "target-globals.h"
+#include "gimple-iterator.h"
+#include "tree-vectorizer.h"
+#include "shrink-wrap.h"
+#include "builtins.h"
+#include "rtl-iter.h"
+#include "tree-iterator.h"
+#include "dbgcnt.h"
+#include "case-cfn-macros.h"
+#include "dojump.h"
+#include "fold-const-call.h"
+#include "tree-vrp.h"
+#include "tree-ssanames.h"
+#include "selftest.h"
+#include "selftest-rtl.h"
+#include "print-rtl.h"
+#include "intl.h"
+#include "ifcvt.h"
+#include "symbol-summary.h"
+#include "ipa-prop.h"
+#include "ipa-fnsummary.h"
+#include "wide-int-bitmask.h"
+#include "tree-vector-builder.h"
+#include "debug.h"
+#include "dwarf2out.h"
+#include "i386-options.h"
+#include "i386-builtins.h"
+#include "i386-expand.h"
+
+/* Split one or more double-mode RTL references into pairs of half-mode
+ references. The RTL can be REG, offsettable MEM, integer constant, or
+ CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
+ split and "num" is its length. lo_half and hi_half are output arrays
+ that parallel "operands". */
+
+void
+split_double_mode (machine_mode mode, rtx operands[],
+ int num, rtx lo_half[], rtx hi_half[])
+{
+ machine_mode half_mode;
+ unsigned int byte;
+
+ switch (mode)
+ {
+ case E_TImode:
+ half_mode = DImode;
+ break;
+ case E_DImode:
+ half_mode = SImode;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ byte = GET_MODE_SIZE (half_mode);
+
+ while (num--)
+ {
+ rtx op = operands[num];
+
+ /* simplify_subreg refuse to split volatile memory addresses,
+ but we still have to handle it. */
+ if (MEM_P (op))
+ {
+ lo_half[num] = adjust_address (op, half_mode, 0);
+ hi_half[num] = adjust_address (op, half_mode, byte);
+ }
+ else
+ {
+ lo_half[num] = simplify_gen_subreg (half_mode, op,
+ GET_MODE (op) == VOIDmode
+ ? mode : GET_MODE (op), 0);
+ hi_half[num] = simplify_gen_subreg (half_mode, op,
+ GET_MODE (op) == VOIDmode
+ ? mode : GET_MODE (op), byte);
+ }
+ }
+}
+
+/* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
+ for the target. */
+
+void
+ix86_expand_clear (rtx dest)
+{
+ rtx tmp;
+
+ /* We play register width games, which are only valid after reload. */
+ gcc_assert (reload_completed);
+
+ /* Avoid HImode and its attendant prefix byte. */
+ if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
+ dest = gen_rtx_REG (SImode, REGNO (dest));
+ tmp = gen_rtx_SET (dest, const0_rtx);
+
+ if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
+ {
+ rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+ tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
+ }
+
+ emit_insn (tmp);
+}
+
+void
+ix86_expand_move (machine_mode mode, rtx operands[])
+{
+ rtx op0, op1;
+ rtx tmp, addend = NULL_RTX;
+ enum tls_model model;
+
+ op0 = operands[0];
+ op1 = operands[1];
+
+ switch (GET_CODE (op1))
+ {
+ case CONST:
+ tmp = XEXP (op1, 0);
+
+ if (GET_CODE (tmp) != PLUS
+ || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
+ break;
+
+ op1 = XEXP (tmp, 0);
+ addend = XEXP (tmp, 1);
+ /* FALLTHRU */
+
+ case SYMBOL_REF:
+ model = SYMBOL_REF_TLS_MODEL (op1);
+
+ if (model)
+ op1 = legitimize_tls_address (op1, model, true);
+ else if (ix86_force_load_from_GOT_p (op1))
+ {
+ /* Load the external function address via GOT slot to avoid PLT. */
+ op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
+ (TARGET_64BIT
+ ? UNSPEC_GOTPCREL
+ : UNSPEC_GOT));
+ op1 = gen_rtx_CONST (Pmode, op1);
+ op1 = gen_const_mem (Pmode, op1);
+ set_mem_alias_set (op1, ix86_GOT_alias_set ());
+ }
+ else
+ {
+ tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
+ if (tmp)
+ {
+ op1 = tmp;
+ if (!addend)
+ break;
+ }
+ else
+ {
+ op1 = operands[1];
+ break;
+ }
+ }
+
+ if (addend)
+ {
+ op1 = force_operand (op1, NULL_RTX);
+ op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
+ op0, 1, OPTAB_DIRECT);
+ }
+ else
+ op1 = force_operand (op1, op0);
+
+ if (op1 == op0)
+ return;
+
+ op1 = convert_to_mode (mode, op1, 1);
+
+ default:
+ break;
+ }
+
+ if ((flag_pic || MACHOPIC_INDIRECT)
+ && symbolic_operand (op1, mode))
+ {
+ if (TARGET_MACHO && !TARGET_64BIT)
+ {
+#if TARGET_MACHO
+ /* dynamic-no-pic */
+ if (MACHOPIC_INDIRECT)
+ {
+ rtx temp = (op0 && REG_P (op0) && mode == Pmode)
+ ? op0 : gen_reg_rtx (Pmode);
+ op1 = machopic_indirect_data_reference (op1, temp);
+ if (MACHOPIC_PURE)
+ op1 = machopic_legitimize_pic_address (op1, mode,
+ temp == op1 ? 0 : temp);
+ }
+ if (op0 != op1 && GET_CODE (op0) != MEM)
+ {
+ rtx insn = gen_rtx_SET (op0, op1);
+ emit_insn (insn);
+ return;
+ }
+ if (GET_CODE (op0) == MEM)
+ op1 = force_reg (Pmode, op1);
+ else
+ {
+ rtx temp = op0;
+ if (GET_CODE (temp) != REG)
+ temp = gen_reg_rtx (Pmode);
+ temp = legitimize_pic_address (op1, temp);
+ if (temp == op0)
+ return;
+ op1 = temp;
+ }
+ /* dynamic-no-pic */
+#endif
+ }
+ else
+ {
+ if (MEM_P (op0))
+ op1 = force_reg (mode, op1);
+ else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
+ {
+ rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
+ op1 = legitimize_pic_address (op1, reg);
+ if (op0 == op1)
+ return;
+ op1 = convert_to_mode (mode, op1, 1);
+ }
+ }
+ }
+ else
+ {
+ if (MEM_P (op0)
+ && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
+ || !push_operand (op0, mode))
+ && MEM_P (op1))
+ op1 = force_reg (mode, op1);
+
+ if (push_operand (op0, mode)
+ && ! general_no_elim_operand (op1, mode))
+ op1 = copy_to_mode_reg (mode, op1);
+
+ /* Force large constants in 64bit compilation into register
+ to get them CSEed. */
+ if (can_create_pseudo_p ()
+ && (mode == DImode) && TARGET_64BIT
+ && immediate_operand (op1, mode)
+ && !x86_64_zext_immediate_operand (op1, VOIDmode)
+ && !register_operand (op0, mode)
+ && optimize)
+ op1 = copy_to_mode_reg (mode, op1);
+
+ if (can_create_pseudo_p ()
+ && CONST_DOUBLE_P (op1))
+ {
+ /* If we are loading a floating point constant to a register,
+ force the value to memory now, since we'll get better code
+ out the back end. */
+
+ op1 = validize_mem (force_const_mem (mode, op1));
+ if (!register_operand (op0, mode))
+ {
+ rtx temp = gen_reg_rtx (mode);
+ emit_insn (gen_rtx_SET (temp, op1));
+ emit_move_insn (op0, temp);
+ return;
+ }
+ }
+ }
+
+ emit_insn (gen_rtx_SET (op0, op1));
+}
+
+void
+ix86_expand_vector_move (machine_mode mode, rtx operands[])
+{
+ rtx op0 = operands[0], op1 = operands[1];
+ /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
+ psABI since the biggest alignment is 4 byte for IA MCU psABI. */
+ unsigned int align = (TARGET_IAMCU
+ ? GET_MODE_BITSIZE (mode)
+ : GET_MODE_ALIGNMENT (mode));
+
+ if (push_operand (op0, VOIDmode))
+ op0 = emit_move_resolve_push (mode, op0);
+
+ /* Force constants other than zero into memory. We do not know how
+ the instructions used to build constants modify the upper 64 bits
+ of the register, once we have that information we may be able
+ to handle some of them more efficiently. */
+ if (can_create_pseudo_p ()
+ && (CONSTANT_P (op1)
+ || (SUBREG_P (op1)
+ && CONSTANT_P (SUBREG_REG (op1))))
+ && ((register_operand (op0, mode)
+ && !standard_sse_constant_p (op1, mode))
+ /* ix86_expand_vector_move_misalign() does not like constants. */
+ || (SSE_REG_MODE_P (mode)
+ && MEM_P (op0)
+ && MEM_ALIGN (op0) < align)))
+ {
+ if (SUBREG_P (op1))
+ {
+ machine_mode imode = GET_MODE (SUBREG_REG (op1));
+ rtx r = force_const_mem (imode, SUBREG_REG (op1));
+ if (r)
+ r = validize_mem (r);
+ else
+ r = force_reg (imode, SUBREG_REG (op1));
+ op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
+ }
+ else
+ op1 = validize_mem (force_const_mem (mode, op1));
+ }
+
+ /* We need to check memory alignment for SSE mode since attribute
+ can make operands unaligned. */
+ if (can_create_pseudo_p ()
+ && SSE_REG_MODE_P (mode)
+ && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
+ || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
+ {
+ rtx tmp[2];
+
+ /* ix86_expand_vector_move_misalign() does not like both
+ arguments in memory. */
+ if (!register_operand (op0, mode)
+ && !register_operand (op1, mode))
+ op1 = force_reg (mode, op1);
+
+ tmp[0] = op0; tmp[1] = op1;
+ ix86_expand_vector_move_misalign (mode, tmp);
+ return;
+ }
+
+ /* Make operand1 a register if it isn't already. */
+ if (can_create_pseudo_p ()
+ && !register_operand (op0, mode)
+ && !register_operand (op1, mode))
+ {
+ emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
+ return;
+ }
+
+ emit_insn (gen_rtx_SET (op0, op1));
+}
+
+/* Split 32-byte AVX unaligned load and store if needed. */
+
+static void
+ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
+{
+ rtx m;
+ rtx (*extract) (rtx, rtx, rtx);
+ machine_mode mode;
+
+ if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
+ || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
+ {
+ emit_insn (gen_rtx_SET (op0, op1));
+ return;
+ }
+
+ rtx orig_op0 = NULL_RTX;
+ mode = GET_MODE (op0);
+ switch (GET_MODE_CLASS (mode))
+ {
+ case MODE_VECTOR_INT:
+ case MODE_INT:
+ if (mode != V32QImode)
+ {
+ if (!MEM_P (op0))
+ {
+ orig_op0 = op0;
+ op0 = gen_reg_rtx (V32QImode);
+ }
+ else
+ op0 = gen_lowpart (V32QImode, op0);
+ op1 = gen_lowpart (V32QImode, op1);
+ mode = V32QImode;
+ }
+ break;
+ case MODE_VECTOR_FLOAT:
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ switch (mode)
+ {
+ default:
+ gcc_unreachable ();
+ case E_V32QImode:
+ extract = gen_avx_vextractf128v32qi;
+ mode = V16QImode;
+ break;
+ case E_V8SFmode:
+ extract = gen_avx_vextractf128v8sf;
+ mode = V4SFmode;
+ break;
+ case E_V4DFmode:
+ extract = gen_avx_vextractf128v4df;
+ mode = V2DFmode;
+ break;
+ }
+
+ if (MEM_P (op1))
+ {
+ rtx r = gen_reg_rtx (mode);
+ m = adjust_address (op1, mode, 0);
+ emit_move_insn (r, m);
+ m = adjust_address (op1, mode, 16);
+ r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
+ emit_move_insn (op0, r);
+ }
+ else if (MEM_P (op0))
+ {
+ m = adjust_address (op0, mode, 0);
+ emit_insn (extract (m, op1, const0_rtx));
+ m = adjust_address (op0, mode, 16);
+ emit_insn (extract (m, copy_rtx (op1), const1_rtx));
+ }
+ else
+ gcc_unreachable ();
+
+ if (orig_op0)
+ emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
+}
+
+/* Implement the movmisalign patterns for SSE. Non-SSE modes go
+ straight to ix86_expand_vector_move. */
+/* Code generation for scalar reg-reg moves of single and double precision data:
+ if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
+ movaps reg, reg
+ else
+ movss reg, reg
+ if (x86_sse_partial_reg_dependency == true)
+ movapd reg, reg
+ else
+ movsd reg, reg
+
+ Code generation for scalar loads of double precision data:
+ if (x86_sse_split_regs == true)
+ movlpd mem, reg (gas syntax)
+ else
+ movsd mem, reg
+
+ Code generation for unaligned packed loads of single precision data
+ (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
+ if (x86_sse_unaligned_move_optimal)
+ movups mem, reg
+
+ if (x86_sse_partial_reg_dependency == true)
+ {
+ xorps reg, reg
+ movlps mem, reg
+ movhps mem+8, reg
+ }
+ else
+ {
+ movlps mem, reg
+ movhps mem+8, reg
+ }
+
+ Code generation for unaligned packed loads of double precision data
+ (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
+ if (x86_sse_unaligned_move_optimal)
+ movupd mem, reg
+
+ if (x86_sse_split_regs == true)
+ {
+ movlpd mem, reg
+ movhpd mem+8, reg
+ }
+ else
+ {
+ movsd mem, reg
+ movhpd mem+8, reg
+ }
+ */
+
+void
+ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
+{
+ rtx op0, op1, m;
+
+ op0 = operands[0];
+ op1 = operands[1];
+
+ /* Use unaligned load/store for AVX512 or when optimizing for size. */
+ if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
+ {
+ emit_insn (gen_rtx_SET (op0, op1));
+ return;
+ }
+
+ if (TARGET_AVX)
+ {
+ if (GET_MODE_SIZE (mode) == 32)
+ ix86_avx256_split_vector_move_misalign (op0, op1);
+ else
+ /* Always use 128-bit mov<mode>_internal pattern for AVX. */
+ emit_insn (gen_rtx_SET (op0, op1));
+ return;
+ }
+
+ if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
+ || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
+ {
+ emit_insn (gen_rtx_SET (op0, op1));
+ return;
+ }
+
+ /* ??? If we have typed data, then it would appear that using
+ movdqu is the only way to get unaligned data loaded with
+ integer type. */
+ if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+ {
+ emit_insn (gen_rtx_SET (op0, op1));
+ return;
+ }
+
+ if (MEM_P (op1))
+ {
+ if (TARGET_SSE2 && mode == V2DFmode)
+ {
+ rtx zero;
+
+ /* When SSE registers are split into halves, we can avoid
+ writing to the top half twice. */
+ if (TARGET_SSE_SPLIT_REGS)
+ {
+ emit_clobber (op0);
+ zero = op0;
+ }
+ else
+ {
+ /* ??? Not sure about the best option for the Intel chips.
+ The following would seem to satisfy; the register is
+ entirely cleared, breaking the dependency chain. We
+ then store to the upper half, with a dependency depth
+ of one. A rumor has it that Intel recommends two movsd
+ followed by an unpacklpd, but this is unconfirmed. And
+ given that the dependency depth of the unpacklpd would
+ still be one, I'm not sure why this would be better. */
+ zero = CONST0_RTX (V2DFmode);
+ }
+
+ m = adjust_address (op1, DFmode, 0);
+ emit_insn (gen_sse2_loadlpd (op0, zero, m));
+ m = adjust_address (op1, DFmode, 8);
+ emit_insn (gen_sse2_loadhpd (op0, op0, m));
+ }
+ else
+ {
+ rtx t;
+
+ if (mode != V4SFmode)
+ t = gen_reg_rtx (V4SFmode);
+ else
+ t = op0;
+
+ if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
+ emit_move_insn (t, CONST0_RTX (V4SFmode));
+ else
+ emit_clobber (t);
+
+ m = adjust_address (op1, V2SFmode, 0);
+ emit_insn (gen_sse_loadlps (t, t, m));
+ m = adjust_address (op1, V2SFmode, 8);
+ emit_insn (gen_sse_loadhps (t, t, m));
+ if (mode != V4SFmode)
+ emit_move_insn (op0, gen_lowpart (mode, t));
+ }
+ }
+ else if (MEM_P (op0))
+ {
+ if (TARGET_SSE2 && mode == V2DFmode)
+ {
+ m = adjust_address (op0, DFmode, 0);
+ emit_insn (gen_sse2_storelpd (m, op1));
+ m = adjust_address (op0, DFmode, 8);
+ emit_insn (gen_sse2_storehpd (m, op1));
+ }
+ else
+ {
+ if (mode != V4SFmode)
+ op1 = gen_lowpart (V4SFmode, op1);
+
+ m = adjust_address (op0, V2SFmode, 0);
+ emit_insn (gen_sse_storelps (m, op1));
+ m = adjust_address (op0, V2SFmode, 8);
+ emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
+ }
+ }
+ else
+ gcc_unreachable ();
+}
+
+/* Helper function of ix86_fixup_binary_operands to canonicalize
+ operand order. Returns true if the operands should be swapped. */
+
+static bool
+ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
+ rtx operands[])
+{
+ rtx dst = operands[0];
+ rtx src1 = operands[1];
+ rtx src2 = operands[2];
+
+ /* If the operation is not commutative, we can't do anything. */
+ if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
+ && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
+ return false;
+
+ /* Highest priority is that src1 should match dst. */
+ if (rtx_equal_p (dst, src1))
+ return false;
+ if (rtx_equal_p (dst, src2))
+ return true;
+
+ /* Next highest priority is that immediate constants come second. */
+ if (immediate_operand (src2, mode))
+ return false;
+ if (immediate_operand (src1, mode))
+ return true;
+
+ /* Lowest priority is that memory references should come second. */
+ if (MEM_P (src2))
+ return false;
+ if (MEM_P (src1))
+ return true;
+
+ return false;
+}
+
+
+/* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
+ destination to use for the operation. If different from the true
+ destination in operands[0], a copy operation will be required. */
+
+rtx
+ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
+ rtx operands[])
+{
+ rtx dst = operands[0];
+ rtx src1 = operands[1];
+ rtx src2 = operands[2];
+
+ /* Canonicalize operand order. */
+ if (ix86_swap_binary_operands_p (code, mode, operands))
+ {
+ /* It is invalid to swap operands of different modes. */
+ gcc_assert (GET_MODE (src1) == GET_MODE (src2));
+
+ std::swap (src1, src2);
+ }
+
+ /* Both source operands cannot be in memory. */
+ if (MEM_P (src1) && MEM_P (src2))
+ {
+ /* Optimization: Only read from memory once. */
+ if (rtx_equal_p (src1, src2))
+ {
+ src2 = force_reg (mode, src2);
+ src1 = src2;
+ }
+ else if (rtx_equal_p (dst, src1))
+ src2 = force_reg (mode, src2);
+ else
+ src1 = force_reg (mode, src1);
+ }
+
+ /* If the destination is memory, and we do not have matching source
+ operands, do things in registers. */
+ if (MEM_P (dst) && !rtx_equal_p (dst, src1))
+ dst = gen_reg_rtx (mode);
+
+ /* Source 1 cannot be a constant. */
+ if (CONSTANT_P (src1))
+ src1 = force_reg (mode, src1);
+
+ /* Source 1 cannot be a non-matching memory. */
+ if (MEM_P (src1) && !rtx_equal_p (dst, src1))
+ src1 = force_reg (mode, src1);
+
+ /* Improve address combine. */
+ if (code == PLUS
+ && GET_MODE_CLASS (mode) == MODE_INT
+ && MEM_P (src2))
+ src2 = force_reg (mode, src2);
+
+ operands[1] = src1;
+ operands[2] = src2;
+ return dst;
+}
+
+/* Similarly, but assume that the destination has already been
+ set up properly. */
+
+void
+ix86_fixup_binary_operands_no_copy (enum rtx_code code,
+ machine_mode mode, rtx operands[])
+{
+ rtx dst = ix86_fixup_binary_operands (code, mode, operands);
+ gcc_assert (dst == operands[0]);
+}
+
+/* Attempt to expand a binary operator. Make the expansion closer to the
+ actual machine, then just general_operand, which will allow 3 separate
+ memory references (one output, two input) in a single insn. */
+
+void
+ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
+ rtx operands[])
+{
+ rtx src1, src2, dst, op, clob;
+
+ dst = ix86_fixup_binary_operands (code, mode, operands);
+ src1 = operands[1];
+ src2 = operands[2];
+
+ /* Emit the instruction. */
+
+ op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
+
+ if (reload_completed
+ && code == PLUS
+ && !rtx_equal_p (dst, src1))
+ {
+ /* This is going to be an LEA; avoid splitting it later. */
+ emit_insn (op);
+ }
+ else
+ {
+ clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+ emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
+ }
+
+ /* Fix up the destination if needed. */
+ if (dst != operands[0])
+ emit_move_insn (operands[0], dst);
+}
+
+/* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
+ the given OPERANDS. */
+
+void
+ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
+ rtx operands[])
+{
+ rtx op1 = NULL_RTX, op2 = NULL_RTX;
+ if (SUBREG_P (operands[1]))
+ {
+ op1 = operands[1];
+ op2 = operands[2];
+ }
+ else if (SUBREG_P (operands[2]))
+ {
+ op1 = operands[2];
+ op2 = operands[1];
+ }
+ /* Optimize (__m128i) d | (__m128i) e and similar code
+ when d and e are float vectors into float vector logical
+ insn. In C/C++ without using intrinsics there is no other way
+ to express vector logical operation on float vectors than
+ to cast them temporarily to integer vectors. */
+ if (op1
+ && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
+ && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
+ && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
+ && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
+ && SUBREG_BYTE (op1) == 0
+ && (GET_CODE (op2) == CONST_VECTOR
+ || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
+ && SUBREG_BYTE (op2) == 0))
+ && can_create_pseudo_p ())
+ {
+ rtx dst;
+ switch (GET_MODE (SUBREG_REG (op1)))
+ {
+ case E_V4SFmode:
+ case E_V8SFmode:
+ case E_V16SFmode:
+ case E_V2DFmode:
+ case E_V4DFmode:
+ case E_V8DFmode:
+ dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
+ if (GET_CODE (op2) == CONST_VECTOR)
+ {
+ op2 = gen_lowpart (GET_MODE (dst), op2);
+ op2 = force_reg (GET_MODE (dst), op2);
+ }
+ else
+ {
+ op1 = operands[1];
+ op2 = SUBREG_REG (operands[2]);
+ if (!vector_operand (op2, GET_MODE (dst)))
+ op2 = force_reg (GET_MODE (dst), op2);
+ }
+ op1 = SUBREG_REG (op1);
+ if (!vector_operand (op1, GET_MODE (dst)))
+ op1 = force_reg (GET_MODE (dst), op1);
+ emit_insn (gen_rtx_SET (dst,
+ gen_rtx_fmt_ee (code, GET_MODE (dst),
+ op1, op2)));
+ emit_move_insn (operands[0], gen_lowpart (mode, dst));
+ return;
+ default:
+ break;
+ }
+ }
+ if (!vector_operand (operands[1], mode))
+ operands[1] = force_reg (mode, operands[1]);
+ if (!vector_operand (operands[2], mode))
+ operands[2] = force_reg (mode, operands[2]);
+ ix86_fixup_binary_operands_no_copy (code, mode, operands);
+ emit_insn (gen_rtx_SET (operands[0],
+ gen_rtx_fmt_ee (code, mode, operands[1],
+ operands[2])));
+}
+
+/* Return TRUE or FALSE depending on whether the binary operator meets the
+ appropriate constraints. */
+
+bool
+ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
+ rtx operands[3])
+{
+ rtx dst = operands[0];
+ rtx src1 = operands[1];
+ rtx src2 = operands[2];
+
+ /* Both source operands cannot be in memory. */
+ if (MEM_P (src1) && MEM_P (src2))
+ return false;
+
+ /* Canonicalize operand order for commutative operators. */
+ if (ix86_swap_binary_operands_p (code, mode, operands))
+ std::swap (src1, src2);
+
+ /* If the destination is memory, we must have a matching source operand. */
+ if (MEM_P (dst) && !rtx_equal_p (dst, src1))
+ return false;
+
+ /* Source 1 cannot be a constant. */
+ if (CONSTANT_P (src1))
+ return false;
+
+ /* Source 1 cannot be a non-matching memory. */
+ if (MEM_P (src1) && !rtx_equal_p (dst, src1))
+ /* Support "andhi/andsi/anddi" as a zero-extending move. */
+ return (code == AND
+ && (mode == HImode
+ || mode == SImode
+ || (TARGET_64BIT && mode == DImode))
+ && satisfies_constraint_L (src2));
+
+ return true;
+}
+
+/* Attempt to expand a unary operator. Make the expansion closer to the
+ actual machine, then just general_operand, which will allow 2 separate
+ memory references (one output, one input) in a single insn. */
+
+void
+ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
+ rtx operands[])
+{
+ bool matching_memory = false;
+ rtx src, dst, op, clob;
+
+ dst = operands[0];
+ src = operands[1];
+
+ /* If the destination is memory, and we do not have matching source
+ operands, do things in registers. */
+ if (MEM_P (dst))
+ {
+ if (rtx_equal_p (dst, src))
+ matching_memory = true;
+ else
+ dst = gen_reg_rtx (mode);
+ }
+
+ /* When source operand is memory, destination must match. */
+ if (MEM_P (src) && !matching_memory)
+ src = force_reg (mode, src);
+
+ /* Emit the instruction. */
+
+ op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
+
+ if (code == NOT)
+ emit_insn (op);
+ else
+ {
+ clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+ emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
+ }
+
+ /* Fix up the destination if needed. */
+ if (dst != operands[0])
+ emit_move_insn (operands[0], dst);
+}
+
+/* Predict just emitted jump instruction to be taken with probability PROB. */
+
+static void
+predict_jump (int prob)
+{
+ rtx_insn *insn = get_last_insn ();
+ gcc_assert (JUMP_P (insn));
+ add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
+}
+
+/* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
+ divisor are within the range [0-255]. */
+
+void
+ix86_split_idivmod (machine_mode mode, rtx operands[],
+ bool signed_p)
+{
+ rtx_code_label *end_label, *qimode_label;
+ rtx div, mod;
+ rtx_insn *insn;
+ rtx scratch, tmp0, tmp1, tmp2;
+ rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
+ rtx (*gen_zero_extend) (rtx, rtx);
+ rtx (*gen_test_ccno_1) (rtx, rtx);
+
+ switch (mode)
+ {
+ case E_SImode:
+ if (GET_MODE (operands[0]) == SImode)
+ {
+ if (GET_MODE (operands[1]) == SImode)
+ gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
+ else
+ gen_divmod4_1
+ = signed_p ? gen_divmodsi4_zext_2 : gen_udivmodsi4_zext_2;
+ gen_zero_extend = gen_zero_extendqisi2;
+ }
+ else
+ {
+ gen_divmod4_1
+ = signed_p ? gen_divmodsi4_zext_1 : gen_udivmodsi4_zext_1;
+ gen_zero_extend = gen_zero_extendqidi2;
+ }
+ gen_test_ccno_1 = gen_testsi_ccno_1;
+ break;
+ case E_DImode:
+ gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
+ gen_test_ccno_1 = gen_testdi_ccno_1;
+ gen_zero_extend = gen_zero_extendqidi2;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ end_label = gen_label_rtx ();
+ qimode_label = gen_label_rtx ();
+
+ scratch = gen_reg_rtx (mode);
+
+ /* Use 8bit unsigned divimod if dividend and divisor are within
+ the range [0-255]. */
+ emit_move_insn (scratch, operands[2]);
+ scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
+ scratch, 1, OPTAB_DIRECT);
+ emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
+ tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
+ tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
+ tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
+ gen_rtx_LABEL_REF (VOIDmode, qimode_label),
+ pc_rtx);
+ insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
+ predict_jump (REG_BR_PROB_BASE * 50 / 100);
+ JUMP_LABEL (insn) = qimode_label;
+
+ /* Generate original signed/unsigned divimod. */
+ div = gen_divmod4_1 (operands[0], operands[1],
+ operands[2], operands[3]);
+ emit_insn (div);
+
+ /* Branch to the end. */
+ emit_jump_insn (gen_jump (end_label));
+ emit_barrier ();
+
+ /* Generate 8bit unsigned divide. */
+ emit_label (qimode_label);
+ /* Don't use operands[0] for result of 8bit divide since not all
+ registers support QImode ZERO_EXTRACT. */
+ tmp0 = lowpart_subreg (HImode, scratch, mode);
+ tmp1 = lowpart_subreg (HImode, operands[2], mode);
+ tmp2 = lowpart_subreg (QImode, operands[3], mode);
+ emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
+
+ if (signed_p)
+ {
+ div = gen_rtx_DIV (mode, operands[2], operands[3]);
+ mod = gen_rtx_MOD (mode, operands[2], operands[3]);
+ }
+ else
+ {
+ div = gen_rtx_UDIV (mode, operands[2], operands[3]);
+ mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
+ }
+ if (mode == SImode)
+ {
+ if (GET_MODE (operands[0]) != SImode)
+ div = gen_rtx_ZERO_EXTEND (DImode, div);
+ if (GET_MODE (operands[1]) != SImode)
+ mod = gen_rtx_ZERO_EXTEND (DImode, mod);
+ }
+
+ /* Extract remainder from AH. */
+ tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
+ tmp0, GEN_INT (8), GEN_INT (8));
+ if (REG_P (operands[1]))
+ insn = emit_move_insn (operands[1], tmp1);
+ else
+ {
+ /* Need a new scratch register since the old one has result
+ of 8bit divide. */
+ scratch = gen_reg_rtx (GET_MODE (operands[1]));
+ emit_move_insn (scratch, tmp1);
+ insn = emit_move_insn (operands[1], scratch);
+ }
+ set_unique_reg_note (insn, REG_EQUAL, mod);
+
+ /* Zero extend quotient from AL. */
+ tmp1 = gen_lowpart (QImode, tmp0);
+ insn = emit_insn (gen_zero_extend (operands[0], tmp1));
+ set_unique_reg_note (insn, REG_EQUAL, div);
+
+ emit_label (end_label);
+}
+
+/* Emit x86 binary operand CODE in mode MODE, where the first operand
+ matches destination. RTX includes clobber of FLAGS_REG. */
+
+void
+ix86_emit_binop (enum rtx_code code, machine_mode mode,
+ rtx dst, rtx src)
+{
+ rtx op, clob;
+
+ op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
+ clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+
+ emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
+}
+
+/* Return true if regno1 def is nearest to the insn. */
+
+static bool
+find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
+{
+ rtx_insn *prev = insn;
+ rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
+
+ if (insn == start)
+ return false;
+ while (prev && prev != start)
+ {
+ if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
+ {
+ prev = PREV_INSN (prev);
+ continue;
+ }
+ if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
+ return true;
+ else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
+ return false;
+ prev = PREV_INSN (prev);
+ }
+
+ /* None of the regs is defined in the bb. */
+ return false;
+}
+
+/* Split lea instructions into a sequence of instructions
+ which are executed on ALU to avoid AGU stalls.
+ It is assumed that it is allowed to clobber flags register
+ at lea position. */
+
+void
+ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
+{
+ unsigned int regno0, regno1, regno2;
+ struct ix86_address parts;
+ rtx target, tmp;
+ int ok, adds;
+
+ ok = ix86_decompose_address (operands[1], &parts);
+ gcc_assert (ok);
+
+ target = gen_lowpart (mode, operands[0]);
+
+ regno0 = true_regnum (target);
+ regno1 = INVALID_REGNUM;
+ regno2 = INVALID_REGNUM;
+
+ if (parts.base)
+ {
+ parts.base = gen_lowpart (mode, parts.base);
+ regno1 = true_regnum (parts.base);
+ }
+
+ if (parts.index)
+ {
+ parts.index = gen_lowpart (mode, parts.index);
+ regno2 = true_regnum (parts.index);
+ }
+
+ if (parts.disp)
+ parts.disp = gen_lowpart (mode, parts.disp);
+
+ if (parts.scale > 1)
+ {
+ /* Case r1 = r1 + ... */
+ if (regno1 == regno0)
+ {
+ /* If we have a case r1 = r1 + C * r2 then we
+ should use multiplication which is very
+ expensive. Assume cost model is wrong if we
+ have such case here. */
+ gcc_assert (regno2 != regno0);
+
+ for (adds = parts.scale; adds > 0; adds--)
+ ix86_emit_binop (PLUS, mode, target, parts.index);
+ }
+ else
+ {
+ /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
+ if (regno0 != regno2)
+ emit_insn (gen_rtx_SET (target, parts.index));
+
+ /* Use shift for scaling. */
+ ix86_emit_binop (ASHIFT, mode, target,
+ GEN_INT (exact_log2 (parts.scale)));
+
+ if (parts.base)
+ ix86_emit_binop (PLUS, mode, target, parts.base);
+
+ if (parts.disp && parts.disp != const0_rtx)
+ ix86_emit_binop (PLUS, mode, target, parts.disp);
+ }
+ }
+ else if (!parts.base && !parts.index)
+ {
+ gcc_assert(parts.disp);
+ emit_insn (gen_rtx_SET (target, parts.disp));
+ }
+ else
+ {
+ if (!parts.base)
+ {
+ if (regno0 != regno2)
+ emit_insn (gen_rtx_SET (target, parts.index));
+ }
+ else if (!parts.index)
+ {
+ if (regno0 != regno1)
+ emit_insn (gen_rtx_SET (target, parts.base));
+ }
+ else
+ {
+ if (regno0 == regno1)
+ tmp = parts.index;
+ else if (regno0 == regno2)
+ tmp = parts.base;
+ else
+ {
+ rtx tmp1;
+
+ /* Find better operand for SET instruction, depending
+ on which definition is farther from the insn. */
+ if (find_nearest_reg_def (insn, regno1, regno2))
+ tmp = parts.index, tmp1 = parts.base;
+ else
+ tmp = parts.base, tmp1 = parts.index;
+
+ emit_insn (gen_rtx_SET (target, tmp));
+
+ if (parts.disp && parts.disp != const0_rtx)
+ ix86_emit_binop (PLUS, mode, target, parts.disp);
+
+ ix86_emit_binop (PLUS, mode, target, tmp1);
+ return;
+ }
+
+ ix86_emit_binop (PLUS, mode, target, tmp);
+ }
+
+ if (parts.disp && parts.disp != const0_rtx)
+ ix86_emit_binop (PLUS, mode, target, parts.disp);
+ }
+}
+
+/* Post-reload splitter for converting an SF or DFmode value in an
+ SSE register into an unsigned SImode. */
+
+void
+ix86_split_convert_uns_si_sse (rtx operands[])
+{
+ machine_mode vecmode;
+ rtx value, large, zero_or_two31, input, two31, x;
+
+ large = operands[1];
+ zero_or_two31 = operands[2];
+ input = operands[3];
+ two31 = operands[4];
+ vecmode = GET_MODE (large);
+ value = gen_rtx_REG (vecmode, REGNO (operands[0]));
+
+ /* Load up the value into the low element. We must ensure that the other
+ elements are valid floats -- zero is the easiest such value. */
+ if (MEM_P (input))
+ {
+ if (vecmode == V4SFmode)
+ emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
+ else
+ emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
+ }
+ else
+ {
+ input = gen_rtx_REG (vecmode, REGNO (input));
+ emit_move_insn (value, CONST0_RTX (vecmode));
+ if (vecmode == V4SFmode)
+ emit_insn (gen_sse_movss (value, value, input));
+ else
+ emit_insn (gen_sse2_movsd (value, value, input));
+ }
+
+ emit_move_insn (large, two31);
+ emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
+
+ x = gen_rtx_fmt_ee (LE, vecmode, large, value);
+ emit_insn (gen_rtx_SET (large, x));
+
+ x = gen_rtx_AND (vecmode, zero_or_two31, large);
+ emit_insn (gen_rtx_SET (zero_or_two31, x));
+
+ x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
+ emit_insn (gen_rtx_SET (value, x));
+
+ large = gen_rtx_REG (V4SImode, REGNO (large));
+ emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
+
+ x = gen_rtx_REG (V4SImode, REGNO (value));
+ if (vecmode == V4SFmode)
+ emit_insn (gen_fix_truncv4sfv4si2 (x, value));
+ else
+ emit_insn (gen_sse2_cvttpd2dq (x, value));
+ value = x;
+
+ emit_insn (gen_xorv4si3 (value, value, large));
+}
+
+static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
+ machine_mode mode, rtx target,
+ rtx var, int one_var);
+
+/* Convert an unsigned DImode value into a DFmode, using only SSE.
+ Expects the 64-bit DImode to be supplied in a pair of integral
+ registers. Requires SSE2; will use SSE3 if available. For x86_32,
+ -mfpmath=sse, !optimize_size only. */
+
+void
+ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
+{
+ REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
+ rtx int_xmm, fp_xmm;
+ rtx biases, exponents;
+ rtx x;
+
+ int_xmm = gen_reg_rtx (V4SImode);
+ if (TARGET_INTER_UNIT_MOVES_TO_VEC)
+ emit_insn (gen_movdi_to_sse (int_xmm, input));
+ else if (TARGET_SSE_SPLIT_REGS)
+ {
+ emit_clobber (int_xmm);
+ emit_move_insn (gen_lowpart (DImode, int_xmm), input);
+ }
+ else
+ {
+ x = gen_reg_rtx (V2DImode);
+ ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
+ emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
+ }
+
+ x = gen_rtx_CONST_VECTOR (V4SImode,
+ gen_rtvec (4, GEN_INT (0x43300000UL),
+ GEN_INT (0x45300000UL),
+ const0_rtx, const0_rtx));
+ exponents = validize_mem (force_const_mem (V4SImode, x));
+
+ /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
+ emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
+
+ /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
+ yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
+ Similarly (0x45300000UL ## fp_value_hi_xmm) yields
+ (0x1.0p84 + double(fp_value_hi_xmm)).
+ Note these exponents differ by 32. */
+
+ fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
+
+ /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
+ in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
+ real_ldexp (&bias_lo_rvt, &dconst1, 52);
+ real_ldexp (&bias_hi_rvt, &dconst1, 84);
+ biases = const_double_from_real_value (bias_lo_rvt, DFmode);
+ x = const_double_from_real_value (bias_hi_rvt, DFmode);
+ biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
+ biases = validize_mem (force_const_mem (V2DFmode, biases));
+ emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
+
+ /* Add the upper and lower DFmode values together. */
+ if (TARGET_SSE3)
+ emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
+ else
+ {
+ x = copy_to_mode_reg (V2DFmode, fp_xmm);
+ emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
+ emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
+ }
+
+ ix86_expand_vector_extract (false, target, fp_xmm, 0);
+}
+
+/* Not used, but eases macroization of patterns. */
+void
+ix86_expand_convert_uns_sixf_sse (rtx, rtx)
+{
+ gcc_unreachable ();
+}
+
+/* Convert an unsigned SImode value into a DFmode. Only currently used
+ for SSE, but applicable anywhere. */
+
+void
+ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
+{
+ REAL_VALUE_TYPE TWO31r;
+ rtx x, fp;
+
+ x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
+ NULL, 1, OPTAB_DIRECT);
+
+ fp = gen_reg_rtx (DFmode);
+ emit_insn (gen_floatsidf2 (fp, x));
+
+ real_ldexp (&TWO31r, &dconst1, 31);
+ x = const_double_from_real_value (TWO31r, DFmode);
+
+ x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
+ if (x != target)
+ emit_move_insn (target, x);
+}
+
+/* Convert a signed DImode value into a DFmode. Only used for SSE in
+ 32-bit mode; otherwise we have a direct convert instruction. */
+
+void
+ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
+{
+ REAL_VALUE_TYPE TWO32r;
+ rtx fp_lo, fp_hi, x;
+
+ fp_lo = gen_reg_rtx (DFmode);
+ fp_hi = gen_reg_rtx (DFmode);
+
+ emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
+
+ real_ldexp (&TWO32r, &dconst1, 32);
+ x = const_double_from_real_value (TWO32r, DFmode);
+ fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
+
+ ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
+
+ x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
+ 0, OPTAB_DIRECT);
+ if (x != target)
+ emit_move_insn (target, x);
+}
+
+/* Convert an unsigned SImode value into a SFmode, using only SSE.
+ For x86_32, -mfpmath=sse, !optimize_size only. */
+void
+ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
+{
+ REAL_VALUE_TYPE ONE16r;
+ rtx fp_hi, fp_lo, int_hi, int_lo, x;
+
+ real_ldexp (&ONE16r, &dconst1, 16);
+ x = const_double_from_real_value (ONE16r, SFmode);
+ int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
+ NULL, 0, OPTAB_DIRECT);
+ int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
+ NULL, 0, OPTAB_DIRECT);
+ fp_hi = gen_reg_rtx (SFmode);
+ fp_lo = gen_reg_rtx (SFmode);
+ emit_insn (gen_floatsisf2 (fp_hi, int_hi));
+ emit_insn (gen_floatsisf2 (fp_lo, int_lo));
+ fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
+ 0, OPTAB_DIRECT);
+ fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
+ 0, OPTAB_DIRECT);
+ if (!rtx_equal_p (target, fp_hi))
+ emit_move_insn (target, fp_hi);
+}
+
+/* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
+ a vector of unsigned ints VAL to vector of floats TARGET. */
+
+void
+ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
+{
+ rtx tmp[8];
+ REAL_VALUE_TYPE TWO16r;
+ machine_mode intmode = GET_MODE (val);
+ machine_mode fltmode = GET_MODE (target);
+ rtx (*cvt) (rtx, rtx);
+
+ if (intmode == V4SImode)
+ cvt = gen_floatv4siv4sf2;
+ else
+ cvt = gen_floatv8siv8sf2;
+ tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
+ tmp[0] = force_reg (intmode, tmp[0]);
+ tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
+ OPTAB_DIRECT);
+ tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
+ NULL_RTX, 1, OPTAB_DIRECT);
+ tmp[3] = gen_reg_rtx (fltmode);
+ emit_insn (cvt (tmp[3], tmp[1]));
+ tmp[4] = gen_reg_rtx (fltmode);
+ emit_insn (cvt (tmp[4], tmp[2]));
+ real_ldexp (&TWO16r, &dconst1, 16);
+ tmp[5] = const_double_from_real_value (TWO16r, SFmode);
+ tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
+ tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
+ OPTAB_DIRECT);
+ tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
+ OPTAB_DIRECT);
+ if (tmp[7] != target)
+ emit_move_insn (target, tmp[7]);
+}
+
+/* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
+ pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
+ This is done by doing just signed conversion if < 0x1p31, and otherwise by
+ subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
+
+rtx
+ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
+{
+ REAL_VALUE_TYPE TWO31r;
+ rtx two31r, tmp[4];
+ machine_mode mode = GET_MODE (val);
+ machine_mode scalarmode = GET_MODE_INNER (mode);
+ machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
+ rtx (*cmp) (rtx, rtx, rtx, rtx);
+ int i;
+
+ for (i = 0; i < 3; i++)
+ tmp[i] = gen_reg_rtx (mode);
+ real_ldexp (&TWO31r, &dconst1, 31);
+ two31r = const_double_from_real_value (TWO31r, scalarmode);
+ two31r = ix86_build_const_vector (mode, 1, two31r);
+ two31r = force_reg (mode, two31r);
+ switch (mode)
+ {
+ case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
+ case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
+ case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
+ case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
+ default: gcc_unreachable ();
+ }
+ tmp[3] = gen_rtx_LE (mode, two31r, val);
+ emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
+ tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
+ 0, OPTAB_DIRECT);
+ if (intmode == V4SImode || TARGET_AVX2)
+ *xorp = expand_simple_binop (intmode, ASHIFT,
+ gen_lowpart (intmode, tmp[0]),
+ GEN_INT (31), NULL_RTX, 0,
+ OPTAB_DIRECT);
+ else
+ {
+ rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
+ two31 = ix86_build_const_vector (intmode, 1, two31);
+ *xorp = expand_simple_binop (intmode, AND,
+ gen_lowpart (intmode, tmp[0]),
+ two31, NULL_RTX, 0,
+ OPTAB_DIRECT);
+ }
+ return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
+ 0, OPTAB_DIRECT);
+}
+
+/* Generate code for floating point ABS or NEG. */
+
+void
+ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
+ rtx operands[])
+{
+ rtx mask, set, dst, src;
+ bool use_sse = false;
+ bool vector_mode = VECTOR_MODE_P (mode);
+ machine_mode vmode = mode;
+
+ if (vector_mode)
+ use_sse = true;
+ else if (mode == TFmode)
+ use_sse = true;
+ else if (TARGET_SSE_MATH)
+ {
+ use_sse = SSE_FLOAT_MODE_P (mode);
+ if (mode == SFmode)
+ vmode = V4SFmode;
+ else if (mode == DFmode)
+ vmode = V2DFmode;
+ }
+
+ /* NEG and ABS performed with SSE use bitwise mask operations.
+ Create the appropriate mask now. */
+ if (use_sse)
+ mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
+ else
+ mask = NULL_RTX;
+
+ dst = operands[0];
+ src = operands[1];
+
+ set = gen_rtx_fmt_e (code, mode, src);
+ set = gen_rtx_SET (dst, set);
+
+ if (mask)
+ {
+ rtx use, clob;
+ rtvec par;
+
+ use = gen_rtx_USE (VOIDmode, mask);
+ if (vector_mode)
+ par = gen_rtvec (2, set, use);
+ else
+ {
+ clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+ par = gen_rtvec (3, set, use, clob);
+ }
+ emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
+ }
+ else
+ emit_insn (set);
+}
+
+/* Expand a copysign operation. Special case operand 0 being a constant. */
+
+void
+ix86_expand_copysign (rtx operands[])
+{
+ machine_mode mode, vmode;
+ rtx dest, op0, op1, mask, nmask;
+
+ dest = operands[0];
+ op0 = operands[1];
+ op1 = operands[2];
+
+ mode = GET_MODE (dest);
+
+ if (mode == SFmode)
+ vmode = V4SFmode;
+ else if (mode == DFmode)
+ vmode = V2DFmode;
+ else
+ vmode = mode;
+
+ if (CONST_DOUBLE_P (op0))
+ {
+ rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
+
+ if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
+ op0 = simplify_unary_operation (ABS, mode, op0, mode);
+
+ if (mode == SFmode || mode == DFmode)
+ {
+ if (op0 == CONST0_RTX (mode))
+ op0 = CONST0_RTX (vmode);
+ else
+ {
+ rtx v = ix86_build_const_vector (vmode, false, op0);
+
+ op0 = force_reg (vmode, v);
+ }
+ }
+ else if (op0 != CONST0_RTX (mode))
+ op0 = force_reg (mode, op0);
+
+ mask = ix86_build_signbit_mask (vmode, 0, 0);
+
+ if (mode == SFmode)
+ copysign_insn = gen_copysignsf3_const;
+ else if (mode == DFmode)
+ copysign_insn = gen_copysigndf3_const;
+ else
+ copysign_insn = gen_copysigntf3_const;
+
+ emit_insn (copysign_insn (dest, op0, op1, mask));
+ }
+ else
+ {
+ rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
+
+ nmask = ix86_build_signbit_mask (vmode, 0, 1);
+ mask = ix86_build_signbit_mask (vmode, 0, 0);
+
+ if (mode == SFmode)
+ copysign_insn = gen_copysignsf3_var;
+ else if (mode == DFmode)
+ copysign_insn = gen_copysigndf3_var;
+ else
+ copysign_insn = gen_copysigntf3_var;
+
+ emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
+ }
+}
+
+/* Deconstruct a copysign operation into bit masks. Operand 0 is known to
+ be a constant, and so has already been expanded into a vector constant. */
+
+void
+ix86_split_copysign_const (rtx operands[])
+{
+ machine_mode mode, vmode;
+ rtx dest, op0, mask, x;
+
+ dest = operands[0];
+ op0 = operands[1];
+ mask = operands[3];
+
+ mode = GET_MODE (dest);
+ vmode = GET_MODE (mask);
+
+ dest = lowpart_subreg (vmode, dest, mode);
+ x = gen_rtx_AND (vmode, dest, mask);
+ emit_insn (gen_rtx_SET (dest, x));
+
+ if (op0 != CONST0_RTX (vmode))
+ {
+ x = gen_rtx_IOR (vmode, dest, op0);
+ emit_insn (gen_rtx_SET (dest, x));
+ }
+}
+
+/* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
+ so we have to do two masks. */
+
+void
+ix86_split_copysign_var (rtx operands[])
+{
+ machine_mode mode, vmode;
+ rtx dest, scratch, op0, op1, mask, nmask, x;
+
+ dest = operands[0];
+ scratch = operands[1];
+ op0 = operands[2];
+ op1 = operands[3];
+ nmask = operands[4];
+ mask = operands[5];
+
+ mode = GET_MODE (dest);
+ vmode = GET_MODE (mask);
+
+ if (rtx_equal_p (op0, op1))
+ {
+ /* Shouldn't happen often (it's useless, obviously), but when it does
+ we'd generate incorrect code if we continue below. */
+ emit_move_insn (dest, op0);
+ return;
+ }
+
+ if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
+ {
+ gcc_assert (REGNO (op1) == REGNO (scratch));
+
+ x = gen_rtx_AND (vmode, scratch, mask);
+ emit_insn (gen_rtx_SET (scratch, x));
+
+ dest = mask;
+ op0 = lowpart_subreg (vmode, op0, mode);
+ x = gen_rtx_NOT (vmode, dest);
+ x = gen_rtx_AND (vmode, x, op0);
+ emit_insn (gen_rtx_SET (dest, x));
+ }
+ else
+ {
+ if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
+ {
+ x = gen_rtx_AND (vmode, scratch, mask);
+ }
+ else /* alternative 2,4 */
+ {
+ gcc_assert (REGNO (mask) == REGNO (scratch));
+ op1 = lowpart_subreg (vmode, op1, mode);
+ x = gen_rtx_AND (vmode, scratch, op1);
+ }
+ emit_insn (gen_rtx_SET (scratch, x));
+
+ if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
+ {
+ dest = lowpart_subreg (vmode, op0, mode);
+ x = gen_rtx_AND (vmode, dest, nmask);
+ }
+ else /* alternative 3,4 */
+ {
+ gcc_assert (REGNO (nmask) == REGNO (dest));
+ dest = nmask;
+ op0 = lowpart_subreg (vmode, op0, mode);
+ x = gen_rtx_AND (vmode, dest, op0);
+ }
+ emit_insn (gen_rtx_SET (dest, x));
+ }
+
+ x = gen_rtx_IOR (vmode, dest, scratch);
+ emit_insn (gen_rtx_SET (dest, x));
+}
+
+/* Expand an xorsign operation. */
+
+void
+ix86_expand_xorsign (rtx operands[])
+{
+ rtx (*xorsign_insn)(rtx, rtx, rtx, rtx);
+ machine_mode mode, vmode;
+ rtx dest, op0, op1, mask;
+
+ dest = operands[0];
+ op0 = operands[1];
+ op1 = operands[2];
+
+ mode = GET_MODE (dest);
+
+ if (mode == SFmode)
+ {
+ xorsign_insn = gen_xorsignsf3_1;
+ vmode = V4SFmode;
+ }
+ else if (mode == DFmode)
+ {
+ xorsign_insn = gen_xorsigndf3_1;
+ vmode = V2DFmode;
+ }
+ else
+ gcc_unreachable ();
+
+ mask = ix86_build_signbit_mask (vmode, 0, 0);
+
+ emit_insn (xorsign_insn (dest, op0, op1, mask));
+}
+
+/* Deconstruct an xorsign operation into bit masks. */
+
+void
+ix86_split_xorsign (rtx operands[])
+{
+ machine_mode mode, vmode;
+ rtx dest, op0, mask, x;
+
+ dest = operands[0];
+ op0 = operands[1];
+ mask = operands[3];
+
+ mode = GET_MODE (dest);
+ vmode = GET_MODE (mask);
+
+ dest = lowpart_subreg (vmode, dest, mode);
+ x = gen_rtx_AND (vmode, dest, mask);
+ emit_insn (gen_rtx_SET (dest, x));
+
+ op0 = lowpart_subreg (vmode, op0, mode);
+ x = gen_rtx_XOR (vmode, dest, op0);
+ emit_insn (gen_rtx_SET (dest, x));
+}
+
+static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
+
+void
+ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
+{
+ machine_mode mode = GET_MODE (op0);
+ rtx tmp;
+
+ /* Handle special case - vector comparsion with boolean result, transform
+ it using ptest instruction. */
+ if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+ {
+ rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
+ machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
+
+ gcc_assert (code == EQ || code == NE);
+ /* Generate XOR since we can't check that one operand is zero vector. */
+ tmp = gen_reg_rtx (mode);
+ emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
+ tmp = gen_lowpart (p_mode, tmp);
+ emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
+ gen_rtx_UNSPEC (CCmode,
+ gen_rtvec (2, tmp, tmp),
+ UNSPEC_PTEST)));
+ tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
+ tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
+ gen_rtx_LABEL_REF (VOIDmode, label),
+ pc_rtx);
+ emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+ return;
+ }
+
+ switch (mode)
+ {
+ case E_SFmode:
+ case E_DFmode:
+ case E_XFmode:
+ case E_QImode:
+ case E_HImode:
+ case E_SImode:
+ simple:
+ tmp = ix86_expand_compare (code, op0, op1);
+ tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
+ gen_rtx_LABEL_REF (VOIDmode, label),
+ pc_rtx);
+ emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+ return;
+
+ case E_DImode:
+ if (TARGET_64BIT)
+ goto simple;
+ /* For 32-bit target DI comparison may be performed on
+ SSE registers. To allow this we should avoid split
+ to SI mode which is achieved by doing xor in DI mode
+ and then comparing with zero (which is recognized by
+ STV pass). We don't compare using xor when optimizing
+ for size. */
+ if (!optimize_insn_for_size_p ()
+ && TARGET_STV
+ && (code == EQ || code == NE))
+ {
+ op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
+ op1 = const0_rtx;
+ }
+ /* FALLTHRU */
+ case E_TImode:
+ /* Expand DImode branch into multiple compare+branch. */
+ {
+ rtx lo[2], hi[2];
+ rtx_code_label *label2;
+ enum rtx_code code1, code2, code3;
+ machine_mode submode;
+
+ if (CONSTANT_P (op0) && !CONSTANT_P (op1))
+ {
+ std::swap (op0, op1);
+ code = swap_condition (code);
+ }
+
+ split_double_mode (mode, &op0, 1, lo+0, hi+0);
+ split_double_mode (mode, &op1, 1, lo+1, hi+1);
+
+ submode = mode == DImode ? SImode : DImode;
+
+ /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
+ avoid two branches. This costs one extra insn, so disable when
+ optimizing for size. */
+
+ if ((code == EQ || code == NE)
+ && (!optimize_insn_for_size_p ()
+ || hi[1] == const0_rtx || lo[1] == const0_rtx))
+ {
+ rtx xor0, xor1;
+
+ xor1 = hi[0];
+ if (hi[1] != const0_rtx)
+ xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
+ NULL_RTX, 0, OPTAB_WIDEN);
+
+ xor0 = lo[0];
+ if (lo[1] != const0_rtx)
+ xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
+ NULL_RTX, 0, OPTAB_WIDEN);
+
+ tmp = expand_binop (submode, ior_optab, xor1, xor0,
+ NULL_RTX, 0, OPTAB_WIDEN);
+
+ ix86_expand_branch (code, tmp, const0_rtx, label);
+ return;
+ }
+
+ /* Otherwise, if we are doing less-than or greater-or-equal-than,
+ op1 is a constant and the low word is zero, then we can just
+ examine the high word. Similarly for low word -1 and
+ less-or-equal-than or greater-than. */
+
+ if (CONST_INT_P (hi[1]))
+ switch (code)
+ {
+ case LT: case LTU: case GE: case GEU:
+ if (lo[1] == const0_rtx)
+ {
+ ix86_expand_branch (code, hi[0], hi[1], label);
+ return;
+ }
+ break;
+ case LE: case LEU: case GT: case GTU:
+ if (lo[1] == constm1_rtx)
+ {
+ ix86_expand_branch (code, hi[0], hi[1], label);
+ return;
+ }
+ break;
+ default:
+ break;
+ }
+
+ /* Emulate comparisons that do not depend on Zero flag with
+ double-word subtraction. Note that only Overflow, Sign
+ and Carry flags are valid, so swap arguments and condition
+ of comparisons that would otherwise test Zero flag. */
+
+ switch (code)
+ {
+ case LE: case LEU: case GT: case GTU:
+ std::swap (lo[0], lo[1]);
+ std::swap (hi[0], hi[1]);
+ code = swap_condition (code);
+ /* FALLTHRU */
+
+ case LT: case LTU: case GE: case GEU:
+ {
+ rtx (*cmp_insn) (rtx, rtx);
+ rtx (*sbb_insn) (rtx, rtx, rtx);
+ bool uns = (code == LTU || code == GEU);
+
+ if (TARGET_64BIT)
+ {
+ cmp_insn = gen_cmpdi_1;
+ sbb_insn
+ = uns ? gen_subdi3_carry_ccc : gen_subdi3_carry_ccgz;
+ }
+ else
+ {
+ cmp_insn = gen_cmpsi_1;
+ sbb_insn
+ = uns ? gen_subsi3_carry_ccc : gen_subsi3_carry_ccgz;
+ }
+
+ if (!nonimmediate_operand (lo[0], submode))
+ lo[0] = force_reg (submode, lo[0]);
+ if (!x86_64_general_operand (lo[1], submode))
+ lo[1] = force_reg (submode, lo[1]);
+
+ if (!register_operand (hi[0], submode))
+ hi[0] = force_reg (submode, hi[0]);
+ if ((uns && !nonimmediate_operand (hi[1], submode))
+ || (!uns && !x86_64_general_operand (hi[1], submode)))
+ hi[1] = force_reg (submode, hi[1]);
+
+ emit_insn (cmp_insn (lo[0], lo[1]));
+ emit_insn (sbb_insn (gen_rtx_SCRATCH (submode), hi[0], hi[1]));
+
+ tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
+
+ ix86_expand_branch (code, tmp, const0_rtx, label);
+ return;
+ }
+
+ default:
+ break;
+ }
+
+ /* Otherwise, we need two or three jumps. */
+
+ label2 = gen_label_rtx ();
+
+ code1 = code;
+ code2 = swap_condition (code);
+ code3 = unsigned_condition (code);
+
+ switch (code)
+ {
+ case LT: case GT: case LTU: case GTU:
+ break;
+
+ case LE: code1 = LT; code2 = GT; break;
+ case GE: code1 = GT; code2 = LT; break;
+ case LEU: code1 = LTU; code2 = GTU; break;
+ case GEU: code1 = GTU; code2 = LTU; break;
+
+ case EQ: code1 = UNKNOWN; code2 = NE; break;
+ case NE: code2 = UNKNOWN; break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ /*
+ * a < b =>
+ * if (hi(a) < hi(b)) goto true;
+ * if (hi(a) > hi(b)) goto false;
+ * if (lo(a) < lo(b)) goto true;
+ * false:
+ */
+
+ if (code1 != UNKNOWN)
+ ix86_expand_branch (code1, hi[0], hi[1], label);
+ if (code2 != UNKNOWN)
+ ix86_expand_branch (code2, hi[0], hi[1], label2);
+
+ ix86_expand_branch (code3, lo[0], lo[1], label);
+
+ if (code2 != UNKNOWN)
+ emit_label (label2);
+ return;
+ }
+
+ default:
+ gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
+ goto simple;
+ }
+}
+
+/* Figure out whether to use unordered fp comparisons. */
+
+static bool
+ix86_unordered_fp_compare (enum rtx_code code)
+{
+ if (!TARGET_IEEE_FP)
+ return false;
+
+ switch (code)
+ {
+ case GT:
+ case GE:
+ case LT:
+ case LE:
+ return false;
+
+ case EQ:
+ case NE:
+
+ case LTGT:
+ case UNORDERED:
+ case ORDERED:
+ case UNLT:
+ case UNLE:
+ case UNGT:
+ case UNGE:
+ case UNEQ:
+ return true;
+
+ default:
+ gcc_unreachable ();
+ }
+}
+
+/* Return a comparison we can do and that it is equivalent to
+ swap_condition (code) apart possibly from orderedness.
+ But, never change orderedness if TARGET_IEEE_FP, returning
+ UNKNOWN in that case if necessary. */
+
+static enum rtx_code
+ix86_fp_swap_condition (enum rtx_code code)
+{
+ switch (code)
+ {
+ case GT: /* GTU - CF=0 & ZF=0 */
+ return TARGET_IEEE_FP ? UNKNOWN : UNLT;
+ case GE: /* GEU - CF=0 */
+ return TARGET_IEEE_FP ? UNKNOWN : UNLE;
+ case UNLT: /* LTU - CF=1 */
+ return TARGET_IEEE_FP ? UNKNOWN : GT;
+ case UNLE: /* LEU - CF=1 | ZF=1 */
+ return TARGET_IEEE_FP ? UNKNOWN : GE;
+ default:
+ return swap_condition (code);
+ }
+}
+
+/* Return cost of comparison CODE using the best strategy for performance.
+ All following functions do use number of instructions as a cost metrics.
+ In future this should be tweaked to compute bytes for optimize_size and
+ take into account performance of various instructions on various CPUs. */
+
+static int
+ix86_fp_comparison_cost (enum rtx_code code)
+{
+ int arith_cost;
+
+ /* The cost of code using bit-twiddling on %ah. */
+ switch (code)
+ {
+ case UNLE:
+ case UNLT:
+ case LTGT:
+ case GT:
+ case GE:
+ case UNORDERED:
+ case ORDERED:
+ case UNEQ:
+ arith_cost = 4;
+ break;
+ case LT:
+ case NE:
+ case EQ:
+ case UNGE:
+ arith_cost = TARGET_IEEE_FP ? 5 : 4;
+ break;
+ case LE:
+ case UNGT:
+ arith_cost = TARGET_IEEE_FP ? 6 : 4;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ switch (ix86_fp_comparison_strategy (code))
+ {
+ case IX86_FPCMP_COMI:
+ return arith_cost > 4 ? 3 : 2;
+ case IX86_FPCMP_SAHF:
+ return arith_cost > 4 ? 4 : 3;
+ default:
+ return arith_cost;
+ }
+}
+
+/* Swap, force into registers, or otherwise massage the two operands
+ to a fp comparison. The operands are updated in place; the new
+ comparison code is returned. */
+
+static enum rtx_code
+ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
+{
+ bool unordered_compare = ix86_unordered_fp_compare (code);
+ rtx op0 = *pop0, op1 = *pop1;
+ machine_mode op_mode = GET_MODE (op0);
+ bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
+
+ /* All of the unordered compare instructions only work on registers.
+ The same is true of the fcomi compare instructions. The XFmode
+ compare instructions require registers except when comparing
+ against zero or when converting operand 1 from fixed point to
+ floating point. */
+
+ if (!is_sse
+ && (unordered_compare
+ || (op_mode == XFmode
+ && ! (standard_80387_constant_p (op0) == 1
+ || standard_80387_constant_p (op1) == 1)
+ && GET_CODE (op1) != FLOAT)
+ || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
+ {
+ op0 = force_reg (op_mode, op0);
+ op1 = force_reg (op_mode, op1);
+ }
+ else
+ {
+ /* %%% We only allow op1 in memory; op0 must be st(0). So swap
+ things around if they appear profitable, otherwise force op0
+ into a register. */
+
+ if (standard_80387_constant_p (op0) == 0
+ || (MEM_P (op0)
+ && ! (standard_80387_constant_p (op1) == 0
+ || MEM_P (op1))))
+ {
+ enum rtx_code new_code = ix86_fp_swap_condition (code);
+ if (new_code != UNKNOWN)
+ {
+ std::swap (op0, op1);
+ code = new_code;
+ }
+ }
+
+ if (!REG_P (op0))
+ op0 = force_reg (op_mode, op0);
+
+ if (CONSTANT_P (op1))
+ {
+ int tmp = standard_80387_constant_p (op1);
+ if (tmp == 0)
+ op1 = validize_mem (force_const_mem (op_mode, op1));
+ else if (tmp == 1)
+ {
+ if (TARGET_CMOVE)
+ op1 = force_reg (op_mode, op1);
+ }
+ else
+ op1 = force_reg (op_mode, op1);
+ }
+ }
+
+ /* Try to rearrange the comparison to make it cheaper. */
+ if (ix86_fp_comparison_cost (code)
+ > ix86_fp_comparison_cost (swap_condition (code))
+ && (REG_P (op1) || can_create_pseudo_p ()))
+ {
+ std::swap (op0, op1);
+ code = swap_condition (code);
+ if (!REG_P (op0))
+ op0 = force_reg (op_mode, op0);
+ }
+
+ *pop0 = op0;
+ *pop1 = op1;
+ return code;
+}
+
+/* Generate insn patterns to do a floating point compare of OPERANDS. */
+
+static rtx
+ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
+{
+ bool unordered_compare = ix86_unordered_fp_compare (code);
+ machine_mode cmp_mode;
+ rtx tmp, scratch;
+
+ code = ix86_prepare_fp_compare_args (code, &op0, &op1);
+
+ tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
+ if (unordered_compare)
+ tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
+
+ /* Do fcomi/sahf based test when profitable. */
+ switch (ix86_fp_comparison_strategy (code))
+ {
+ case IX86_FPCMP_COMI:
+ cmp_mode = CCFPmode;
+ emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
+ break;
+
+ case IX86_FPCMP_SAHF:
+ cmp_mode = CCFPmode;
+ tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
+ scratch = gen_reg_rtx (HImode);
+ emit_insn (gen_rtx_SET (scratch, tmp));
+ emit_insn (gen_x86_sahf_1 (scratch));
+ break;
+
+ case IX86_FPCMP_ARITH:
+ cmp_mode = CCNOmode;
+ tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
+ scratch = gen_reg_rtx (HImode);
+ emit_insn (gen_rtx_SET (scratch, tmp));
+
+ /* In the unordered case, we have to check C2 for NaN's, which
+ doesn't happen to work out to anything nice combination-wise.
+ So do some bit twiddling on the value we've got in AH to come
+ up with an appropriate set of condition codes. */
+
+ switch (code)
+ {
+ case GT:
+ case UNGT:
+ if (code == GT || !TARGET_IEEE_FP)
+ {
+ emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
+ code = EQ;
+ }
+ else
+ {
+ emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
+ emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
+ emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
+ cmp_mode = CCmode;
+ code = GEU;
+ }
+ break;
+ case LT:
+ case UNLT:
+ if (code == LT && TARGET_IEEE_FP)
+ {
+ emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
+ emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
+ cmp_mode = CCmode;
+ code = EQ;
+ }
+ else
+ {
+ emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
+ code = NE;
+ }
+ break;
+ case GE:
+ case UNGE:
+ if (code == GE || !TARGET_IEEE_FP)
+ {
+ emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
+ code = EQ;
+ }
+ else
+ {
+ emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
+ emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
+ code = NE;
+ }
+ break;
+ case LE:
+ case UNLE:
+ if (code == LE && TARGET_IEEE_FP)
+ {
+ emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
+ emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
+ emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
+ cmp_mode = CCmode;
+ code = LTU;
+ }
+ else
+ {
+ emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
+ code = NE;
+ }
+ break;
+ case EQ:
+ case UNEQ:
+ if (code == EQ && TARGET_IEEE_FP)
+ {
+ emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
+ emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
+ cmp_mode = CCmode;
+ code = EQ;
+ }
+ else
+ {
+ emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
+ code = NE;
+ }
+ break;
+ case NE:
+ case LTGT:
+ if (code == NE && TARGET_IEEE_FP)
+ {
+ emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
+ emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
+ GEN_INT (0x40)));
+ code = NE;
+ }
+ else
+ {
+ emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
+ code = EQ;
+ }
+ break;
+
+ case UNORDERED:
+ emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
+ code = NE;
+ break;
+ case ORDERED:
+ emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
+ code = EQ;
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+ break;
+
+ default:
+ gcc_unreachable();
+ }
+
+ /* Return the test that should be put into the flags user, i.e.
+ the bcc, scc, or cmov instruction. */
+ return gen_rtx_fmt_ee (code, VOIDmode,
+ gen_rtx_REG (cmp_mode, FLAGS_REG),
+ const0_rtx);
+}
+
+/* Generate insn patterns to do an integer compare of OPERANDS. */
+
+static rtx
+ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
+{
+ machine_mode cmpmode;
+ rtx tmp, flags;
+
+ cmpmode = SELECT_CC_MODE (code, op0, op1);
+ flags = gen_rtx_REG (cmpmode, FLAGS_REG);
+
+ /* This is very simple, but making the interface the same as in the
+ FP case makes the rest of the code easier. */
+ tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
+ emit_insn (gen_rtx_SET (flags, tmp));
+
+ /* Return the test that should be put into the flags user, i.e.
+ the bcc, scc, or cmov instruction. */
+ return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
+}
+
+static rtx
+ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
+{
+ rtx ret;
+
+ if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
+ ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
+
+ else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
+ {
+ gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
+ ret = ix86_expand_fp_compare (code, op0, op1);
+ }
+ else
+ ret = ix86_expand_int_compare (code, op0, op1);
+
+ return ret;
+}
+
+void
+ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
+{
+ rtx ret;
+
+ gcc_assert (GET_MODE (dest) == QImode);
+
+ ret = ix86_expand_compare (code, op0, op1);
+ PUT_MODE (ret, QImode);
+ emit_insn (gen_rtx_SET (dest, ret));
+}
+
+/* Expand comparison setting or clearing carry flag. Return true when
+ successful and set pop for the operation. */
+static bool
+ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
+{
+ machine_mode mode
+ = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
+
+ /* Do not handle double-mode compares that go through special path. */
+ if (mode == (TARGET_64BIT ? TImode : DImode))
+ return false;
+
+ if (SCALAR_FLOAT_MODE_P (mode))
+ {
+ rtx compare_op;
+ rtx_insn *compare_seq;
+
+ gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
+
+ /* Shortcut: following common codes never translate
+ into carry flag compares. */
+ if (code == EQ || code == NE || code == UNEQ || code == LTGT
+ || code == ORDERED || code == UNORDERED)
+ return false;
+
+ /* These comparisons require zero flag; swap operands so they won't. */
+ if ((code == GT || code == UNLE || code == LE || code == UNGT)
+ && !TARGET_IEEE_FP)
+ {
+ std::swap (op0, op1);
+ code = swap_condition (code);
+ }
+
+ /* Try to expand the comparison and verify that we end up with
+ carry flag based comparison. This fails to be true only when
+ we decide to expand comparison using arithmetic that is not
+ too common scenario. */
+ start_sequence ();
+ compare_op = ix86_expand_fp_compare (code, op0, op1);
+ compare_seq = get_insns ();
+ end_sequence ();
+
+ if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
+ code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
+ else
+ code = GET_CODE (compare_op);
+
+ if (code != LTU && code != GEU)
+ return false;
+
+ emit_insn (compare_seq);
+ *pop = compare_op;
+ return true;
+ }
+
+ if (!INTEGRAL_MODE_P (mode))
+ return false;
+
+ switch (code)
+ {
+ case LTU:
+ case GEU:
+ break;
+
+ /* Convert a==0 into (unsigned)a<1. */
+ case EQ:
+ case NE:
+ if (op1 != const0_rtx)
+ return false;
+ op1 = const1_rtx;
+ code = (code == EQ ? LTU : GEU);
+ break;
+
+ /* Convert a>b into b<a or a>=b-1. */
+ case GTU:
+ case LEU:
+ if (CONST_INT_P (op1))
+ {
+ op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
+ /* Bail out on overflow. We still can swap operands but that
+ would force loading of the constant into register. */
+ if (op1 == const0_rtx
+ || !x86_64_immediate_operand (op1, GET_MODE (op1)))
+ return false;
+ code = (code == GTU ? GEU : LTU);
+ }
+ else
+ {
+ std::swap (op0, op1);
+ code = (code == GTU ? LTU : GEU);
+ }
+ break;
+
+ /* Convert a>=0 into (unsigned)a<0x80000000. */
+ case LT:
+ case GE:
+ if (mode == DImode || op1 != const0_rtx)
+ return false;
+ op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
+ code = (code == LT ? GEU : LTU);
+ break;
+ case LE:
+ case GT:
+ if (mode == DImode || op1 != constm1_rtx)
+ return false;
+ op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
+ code = (code == LE ? GEU : LTU);
+ break;
+
+ default:
+ return false;
+ }
+ /* Swapping operands may cause constant to appear as first operand. */
+ if (!nonimmediate_operand (op0, VOIDmode))
+ {
+ if (!can_create_pseudo_p ())
+ return false;
+ op0 = force_reg (mode, op0);
+ }
+ *pop = ix86_expand_compare (code, op0, op1);
+ gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
+ return true;
+}
+
+/* Expand conditional increment or decrement using adb/sbb instructions.
+ The default case using setcc followed by the conditional move can be
+ done by generic code. */
+bool
+ix86_expand_int_addcc (rtx operands[])
+{
+ enum rtx_code code = GET_CODE (operands[1]);
+ rtx flags;
+ rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
+ rtx compare_op;
+ rtx val = const0_rtx;
+ bool fpcmp = false;
+ machine_mode mode;
+ rtx op0 = XEXP (operands[1], 0);
+ rtx op1 = XEXP (operands[1], 1);
+
+ if (operands[3] != const1_rtx
+ && operands[3] != constm1_rtx)
+ return false;
+ if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
+ return false;
+ code = GET_CODE (compare_op);
+
+ flags = XEXP (compare_op, 0);
+
+ if (GET_MODE (flags) == CCFPmode)
+ {
+ fpcmp = true;
+ code = ix86_fp_compare_code_to_integer (code);
+ }
+
+ if (code != LTU)
+ {
+ val = constm1_rtx;
+ if (fpcmp)
+ PUT_CODE (compare_op,
+ reverse_condition_maybe_unordered
+ (GET_CODE (compare_op)));
+ else
+ PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
+ }
+
+ mode = GET_MODE (operands[0]);
+
+ /* Construct either adc or sbb insn. */
+ if ((code == LTU) == (operands[3] == constm1_rtx))
+ {
+ switch (mode)
+ {
+ case E_QImode:
+ insn = gen_subqi3_carry;
+ break;
+ case E_HImode:
+ insn = gen_subhi3_carry;
+ break;
+ case E_SImode:
+ insn = gen_subsi3_carry;
+ break;
+ case E_DImode:
+ insn = gen_subdi3_carry;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ }
+ else
+ {
+ switch (mode)
+ {
+ case E_QImode:
+ insn = gen_addqi3_carry;
+ break;
+ case E_HImode:
+ insn = gen_addhi3_carry;
+ break;
+ case E_SImode:
+ insn = gen_addsi3_carry;
+ break;
+ case E_DImode:
+ insn = gen_adddi3_carry;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ }
+ emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
+
+ return true;
+}
+
+bool
+ix86_expand_int_movcc (rtx operands[])
+{
+ enum rtx_code code = GET_CODE (operands[1]), compare_code;
+ rtx_insn *compare_seq;
+ rtx compare_op;
+ machine_mode mode = GET_MODE (operands[0]);
+ bool sign_bit_compare_p = false;
+ rtx op0 = XEXP (operands[1], 0);
+ rtx op1 = XEXP (operands[1], 1);
+
+ if (GET_MODE (op0) == TImode
+ || (GET_MODE (op0) == DImode
+ && !TARGET_64BIT))
+ return false;
+
+ start_sequence ();
+ compare_op = ix86_expand_compare (code, op0, op1);
+ compare_seq = get_insns ();
+ end_sequence ();
+
+ compare_code = GET_CODE (compare_op);
+
+ if ((op1 == const0_rtx && (code == GE || code == LT))
+ || (op1 == constm1_rtx && (code == GT || code == LE)))
+ sign_bit_compare_p = true;
+
+ /* Don't attempt mode expansion here -- if we had to expand 5 or 6
+ HImode insns, we'd be swallowed in word prefix ops. */
+
+ if ((mode != HImode || TARGET_FAST_PREFIX)
+ && (mode != (TARGET_64BIT ? TImode : DImode))
+ && CONST_INT_P (operands[2])
+ && CONST_INT_P (operands[3]))
+ {
+ rtx out = operands[0];
+ HOST_WIDE_INT ct = INTVAL (operands[2]);
+ HOST_WIDE_INT cf = INTVAL (operands[3]);
+ HOST_WIDE_INT diff;
+
+ diff = ct - cf;
+ /* Sign bit compares are better done using shifts than we do by using
+ sbb. */
+ if (sign_bit_compare_p
+ || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
+ {
+ /* Detect overlap between destination and compare sources. */
+ rtx tmp = out;
+
+ if (!sign_bit_compare_p)
+ {
+ rtx flags;
+ bool fpcmp = false;
+
+ compare_code = GET_CODE (compare_op);
+
+ flags = XEXP (compare_op, 0);
+
+ if (GET_MODE (flags) == CCFPmode)
+ {
+ fpcmp = true;
+ compare_code
+ = ix86_fp_compare_code_to_integer (compare_code);
+ }
+
+ /* To simplify rest of code, restrict to the GEU case. */
+ if (compare_code == LTU)
+ {
+ std::swap (ct, cf);
+ compare_code = reverse_condition (compare_code);
+ code = reverse_condition (code);
+ }
+ else
+ {
+ if (fpcmp)
+ PUT_CODE (compare_op,
+ reverse_condition_maybe_unordered
+ (GET_CODE (compare_op)));
+ else
+ PUT_CODE (compare_op,
+ reverse_condition (GET_CODE (compare_op)));
+ }
+ diff = ct - cf;
+
+ if (reg_overlap_mentioned_p (out, op0)
+ || reg_overlap_mentioned_p (out, op1))
+ tmp = gen_reg_rtx (mode);
+
+ if (mode == DImode)
+ emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
+ else
+ emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
+ flags, compare_op));
+ }
+ else
+ {
+ if (code == GT || code == GE)
+ code = reverse_condition (code);
+ else
+ {
+ std::swap (ct, cf);
+ diff = ct - cf;
+ }
+ tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
+ }
+
+ if (diff == 1)
+ {
+ /*
+ * cmpl op0,op1
+ * sbbl dest,dest
+ * [addl dest, ct]
+ *
+ * Size 5 - 8.
+ */
+ if (ct)
+ tmp = expand_simple_binop (mode, PLUS,
+ tmp, GEN_INT (ct),
+ copy_rtx (tmp), 1, OPTAB_DIRECT);
+ }
+ else if (cf == -1)
+ {
+ /*
+ * cmpl op0,op1
+ * sbbl dest,dest
+ * orl $ct, dest
+ *
+ * Size 8.
+ */
+ tmp = expand_simple_binop (mode, IOR,
+ tmp, GEN_INT (ct),
+ copy_rtx (tmp), 1, OPTAB_DIRECT);
+ }
+ else if (diff == -1 && ct)
+ {
+ /*
+ * cmpl op0,op1
+ * sbbl dest,dest
+ * notl dest
+ * [addl dest, cf]
+ *
+ * Size 8 - 11.
+ */
+ tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
+ if (cf)
+ tmp = expand_simple_binop (mode, PLUS,
+ copy_rtx (tmp), GEN_INT (cf),
+ copy_rtx (tmp), 1, OPTAB_DIRECT);
+ }
+ else
+ {
+ /*
+ * cmpl op0,op1
+ * sbbl dest,dest
+ * [notl dest]
+ * andl cf - ct, dest
+ * [addl dest, ct]
+ *
+ * Size 8 - 11.
+ */
+
+ if (cf == 0)
+ {
+ cf = ct;
+ ct = 0;
+ tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
+ }
+
+ tmp = expand_simple_binop (mode, AND,
+ copy_rtx (tmp),
+ gen_int_mode (cf - ct, mode),
+ copy_rtx (tmp), 1, OPTAB_DIRECT);
+ if (ct)
+ tmp = expand_simple_binop (mode, PLUS,
+ copy_rtx (tmp), GEN_INT (ct),
+ copy_rtx (tmp), 1, OPTAB_DIRECT);
+ }
+
+ if (!rtx_equal_p (tmp, out))
+ emit_move_insn (copy_rtx (out), copy_rtx (tmp));
+
+ return true;
+ }
+
+ if (diff < 0)
+ {
+ machine_mode cmp_mode = GET_MODE (op0);
+ enum rtx_code new_code;
+
+ if (SCALAR_FLOAT_MODE_P (cmp_mode))
+ {
+ gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
+
+ /* We may be reversing unordered compare to normal compare, that
+ is not valid in general (we may convert non-trapping condition
+ to trapping one), however on i386 we currently emit all
+ comparisons unordered. */
+ new_code = reverse_condition_maybe_unordered (code);
+ }
+ else
+ new_code = ix86_reverse_condition (code, cmp_mode);
+ if (new_code != UNKNOWN)
+ {
+ std::swap (ct, cf);
+ diff = -diff;
+ code = new_code;
+ }
+ }
+
+ compare_code = UNKNOWN;
+ if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
+ && CONST_INT_P (op1))
+ {
+ if (op1 == const0_rtx
+ && (code == LT || code == GE))
+ compare_code = code;
+ else if (op1 == constm1_rtx)
+ {
+ if (code == LE)
+ compare_code = LT;
+ else if (code == GT)
+ compare_code = GE;
+ }
+ }
+
+ /* Optimize dest = (op0 < 0) ? -1 : cf. */
+ if (compare_code != UNKNOWN
+ && GET_MODE (op0) == GET_MODE (out)
+ && (cf == -1 || ct == -1))
+ {
+ /* If lea code below could be used, only optimize
+ if it results in a 2 insn sequence. */
+
+ if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
+ || diff == 3 || diff == 5 || diff == 9)
+ || (compare_code == LT && ct == -1)
+ || (compare_code == GE && cf == -1))
+ {
+ /*
+ * notl op1 (if necessary)
+ * sarl $31, op1
+ * orl cf, op1
+ */
+ if (ct != -1)
+ {
+ cf = ct;
+ ct = -1;
+ code = reverse_condition (code);
+ }
+
+ out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
+
+ out = expand_simple_binop (mode, IOR,
+ out, GEN_INT (cf),
+ out, 1, OPTAB_DIRECT);
+ if (out != operands[0])
+ emit_move_insn (operands[0], out);
+
+ return true;
+ }
+ }
+
+
+ if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
+ || diff == 3 || diff == 5 || diff == 9)
+ && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
+ && (mode != DImode
+ || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
+ {
+ /*
+ * xorl dest,dest
+ * cmpl op1,op2
+ * setcc dest
+ * lea cf(dest*(ct-cf)),dest
+ *
+ * Size 14.
+ *
+ * This also catches the degenerate setcc-only case.
+ */
+
+ rtx tmp;
+ int nops;
+
+ out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
+
+ nops = 0;
+ /* On x86_64 the lea instruction operates on Pmode, so we need
+ to get arithmetics done in proper mode to match. */
+ if (diff == 1)
+ tmp = copy_rtx (out);
+ else
+ {
+ rtx out1;
+ out1 = copy_rtx (out);
+ tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
+ nops++;
+ if (diff & 1)
+ {
+ tmp = gen_rtx_PLUS (mode, tmp, out1);
+ nops++;
+ }
+ }
+ if (cf != 0)
+ {
+ tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
+ nops++;
+ }
+ if (!rtx_equal_p (tmp, out))
+ {
+ if (nops == 1)
+ out = force_operand (tmp, copy_rtx (out));
+ else
+ emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
+ }
+ if (!rtx_equal_p (out, operands[0]))
+ emit_move_insn (operands[0], copy_rtx (out));
+
+ return true;
+ }
+
+ /*
+ * General case: Jumpful:
+ * xorl dest,dest cmpl op1, op2
+ * cmpl op1, op2 movl ct, dest
+ * setcc dest jcc 1f
+ * decl dest movl cf, dest
+ * andl (cf-ct),dest 1:
+ * addl ct,dest
+ *
+ * Size 20. Size 14.
+ *
+ * This is reasonably steep, but branch mispredict costs are
+ * high on modern cpus, so consider failing only if optimizing
+ * for space.
+ */
+
+ if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
+ && BRANCH_COST (optimize_insn_for_speed_p (),
+ false) >= 2)
+ {
+ if (cf == 0)
+ {
+ machine_mode cmp_mode = GET_MODE (op0);
+ enum rtx_code new_code;
+
+ if (SCALAR_FLOAT_MODE_P (cmp_mode))
+ {
+ gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
+
+ /* We may be reversing unordered compare to normal compare,
+ that is not valid in general (we may convert non-trapping
+ condition to trapping one), however on i386 we currently
+ emit all comparisons unordered. */
+ new_code = reverse_condition_maybe_unordered (code);
+ }
+ else
+ {
+ new_code = ix86_reverse_condition (code, cmp_mode);
+ if (compare_code != UNKNOWN && new_code != UNKNOWN)
+ compare_code = reverse_condition (compare_code);
+ }
+
+ if (new_code != UNKNOWN)
+ {
+ cf = ct;
+ ct = 0;
+ code = new_code;
+ }
+ }
+
+ if (compare_code != UNKNOWN)
+ {
+ /* notl op1 (if needed)
+ sarl $31, op1
+ andl (cf-ct), op1
+ addl ct, op1
+
+ For x < 0 (resp. x <= -1) there will be no notl,
+ so if possible swap the constants to get rid of the
+ complement.
+ True/false will be -1/0 while code below (store flag
+ followed by decrement) is 0/-1, so the constants need
+ to be exchanged once more. */
+
+ if (compare_code == GE || !cf)
+ {
+ code = reverse_condition (code);
+ compare_code = LT;
+ }
+ else
+ std::swap (ct, cf);
+
+ out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
+ }
+ else
+ {
+ out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
+
+ out = expand_simple_binop (mode, PLUS, copy_rtx (out),
+ constm1_rtx,
+ copy_rtx (out), 1, OPTAB_DIRECT);
+ }
+
+ out = expand_simple_binop (mode, AND, copy_rtx (out),
+ gen_int_mode (cf - ct, mode),
+ copy_rtx (out), 1, OPTAB_DIRECT);
+ if (ct)
+ out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
+ copy_rtx (out), 1, OPTAB_DIRECT);
+ if (!rtx_equal_p (out, operands[0]))
+ emit_move_insn (operands[0], copy_rtx (out));
+
+ return true;
+ }
+ }
+
+ if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
+ {
+ /* Try a few things more with specific constants and a variable. */
+
+ optab op;
+ rtx var, orig_out, out, tmp;
+
+ if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
+ return false;
+
+ /* If one of the two operands is an interesting constant, load a
+ constant with the above and mask it in with a logical operation. */
+
+ if (CONST_INT_P (operands[2]))
+ {
+ var = operands[3];
+ if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
+ operands[3] = constm1_rtx, op = and_optab;
+ else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
+ operands[3] = const0_rtx, op = ior_optab;
+ else
+ return false;
+ }
+ else if (CONST_INT_P (operands[3]))
+ {
+ var = operands[2];
+ if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
+ operands[2] = constm1_rtx, op = and_optab;
+ else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
+ operands[2] = const0_rtx, op = ior_optab;
+ else
+ return false;
+ }
+ else
+ return false;
+
+ orig_out = operands[0];
+ tmp = gen_reg_rtx (mode);
+ operands[0] = tmp;
+
+ /* Recurse to get the constant loaded. */
+ if (!ix86_expand_int_movcc (operands))
+ return false;
+
+ /* Mask in the interesting variable. */
+ out = expand_binop (mode, op, var, tmp, orig_out, 0,
+ OPTAB_WIDEN);
+ if (!rtx_equal_p (out, orig_out))
+ emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
+
+ return true;
+ }
+
+ /*
+ * For comparison with above,
+ *
+ * movl cf,dest
+ * movl ct,tmp
+ * cmpl op1,op2
+ * cmovcc tmp,dest
+ *
+ * Size 15.
+ */
+
+ if (! nonimmediate_operand (operands[2], mode))
+ operands[2] = force_reg (mode, operands[2]);
+ if (! nonimmediate_operand (operands[3], mode))
+ operands[3] = force_reg (mode, operands[3]);
+
+ if (! register_operand (operands[2], VOIDmode)
+ && (mode == QImode
+ || ! register_operand (operands[3], VOIDmode)))
+ operands[2] = force_reg (mode, operands[2]);
+
+ if (mode == QImode
+ && ! register_operand (operands[3], VOIDmode))
+ operands[3] = force_reg (mode, operands[3]);
+
+ emit_insn (compare_seq);
+ emit_insn (gen_rtx_SET (operands[0],
+ gen_rtx_IF_THEN_ELSE (mode,
+ compare_op, operands[2],
+ operands[3])));
+ return true;
+}
+
+/* Detect conditional moves that exactly match min/max operational
+ semantics. Note that this is IEEE safe, as long as we don't
+ interchange the operands.
+
+ Returns FALSE if this conditional move doesn't match a MIN/MAX,
+ and TRUE if the operation is successful and instructions are emitted. */
+
+static bool
+ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
+ rtx cmp_op1, rtx if_true, rtx if_false)
+{
+ machine_mode mode;
+ bool is_min;
+ rtx tmp;
+
+ if (code == LT)
+ ;
+ else if (code == UNGE)
+ std::swap (if_true, if_false);
+ else
+ return false;
+
+ if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
+ is_min = true;
+ else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
+ is_min = false;
+ else
+ return false;
+
+ mode = GET_MODE (dest);
+
+ /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
+ but MODE may be a vector mode and thus not appropriate. */
+ if (!flag_finite_math_only || flag_signed_zeros)
+ {
+ int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
+ rtvec v;
+
+ if_true = force_reg (mode, if_true);
+ v = gen_rtvec (2, if_true, if_false);
+ tmp = gen_rtx_UNSPEC (mode, v, u);
+ }
+ else
+ {
+ code = is_min ? SMIN : SMAX;
+ if (MEM_P (if_true) && MEM_P (if_false))
+ if_true = force_reg (mode, if_true);
+ tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
+ }
+
+ emit_insn (gen_rtx_SET (dest, tmp));
+ return true;
+}
+
+/* Expand an SSE comparison. Return the register with the result. */
+
+static rtx
+ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
+ rtx op_true, rtx op_false)
+{
+ machine_mode mode = GET_MODE (dest);
+ machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
+
+ /* In general case result of comparison can differ from operands' type. */
+ machine_mode cmp_mode;
+
+ /* In AVX512F the result of comparison is an integer mask. */
+ bool maskcmp = false;
+ rtx x;
+
+ if (GET_MODE_SIZE (cmp_ops_mode) == 64)
+ {
+ unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
+ cmp_mode = int_mode_for_size (nbits, 0).require ();
+ maskcmp = true;
+ }
+ else
+ cmp_mode = cmp_ops_mode;
+
+ cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
+
+ int (*op1_predicate)(rtx, machine_mode)
+ = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
+
+ if (!op1_predicate (cmp_op1, cmp_ops_mode))
+ cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
+
+ if (optimize
+ || (maskcmp && cmp_mode != mode)
+ || (op_true && reg_overlap_mentioned_p (dest, op_true))
+ || (op_false && reg_overlap_mentioned_p (dest, op_false)))
+ dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
+
+ /* Compare patterns for int modes are unspec in AVX512F only. */
+ if (maskcmp && (code == GT || code == EQ))
+ {
+ rtx (*gen)(rtx, rtx, rtx);
+
+ switch (cmp_ops_mode)
+ {
+ case E_V64QImode:
+ gcc_assert (TARGET_AVX512BW);
+ gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
+ break;
+ case E_V32HImode:
+ gcc_assert (TARGET_AVX512BW);
+ gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
+ break;
+ case E_V16SImode:
+ gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
+ break;
+ case E_V8DImode:
+ gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
+ break;
+ default:
+ gen = NULL;
+ }
+
+ if (gen)
+ {
+ emit_insn (gen (dest, cmp_op0, cmp_op1));
+ return dest;
+ }
+ }
+ x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
+
+ if (cmp_mode != mode && !maskcmp)
+ {
+ x = force_reg (cmp_ops_mode, x);
+ convert_move (dest, x, false);
+ }
+ else
+ emit_insn (gen_rtx_SET (dest, x));
+
+ return dest;
+}
+
+/* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
+ operations. This is used for both scalar and vector conditional moves. */
+
+void
+ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
+{
+ machine_mode mode = GET_MODE (dest);
+ machine_mode cmpmode = GET_MODE (cmp);
+
+ /* In AVX512F the result of comparison is an integer mask. */
+ bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
+
+ rtx t2, t3, x;
+
+ /* If we have an integer mask and FP value then we need
+ to cast mask to FP mode. */
+ if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
+ {
+ cmp = force_reg (cmpmode, cmp);
+ cmp = gen_rtx_SUBREG (mode, cmp, 0);
+ }
+
+ if (maskcmp)
+ {
+ rtx (*gen) (rtx, rtx) = NULL;
+ if ((op_true == CONST0_RTX (mode)
+ && vector_all_ones_operand (op_false, mode))
+ || (op_false == CONST0_RTX (mode)
+ && vector_all_ones_operand (op_true, mode)))
+ switch (mode)
+ {
+ case E_V64QImode:
+ if (TARGET_AVX512BW)
+ gen = gen_avx512bw_cvtmask2bv64qi;
+ break;
+ case E_V32QImode:
+ if (TARGET_AVX512VL && TARGET_AVX512BW)
+ gen = gen_avx512vl_cvtmask2bv32qi;
+ break;
+ case E_V16QImode:
+ if (TARGET_AVX512VL && TARGET_AVX512BW)
+ gen = gen_avx512vl_cvtmask2bv16qi;
+ break;
+ case E_V32HImode:
+ if (TARGET_AVX512BW)
+ gen = gen_avx512bw_cvtmask2wv32hi;
+ break;
+ case E_V16HImode:
+ if (TARGET_AVX512VL && TARGET_AVX512BW)
+ gen = gen_avx512vl_cvtmask2wv16hi;
+ break;
+ case E_V8HImode:
+ if (TARGET_AVX512VL && TARGET_AVX512BW)
+ gen = gen_avx512vl_cvtmask2wv8hi;
+ break;
+ case E_V16SImode:
+ if (TARGET_AVX512DQ)
+ gen = gen_avx512f_cvtmask2dv16si;
+ break;
+ case E_V8SImode:
+ if (TARGET_AVX512VL && TARGET_AVX512DQ)
+ gen = gen_avx512vl_cvtmask2dv8si;
+ break;
+ case E_V4SImode:
+ if (TARGET_AVX512VL && TARGET_AVX512DQ)
+ gen = gen_avx512vl_cvtmask2dv4si;
+ break;
+ case E_V8DImode:
+ if (TARGET_AVX512DQ)
+ gen = gen_avx512f_cvtmask2qv8di;
+ break;
+ case E_V4DImode:
+ if (TARGET_AVX512VL && TARGET_AVX512DQ)
+ gen = gen_avx512vl_cvtmask2qv4di;
+ break;
+ case E_V2DImode:
+ if (TARGET_AVX512VL && TARGET_AVX512DQ)
+ gen = gen_avx512vl_cvtmask2qv2di;
+ break;
+ default:
+ break;
+ }
+ if (gen && SCALAR_INT_MODE_P (cmpmode))
+ {
+ cmp = force_reg (cmpmode, cmp);
+ if (op_true == CONST0_RTX (mode))
+ {
+ rtx (*gen_not) (rtx, rtx);
+ switch (cmpmode)
+ {
+ case E_QImode: gen_not = gen_knotqi; break;
+ case E_HImode: gen_not = gen_knothi; break;
+ case E_SImode: gen_not = gen_knotsi; break;
+ case E_DImode: gen_not = gen_knotdi; break;
+ default: gcc_unreachable ();
+ }
+ rtx n = gen_reg_rtx (cmpmode);
+ emit_insn (gen_not (n, cmp));
+ cmp = n;
+ }
+ emit_insn (gen (dest, cmp));
+ return;
+ }
+ }
+ else if (vector_all_ones_operand (op_true, mode)
+ && op_false == CONST0_RTX (mode))
+ {
+ emit_insn (gen_rtx_SET (dest, cmp));
+ return;
+ }
+ else if (op_false == CONST0_RTX (mode))
+ {
+ op_true = force_reg (mode, op_true);
+ x = gen_rtx_AND (mode, cmp, op_true);
+ emit_insn (gen_rtx_SET (dest, x));
+ return;
+ }
+ else if (op_true == CONST0_RTX (mode))
+ {
+ op_false = force_reg (mode, op_false);
+ x = gen_rtx_NOT (mode, cmp);
+ x = gen_rtx_AND (mode, x, op_false);
+ emit_insn (gen_rtx_SET (dest, x));
+ return;
+ }
+ else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
+ {
+ op_false = force_reg (mode, op_false);
+ x = gen_rtx_IOR (mode, cmp, op_false);
+ emit_insn (gen_rtx_SET (dest, x));
+ return;
+ }
+ else if (TARGET_XOP)
+ {
+ op_true = force_reg (mode, op_true);
+
+ if (!nonimmediate_operand (op_false, mode))
+ op_false = force_reg (mode, op_false);
+
+ emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
+ op_true,
+ op_false)));
+ return;
+ }
+
+ rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
+ rtx d = dest;
+
+ if (!vector_operand (op_true, mode))
+ op_true = force_reg (mode, op_true);
+
+ op_false = force_reg (mode, op_false);
+
+ switch (mode)
+ {
+ case E_V4SFmode:
+ if (TARGET_SSE4_1)
+ gen = gen_sse4_1_blendvps;
+ break;
+ case E_V2DFmode:
+ if (TARGET_SSE4_1)
+ gen = gen_sse4_1_blendvpd;
+ break;
+ case E_SFmode:
+ if (TARGET_SSE4_1)
+ {
+ gen = gen_sse4_1_blendvss;
+ op_true = force_reg (mode, op_true);
+ }
+ break;
+ case E_DFmode:
+ if (TARGET_SSE4_1)
+ {
+ gen = gen_sse4_1_blendvsd;
+ op_true = force_reg (mode, op_true);
+ }
+ break;
+ case E_V16QImode:
+ case E_V8HImode:
+ case E_V4SImode:
+ case E_V2DImode:
+ if (TARGET_SSE4_1)
+ {
+ gen = gen_sse4_1_pblendvb;
+ if (mode != V16QImode)
+ d = gen_reg_rtx (V16QImode);
+ op_false = gen_lowpart (V16QImode, op_false);
+ op_true = gen_lowpart (V16QImode, op_true);
+ cmp = gen_lowpart (V16QImode, cmp);
+ }
+ break;
+ case E_V8SFmode:
+ if (TARGET_AVX)
+ gen = gen_avx_blendvps256;
+ break;
+ case E_V4DFmode:
+ if (TARGET_AVX)
+ gen = gen_avx_blendvpd256;
+ break;
+ case E_V32QImode:
+ case E_V16HImode:
+ case E_V8SImode:
+ case E_V4DImode:
+ if (TARGET_AVX2)
+ {
+ gen = gen_avx2_pblendvb;
+ if (mode != V32QImode)
+ d = gen_reg_rtx (V32QImode);
+ op_false = gen_lowpart (V32QImode, op_false);
+ op_true = gen_lowpart (V32QImode, op_true);
+ cmp = gen_lowpart (V32QImode, cmp);
+ }
+ break;
+
+ case E_V64QImode:
+ gen = gen_avx512bw_blendmv64qi;
+ break;
+ case E_V32HImode:
+ gen = gen_avx512bw_blendmv32hi;
+ break;
+ case E_V16SImode:
+ gen = gen_avx512f_blendmv16si;
+ break;
+ case E_V8DImode:
+ gen = gen_avx512f_blendmv8di;
+ break;
+ case E_V8DFmode:
+ gen = gen_avx512f_blendmv8df;
+ break;
+ case E_V16SFmode:
+ gen = gen_avx512f_blendmv16sf;
+ break;
+
+ default:
+ break;
+ }
+
+ if (gen != NULL)
+ {
+ emit_insn (gen (d, op_false, op_true, cmp));
+ if (d != dest)
+ emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
+ }
+ else
+ {
+ op_true = force_reg (mode, op_true);
+
+ t2 = gen_reg_rtx (mode);
+ if (optimize)
+ t3 = gen_reg_rtx (mode);
+ else
+ t3 = dest;
+
+ x = gen_rtx_AND (mode, op_true, cmp);
+ emit_insn (gen_rtx_SET (t2, x));
+
+ x = gen_rtx_NOT (mode, cmp);
+ x = gen_rtx_AND (mode, x, op_false);
+ emit_insn (gen_rtx_SET (t3, x));
+
+ x = gen_rtx_IOR (mode, t3, t2);
+ emit_insn (gen_rtx_SET (dest, x));
+ }
+}
+
+/* Swap, force into registers, or otherwise massage the two operands
+ to an sse comparison with a mask result. Thus we differ a bit from
+ ix86_prepare_fp_compare_args which expects to produce a flags result.
+
+ The DEST operand exists to help determine whether to commute commutative
+ operators. The POP0/POP1 operands are updated in place. The new
+ comparison code is returned, or UNKNOWN if not implementable. */
+
+static enum rtx_code
+ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
+ rtx *pop0, rtx *pop1)
+{
+ switch (code)
+ {
+ case LTGT:
+ case UNEQ:
+ /* AVX supports all the needed comparisons. */
+ if (TARGET_AVX)
+ break;
+ /* We have no LTGT as an operator. We could implement it with
+ NE & ORDERED, but this requires an extra temporary. It's
+ not clear that it's worth it. */
+ return UNKNOWN;
+
+ case LT:
+ case LE:
+ case UNGT:
+ case UNGE:
+ /* These are supported directly. */
+ break;
+
+ case EQ:
+ case NE:
+ case UNORDERED:
+ case ORDERED:
+ /* AVX has 3 operand comparisons, no need to swap anything. */
+ if (TARGET_AVX)
+ break;
+ /* For commutative operators, try to canonicalize the destination
+ operand to be first in the comparison - this helps reload to
+ avoid extra moves. */
+ if (!dest || !rtx_equal_p (dest, *pop1))
+ break;
+ /* FALLTHRU */
+
+ case GE:
+ case GT:
+ case UNLE:
+ case UNLT:
+ /* These are not supported directly before AVX, and furthermore
+ ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
+ comparison operands to transform into something that is
+ supported. */
+ std::swap (*pop0, *pop1);
+ code = swap_condition (code);
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ return code;
+}
+
+/* Expand a floating-point conditional move. Return true if successful. */
+
+bool
+ix86_expand_fp_movcc (rtx operands[])
+{
+ machine_mode mode = GET_MODE (operands[0]);
+ enum rtx_code code = GET_CODE (operands[1]);
+ rtx tmp, compare_op;
+ rtx op0 = XEXP (operands[1], 0);
+ rtx op1 = XEXP (operands[1], 1);
+
+ if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
+ {
+ machine_mode cmode;
+
+ /* Since we've no cmove for sse registers, don't force bad register
+ allocation just to gain access to it. Deny movcc when the
+ comparison mode doesn't match the move mode. */
+ cmode = GET_MODE (op0);
+ if (cmode == VOIDmode)
+ cmode = GET_MODE (op1);
+ if (cmode != mode)
+ return false;
+
+ code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
+ if (code == UNKNOWN)
+ return false;
+
+ if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
+ operands[2], operands[3]))
+ return true;
+
+ tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
+ operands[2], operands[3]);
+ ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
+ return true;
+ }
+
+ if (GET_MODE (op0) == TImode
+ || (GET_MODE (op0) == DImode
+ && !TARGET_64BIT))
+ return false;
+
+ /* The floating point conditional move instructions don't directly
+ support conditions resulting from a signed integer comparison. */
+
+ compare_op = ix86_expand_compare (code, op0, op1);
+ if (!fcmov_comparison_operator (compare_op, VOIDmode))
+ {
+ tmp = gen_reg_rtx (QImode);
+ ix86_expand_setcc (tmp, code, op0, op1);
+
+ compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
+ }
+
+ emit_insn (gen_rtx_SET (operands[0],
+ gen_rtx_IF_THEN_ELSE (mode, compare_op,
+ operands[2], operands[3])));
+
+ return true;
+}
+
+/* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
+
+static int
+ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
+{
+ switch (code)
+ {
+ case EQ:
+ return 0;
+ case LT:
+ case LTU:
+ return 1;
+ case LE:
+ case LEU:
+ return 2;
+ case NE:
+ return 4;
+ case GE:
+ case GEU:
+ return 5;
+ case GT:
+ case GTU:
+ return 6;
+ default:
+ gcc_unreachable ();
+ }
+}
+
+/* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
+
+static int
+ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
+{
+ switch (code)
+ {
+ case EQ:
+ return 0x00;
+ case NE:
+ return 0x04;
+ case GT:
+ return 0x0e;
+ case LE:
+ return 0x02;
+ case GE:
+ return 0x0d;
+ case LT:
+ return 0x01;
+ case UNLE:
+ return 0x0a;
+ case UNLT:
+ return 0x09;
+ case UNGE:
+ return 0x05;
+ case UNGT:
+ return 0x06;
+ case UNEQ:
+ return 0x18;
+ case LTGT:
+ return 0x0c;
+ case ORDERED:
+ return 0x07;
+ case UNORDERED:
+ return 0x03;
+ default:
+ gcc_unreachable ();
+ }
+}
+
+/* Return immediate value to be used in UNSPEC_PCMP
+ for comparison CODE in MODE. */
+
+static int
+ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
+{
+ if (FLOAT_MODE_P (mode))
+ return ix86_fp_cmp_code_to_pcmp_immediate (code);
+ return ix86_int_cmp_code_to_pcmp_immediate (code);
+}
+
+/* Expand AVX-512 vector comparison. */
+
+bool
+ix86_expand_mask_vec_cmp (rtx operands[])
+{
+ machine_mode mask_mode = GET_MODE (operands[0]);
+ machine_mode cmp_mode = GET_MODE (operands[2]);
+ enum rtx_code code = GET_CODE (operands[1]);
+ rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
+ int unspec_code;
+ rtx unspec;
+
+ switch (code)
+ {
+ case LEU:
+ case GTU:
+ case GEU:
+ case LTU:
+ unspec_code = UNSPEC_UNSIGNED_PCMP;
+ break;
+
+ default:
+ unspec_code = UNSPEC_PCMP;
+ }
+
+ unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
+ operands[3], imm),
+ unspec_code);
+ emit_insn (gen_rtx_SET (operands[0], unspec));
+
+ return true;
+}
+
+/* Expand fp vector comparison. */
+
+bool
+ix86_expand_fp_vec_cmp (rtx operands[])
+{
+ enum rtx_code code = GET_CODE (operands[1]);
+ rtx cmp;
+
+ code = ix86_prepare_sse_fp_compare_args (operands[0], code,
+ &operands[2], &operands[3]);
+ if (code == UNKNOWN)
+ {
+ rtx temp;
+ switch (GET_CODE (operands[1]))
+ {
+ case LTGT:
+ temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
+ operands[3], NULL, NULL);
+ cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
+ operands[3], NULL, NULL);
+ code = AND;
+ break;
+ case UNEQ:
+ temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
+ operands[3], NULL, NULL);
+ cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
+ operands[3], NULL, NULL);
+ code = IOR;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
+ OPTAB_DIRECT);
+ }
+ else
+ cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
+ operands[1], operands[2]);
+
+ if (operands[0] != cmp)
+ emit_move_insn (operands[0], cmp);
+
+ return true;
+}
+
+static rtx
+ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
+ rtx op_true, rtx op_false, bool *negate)
+{
+ machine_mode data_mode = GET_MODE (dest);
+ machine_mode mode = GET_MODE (cop0);
+ rtx x;
+
+ *negate = false;
+
+ /* XOP supports all of the comparisons on all 128-bit vector int types. */
+ if (TARGET_XOP
+ && (mode == V16QImode || mode == V8HImode
+ || mode == V4SImode || mode == V2DImode))
+ ;
+ else
+ {
+ /* Canonicalize the comparison to EQ, GT, GTU. */
+ switch (code)
+ {
+ case EQ:
+ case GT:
+ case GTU:
+ break;
+
+ case NE:
+ case LE:
+ case LEU:
+ code = reverse_condition (code);
+ *negate = true;
+ break;
+
+ case GE:
+ case GEU:
+ code = reverse_condition (code);
+ *negate = true;
+ /* FALLTHRU */
+
+ case LT:
+ case LTU:
+ std::swap (cop0, cop1);
+ code = swap_condition (code);
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ /* Only SSE4.1/SSE4.2 supports V2DImode. */
+ if (mode == V2DImode)
+ {
+ switch (code)
+ {
+ case EQ:
+ /* SSE4.1 supports EQ. */
+ if (!TARGET_SSE4_1)
+ return NULL;
+ break;
+
+ case GT:
+ case GTU:
+ /* SSE4.2 supports GT/GTU. */
+ if (!TARGET_SSE4_2)
+ return NULL;
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+ }
+
+ rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
+ rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
+ if (*negate)
+ std::swap (optrue, opfalse);
+
+ /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
+ not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
+ min (x, y) == x). While we add one instruction (the minimum),
+ we remove the need for two instructions in the negation, as the
+ result is done this way.
+ When using masks, do it for SI/DImode element types, as it is shorter
+ than the two subtractions. */
+ if ((code != EQ
+ && GET_MODE_SIZE (mode) != 64
+ && vector_all_ones_operand (opfalse, data_mode)
+ && optrue == CONST0_RTX (data_mode))
+ || (code == GTU
+ && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
+ /* Don't do it if not using integer masks and we'd end up with
+ the right values in the registers though. */
+ && (GET_MODE_SIZE (mode) == 64
+ || !vector_all_ones_operand (optrue, data_mode)
+ || opfalse != CONST0_RTX (data_mode))))
+ {
+ rtx (*gen) (rtx, rtx, rtx) = NULL;
+
+ switch (mode)
+ {
+ case E_V16SImode:
+ gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
+ break;
+ case E_V8DImode:
+ gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
+ cop0 = force_reg (mode, cop0);
+ cop1 = force_reg (mode, cop1);
+ break;
+ case E_V32QImode:
+ if (TARGET_AVX2)
+ gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
+ break;
+ case E_V16HImode:
+ if (TARGET_AVX2)
+ gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
+ break;
+ case E_V8SImode:
+ if (TARGET_AVX2)
+ gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
+ break;
+ case E_V4DImode:
+ if (TARGET_AVX512VL)
+ {
+ gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
+ cop0 = force_reg (mode, cop0);
+ cop1 = force_reg (mode, cop1);
+ }
+ break;
+ case E_V16QImode:
+ if (code == GTU && TARGET_SSE2)
+ gen = gen_uminv16qi3;
+ else if (code == GT && TARGET_SSE4_1)
+ gen = gen_sminv16qi3;
+ break;
+ case E_V8HImode:
+ if (code == GTU && TARGET_SSE4_1)
+ gen = gen_uminv8hi3;
+ else if (code == GT && TARGET_SSE2)
+ gen = gen_sminv8hi3;
+ break;
+ case E_V4SImode:
+ if (TARGET_SSE4_1)
+ gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
+ break;
+ case E_V2DImode:
+ if (TARGET_AVX512VL)
+ {
+ gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
+ cop0 = force_reg (mode, cop0);
+ cop1 = force_reg (mode, cop1);
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (gen)
+ {
+ rtx tem = gen_reg_rtx (mode);
+ if (!vector_operand (cop0, mode))
+ cop0 = force_reg (mode, cop0);
+ if (!vector_operand (cop1, mode))
+ cop1 = force_reg (mode, cop1);
+ *negate = !*negate;
+ emit_insn (gen (tem, cop0, cop1));
+ cop1 = tem;
+ code = EQ;
+ }
+ }
+
+ /* Unsigned parallel compare is not supported by the hardware.
+ Play some tricks to turn this into a signed comparison
+ against 0. */
+ if (code == GTU)
+ {
+ cop0 = force_reg (mode, cop0);
+
+ switch (mode)
+ {
+ case E_V16SImode:
+ case E_V8DImode:
+ case E_V8SImode:
+ case E_V4DImode:
+ case E_V4SImode:
+ case E_V2DImode:
+ {
+ rtx t1, t2, mask;
+ rtx (*gen_sub3) (rtx, rtx, rtx);
+
+ switch (mode)
+ {
+ case E_V16SImode: gen_sub3 = gen_subv16si3; break;
+ case E_V8DImode: gen_sub3 = gen_subv8di3; break;
+ case E_V8SImode: gen_sub3 = gen_subv8si3; break;
+ case E_V4DImode: gen_sub3 = gen_subv4di3; break;
+ case E_V4SImode: gen_sub3 = gen_subv4si3; break;
+ case E_V2DImode: gen_sub3 = gen_subv2di3; break;
+ default:
+ gcc_unreachable ();
+ }
+ /* Subtract (-(INT MAX) - 1) from both operands to make
+ them signed. */
+ mask = ix86_build_signbit_mask (mode, true, false);
+ t1 = gen_reg_rtx (mode);
+ emit_insn (gen_sub3 (t1, cop0, mask));
+
+ t2 = gen_reg_rtx (mode);
+ emit_insn (gen_sub3 (t2, cop1, mask));
+
+ cop0 = t1;
+ cop1 = t2;
+ code = GT;
+ }
+ break;
+
+ case E_V64QImode:
+ case E_V32HImode:
+ case E_V32QImode:
+ case E_V16HImode:
+ case E_V16QImode:
+ case E_V8HImode:
+ /* Perform a parallel unsigned saturating subtraction. */
+ x = gen_reg_rtx (mode);
+ emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
+ cop1)));
+
+ cop0 = x;
+ cop1 = CONST0_RTX (mode);
+ code = EQ;
+ *negate = !*negate;
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+ }
+ }
+
+ if (*negate)
+ std::swap (op_true, op_false);
+
+ /* Allow the comparison to be done in one mode, but the movcc to
+ happen in another mode. */
+ if (data_mode == mode)
+ {
+ x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
+ op_true, op_false);
+ }
+ else
+ {
+ gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
+ x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
+ op_true, op_false);
+ if (GET_MODE (x) == mode)
+ x = gen_lowpart (data_mode, x);
+ }
+
+ return x;
+}
+
+/* Expand integer vector comparison. */
+
+bool
+ix86_expand_int_vec_cmp (rtx operands[])
+{
+ rtx_code code = GET_CODE (operands[1]);
+ bool negate = false;
+ rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
+ operands[3], NULL, NULL, &negate);
+
+ if (!cmp)
+ return false;
+
+ if (negate)
+ cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
+ CONST0_RTX (GET_MODE (cmp)),
+ NULL, NULL, &negate);
+
+ gcc_assert (!negate);
+
+ if (operands[0] != cmp)
+ emit_move_insn (operands[0], cmp);
+
+ return true;
+}
+
+/* Expand a floating-point vector conditional move; a vcond operation
+ rather than a movcc operation. */
+
+bool
+ix86_expand_fp_vcond (rtx operands[])
+{
+ enum rtx_code code = GET_CODE (operands[3]);
+ rtx cmp;
+
+ code = ix86_prepare_sse_fp_compare_args (operands[0], code,
+ &operands[4], &operands[5]);
+ if (code == UNKNOWN)
+ {
+ rtx temp;
+ switch (GET_CODE (operands[3]))
+ {
+ case LTGT:
+ temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
+ operands[5], operands[0], operands[0]);
+ cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
+ operands[5], operands[1], operands[2]);
+ code = AND;
+ break;
+ case UNEQ:
+ temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
+ operands[5], operands[0], operands[0]);
+ cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
+ operands[5], operands[1], operands[2]);
+ code = IOR;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
+ OPTAB_DIRECT);
+ ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
+ return true;
+ }
+
+ if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
+ operands[5], operands[1], operands[2]))
+ return true;
+
+ cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
+ operands[1], operands[2]);
+ ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
+ return true;
+}
+
+/* Expand a signed/unsigned integral vector conditional move. */
+
+bool
+ix86_expand_int_vcond (rtx operands[])
+{
+ machine_mode data_mode = GET_MODE (operands[0]);
+ machine_mode mode = GET_MODE (operands[4]);
+ enum rtx_code code = GET_CODE (operands[3]);
+ bool negate = false;
+ rtx x, cop0, cop1;
+
+ cop0 = operands[4];
+ cop1 = operands[5];
+
+ /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
+ and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
+ if ((code == LT || code == GE)
+ && data_mode == mode
+ && cop1 == CONST0_RTX (mode)
+ && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
+ && GET_MODE_UNIT_SIZE (data_mode) > 1
+ && GET_MODE_UNIT_SIZE (data_mode) <= 8
+ && (GET_MODE_SIZE (data_mode) == 16
+ || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
+ {
+ rtx negop = operands[2 - (code == LT)];
+ int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
+ if (negop == CONST1_RTX (data_mode))
+ {
+ rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
+ operands[0], 1, OPTAB_DIRECT);
+ if (res != operands[0])
+ emit_move_insn (operands[0], res);
+ return true;
+ }
+ else if (GET_MODE_INNER (data_mode) != DImode
+ && vector_all_ones_operand (negop, data_mode))
+ {
+ rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
+ operands[0], 0, OPTAB_DIRECT);
+ if (res != operands[0])
+ emit_move_insn (operands[0], res);
+ return true;
+ }
+ }
+
+ if (!nonimmediate_operand (cop1, mode))
+ cop1 = force_reg (mode, cop1);
+ if (!general_operand (operands[1], data_mode))
+ operands[1] = force_reg (data_mode, operands[1]);
+ if (!general_operand (operands[2], data_mode))
+ operands[2] = force_reg (data_mode, operands[2]);
+
+ x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
+ operands[1], operands[2], &negate);
+
+ if (!x)
+ return false;
+
+ ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
+ operands[2-negate]);
+ return true;
+}
+
+static bool
+ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
+ struct expand_vec_perm_d *d)
+{
+ /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
+ expander, so args are either in d, or in op0, op1 etc. */
+ machine_mode mode = GET_MODE (d ? d->op0 : op0);
+ machine_mode maskmode = mode;
+ rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
+
+ switch (mode)
+ {
+ case E_V8HImode:
+ if (TARGET_AVX512VL && TARGET_AVX512BW)
+ gen = gen_avx512vl_vpermt2varv8hi3;
+ break;
+ case E_V16HImode:
+ if (TARGET_AVX512VL && TARGET_AVX512BW)
+ gen = gen_avx512vl_vpermt2varv16hi3;
+ break;
+ case E_V64QImode:
+ if (TARGET_AVX512VBMI)
+ gen = gen_avx512bw_vpermt2varv64qi3;
+ break;
+ case E_V32HImode:
+ if (TARGET_AVX512BW)
+ gen = gen_avx512bw_vpermt2varv32hi3;
+ break;
+ case E_V4SImode:
+ if (TARGET_AVX512VL)
+ gen = gen_avx512vl_vpermt2varv4si3;
+ break;
+ case E_V8SImode:
+ if (TARGET_AVX512VL)
+ gen = gen_avx512vl_vpermt2varv8si3;
+ break;
+ case E_V16SImode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vpermt2varv16si3;
+ break;
+ case E_V4SFmode:
+ if (TARGET_AVX512VL)
+ {
+ gen = gen_avx512vl_vpermt2varv4sf3;
+ maskmode = V4SImode;
+ }
+ break;
+ case E_V8SFmode:
+ if (TARGET_AVX512VL)
+ {
+ gen = gen_avx512vl_vpermt2varv8sf3;
+ maskmode = V8SImode;
+ }
+ break;
+ case E_V16SFmode:
+ if (TARGET_AVX512F)
+ {
+ gen = gen_avx512f_vpermt2varv16sf3;
+ maskmode = V16SImode;
+ }
+ break;
+ case E_V2DImode:
+ if (TARGET_AVX512VL)
+ gen = gen_avx512vl_vpermt2varv2di3;
+ break;
+ case E_V4DImode:
+ if (TARGET_AVX512VL)
+ gen = gen_avx512vl_vpermt2varv4di3;
+ break;
+ case E_V8DImode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vpermt2varv8di3;
+ break;
+ case E_V2DFmode:
+ if (TARGET_AVX512VL)
+ {
+ gen = gen_avx512vl_vpermt2varv2df3;
+ maskmode = V2DImode;
+ }
+ break;
+ case E_V4DFmode:
+ if (TARGET_AVX512VL)
+ {
+ gen = gen_avx512vl_vpermt2varv4df3;
+ maskmode = V4DImode;
+ }
+ break;
+ case E_V8DFmode:
+ if (TARGET_AVX512F)
+ {
+ gen = gen_avx512f_vpermt2varv8df3;
+ maskmode = V8DImode;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (gen == NULL)
+ return false;
+
+ /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
+ expander, so args are either in d, or in op0, op1 etc. */
+ if (d)
+ {
+ rtx vec[64];
+ target = d->target;
+ op0 = d->op0;
+ op1 = d->op1;
+ for (int i = 0; i < d->nelt; ++i)
+ vec[i] = GEN_INT (d->perm[i]);
+ mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
+ }
+
+ emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
+ return true;
+}
+
+/* Expand a variable vector permutation. */
+
+void
+ix86_expand_vec_perm (rtx operands[])
+{
+ rtx target = operands[0];
+ rtx op0 = operands[1];
+ rtx op1 = operands[2];
+ rtx mask = operands[3];
+ rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
+ machine_mode mode = GET_MODE (op0);
+ machine_mode maskmode = GET_MODE (mask);
+ int w, e, i;
+ bool one_operand_shuffle = rtx_equal_p (op0, op1);
+
+ /* Number of elements in the vector. */
+ w = GET_MODE_NUNITS (mode);
+ e = GET_MODE_UNIT_SIZE (mode);
+ gcc_assert (w <= 64);
+
+ if (TARGET_AVX512F && one_operand_shuffle)
+ {
+ rtx (*gen) (rtx, rtx, rtx) = NULL;
+ switch (mode)
+ {
+ case E_V16SImode:
+ gen =gen_avx512f_permvarv16si;
+ break;
+ case E_V16SFmode:
+ gen = gen_avx512f_permvarv16sf;
+ break;
+ case E_V8DImode:
+ gen = gen_avx512f_permvarv8di;
+ break;
+ case E_V8DFmode:
+ gen = gen_avx512f_permvarv8df;
+ break;
+ default:
+ break;
+ }
+ if (gen != NULL)
+ {
+ emit_insn (gen (target, op0, mask));
+ return;
+ }
+ }
+
+ if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
+ return;
+
+ if (TARGET_AVX2)
+ {
+ if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
+ {
+ /* Unfortunately, the VPERMQ and VPERMPD instructions only support
+ an constant shuffle operand. With a tiny bit of effort we can
+ use VPERMD instead. A re-interpretation stall for V4DFmode is
+ unfortunate but there's no avoiding it.
+ Similarly for V16HImode we don't have instructions for variable
+ shuffling, while for V32QImode we can use after preparing suitable
+ masks vpshufb; vpshufb; vpermq; vpor. */
+
+ if (mode == V16HImode)
+ {
+ maskmode = mode = V32QImode;
+ w = 32;
+ e = 1;
+ }
+ else
+ {
+ maskmode = mode = V8SImode;
+ w = 8;
+ e = 4;
+ }
+ t1 = gen_reg_rtx (maskmode);
+
+ /* Replicate the low bits of the V4DImode mask into V8SImode:
+ mask = { A B C D }
+ t1 = { A A B B C C D D }. */
+ for (i = 0; i < w / 2; ++i)
+ vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
+ vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
+ vt = force_reg (maskmode, vt);
+ mask = gen_lowpart (maskmode, mask);
+ if (maskmode == V8SImode)
+ emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
+ else
+ emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
+
+ /* Multiply the shuffle indicies by two. */
+ t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
+ OPTAB_DIRECT);
+
+ /* Add one to the odd shuffle indicies:
+ t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
+ for (i = 0; i < w / 2; ++i)
+ {
+ vec[i * 2] = const0_rtx;
+ vec[i * 2 + 1] = const1_rtx;
+ }
+ vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
+ vt = validize_mem (force_const_mem (maskmode, vt));
+ t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
+ OPTAB_DIRECT);
+
+ /* Continue as if V8SImode (resp. V32QImode) was used initially. */
+ operands[3] = mask = t1;
+ target = gen_reg_rtx (mode);
+ op0 = gen_lowpart (mode, op0);
+ op1 = gen_lowpart (mode, op1);
+ }
+
+ switch (mode)
+ {
+ case E_V8SImode:
+ /* The VPERMD and VPERMPS instructions already properly ignore
+ the high bits of the shuffle elements. No need for us to
+ perform an AND ourselves. */
+ if (one_operand_shuffle)
+ {
+ emit_insn (gen_avx2_permvarv8si (target, op0, mask));
+ if (target != operands[0])
+ emit_move_insn (operands[0],
+ gen_lowpart (GET_MODE (operands[0]), target));
+ }
+ else
+ {
+ t1 = gen_reg_rtx (V8SImode);
+ t2 = gen_reg_rtx (V8SImode);
+ emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
+ emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
+ goto merge_two;
+ }
+ return;
+
+ case E_V8SFmode:
+ mask = gen_lowpart (V8SImode, mask);
+ if (one_operand_shuffle)
+ emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
+ else
+ {
+ t1 = gen_reg_rtx (V8SFmode);
+ t2 = gen_reg_rtx (V8SFmode);
+ emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
+ emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
+ goto merge_two;
+ }
+ return;
+
+ case E_V4SImode:
+ /* By combining the two 128-bit input vectors into one 256-bit
+ input vector, we can use VPERMD and VPERMPS for the full
+ two-operand shuffle. */
+ t1 = gen_reg_rtx (V8SImode);
+ t2 = gen_reg_rtx (V8SImode);
+ emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
+ emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
+ emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
+ emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
+ return;
+
+ case E_V4SFmode:
+ t1 = gen_reg_rtx (V8SFmode);
+ t2 = gen_reg_rtx (V8SImode);
+ mask = gen_lowpart (V4SImode, mask);
+ emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
+ emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
+ emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
+ emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
+ return;
+
+ case E_V32QImode:
+ t1 = gen_reg_rtx (V32QImode);
+ t2 = gen_reg_rtx (V32QImode);
+ t3 = gen_reg_rtx (V32QImode);
+ vt2 = GEN_INT (-128);
+ vt = gen_const_vec_duplicate (V32QImode, vt2);
+ vt = force_reg (V32QImode, vt);
+ for (i = 0; i < 32; i++)
+ vec[i] = i < 16 ? vt2 : const0_rtx;
+ vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
+ vt2 = force_reg (V32QImode, vt2);
+ /* From mask create two adjusted masks, which contain the same
+ bits as mask in the low 7 bits of each vector element.
+ The first mask will have the most significant bit clear
+ if it requests element from the same 128-bit lane
+ and MSB set if it requests element from the other 128-bit lane.
+ The second mask will have the opposite values of the MSB,
+ and additionally will have its 128-bit lanes swapped.
+ E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
+ t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
+ t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
+ stands for other 12 bytes. */
+ /* The bit whether element is from the same lane or the other
+ lane is bit 4, so shift it up by 3 to the MSB position. */
+ t5 = gen_reg_rtx (V4DImode);
+ emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
+ GEN_INT (3)));
+ /* Clear MSB bits from the mask just in case it had them set. */
+ emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
+ /* After this t1 will have MSB set for elements from other lane. */
+ emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
+ /* Clear bits other than MSB. */
+ emit_insn (gen_andv32qi3 (t1, t1, vt));
+ /* Or in the lower bits from mask into t3. */
+ emit_insn (gen_iorv32qi3 (t3, t1, t2));
+ /* And invert MSB bits in t1, so MSB is set for elements from the same
+ lane. */
+ emit_insn (gen_xorv32qi3 (t1, t1, vt));
+ /* Swap 128-bit lanes in t3. */
+ t6 = gen_reg_rtx (V4DImode);
+ emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
+ const2_rtx, GEN_INT (3),
+ const0_rtx, const1_rtx));
+ /* And or in the lower bits from mask into t1. */
+ emit_insn (gen_iorv32qi3 (t1, t1, t2));
+ if (one_operand_shuffle)
+ {
+ /* Each of these shuffles will put 0s in places where
+ element from the other 128-bit lane is needed, otherwise
+ will shuffle in the requested value. */
+ emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
+ gen_lowpart (V32QImode, t6)));
+ emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
+ /* For t3 the 128-bit lanes are swapped again. */
+ t7 = gen_reg_rtx (V4DImode);
+ emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
+ const2_rtx, GEN_INT (3),
+ const0_rtx, const1_rtx));
+ /* And oring both together leads to the result. */
+ emit_insn (gen_iorv32qi3 (target, t1,
+ gen_lowpart (V32QImode, t7)));
+ if (target != operands[0])
+ emit_move_insn (operands[0],
+ gen_lowpart (GET_MODE (operands[0]), target));
+ return;
+ }
+
+ t4 = gen_reg_rtx (V32QImode);
+ /* Similarly to the above one_operand_shuffle code,
+ just for repeated twice for each operand. merge_two:
+ code will merge the two results together. */
+ emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
+ gen_lowpart (V32QImode, t6)));
+ emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
+ gen_lowpart (V32QImode, t6)));
+ emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
+ emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
+ t7 = gen_reg_rtx (V4DImode);
+ emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
+ const2_rtx, GEN_INT (3),
+ const0_rtx, const1_rtx));
+ t8 = gen_reg_rtx (V4DImode);
+ emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
+ const2_rtx, GEN_INT (3),
+ const0_rtx, const1_rtx));
+ emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
+ emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
+ t1 = t4;
+ t2 = t3;
+ goto merge_two;
+
+ default:
+ gcc_assert (GET_MODE_SIZE (mode) <= 16);
+ break;
+ }
+ }
+
+ if (TARGET_XOP)
+ {
+ /* The XOP VPPERM insn supports three inputs. By ignoring the
+ one_operand_shuffle special case, we avoid creating another
+ set of constant vectors in memory. */
+ one_operand_shuffle = false;
+
+ /* mask = mask & {2*w-1, ...} */
+ vt = GEN_INT (2*w - 1);
+ }
+ else
+ {
+ /* mask = mask & {w-1, ...} */
+ vt = GEN_INT (w - 1);
+ }
+
+ vt = gen_const_vec_duplicate (maskmode, vt);
+ mask = expand_simple_binop (maskmode, AND, mask, vt,
+ NULL_RTX, 0, OPTAB_DIRECT);
+
+ /* For non-QImode operations, convert the word permutation control
+ into a byte permutation control. */
+ if (mode != V16QImode)
+ {
+ mask = expand_simple_binop (maskmode, ASHIFT, mask,
+ GEN_INT (exact_log2 (e)),
+ NULL_RTX, 0, OPTAB_DIRECT);
+
+ /* Convert mask to vector of chars. */
+ mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
+
+ /* Replicate each of the input bytes into byte positions:
+ (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
+ (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
+ (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
+ for (i = 0; i < 16; ++i)
+ vec[i] = GEN_INT (i/e * e);
+ vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
+ vt = validize_mem (force_const_mem (V16QImode, vt));
+ if (TARGET_XOP)
+ emit_insn (gen_xop_pperm (mask, mask, mask, vt));
+ else
+ emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
+
+ /* Convert it into the byte positions by doing
+ mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
+ for (i = 0; i < 16; ++i)
+ vec[i] = GEN_INT (i % e);
+ vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
+ vt = validize_mem (force_const_mem (V16QImode, vt));
+ emit_insn (gen_addv16qi3 (mask, mask, vt));
+ }
+
+ /* The actual shuffle operations all operate on V16QImode. */
+ op0 = gen_lowpart (V16QImode, op0);
+ op1 = gen_lowpart (V16QImode, op1);
+
+ if (TARGET_XOP)
+ {
+ if (GET_MODE (target) != V16QImode)
+ target = gen_reg_rtx (V16QImode);
+ emit_insn (gen_xop_pperm (target, op0, op1, mask));
+ if (target != operands[0])
+ emit_move_insn (operands[0],
+ gen_lowpart (GET_MODE (operands[0]), target));
+ }
+ else if (one_operand_shuffle)
+ {
+ if (GET_MODE (target) != V16QImode)
+ target = gen_reg_rtx (V16QImode);
+ emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
+ if (target != operands[0])
+ emit_move_insn (operands[0],
+ gen_lowpart (GET_MODE (operands[0]), target));
+ }
+ else
+ {
+ rtx xops[6];
+ bool ok;
+
+ /* Shuffle the two input vectors independently. */
+ t1 = gen_reg_rtx (V16QImode);
+ t2 = gen_reg_rtx (V16QImode);
+ emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
+ emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
+
+ merge_two:
+ /* Then merge them together. The key is whether any given control
+ element contained a bit set that indicates the second word. */
+ mask = operands[3];
+ vt = GEN_INT (w);
+ if (maskmode == V2DImode && !TARGET_SSE4_1)
+ {
+ /* Without SSE4.1, we don't have V2DImode EQ. Perform one
+ more shuffle to convert the V2DI input mask into a V4SI
+ input mask. At which point the masking that expand_int_vcond
+ will work as desired. */
+ rtx t3 = gen_reg_rtx (V4SImode);
+ emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
+ const0_rtx, const0_rtx,
+ const2_rtx, const2_rtx));
+ mask = t3;
+ maskmode = V4SImode;
+ e = w = 4;
+ }
+
+ vt = gen_const_vec_duplicate (maskmode, vt);
+ vt = force_reg (maskmode, vt);
+ mask = expand_simple_binop (maskmode, AND, mask, vt,
+ NULL_RTX, 0, OPTAB_DIRECT);
+
+ if (GET_MODE (target) != mode)
+ target = gen_reg_rtx (mode);
+ xops[0] = target;
+ xops[1] = gen_lowpart (mode, t2);
+ xops[2] = gen_lowpart (mode, t1);
+ xops[3] = gen_rtx_EQ (maskmode, mask, vt);
+ xops[4] = mask;
+ xops[5] = vt;
+ ok = ix86_expand_int_vcond (xops);
+ gcc_assert (ok);
+ if (target != operands[0])
+ emit_move_insn (operands[0],
+ gen_lowpart (GET_MODE (operands[0]), target));
+ }
+}
+
+/* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
+ true if we should do zero extension, else sign extension. HIGH_P is
+ true if we want the N/2 high elements, else the low elements. */
+
+void
+ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
+{
+ machine_mode imode = GET_MODE (src);
+ rtx tmp;
+
+ if (TARGET_SSE4_1)
+ {
+ rtx (*unpack)(rtx, rtx);
+ rtx (*extract)(rtx, rtx) = NULL;
+ machine_mode halfmode = BLKmode;
+
+ switch (imode)
+ {
+ case E_V64QImode:
+ if (unsigned_p)
+ unpack = gen_avx512bw_zero_extendv32qiv32hi2;
+ else
+ unpack = gen_avx512bw_sign_extendv32qiv32hi2;
+ halfmode = V32QImode;
+ extract
+ = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
+ break;
+ case E_V32QImode:
+ if (unsigned_p)
+ unpack = gen_avx2_zero_extendv16qiv16hi2;
+ else
+ unpack = gen_avx2_sign_extendv16qiv16hi2;
+ halfmode = V16QImode;
+ extract
+ = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
+ break;
+ case E_V32HImode:
+ if (unsigned_p)
+ unpack = gen_avx512f_zero_extendv16hiv16si2;
+ else
+ unpack = gen_avx512f_sign_extendv16hiv16si2;
+ halfmode = V16HImode;
+ extract
+ = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
+ break;
+ case E_V16HImode:
+ if (unsigned_p)
+ unpack = gen_avx2_zero_extendv8hiv8si2;
+ else
+ unpack = gen_avx2_sign_extendv8hiv8si2;
+ halfmode = V8HImode;
+ extract
+ = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
+ break;
+ case E_V16SImode:
+ if (unsigned_p)
+ unpack = gen_avx512f_zero_extendv8siv8di2;
+ else
+ unpack = gen_avx512f_sign_extendv8siv8di2;
+ halfmode = V8SImode;
+ extract
+ = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
+ break;
+ case E_V8SImode:
+ if (unsigned_p)
+ unpack = gen_avx2_zero_extendv4siv4di2;
+ else
+ unpack = gen_avx2_sign_extendv4siv4di2;
+ halfmode = V4SImode;
+ extract
+ = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
+ break;
+ case E_V16QImode:
+ if (unsigned_p)
+ unpack = gen_sse4_1_zero_extendv8qiv8hi2;
+ else
+ unpack = gen_sse4_1_sign_extendv8qiv8hi2;
+ break;
+ case E_V8HImode:
+ if (unsigned_p)
+ unpack = gen_sse4_1_zero_extendv4hiv4si2;
+ else
+ unpack = gen_sse4_1_sign_extendv4hiv4si2;
+ break;
+ case E_V4SImode:
+ if (unsigned_p)
+ unpack = gen_sse4_1_zero_extendv2siv2di2;
+ else
+ unpack = gen_sse4_1_sign_extendv2siv2di2;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ if (GET_MODE_SIZE (imode) >= 32)
+ {
+ tmp = gen_reg_rtx (halfmode);
+ emit_insn (extract (tmp, src));
+ }
+ else if (high_p)
+ {
+ /* Shift higher 8 bytes to lower 8 bytes. */
+ tmp = gen_reg_rtx (V1TImode);
+ emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
+ GEN_INT (64)));
+ tmp = gen_lowpart (imode, tmp);
+ }
+ else
+ tmp = src;
+
+ emit_insn (unpack (dest, tmp));
+ }
+ else
+ {
+ rtx (*unpack)(rtx, rtx, rtx);
+
+ switch (imode)
+ {
+ case E_V16QImode:
+ if (high_p)
+ unpack = gen_vec_interleave_highv16qi;
+ else
+ unpack = gen_vec_interleave_lowv16qi;
+ break;
+ case E_V8HImode:
+ if (high_p)
+ unpack = gen_vec_interleave_highv8hi;
+ else
+ unpack = gen_vec_interleave_lowv8hi;
+ break;
+ case E_V4SImode:
+ if (high_p)
+ unpack = gen_vec_interleave_highv4si;
+ else
+ unpack = gen_vec_interleave_lowv4si;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ if (unsigned_p)
+ tmp = force_reg (imode, CONST0_RTX (imode));
+ else
+ tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
+ src, pc_rtx, pc_rtx);
+
+ rtx tmp2 = gen_reg_rtx (imode);
+ emit_insn (unpack (tmp2, src, tmp));
+ emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
+ }
+}
+
+/* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
+ but works for floating pointer parameters and nonoffsetable memories.
+ For pushes, it returns just stack offsets; the values will be saved
+ in the right order. Maximally three parts are generated. */
+
+static int
+ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
+{
+ int size;
+
+ if (!TARGET_64BIT)
+ size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
+ else
+ size = (GET_MODE_SIZE (mode) + 4) / 8;
+
+ gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
+ gcc_assert (size >= 2 && size <= 4);
+
+ /* Optimize constant pool reference to immediates. This is used by fp
+ moves, that force all constants to memory to allow combining. */
+ if (MEM_P (operand) && MEM_READONLY_P (operand))
+ operand = avoid_constant_pool_reference (operand);
+
+ if (MEM_P (operand) && !offsettable_memref_p (operand))
+ {
+ /* The only non-offsetable memories we handle are pushes. */
+ int ok = push_operand (operand, VOIDmode);
+
+ gcc_assert (ok);
+
+ operand = copy_rtx (operand);
+ PUT_MODE (operand, word_mode);
+ parts[0] = parts[1] = parts[2] = parts[3] = operand;
+ return size;
+ }
+
+ if (GET_CODE (operand) == CONST_VECTOR)
+ {
+ scalar_int_mode imode = int_mode_for_mode (mode).require ();
+ /* Caution: if we looked through a constant pool memory above,
+ the operand may actually have a different mode now. That's
+ ok, since we want to pun this all the way back to an integer. */
+ operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
+ gcc_assert (operand != NULL);
+ mode = imode;
+ }
+
+ if (!TARGET_64BIT)
+ {
+ if (mode == DImode)
+ split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
+ else
+ {
+ int i;
+
+ if (REG_P (operand))
+ {
+ gcc_assert (reload_completed);
+ for (i = 0; i < size; i++)
+ parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
+ }
+ else if (offsettable_memref_p (operand))
+ {
+ operand = adjust_address (operand, SImode, 0);
+ parts[0] = operand;
+ for (i = 1; i < size; i++)
+ parts[i] = adjust_address (operand, SImode, 4 * i);
+ }
+ else if (CONST_DOUBLE_P (operand))
+ {
+ const REAL_VALUE_TYPE *r;
+ long l[4];
+
+ r = CONST_DOUBLE_REAL_VALUE (operand);
+ switch (mode)
+ {
+ case E_TFmode:
+ real_to_target (l, r, mode);
+ parts[3] = gen_int_mode (l[3], SImode);
+ parts[2] = gen_int_mode (l[2], SImode);
+ break;
+ case E_XFmode:
+ /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
+ long double may not be 80-bit. */
+ real_to_target (l, r, mode);
+ parts[2] = gen_int_mode (l[2], SImode);
+ break;
+ case E_DFmode:
+ REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ parts[1] = gen_int_mode (l[1], SImode);
+ parts[0] = gen_int_mode (l[0], SImode);
+ }
+ else
+ gcc_unreachable ();
+ }
+ }
+ else
+ {
+ if (mode == TImode)
+ split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
+ if (mode == XFmode || mode == TFmode)
+ {
+ machine_mode upper_mode = mode==XFmode ? SImode : DImode;
+ if (REG_P (operand))
+ {
+ gcc_assert (reload_completed);
+ parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
+ parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
+ }
+ else if (offsettable_memref_p (operand))
+ {
+ operand = adjust_address (operand, DImode, 0);
+ parts[0] = operand;
+ parts[1] = adjust_address (operand, upper_mode, 8);
+ }
+ else if (CONST_DOUBLE_P (operand))
+ {
+ long l[4];
+
+ real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
+
+ /* real_to_target puts 32-bit pieces in each long. */
+ parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
+ | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
+ << 32), DImode);
+
+ if (upper_mode == SImode)
+ parts[1] = gen_int_mode (l[2], SImode);
+ else
+ parts[1]
+ = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
+ | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
+ << 32), DImode);
+ }
+ else
+ gcc_unreachable ();
+ }
+ }
+
+ return size;
+}
+
+/* Emit insns to perform a move or push of DI, DF, XF, and TF values.
+ Return false when normal moves are needed; true when all required
+ insns have been emitted. Operands 2-4 contain the input values
+ int the correct order; operands 5-7 contain the output values. */
+
+void
+ix86_split_long_move (rtx operands[])
+{
+ rtx part[2][4];
+ int nparts, i, j;
+ int push = 0;
+ int collisions = 0;
+ machine_mode mode = GET_MODE (operands[0]);
+ bool collisionparts[4];
+
+ /* The DFmode expanders may ask us to move double.
+ For 64bit target this is single move. By hiding the fact
+ here we simplify i386.md splitters. */
+ if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
+ {
+ /* Optimize constant pool reference to immediates. This is used by
+ fp moves, that force all constants to memory to allow combining. */
+
+ if (MEM_P (operands[1])
+ && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
+ && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
+ operands[1] = get_pool_constant (XEXP (operands[1], 0));
+ if (push_operand (operands[0], VOIDmode))
+ {
+ operands[0] = copy_rtx (operands[0]);
+ PUT_MODE (operands[0], word_mode);
+ }
+ else
+ operands[0] = gen_lowpart (DImode, operands[0]);
+ operands[1] = gen_lowpart (DImode, operands[1]);
+ emit_move_insn (operands[0], operands[1]);
+ return;
+ }
+
+ /* The only non-offsettable memory we handle is push. */
+ if (push_operand (operands[0], VOIDmode))
+ push = 1;
+ else
+ gcc_assert (!MEM_P (operands[0])
+ || offsettable_memref_p (operands[0]));
+
+ nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
+ ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
+
+ /* When emitting push, take care for source operands on the stack. */
+ if (push && MEM_P (operands[1])
+ && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
+ {
+ rtx src_base = XEXP (part[1][nparts - 1], 0);
+
+ /* Compensate for the stack decrement by 4. */
+ if (!TARGET_64BIT && nparts == 3
+ && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
+ src_base = plus_constant (Pmode, src_base, 4);
+
+ /* src_base refers to the stack pointer and is
+ automatically decreased by emitted push. */
+ for (i = 0; i < nparts; i++)
+ part[1][i] = change_address (part[1][i],
+ GET_MODE (part[1][i]), src_base);
+ }
+
+ /* We need to do copy in the right order in case an address register
+ of the source overlaps the destination. */
+ if (REG_P (part[0][0]) && MEM_P (part[1][0]))
+ {
+ rtx tmp;
+
+ for (i = 0; i < nparts; i++)
+ {
+ collisionparts[i]
+ = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
+ if (collisionparts[i])
+ collisions++;
+ }
+
+ /* Collision in the middle part can be handled by reordering. */
+ if (collisions == 1 && nparts == 3 && collisionparts [1])
+ {
+ std::swap (part[0][1], part[0][2]);
+ std::swap (part[1][1], part[1][2]);
+ }
+ else if (collisions == 1
+ && nparts == 4
+ && (collisionparts [1] || collisionparts [2]))
+ {
+ if (collisionparts [1])
+ {
+ std::swap (part[0][1], part[0][2]);
+ std::swap (part[1][1], part[1][2]);
+ }
+ else
+ {
+ std::swap (part[0][2], part[0][3]);
+ std::swap (part[1][2], part[1][3]);
+ }
+ }
+
+ /* If there are more collisions, we can't handle it by reordering.
+ Do an lea to the last part and use only one colliding move. */
+ else if (collisions > 1)
+ {
+ rtx base, addr;
+
+ collisions = 1;
+
+ base = part[0][nparts - 1];
+
+ /* Handle the case when the last part isn't valid for lea.
+ Happens in 64-bit mode storing the 12-byte XFmode. */
+ if (GET_MODE (base) != Pmode)
+ base = gen_rtx_REG (Pmode, REGNO (base));
+
+ addr = XEXP (part[1][0], 0);
+ if (TARGET_TLS_DIRECT_SEG_REFS)
+ {
+ struct ix86_address parts;
+ int ok = ix86_decompose_address (addr, &parts);
+ gcc_assert (ok);
+ /* It is not valid to use %gs: or %fs: in lea. */
+ gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
+ }
+ emit_insn (gen_rtx_SET (base, addr));
+ part[1][0] = replace_equiv_address (part[1][0], base);
+ for (i = 1; i < nparts; i++)
+ {
+ tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
+ part[1][i] = replace_equiv_address (part[1][i], tmp);
+ }
+ }
+ }
+
+ if (push)
+ {
+ if (!TARGET_64BIT)
+ {
+ if (nparts == 3)
+ {
+ if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
+ emit_insn (ix86_gen_add3 (stack_pointer_rtx,
+ stack_pointer_rtx, GEN_INT (-4)));
+ emit_move_insn (part[0][2], part[1][2]);
+ }
+ else if (nparts == 4)
+ {
+ emit_move_insn (part[0][3], part[1][3]);
+ emit_move_insn (part[0][2], part[1][2]);
+ }
+ }
+ else
+ {
+ /* In 64bit mode we don't have 32bit push available. In case this is
+ register, it is OK - we will just use larger counterpart. We also
+ retype memory - these comes from attempt to avoid REX prefix on
+ moving of second half of TFmode value. */
+ if (GET_MODE (part[1][1]) == SImode)
+ {
+ switch (GET_CODE (part[1][1]))
+ {
+ case MEM:
+ part[1][1] = adjust_address (part[1][1], DImode, 0);
+ break;
+
+ case REG:
+ part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ if (GET_MODE (part[1][0]) == SImode)
+ part[1][0] = part[1][1];
+ }
+ }
+ emit_move_insn (part[0][1], part[1][1]);
+ emit_move_insn (part[0][0], part[1][0]);
+ return;
+ }
+
+ /* Choose correct order to not overwrite the source before it is copied. */
+ if ((REG_P (part[0][0])
+ && REG_P (part[1][1])
+ && (REGNO (part[0][0]) == REGNO (part[1][1])
+ || (nparts == 3
+ && REGNO (part[0][0]) == REGNO (part[1][2]))
+ || (nparts == 4
+ && REGNO (part[0][0]) == REGNO (part[1][3]))))
+ || (collisions > 0
+ && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
+ {
+ for (i = 0, j = nparts - 1; i < nparts; i++, j--)
+ {
+ operands[2 + i] = part[0][j];
+ operands[6 + i] = part[1][j];
+ }
+ }
+ else
+ {
+ for (i = 0; i < nparts; i++)
+ {
+ operands[2 + i] = part[0][i];
+ operands[6 + i] = part[1][i];
+ }
+ }
+
+ /* If optimizing for size, attempt to locally unCSE nonzero constants. */
+ if (optimize_insn_for_size_p ())
+ {
+ for (j = 0; j < nparts - 1; j++)
+ if (CONST_INT_P (operands[6 + j])
+ && operands[6 + j] != const0_rtx
+ && REG_P (operands[2 + j]))
+ for (i = j; i < nparts - 1; i++)
+ if (CONST_INT_P (operands[7 + i])
+ && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
+ operands[7 + i] = operands[2 + j];
+ }
+
+ for (i = 0; i < nparts; i++)
+ emit_move_insn (operands[2 + i], operands[6 + i]);
+
+ return;
+}
+
+/* Helper function of ix86_split_ashl used to generate an SImode/DImode
+ left shift by a constant, either using a single shift or
+ a sequence of add instructions. */
+
+static void
+ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
+{
+ rtx (*insn)(rtx, rtx, rtx);
+
+ if (count == 1
+ || (count * ix86_cost->add <= ix86_cost->shift_const
+ && !optimize_insn_for_size_p ()))
+ {
+ insn = mode == DImode ? gen_addsi3 : gen_adddi3;
+ while (count-- > 0)
+ emit_insn (insn (operand, operand, operand));
+ }
+ else
+ {
+ insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
+ emit_insn (insn (operand, operand, GEN_INT (count)));
+ }
+}
+
+void
+ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
+{
+ rtx (*gen_ashl3)(rtx, rtx, rtx);
+ rtx (*gen_shld)(rtx, rtx, rtx);
+ int half_width = GET_MODE_BITSIZE (mode) >> 1;
+
+ rtx low[2], high[2];
+ int count;
+
+ if (CONST_INT_P (operands[2]))
+ {
+ split_double_mode (mode, operands, 2, low, high);
+ count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
+
+ if (count >= half_width)
+ {
+ emit_move_insn (high[0], low[1]);
+ emit_move_insn (low[0], const0_rtx);
+
+ if (count > half_width)
+ ix86_expand_ashl_const (high[0], count - half_width, mode);
+ }
+ else
+ {
+ gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
+
+ if (!rtx_equal_p (operands[0], operands[1]))
+ emit_move_insn (operands[0], operands[1]);
+
+ emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
+ ix86_expand_ashl_const (low[0], count, mode);
+ }
+ return;
+ }
+
+ split_double_mode (mode, operands, 1, low, high);
+
+ gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
+
+ if (operands[1] == const1_rtx)
+ {
+ /* Assuming we've chosen a QImode capable registers, then 1 << N
+ can be done with two 32/64-bit shifts, no branches, no cmoves. */
+ if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
+ {
+ rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
+
+ ix86_expand_clear (low[0]);
+ ix86_expand_clear (high[0]);
+ emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
+
+ d = gen_lowpart (QImode, low[0]);
+ d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
+ s = gen_rtx_EQ (QImode, flags, const0_rtx);
+ emit_insn (gen_rtx_SET (d, s));
+
+ d = gen_lowpart (QImode, high[0]);
+ d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
+ s = gen_rtx_NE (QImode, flags, const0_rtx);
+ emit_insn (gen_rtx_SET (d, s));
+ }
+
+ /* Otherwise, we can get the same results by manually performing
+ a bit extract operation on bit 5/6, and then performing the two
+ shifts. The two methods of getting 0/1 into low/high are exactly
+ the same size. Avoiding the shift in the bit extract case helps
+ pentium4 a bit; no one else seems to care much either way. */
+ else
+ {
+ machine_mode half_mode;
+ rtx (*gen_lshr3)(rtx, rtx, rtx);
+ rtx (*gen_and3)(rtx, rtx, rtx);
+ rtx (*gen_xor3)(rtx, rtx, rtx);
+ HOST_WIDE_INT bits;
+ rtx x;
+
+ if (mode == DImode)
+ {
+ half_mode = SImode;
+ gen_lshr3 = gen_lshrsi3;
+ gen_and3 = gen_andsi3;
+ gen_xor3 = gen_xorsi3;
+ bits = 5;
+ }
+ else
+ {
+ half_mode = DImode;
+ gen_lshr3 = gen_lshrdi3;
+ gen_and3 = gen_anddi3;
+ gen_xor3 = gen_xordi3;
+ bits = 6;
+ }
+
+ if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
+ x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
+ else
+ x = gen_lowpart (half_mode, operands[2]);
+ emit_insn (gen_rtx_SET (high[0], x));
+
+ emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
+ emit_insn (gen_and3 (high[0], high[0], const1_rtx));
+ emit_move_insn (low[0], high[0]);
+ emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
+ }
+
+ emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
+ emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
+ return;
+ }
+
+ if (operands[1] == constm1_rtx)
+ {
+ /* For -1 << N, we can avoid the shld instruction, because we
+ know that we're shifting 0...31/63 ones into a -1. */
+ emit_move_insn (low[0], constm1_rtx);
+ if (optimize_insn_for_size_p ())
+ emit_move_insn (high[0], low[0]);
+ else
+ emit_move_insn (high[0], constm1_rtx);
+ }
+ else
+ {
+ gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
+
+ if (!rtx_equal_p (operands[0], operands[1]))
+ emit_move_insn (operands[0], operands[1]);
+
+ split_double_mode (mode, operands, 1, low, high);
+ emit_insn (gen_shld (high[0], low[0], operands[2]));
+ }
+
+ emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
+
+ if (TARGET_CMOVE && scratch)
+ {
+ rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
+ = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
+
+ ix86_expand_clear (scratch);
+ emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
+ }
+ else
+ {
+ rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
+ = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
+
+ emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
+ }
+}
+
+void
+ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
+{
+ rtx (*gen_ashr3)(rtx, rtx, rtx)
+ = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
+ rtx (*gen_shrd)(rtx, rtx, rtx);
+ int half_width = GET_MODE_BITSIZE (mode) >> 1;
+
+ rtx low[2], high[2];
+ int count;
+
+ if (CONST_INT_P (operands[2]))
+ {
+ split_double_mode (mode, operands, 2, low, high);
+ count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
+
+ if (count == GET_MODE_BITSIZE (mode) - 1)
+ {
+ emit_move_insn (high[0], high[1]);
+ emit_insn (gen_ashr3 (high[0], high[0],
+ GEN_INT (half_width - 1)));
+ emit_move_insn (low[0], high[0]);
+
+ }
+ else if (count >= half_width)
+ {
+ emit_move_insn (low[0], high[1]);
+ emit_move_insn (high[0], low[0]);
+ emit_insn (gen_ashr3 (high[0], high[0],
+ GEN_INT (half_width - 1)));
+
+ if (count > half_width)
+ emit_insn (gen_ashr3 (low[0], low[0],
+ GEN_INT (count - half_width)));
+ }
+ else
+ {
+ gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
+
+ if (!rtx_equal_p (operands[0], operands[1]))
+ emit_move_insn (operands[0], operands[1]);
+
+ emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
+ emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
+ }
+ }
+ else
+ {
+ gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
+
+ if (!rtx_equal_p (operands[0], operands[1]))
+ emit_move_insn (operands[0], operands[1]);
+
+ split_double_mode (mode, operands, 1, low, high);
+
+ emit_insn (gen_shrd (low[0], high[0], operands[2]));
+ emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
+
+ if (TARGET_CMOVE && scratch)
+ {
+ rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
+ = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
+
+ emit_move_insn (scratch, high[0]);
+ emit_insn (gen_ashr3 (scratch, scratch,
+ GEN_INT (half_width - 1)));
+ emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
+ scratch));
+ }
+ else
+ {
+ rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
+ = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
+
+ emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
+ }
+ }
+}
+
+void
+ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
+{
+ rtx (*gen_lshr3)(rtx, rtx, rtx)
+ = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
+ rtx (*gen_shrd)(rtx, rtx, rtx);
+ int half_width = GET_MODE_BITSIZE (mode) >> 1;
+
+ rtx low[2], high[2];
+ int count;
+
+ if (CONST_INT_P (operands[2]))
+ {
+ split_double_mode (mode, operands, 2, low, high);
+ count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
+
+ if (count >= half_width)
+ {
+ emit_move_insn (low[0], high[1]);
+ ix86_expand_clear (high[0]);
+
+ if (count > half_width)
+ emit_insn (gen_lshr3 (low[0], low[0],
+ GEN_INT (count - half_width)));
+ }
+ else
+ {
+ gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
+
+ if (!rtx_equal_p (operands[0], operands[1]))
+ emit_move_insn (operands[0], operands[1]);
+
+ emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
+ emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
+ }
+ }
+ else
+ {
+ gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
+
+ if (!rtx_equal_p (operands[0], operands[1]))
+ emit_move_insn (operands[0], operands[1]);
+
+ split_double_mode (mode, operands, 1, low, high);
+
+ emit_insn (gen_shrd (low[0], high[0], operands[2]));
+ emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
+
+ if (TARGET_CMOVE && scratch)
+ {
+ rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
+ = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
+
+ ix86_expand_clear (scratch);
+ emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
+ scratch));
+ }
+ else
+ {
+ rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
+ = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
+
+ emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
+ }
+ }
+}
+
+/* Return mode for the memcpy/memset loop counter. Prefer SImode over
+ DImode for constant loop counts. */
+
+static machine_mode
+counter_mode (rtx count_exp)
+{
+ if (GET_MODE (count_exp) != VOIDmode)
+ return GET_MODE (count_exp);
+ if (!CONST_INT_P (count_exp))
+ return Pmode;
+ if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
+ return DImode;
+ return SImode;
+}
+
+/* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
+ to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
+ specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
+ memory by VALUE (supposed to be in MODE).
+
+ The size is rounded down to whole number of chunk size moved at once.
+ SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
+
+
+static void
+expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
+ rtx destptr, rtx srcptr, rtx value,
+ rtx count, machine_mode mode, int unroll,
+ int expected_size, bool issetmem)
+{
+ rtx_code_label *out_label, *top_label;
+ rtx iter, tmp;
+ machine_mode iter_mode = counter_mode (count);
+ int piece_size_n = GET_MODE_SIZE (mode) * unroll;
+ rtx piece_size = GEN_INT (piece_size_n);
+ rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
+ rtx size;
+ int i;
+
+ top_label = gen_label_rtx ();
+ out_label = gen_label_rtx ();
+ iter = gen_reg_rtx (iter_mode);
+
+ size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
+ NULL, 1, OPTAB_DIRECT);
+ /* Those two should combine. */
+ if (piece_size == const1_rtx)
+ {
+ emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
+ true, out_label);
+ predict_jump (REG_BR_PROB_BASE * 10 / 100);
+ }
+ emit_move_insn (iter, const0_rtx);
+
+ emit_label (top_label);
+
+ tmp = convert_modes (Pmode, iter_mode, iter, true);
+
+ /* This assert could be relaxed - in this case we'll need to compute
+ smallest power of two, containing in PIECE_SIZE_N and pass it to
+ offset_address. */
+ gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
+ destmem = offset_address (destmem, tmp, piece_size_n);
+ destmem = adjust_address (destmem, mode, 0);
+
+ if (!issetmem)
+ {
+ srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
+ srcmem = adjust_address (srcmem, mode, 0);
+
+ /* When unrolling for chips that reorder memory reads and writes,
+ we can save registers by using single temporary.
+ Also using 4 temporaries is overkill in 32bit mode. */
+ if (!TARGET_64BIT && 0)
+ {
+ for (i = 0; i < unroll; i++)
+ {
+ if (i)
+ {
+ destmem = adjust_address (copy_rtx (destmem), mode,
+ GET_MODE_SIZE (mode));
+ srcmem = adjust_address (copy_rtx (srcmem), mode,
+ GET_MODE_SIZE (mode));
+ }
+ emit_move_insn (destmem, srcmem);
+ }
+ }
+ else
+ {
+ rtx tmpreg[4];
+ gcc_assert (unroll <= 4);
+ for (i = 0; i < unroll; i++)
+ {
+ tmpreg[i] = gen_reg_rtx (mode);
+ if (i)
+ srcmem = adjust_address (copy_rtx (srcmem), mode,
+ GET_MODE_SIZE (mode));
+ emit_move_insn (tmpreg[i], srcmem);
+ }
+ for (i = 0; i < unroll; i++)
+ {
+ if (i)
+ destmem = adjust_address (copy_rtx (destmem), mode,
+ GET_MODE_SIZE (mode));
+ emit_move_insn (destmem, tmpreg[i]);
+ }
+ }
+ }
+ else
+ for (i = 0; i < unroll; i++)
+ {
+ if (i)
+ destmem = adjust_address (copy_rtx (destmem), mode,
+ GET_MODE_SIZE (mode));
+ emit_move_insn (destmem, value);
+ }
+
+ tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != iter)
+ emit_move_insn (iter, tmp);
+
+ emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+ true, top_label);
+ if (expected_size != -1)
+ {
+ expected_size /= GET_MODE_SIZE (mode) * unroll;
+ if (expected_size == 0)
+ predict_jump (0);
+ else if (expected_size > REG_BR_PROB_BASE)
+ predict_jump (REG_BR_PROB_BASE - 1);
+ else
+ predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
+ / expected_size);
+ }
+ else
+ predict_jump (REG_BR_PROB_BASE * 80 / 100);
+ iter = ix86_zero_extend_to_Pmode (iter);
+ tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != destptr)
+ emit_move_insn (destptr, tmp);
+ if (!issetmem)
+ {
+ tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != srcptr)
+ emit_move_insn (srcptr, tmp);
+ }
+ emit_label (out_label);
+}
+
+/* Divide COUNTREG by SCALE. */
+static rtx
+scale_counter (rtx countreg, int scale)
+{
+ rtx sc;
+
+ if (scale == 1)
+ return countreg;
+ if (CONST_INT_P (countreg))
+ return GEN_INT (INTVAL (countreg) / scale);
+ gcc_assert (REG_P (countreg));
+
+ sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
+ GEN_INT (exact_log2 (scale)),
+ NULL, 1, OPTAB_DIRECT);
+ return sc;
+}
+
+/* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
+ When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
+ When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
+ For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
+ ORIG_VALUE is the original value passed to memset to fill the memory with.
+ Other arguments have same meaning as for previous function. */
+
+static void
+expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
+ rtx destptr, rtx srcptr, rtx value, rtx orig_value,
+ rtx count,
+ machine_mode mode, bool issetmem)
+{
+ rtx destexp;
+ rtx srcexp;
+ rtx countreg;
+ HOST_WIDE_INT rounded_count;
+
+ /* If possible, it is shorter to use rep movs.
+ TODO: Maybe it is better to move this logic to decide_alg. */
+ if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
+ && (!issetmem || orig_value == const0_rtx))
+ mode = SImode;
+
+ if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
+ destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
+
+ countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
+ GET_MODE_SIZE (mode)));
+ if (mode != QImode)
+ {
+ destexp = gen_rtx_ASHIFT (Pmode, countreg,
+ GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
+ destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
+ }
+ else
+ destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
+ if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
+ {
+ rounded_count
+ = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
+ destmem = shallow_copy_rtx (destmem);
+ set_mem_size (destmem, rounded_count);
+ }
+ else if (MEM_SIZE_KNOWN_P (destmem))
+ clear_mem_size (destmem);
+
+ if (issetmem)
+ {
+ value = force_reg (mode, gen_lowpart (mode, value));
+ emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
+ }
+ else
+ {
+ if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
+ srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
+ if (mode != QImode)
+ {
+ srcexp = gen_rtx_ASHIFT (Pmode, countreg,
+ GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
+ srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
+ }
+ else
+ srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
+ if (CONST_INT_P (count))
+ {
+ rounded_count
+ = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
+ srcmem = shallow_copy_rtx (srcmem);
+ set_mem_size (srcmem, rounded_count);
+ }
+ else
+ {
+ if (MEM_SIZE_KNOWN_P (srcmem))
+ clear_mem_size (srcmem);
+ }
+ emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
+ destexp, srcexp));
+ }
+}
+
+/* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
+ DESTMEM.
+ SRC is passed by pointer to be updated on return.
+ Return value is updated DST. */
+static rtx
+emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
+ HOST_WIDE_INT size_to_move)
+{
+ rtx dst = destmem, src = *srcmem, adjust, tempreg;
+ enum insn_code code;
+ machine_mode move_mode;
+ int piece_size, i;
+
+ /* Find the widest mode in which we could perform moves.
+ Start with the biggest power of 2 less than SIZE_TO_MOVE and half
+ it until move of such size is supported. */
+ piece_size = 1 << floor_log2 (size_to_move);
+ while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
+ || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
+ {
+ gcc_assert (piece_size > 1);
+ piece_size >>= 1;
+ }
+
+ /* Find the corresponding vector mode with the same size as MOVE_MODE.
+ MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
+ if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
+ {
+ int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
+ if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
+ || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
+ {
+ move_mode = word_mode;
+ piece_size = GET_MODE_SIZE (move_mode);
+ code = optab_handler (mov_optab, move_mode);
+ }
+ }
+ gcc_assert (code != CODE_FOR_nothing);
+
+ dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
+ src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
+
+ /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
+ gcc_assert (size_to_move % piece_size == 0);
+ adjust = GEN_INT (piece_size);
+ for (i = 0; i < size_to_move; i += piece_size)
+ {
+ /* We move from memory to memory, so we'll need to do it via
+ a temporary register. */
+ tempreg = gen_reg_rtx (move_mode);
+ emit_insn (GEN_FCN (code) (tempreg, src));
+ emit_insn (GEN_FCN (code) (dst, tempreg));
+
+ emit_move_insn (destptr,
+ gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
+ emit_move_insn (srcptr,
+ gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
+
+ dst = adjust_automodify_address_nv (dst, move_mode, destptr,
+ piece_size);
+ src = adjust_automodify_address_nv (src, move_mode, srcptr,
+ piece_size);
+ }
+
+ /* Update DST and SRC rtx. */
+ *srcmem = src;
+ return dst;
+}
+
+/* Helper function for the string operations below. Dest VARIABLE whether
+ it is aligned to VALUE bytes. If true, jump to the label. */
+
+static rtx_code_label *
+ix86_expand_aligntest (rtx variable, int value, bool epilogue)
+{
+ rtx_code_label *label = gen_label_rtx ();
+ rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
+ if (GET_MODE (variable) == DImode)
+ emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
+ else
+ emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
+ emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
+ 1, label);
+ if (epilogue)
+ predict_jump (REG_BR_PROB_BASE * 50 / 100);
+ else
+ predict_jump (REG_BR_PROB_BASE * 90 / 100);
+ return label;
+}
+
+
+/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
+
+static void
+expand_movmem_epilogue (rtx destmem, rtx srcmem,
+ rtx destptr, rtx srcptr, rtx count, int max_size)
+{
+ rtx src, dest;
+ if (CONST_INT_P (count))
+ {
+ HOST_WIDE_INT countval = INTVAL (count);
+ HOST_WIDE_INT epilogue_size = countval % max_size;
+ int i;
+
+ /* For now MAX_SIZE should be a power of 2. This assert could be
+ relaxed, but it'll require a bit more complicated epilogue
+ expanding. */
+ gcc_assert ((max_size & (max_size - 1)) == 0);
+ for (i = max_size; i >= 1; i >>= 1)
+ {
+ if (epilogue_size & i)
+ destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
+ }
+ return;
+ }
+ if (max_size > 8)
+ {
+ count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
+ count, 1, OPTAB_DIRECT);
+ expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
+ count, QImode, 1, 4, false);
+ return;
+ }
+
+ /* When there are stringops, we can cheaply increase dest and src pointers.
+ Otherwise we save code size by maintaining offset (zero is readily
+ available from preceding rep operation) and using x86 addressing modes.
+ */
+ if (TARGET_SINGLE_STRINGOP)
+ {
+ if (max_size > 4)
+ {
+ rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
+ src = change_address (srcmem, SImode, srcptr);
+ dest = change_address (destmem, SImode, destptr);
+ emit_insn (gen_strmov (destptr, dest, srcptr, src));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (max_size > 2)
+ {
+ rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
+ src = change_address (srcmem, HImode, srcptr);
+ dest = change_address (destmem, HImode, destptr);
+ emit_insn (gen_strmov (destptr, dest, srcptr, src));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (max_size > 1)
+ {
+ rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
+ src = change_address (srcmem, QImode, srcptr);
+ dest = change_address (destmem, QImode, destptr);
+ emit_insn (gen_strmov (destptr, dest, srcptr, src));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ }
+ else
+ {
+ rtx offset = force_reg (Pmode, const0_rtx);
+ rtx tmp;
+
+ if (max_size > 4)
+ {
+ rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
+ src = change_address (srcmem, SImode, srcptr);
+ dest = change_address (destmem, SImode, destptr);
+ emit_move_insn (dest, src);
+ tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != offset)
+ emit_move_insn (offset, tmp);
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (max_size > 2)
+ {
+ rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
+ tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
+ src = change_address (srcmem, HImode, tmp);
+ tmp = gen_rtx_PLUS (Pmode, destptr, offset);
+ dest = change_address (destmem, HImode, tmp);
+ emit_move_insn (dest, src);
+ tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
+ true, OPTAB_LIB_WIDEN);
+ if (tmp != offset)
+ emit_move_insn (offset, tmp);
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (max_size > 1)
+ {
+ rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
+ tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
+ src = change_address (srcmem, QImode, tmp);
+ tmp = gen_rtx_PLUS (Pmode, destptr, offset);
+ dest = change_address (destmem, QImode, tmp);
+ emit_move_insn (dest, src);
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ }
+}
+
+/* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
+ with value PROMOTED_VAL.
+ SRC is passed by pointer to be updated on return.
+ Return value is updated DST. */
+static rtx
+emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
+ HOST_WIDE_INT size_to_move)
+{
+ rtx dst = destmem, adjust;
+ enum insn_code code;
+ machine_mode move_mode;
+ int piece_size, i;
+
+ /* Find the widest mode in which we could perform moves.
+ Start with the biggest power of 2 less than SIZE_TO_MOVE and half
+ it until move of such size is supported. */
+ move_mode = GET_MODE (promoted_val);
+ if (move_mode == VOIDmode)
+ move_mode = QImode;
+ if (size_to_move < GET_MODE_SIZE (move_mode))
+ {
+ unsigned int move_bits = size_to_move * BITS_PER_UNIT;
+ move_mode = int_mode_for_size (move_bits, 0).require ();
+ promoted_val = gen_lowpart (move_mode, promoted_val);
+ }
+ piece_size = GET_MODE_SIZE (move_mode);
+ code = optab_handler (mov_optab, move_mode);
+ gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
+
+ dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
+
+ /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
+ gcc_assert (size_to_move % piece_size == 0);
+ adjust = GEN_INT (piece_size);
+ for (i = 0; i < size_to_move; i += piece_size)
+ {
+ if (piece_size <= GET_MODE_SIZE (word_mode))
+ {
+ emit_insn (gen_strset (destptr, dst, promoted_val));
+ dst = adjust_automodify_address_nv (dst, move_mode, destptr,
+ piece_size);
+ continue;
+ }
+
+ emit_insn (GEN_FCN (code) (dst, promoted_val));
+
+ emit_move_insn (destptr,
+ gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
+
+ dst = adjust_automodify_address_nv (dst, move_mode, destptr,
+ piece_size);
+ }
+
+ /* Update DST rtx. */
+ return dst;
+}
+/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
+static void
+expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
+ rtx count, int max_size)
+{
+ count = expand_simple_binop (counter_mode (count), AND, count,
+ GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
+ expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
+ gen_lowpart (QImode, value), count, QImode,
+ 1, max_size / 2, true);
+}
+
+/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
+static void
+expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
+ rtx count, int max_size)
+{
+ rtx dest;
+
+ if (CONST_INT_P (count))
+ {
+ HOST_WIDE_INT countval = INTVAL (count);
+ HOST_WIDE_INT epilogue_size = countval % max_size;
+ int i;
+
+ /* For now MAX_SIZE should be a power of 2. This assert could be
+ relaxed, but it'll require a bit more complicated epilogue
+ expanding. */
+ gcc_assert ((max_size & (max_size - 1)) == 0);
+ for (i = max_size; i >= 1; i >>= 1)
+ {
+ if (epilogue_size & i)
+ {
+ if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
+ destmem = emit_memset (destmem, destptr, vec_value, i);
+ else
+ destmem = emit_memset (destmem, destptr, value, i);
+ }
+ }
+ return;
+ }
+ if (max_size > 32)
+ {
+ expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
+ return;
+ }
+ if (max_size > 16)
+ {
+ rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
+ if (TARGET_64BIT)
+ {
+ dest = change_address (destmem, DImode, destptr);
+ emit_insn (gen_strset (destptr, dest, value));
+ dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
+ emit_insn (gen_strset (destptr, dest, value));
+ }
+ else
+ {
+ dest = change_address (destmem, SImode, destptr);
+ emit_insn (gen_strset (destptr, dest, value));
+ dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
+ emit_insn (gen_strset (destptr, dest, value));
+ dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
+ emit_insn (gen_strset (destptr, dest, value));
+ dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
+ emit_insn (gen_strset (destptr, dest, value));
+ }
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (max_size > 8)
+ {
+ rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
+ if (TARGET_64BIT)
+ {
+ dest = change_address (destmem, DImode, destptr);
+ emit_insn (gen_strset (destptr, dest, value));
+ }
+ else
+ {
+ dest = change_address (destmem, SImode, destptr);
+ emit_insn (gen_strset (destptr, dest, value));
+ dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
+ emit_insn (gen_strset (destptr, dest, value));
+ }
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (max_size > 4)
+ {
+ rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
+ dest = change_address (destmem, SImode, destptr);
+ emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (max_size > 2)
+ {
+ rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
+ dest = change_address (destmem, HImode, destptr);
+ emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+ if (max_size > 1)
+ {
+ rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
+ dest = change_address (destmem, QImode, destptr);
+ emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+}
+
+/* Adjust COUNTER by the VALUE. */
+static void
+ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
+{
+ rtx (*gen_add)(rtx, rtx, rtx)
+ = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
+
+ emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
+}
+
+/* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
+ DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
+ Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
+ ignored.
+ Return value is updated DESTMEM. */
+
+static rtx
+expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
+ rtx destptr, rtx srcptr, rtx value,
+ rtx vec_value, rtx count, int align,
+ int desired_alignment, bool issetmem)
+{
+ int i;
+ for (i = 1; i < desired_alignment; i <<= 1)
+ {
+ if (align <= i)
+ {
+ rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
+ if (issetmem)
+ {
+ if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
+ destmem = emit_memset (destmem, destptr, vec_value, i);
+ else
+ destmem = emit_memset (destmem, destptr, value, i);
+ }
+ else
+ destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
+ ix86_adjust_counter (count, i);
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
+ }
+ }
+ return destmem;
+}
+
+/* Test if COUNT&SIZE is nonzero and if so, expand movme
+ or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
+ and jump to DONE_LABEL. */
+static void
+expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
+ rtx destptr, rtx srcptr,
+ rtx value, rtx vec_value,
+ rtx count, int size,
+ rtx done_label, bool issetmem)
+{
+ rtx_code_label *label = ix86_expand_aligntest (count, size, false);
+ machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
+ rtx modesize;
+ int n;
+
+ /* If we do not have vector value to copy, we must reduce size. */
+ if (issetmem)
+ {
+ if (!vec_value)
+ {
+ if (GET_MODE (value) == VOIDmode && size > 8)
+ mode = Pmode;
+ else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
+ mode = GET_MODE (value);
+ }
+ else
+ mode = GET_MODE (vec_value), value = vec_value;
+ }
+ else
+ {
+ /* Choose appropriate vector mode. */
+ if (size >= 32)
+ mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
+ else if (size >= 16)
+ mode = TARGET_SSE ? V16QImode : DImode;
+ srcmem = change_address (srcmem, mode, srcptr);
+ }
+ destmem = change_address (destmem, mode, destptr);
+ modesize = GEN_INT (GET_MODE_SIZE (mode));
+ gcc_assert (GET_MODE_SIZE (mode) <= size);
+ for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
+ {
+ if (issetmem)
+ emit_move_insn (destmem, gen_lowpart (mode, value));
+ else
+ {
+ emit_move_insn (destmem, srcmem);
+ srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
+ }
+ destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
+ }
+
+ destmem = offset_address (destmem, count, 1);
+ destmem = offset_address (destmem, GEN_INT (-2 * size),
+ GET_MODE_SIZE (mode));
+ if (!issetmem)
+ {
+ srcmem = offset_address (srcmem, count, 1);
+ srcmem = offset_address (srcmem, GEN_INT (-2 * size),
+ GET_MODE_SIZE (mode));
+ }
+ for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
+ {
+ if (issetmem)
+ emit_move_insn (destmem, gen_lowpart (mode, value));
+ else
+ {
+ emit_move_insn (destmem, srcmem);
+ srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
+ }
+ destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
+ }
+ emit_jump_insn (gen_jump (done_label));
+ emit_barrier ();
+
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+}
+
+/* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
+ and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
+ bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
+ proceed with an loop copying SIZE bytes at once. Do moves in MODE.
+ DONE_LABEL is a label after the whole copying sequence. The label is created
+ on demand if *DONE_LABEL is NULL.
+ MIN_SIZE is minimal size of block copied. This value gets adjusted for new
+ bounds after the initial copies.
+
+ DESTMEM/SRCMEM are memory expressions pointing to the copies block,
+ DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
+ we will dispatch to a library call for large blocks.
+
+ In pseudocode we do:
+
+ if (COUNT < SIZE)
+ {
+ Assume that SIZE is 4. Bigger sizes are handled analogously
+ if (COUNT & 4)
+ {
+ copy 4 bytes from SRCPTR to DESTPTR
+ copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
+ goto done_label
+ }
+ if (!COUNT)
+ goto done_label;
+ copy 1 byte from SRCPTR to DESTPTR
+ if (COUNT & 2)
+ {
+ copy 2 bytes from SRCPTR to DESTPTR
+ copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
+ }
+ }
+ else
+ {
+ copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
+ copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
+
+ OLD_DESPTR = DESTPTR;
+ Align DESTPTR up to DESIRED_ALIGN
+ SRCPTR += DESTPTR - OLD_DESTPTR
+ COUNT -= DEST_PTR - OLD_DESTPTR
+ if (DYNAMIC_CHECK)
+ Round COUNT down to multiple of SIZE
+ << optional caller supplied zero size guard is here >>
+ << optional caller supplied dynamic check is here >>
+ << caller supplied main copy loop is here >>
+ }
+ done_label:
+ */
+static void
+expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
+ rtx *destptr, rtx *srcptr,
+ machine_mode mode,
+ rtx value, rtx vec_value,
+ rtx *count,
+ rtx_code_label **done_label,
+ int size,
+ int desired_align,
+ int align,
+ unsigned HOST_WIDE_INT *min_size,
+ bool dynamic_check,
+ bool issetmem)
+{
+ rtx_code_label *loop_label = NULL, *label;
+ int n;
+ rtx modesize;
+ int prolog_size = 0;
+ rtx mode_value;
+
+ /* Chose proper value to copy. */
+ if (issetmem && VECTOR_MODE_P (mode))
+ mode_value = vec_value;
+ else
+ mode_value = value;
+ gcc_assert (GET_MODE_SIZE (mode) <= size);
+
+ /* See if block is big or small, handle small blocks. */
+ if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
+ {
+ int size2 = size;
+ loop_label = gen_label_rtx ();
+
+ if (!*done_label)
+ *done_label = gen_label_rtx ();
+
+ emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
+ 1, loop_label);
+ size2 >>= 1;
+
+ /* Handle sizes > 3. */
+ for (;size2 > 2; size2 >>= 1)
+ expand_small_movmem_or_setmem (destmem, srcmem,
+ *destptr, *srcptr,
+ value, vec_value,
+ *count,
+ size2, *done_label, issetmem);
+ /* Nothing to copy? Jump to DONE_LABEL if so */
+ emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
+ 1, *done_label);
+
+ /* Do a byte copy. */
+ destmem = change_address (destmem, QImode, *destptr);
+ if (issetmem)
+ emit_move_insn (destmem, gen_lowpart (QImode, value));
+ else
+ {
+ srcmem = change_address (srcmem, QImode, *srcptr);
+ emit_move_insn (destmem, srcmem);
+ }
+
+ /* Handle sizes 2 and 3. */
+ label = ix86_expand_aligntest (*count, 2, false);
+ destmem = change_address (destmem, HImode, *destptr);
+ destmem = offset_address (destmem, *count, 1);
+ destmem = offset_address (destmem, GEN_INT (-2), 2);
+ if (issetmem)
+ emit_move_insn (destmem, gen_lowpart (HImode, value));
+ else
+ {
+ srcmem = change_address (srcmem, HImode, *srcptr);
+ srcmem = offset_address (srcmem, *count, 1);
+ srcmem = offset_address (srcmem, GEN_INT (-2), 2);
+ emit_move_insn (destmem, srcmem);
+ }
+
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ emit_jump_insn (gen_jump (*done_label));
+ emit_barrier ();
+ }
+ else
+ gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
+ || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
+
+ /* Start memcpy for COUNT >= SIZE. */
+ if (loop_label)
+ {
+ emit_label (loop_label);
+ LABEL_NUSES (loop_label) = 1;
+ }
+
+ /* Copy first desired_align bytes. */
+ if (!issetmem)
+ srcmem = change_address (srcmem, mode, *srcptr);
+ destmem = change_address (destmem, mode, *destptr);
+ modesize = GEN_INT (GET_MODE_SIZE (mode));
+ for (n = 0; prolog_size < desired_align - align; n++)
+ {
+ if (issetmem)
+ emit_move_insn (destmem, mode_value);
+ else
+ {
+ emit_move_insn (destmem, srcmem);
+ srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
+ }
+ destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
+ prolog_size += GET_MODE_SIZE (mode);
+ }
+
+
+ /* Copy last SIZE bytes. */
+ destmem = offset_address (destmem, *count, 1);
+ destmem = offset_address (destmem,
+ GEN_INT (-size - prolog_size),
+ 1);
+ if (issetmem)
+ emit_move_insn (destmem, mode_value);
+ else
+ {
+ srcmem = offset_address (srcmem, *count, 1);
+ srcmem = offset_address (srcmem,
+ GEN_INT (-size - prolog_size),
+ 1);
+ emit_move_insn (destmem, srcmem);
+ }
+ for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
+ {
+ destmem = offset_address (destmem, modesize, 1);
+ if (issetmem)
+ emit_move_insn (destmem, mode_value);
+ else
+ {
+ srcmem = offset_address (srcmem, modesize, 1);
+ emit_move_insn (destmem, srcmem);
+ }
+ }
+
+ /* Align destination. */
+ if (desired_align > 1 && desired_align > align)
+ {
+ rtx saveddest = *destptr;
+
+ gcc_assert (desired_align <= size);
+ /* Align destptr up, place it to new register. */
+ *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
+ GEN_INT (prolog_size),
+ NULL_RTX, 1, OPTAB_DIRECT);
+ if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
+ REG_POINTER (*destptr) = 1;
+ *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
+ GEN_INT (-desired_align),
+ *destptr, 1, OPTAB_DIRECT);
+ /* See how many bytes we skipped. */
+ saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
+ *destptr,
+ saveddest, 1, OPTAB_DIRECT);
+ /* Adjust srcptr and count. */
+ if (!issetmem)
+ *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
+ saveddest, *srcptr, 1, OPTAB_DIRECT);
+ *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
+ saveddest, *count, 1, OPTAB_DIRECT);
+ /* We copied at most size + prolog_size. */
+ if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
+ *min_size
+ = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
+ else
+ *min_size = 0;
+
+ /* Our loops always round down the block size, but for dispatch to
+ library we need precise value. */
+ if (dynamic_check)
+ *count = expand_simple_binop (GET_MODE (*count), AND, *count,
+ GEN_INT (-size), *count, 1, OPTAB_DIRECT);
+ }
+ else
+ {
+ gcc_assert (prolog_size == 0);
+ /* Decrease count, so we won't end up copying last word twice. */
+ if (!CONST_INT_P (*count))
+ *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
+ constm1_rtx, *count, 1, OPTAB_DIRECT);
+ else
+ *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
+ (unsigned HOST_WIDE_INT)size));
+ if (*min_size)
+ *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
+ }
+}
+
+
+/* This function is like the previous one, except here we know how many bytes
+ need to be copied. That allows us to update alignment not only of DST, which
+ is returned, but also of SRC, which is passed as a pointer for that
+ reason. */
+static rtx
+expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
+ rtx srcreg, rtx value, rtx vec_value,
+ int desired_align, int align_bytes,
+ bool issetmem)
+{
+ rtx src = NULL;
+ rtx orig_dst = dst;
+ rtx orig_src = NULL;
+ int piece_size = 1;
+ int copied_bytes = 0;
+
+ if (!issetmem)
+ {
+ gcc_assert (srcp != NULL);
+ src = *srcp;
+ orig_src = src;
+ }
+
+ for (piece_size = 1;
+ piece_size <= desired_align && copied_bytes < align_bytes;
+ piece_size <<= 1)
+ {
+ if (align_bytes & piece_size)
+ {
+ if (issetmem)
+ {
+ if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
+ dst = emit_memset (dst, destreg, vec_value, piece_size);
+ else
+ dst = emit_memset (dst, destreg, value, piece_size);
+ }
+ else
+ dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
+ copied_bytes += piece_size;
+ }
+ }
+ if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
+ set_mem_align (dst, desired_align * BITS_PER_UNIT);
+ if (MEM_SIZE_KNOWN_P (orig_dst))
+ set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
+
+ if (!issetmem)
+ {
+ int src_align_bytes = get_mem_align_offset (src, desired_align
+ * BITS_PER_UNIT);
+ if (src_align_bytes >= 0)
+ src_align_bytes = desired_align - src_align_bytes;
+ if (src_align_bytes >= 0)
+ {
+ unsigned int src_align;
+ for (src_align = desired_align; src_align >= 2; src_align >>= 1)
+ {
+ if ((src_align_bytes & (src_align - 1))
+ == (align_bytes & (src_align - 1)))
+ break;
+ }
+ if (src_align > (unsigned int) desired_align)
+ src_align = desired_align;
+ if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
+ set_mem_align (src, src_align * BITS_PER_UNIT);
+ }
+ if (MEM_SIZE_KNOWN_P (orig_src))
+ set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
+ *srcp = src;
+ }
+
+ return dst;
+}
+
+/* Return true if ALG can be used in current context.
+ Assume we expand memset if MEMSET is true. */
+static bool
+alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
+{
+ if (alg == no_stringop)
+ return false;
+ if (alg == vector_loop)
+ return TARGET_SSE || TARGET_AVX;
+ /* Algorithms using the rep prefix want at least edi and ecx;
+ additionally, memset wants eax and memcpy wants esi. Don't
+ consider such algorithms if the user has appropriated those
+ registers for their own purposes, or if we have a non-default
+ address space, since some string insns cannot override the segment. */
+ if (alg == rep_prefix_1_byte
+ || alg == rep_prefix_4_byte
+ || alg == rep_prefix_8_byte)
+ {
+ if (have_as)
+ return false;
+ if (fixed_regs[CX_REG]
+ || fixed_regs[DI_REG]
+ || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
+ return false;
+ }
+ return true;
+}
+
+/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
+static enum stringop_alg
+decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
+ unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
+ bool memset, bool zero_memset, bool have_as,
+ int *dynamic_check, bool *noalign, bool recur)
+{
+ const struct stringop_algs *algs;
+ bool optimize_for_speed;
+ int max = 0;
+ const struct processor_costs *cost;
+ int i;
+ bool any_alg_usable_p = false;
+
+ *noalign = false;
+ *dynamic_check = -1;
+
+ /* Even if the string operation call is cold, we still might spend a lot
+ of time processing large blocks. */
+ if (optimize_function_for_size_p (cfun)
+ || (optimize_insn_for_size_p ()
+ && (max_size < 256
+ || (expected_size != -1 && expected_size < 256))))
+ optimize_for_speed = false;
+ else
+ optimize_for_speed = true;
+
+ cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
+ if (memset)
+ algs = &cost->memset[TARGET_64BIT != 0];
+ else
+ algs = &cost->memcpy[TARGET_64BIT != 0];
+
+ /* See maximal size for user defined algorithm. */
+ for (i = 0; i < MAX_STRINGOP_ALGS; i++)
+ {
+ enum stringop_alg candidate = algs->size[i].alg;
+ bool usable = alg_usable_p (candidate, memset, have_as);
+ any_alg_usable_p |= usable;
+
+ if (candidate != libcall && candidate && usable)
+ max = algs->size[i].max;
+ }
+
+ /* If expected size is not known but max size is small enough
+ so inline version is a win, set expected size into
+ the range. */
+ if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
+ && expected_size == -1)
+ expected_size = min_size / 2 + max_size / 2;
+
+ /* If user specified the algorithm, honor it if possible. */
+ if (ix86_stringop_alg != no_stringop
+ && alg_usable_p (ix86_stringop_alg, memset, have_as))
+ return ix86_stringop_alg;
+ /* rep; movq or rep; movl is the smallest variant. */
+ else if (!optimize_for_speed)
+ {
+ *noalign = true;
+ if (!count || (count & 3) || (memset && !zero_memset))
+ return alg_usable_p (rep_prefix_1_byte, memset, have_as)
+ ? rep_prefix_1_byte : loop_1_byte;
+ else
+ return alg_usable_p (rep_prefix_4_byte, memset, have_as)
+ ? rep_prefix_4_byte : loop;
+ }
+ /* Very tiny blocks are best handled via the loop, REP is expensive to
+ setup. */
+ else if (expected_size != -1 && expected_size < 4)
+ return loop_1_byte;
+ else if (expected_size != -1)
+ {
+ enum stringop_alg alg = libcall;
+ bool alg_noalign = false;
+ for (i = 0; i < MAX_STRINGOP_ALGS; i++)
+ {
+ /* We get here if the algorithms that were not libcall-based
+ were rep-prefix based and we are unable to use rep prefixes
+ based on global register usage. Break out of the loop and
+ use the heuristic below. */
+ if (algs->size[i].max == 0)
+ break;
+ if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
+ {
+ enum stringop_alg candidate = algs->size[i].alg;
+
+ if (candidate != libcall
+ && alg_usable_p (candidate, memset, have_as))
+ {
+ alg = candidate;
+ alg_noalign = algs->size[i].noalign;
+ }
+ /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
+ last non-libcall inline algorithm. */
+ if (TARGET_INLINE_ALL_STRINGOPS)
+ {
+ /* When the current size is best to be copied by a libcall,
+ but we are still forced to inline, run the heuristic below
+ that will pick code for medium sized blocks. */
+ if (alg != libcall)
+ {
+ *noalign = alg_noalign;
+ return alg;
+ }
+ else if (!any_alg_usable_p)
+ break;
+ }
+ else if (alg_usable_p (candidate, memset, have_as))
+ {
+ *noalign = algs->size[i].noalign;
+ return candidate;
+ }
+ }
+ }
+ }
+ /* When asked to inline the call anyway, try to pick meaningful choice.
+ We look for maximal size of block that is faster to copy by hand and
+ take blocks of at most of that size guessing that average size will
+ be roughly half of the block.
+
+ If this turns out to be bad, we might simply specify the preferred
+ choice in ix86_costs. */
+ if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
+ && (algs->unknown_size == libcall
+ || !alg_usable_p (algs->unknown_size, memset, have_as)))
+ {
+ enum stringop_alg alg;
+ HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
+
+ /* If there aren't any usable algorithms or if recursing already,
+ then recursing on smaller sizes or same size isn't going to
+ find anything. Just return the simple byte-at-a-time copy loop. */
+ if (!any_alg_usable_p || recur)
+ {
+ /* Pick something reasonable. */
+ if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
+ *dynamic_check = 128;
+ return loop_1_byte;
+ }
+ alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
+ zero_memset, have_as, dynamic_check, noalign, true);
+ gcc_assert (*dynamic_check == -1);
+ if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
+ *dynamic_check = max;
+ else
+ gcc_assert (alg != libcall);
+ return alg;
+ }
+ return (alg_usable_p (algs->unknown_size, memset, have_as)
+ ? algs->unknown_size : libcall);
+}
+
+/* Decide on alignment. We know that the operand is already aligned to ALIGN
+ (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
+static int
+decide_alignment (int align,
+ enum stringop_alg alg,
+ int expected_size,
+ machine_mode move_mode)
+{
+ int desired_align = 0;
+
+ gcc_assert (alg != no_stringop);
+
+ if (alg == libcall)
+ return 0;
+ if (move_mode == VOIDmode)
+ return 0;
+
+ desired_align = GET_MODE_SIZE (move_mode);
+ /* PentiumPro has special logic triggering for 8 byte aligned blocks.
+ copying whole cacheline at once. */
+ if (TARGET_PENTIUMPRO
+ && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
+ desired_align = 8;
+
+ if (optimize_size)
+ desired_align = 1;
+ if (desired_align < align)
+ desired_align = align;
+ if (expected_size != -1 && expected_size < 4)
+ desired_align = align;
+
+ return desired_align;
+}
+
+
+/* Helper function for memcpy. For QImode value 0xXY produce
+ 0xXYXYXYXY of wide specified by MODE. This is essentially
+ a * 0x10101010, but we can do slightly better than
+ synth_mult by unwinding the sequence by hand on CPUs with
+ slow multiply. */
+static rtx
+promote_duplicated_reg (machine_mode mode, rtx val)
+{
+ machine_mode valmode = GET_MODE (val);
+ rtx tmp;
+ int nops = mode == DImode ? 3 : 2;
+
+ gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
+ if (val == const0_rtx)
+ return copy_to_mode_reg (mode, CONST0_RTX (mode));
+ if (CONST_INT_P (val))
+ {
+ HOST_WIDE_INT v = INTVAL (val) & 255;
+
+ v |= v << 8;
+ v |= v << 16;
+ if (mode == DImode)
+ v |= (v << 16) << 16;
+ return copy_to_mode_reg (mode, gen_int_mode (v, mode));
+ }
+
+ if (valmode == VOIDmode)
+ valmode = QImode;
+ if (valmode != QImode)
+ val = gen_lowpart (QImode, val);
+ if (mode == QImode)
+ return val;
+ if (!TARGET_PARTIAL_REG_STALL)
+ nops--;
+ if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
+ + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
+ <= (ix86_cost->shift_const + ix86_cost->add) * nops
+ + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
+ {
+ rtx reg = convert_modes (mode, QImode, val, true);
+ tmp = promote_duplicated_reg (mode, const1_rtx);
+ return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
+ OPTAB_DIRECT);
+ }
+ else
+ {
+ rtx reg = convert_modes (mode, QImode, val, true);
+
+ if (!TARGET_PARTIAL_REG_STALL)
+ if (mode == SImode)
+ emit_insn (gen_insvsi_1 (reg, reg));
+ else
+ emit_insn (gen_insvdi_1 (reg, reg));
+ else
+ {
+ tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
+ NULL, 1, OPTAB_DIRECT);
+ reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
+ OPTAB_DIRECT);
+ }
+ tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
+ NULL, 1, OPTAB_DIRECT);
+ reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
+ if (mode == SImode)
+ return reg;
+ tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
+ NULL, 1, OPTAB_DIRECT);
+ reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
+ return reg;
+ }
+}
+
+/* Duplicate value VAL using promote_duplicated_reg into maximal size that will
+ be needed by main loop copying SIZE_NEEDED chunks and prologue getting
+ alignment from ALIGN to DESIRED_ALIGN. */
+static rtx
+promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
+ int align)
+{
+ rtx promoted_val;
+
+ if (TARGET_64BIT
+ && (size_needed > 4 || (desired_align > align && desired_align > 4)))
+ promoted_val = promote_duplicated_reg (DImode, val);
+ else if (size_needed > 2 || (desired_align > align && desired_align > 2))
+ promoted_val = promote_duplicated_reg (SImode, val);
+ else if (size_needed > 1 || (desired_align > align && desired_align > 1))
+ promoted_val = promote_duplicated_reg (HImode, val);
+ else
+ promoted_val = val;
+
+ return promoted_val;
+}
+
+/* Copy the address to a Pmode register. This is used for x32 to
+ truncate DImode TLS address to a SImode register. */
+
+static rtx
+ix86_copy_addr_to_reg (rtx addr)
+{
+ rtx reg;
+ if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
+ {
+ reg = copy_addr_to_reg (addr);
+ REG_POINTER (reg) = 1;
+ return reg;
+ }
+ else
+ {
+ gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
+ reg = copy_to_mode_reg (DImode, addr);
+ REG_POINTER (reg) = 1;
+ return gen_rtx_SUBREG (SImode, reg, 0);
+ }
+}
+
+/* Expand string move (memcpy) ot store (memset) operation. Use i386 string
+ operations when profitable. The code depends upon architecture, block size
+ and alignment, but always has one of the following overall structures:
+
+ Aligned move sequence:
+
+ 1) Prologue guard: Conditional that jumps up to epilogues for small
+ blocks that can be handled by epilogue alone. This is faster
+ but also needed for correctness, since prologue assume the block
+ is larger than the desired alignment.
+
+ Optional dynamic check for size and libcall for large
+ blocks is emitted here too, with -minline-stringops-dynamically.
+
+ 2) Prologue: copy first few bytes in order to get destination
+ aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
+ than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
+ copied. We emit either a jump tree on power of two sized
+ blocks, or a byte loop.
+
+ 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
+ with specified algorithm.
+
+ 4) Epilogue: code copying tail of the block that is too small to be
+ handled by main body (or up to size guarded by prologue guard).
+
+ Misaligned move sequence
+
+ 1) missaligned move prologue/epilogue containing:
+ a) Prologue handling small memory blocks and jumping to done_label
+ (skipped if blocks are known to be large enough)
+ b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
+ needed by single possibly misaligned move
+ (skipped if alignment is not needed)
+ c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
+
+ 2) Zero size guard dispatching to done_label, if needed
+
+ 3) dispatch to library call, if needed,
+
+ 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
+ with specified algorithm. */
+bool
+ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
+ rtx align_exp, rtx expected_align_exp,
+ rtx expected_size_exp, rtx min_size_exp,
+ rtx max_size_exp, rtx probable_max_size_exp,
+ bool issetmem)
+{
+ rtx destreg;
+ rtx srcreg = NULL;
+ rtx_code_label *label = NULL;
+ rtx tmp;
+ rtx_code_label *jump_around_label = NULL;
+ HOST_WIDE_INT align = 1;
+ unsigned HOST_WIDE_INT count = 0;
+ HOST_WIDE_INT expected_size = -1;
+ int size_needed = 0, epilogue_size_needed;
+ int desired_align = 0, align_bytes = 0;
+ enum stringop_alg alg;
+ rtx promoted_val = NULL;
+ rtx vec_promoted_val = NULL;
+ bool force_loopy_epilogue = false;
+ int dynamic_check;
+ bool need_zero_guard = false;
+ bool noalign;
+ machine_mode move_mode = VOIDmode;
+ machine_mode wider_mode;
+ int unroll_factor = 1;
+ /* TODO: Once value ranges are available, fill in proper data. */
+ unsigned HOST_WIDE_INT min_size = 0;
+ unsigned HOST_WIDE_INT max_size = -1;
+ unsigned HOST_WIDE_INT probable_max_size = -1;
+ bool misaligned_prologue_used = false;
+ bool have_as;
+
+ if (CONST_INT_P (align_exp))
+ align = INTVAL (align_exp);
+ /* i386 can do misaligned access on reasonably increased cost. */
+ if (CONST_INT_P (expected_align_exp)
+ && INTVAL (expected_align_exp) > align)
+ align = INTVAL (expected_align_exp);
+ /* ALIGN is the minimum of destination and source alignment, but we care here
+ just about destination alignment. */
+ else if (!issetmem
+ && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
+ align = MEM_ALIGN (dst) / BITS_PER_UNIT;
+
+ if (CONST_INT_P (count_exp))
+ {
+ min_size = max_size = probable_max_size = count = expected_size
+ = INTVAL (count_exp);
+ /* When COUNT is 0, there is nothing to do. */
+ if (!count)
+ return true;
+ }
+ else
+ {
+ if (min_size_exp)
+ min_size = INTVAL (min_size_exp);
+ if (max_size_exp)
+ max_size = INTVAL (max_size_exp);
+ if (probable_max_size_exp)
+ probable_max_size = INTVAL (probable_max_size_exp);
+ if (CONST_INT_P (expected_size_exp))
+ expected_size = INTVAL (expected_size_exp);
+ }
+
+ /* Make sure we don't need to care about overflow later on. */
+ if (count > (HOST_WIDE_INT_1U << 30))
+ return false;
+
+ have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
+ if (!issetmem)
+ have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
+
+ /* Step 0: Decide on preferred algorithm, desired alignment and
+ size of chunks to be copied by main loop. */
+ alg = decide_alg (count, expected_size, min_size, probable_max_size,
+ issetmem,
+ issetmem && val_exp == const0_rtx, have_as,
+ &dynamic_check, &noalign, false);
+
+ if (dump_file)
+ fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
+ stringop_alg_names[alg]);
+
+ if (alg == libcall)
+ return false;
+ gcc_assert (alg != no_stringop);
+
+ /* For now vector-version of memset is generated only for memory zeroing, as
+ creating of promoted vector value is very cheap in this case. */
+ if (issetmem && alg == vector_loop && val_exp != const0_rtx)
+ alg = unrolled_loop;
+
+ if (!count)
+ count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
+ destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
+ if (!issetmem)
+ srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
+
+ unroll_factor = 1;
+ move_mode = word_mode;
+ switch (alg)
+ {
+ case libcall:
+ case no_stringop:
+ case last_alg:
+ gcc_unreachable ();
+ case loop_1_byte:
+ need_zero_guard = true;
+ move_mode = QImode;
+ break;
+ case loop:
+ need_zero_guard = true;
+ break;
+ case unrolled_loop:
+ need_zero_guard = true;
+ unroll_factor = (TARGET_64BIT ? 4 : 2);
+ break;
+ case vector_loop:
+ need_zero_guard = true;
+ unroll_factor = 4;
+ /* Find the widest supported mode. */
+ move_mode = word_mode;
+ while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
+ && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
+ move_mode = wider_mode;
+
+ if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
+ move_mode = TImode;
+
+ /* Find the corresponding vector mode with the same size as MOVE_MODE.
+ MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
+ if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
+ {
+ int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
+ if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
+ || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
+ move_mode = word_mode;
+ }
+ gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
+ break;
+ case rep_prefix_8_byte:
+ move_mode = DImode;
+ break;
+ case rep_prefix_4_byte:
+ move_mode = SImode;
+ break;
+ case rep_prefix_1_byte:
+ move_mode = QImode;
+ break;
+ }
+ size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
+ epilogue_size_needed = size_needed;
+
+ /* If we are going to call any library calls conditionally, make sure any
+ pending stack adjustment happen before the first conditional branch,
+ otherwise they will be emitted before the library call only and won't
+ happen from the other branches. */
+ if (dynamic_check != -1)
+ do_pending_stack_adjust ();
+
+ desired_align = decide_alignment (align, alg, expected_size, move_mode);
+ if (!TARGET_ALIGN_STRINGOPS || noalign)
+ align = desired_align;
+
+ /* Step 1: Prologue guard. */
+
+ /* Alignment code needs count to be in register. */
+ if (CONST_INT_P (count_exp) && desired_align > align)
+ {
+ if (INTVAL (count_exp) > desired_align
+ && INTVAL (count_exp) > size_needed)
+ {
+ align_bytes
+ = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
+ if (align_bytes <= 0)
+ align_bytes = 0;
+ else
+ align_bytes = desired_align - align_bytes;
+ }
+ if (align_bytes == 0)
+ count_exp = force_reg (counter_mode (count_exp), count_exp);
+ }
+ gcc_assert (desired_align >= 1 && align >= 1);
+
+ /* Misaligned move sequences handle both prologue and epilogue at once.
+ Default code generation results in a smaller code for large alignments
+ and also avoids redundant job when sizes are known precisely. */
+ misaligned_prologue_used
+ = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
+ && MAX (desired_align, epilogue_size_needed) <= 32
+ && desired_align <= epilogue_size_needed
+ && ((desired_align > align && !align_bytes)
+ || (!count && epilogue_size_needed > 1)));
+
+ /* Do the cheap promotion to allow better CSE across the
+ main loop and epilogue (ie one load of the big constant in the
+ front of all code.
+ For now the misaligned move sequences do not have fast path
+ without broadcasting. */
+ if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
+ {
+ if (alg == vector_loop)
+ {
+ gcc_assert (val_exp == const0_rtx);
+ vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
+ promoted_val = promote_duplicated_reg_to_size (val_exp,
+ GET_MODE_SIZE (word_mode),
+ desired_align, align);
+ }
+ else
+ {
+ promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
+ desired_align, align);
+ }
+ }
+ /* Misaligned move sequences handles both prologues and epilogues at once.
+ Default code generation results in smaller code for large alignments and
+ also avoids redundant job when sizes are known precisely. */
+ if (misaligned_prologue_used)
+ {
+ /* Misaligned move prologue handled small blocks by itself. */
+ expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
+ (dst, src, &destreg, &srcreg,
+ move_mode, promoted_val, vec_promoted_val,
+ &count_exp,
+ &jump_around_label,
+ desired_align < align
+ ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
+ desired_align, align, &min_size, dynamic_check, issetmem);
+ if (!issetmem)
+ src = change_address (src, BLKmode, srcreg);
+ dst = change_address (dst, BLKmode, destreg);
+ set_mem_align (dst, desired_align * BITS_PER_UNIT);
+ epilogue_size_needed = 0;
+ if (need_zero_guard
+ && min_size < (unsigned HOST_WIDE_INT) size_needed)
+ {
+ /* It is possible that we copied enough so the main loop will not
+ execute. */
+ gcc_assert (size_needed > 1);
+ if (jump_around_label == NULL_RTX)
+ jump_around_label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (count_exp,
+ GEN_INT (size_needed),
+ LTU, 0, counter_mode (count_exp), 1, jump_around_label);
+ if (expected_size == -1
+ || expected_size < (desired_align - align) / 2 + size_needed)
+ predict_jump (REG_BR_PROB_BASE * 20 / 100);
+ else
+ predict_jump (REG_BR_PROB_BASE * 60 / 100);
+ }
+ }
+ /* Ensure that alignment prologue won't copy past end of block. */
+ else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
+ {
+ epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
+ /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
+ Make sure it is power of 2. */
+ epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
+
+ /* To improve performance of small blocks, we jump around the VAL
+ promoting mode. This mean that if the promoted VAL is not constant,
+ we might not use it in the epilogue and have to use byte
+ loop variant. */
+ if (issetmem && epilogue_size_needed > 2 && !promoted_val)
+ force_loopy_epilogue = true;
+ if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
+ || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
+ {
+ /* If main algorithm works on QImode, no epilogue is needed.
+ For small sizes just don't align anything. */
+ if (size_needed == 1)
+ desired_align = align;
+ else
+ goto epilogue;
+ }
+ else if (!count
+ && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
+ {
+ label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (count_exp,
+ GEN_INT (epilogue_size_needed),
+ LTU, 0, counter_mode (count_exp), 1, label);
+ if (expected_size == -1 || expected_size < epilogue_size_needed)
+ predict_jump (REG_BR_PROB_BASE * 60 / 100);
+ else
+ predict_jump (REG_BR_PROB_BASE * 20 / 100);
+ }
+ }
+
+ /* Emit code to decide on runtime whether library call or inline should be
+ used. */
+ if (dynamic_check != -1)
+ {
+ if (!issetmem && CONST_INT_P (count_exp))
+ {
+ if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
+ {
+ emit_block_copy_via_libcall (dst, src, count_exp);
+ count_exp = const0_rtx;
+ goto epilogue;
+ }
+ }
+ else
+ {
+ rtx_code_label *hot_label = gen_label_rtx ();
+ if (jump_around_label == NULL_RTX)
+ jump_around_label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
+ LEU, 0, counter_mode (count_exp),
+ 1, hot_label);
+ predict_jump (REG_BR_PROB_BASE * 90 / 100);
+ if (issetmem)
+ set_storage_via_libcall (dst, count_exp, val_exp);
+ else
+ emit_block_copy_via_libcall (dst, src, count_exp);
+ emit_jump (jump_around_label);
+ emit_label (hot_label);
+ }
+ }
+
+ /* Step 2: Alignment prologue. */
+ /* Do the expensive promotion once we branched off the small blocks. */
+ if (issetmem && !promoted_val)
+ promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
+ desired_align, align);
+
+ if (desired_align > align && !misaligned_prologue_used)
+ {
+ if (align_bytes == 0)
+ {
+ /* Except for the first move in prologue, we no longer know
+ constant offset in aliasing info. It don't seems to worth
+ the pain to maintain it for the first move, so throw away
+ the info early. */
+ dst = change_address (dst, BLKmode, destreg);
+ if (!issetmem)
+ src = change_address (src, BLKmode, srcreg);
+ dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
+ promoted_val, vec_promoted_val,
+ count_exp, align, desired_align,
+ issetmem);
+ /* At most desired_align - align bytes are copied. */
+ if (min_size < (unsigned)(desired_align - align))
+ min_size = 0;
+ else
+ min_size -= desired_align - align;
+ }
+ else
+ {
+ /* If we know how many bytes need to be stored before dst is
+ sufficiently aligned, maintain aliasing info accurately. */
+ dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
+ srcreg,
+ promoted_val,
+ vec_promoted_val,
+ desired_align,
+ align_bytes,
+ issetmem);
+
+ count_exp = plus_constant (counter_mode (count_exp),
+ count_exp, -align_bytes);
+ count -= align_bytes;
+ min_size -= align_bytes;
+ max_size -= align_bytes;
+ }
+ if (need_zero_guard
+ && min_size < (unsigned HOST_WIDE_INT) size_needed
+ && (count < (unsigned HOST_WIDE_INT) size_needed
+ || (align_bytes == 0
+ && count < ((unsigned HOST_WIDE_INT) size_needed
+ + desired_align - align))))
+ {
+ /* It is possible that we copied enough so the main loop will not
+ execute. */
+ gcc_assert (size_needed > 1);
+ if (label == NULL_RTX)
+ label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (count_exp,
+ GEN_INT (size_needed),
+ LTU, 0, counter_mode (count_exp), 1, label);
+ if (expected_size == -1
+ || expected_size < (desired_align - align) / 2 + size_needed)
+ predict_jump (REG_BR_PROB_BASE * 20 / 100);
+ else
+ predict_jump (REG_BR_PROB_BASE * 60 / 100);
+ }
+ }
+ if (label && size_needed == 1)
+ {
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ label = NULL;
+ epilogue_size_needed = 1;
+ if (issetmem)
+ promoted_val = val_exp;
+ }
+ else if (label == NULL_RTX && !misaligned_prologue_used)
+ epilogue_size_needed = size_needed;
+
+ /* Step 3: Main loop. */
+
+ switch (alg)
+ {
+ case libcall:
+ case no_stringop:
+ case last_alg:
+ gcc_unreachable ();
+ case loop_1_byte:
+ case loop:
+ case unrolled_loop:
+ expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
+ count_exp, move_mode, unroll_factor,
+ expected_size, issetmem);
+ break;
+ case vector_loop:
+ expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
+ vec_promoted_val, count_exp, move_mode,
+ unroll_factor, expected_size, issetmem);
+ break;
+ case rep_prefix_8_byte:
+ case rep_prefix_4_byte:
+ case rep_prefix_1_byte:
+ expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
+ val_exp, count_exp, move_mode, issetmem);
+ break;
+ }
+ /* Adjust properly the offset of src and dest memory for aliasing. */
+ if (CONST_INT_P (count_exp))
+ {
+ if (!issetmem)
+ src = adjust_automodify_address_nv (src, BLKmode, srcreg,
+ (count / size_needed) * size_needed);
+ dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
+ (count / size_needed) * size_needed);
+ }
+ else
+ {
+ if (!issetmem)
+ src = change_address (src, BLKmode, srcreg);
+ dst = change_address (dst, BLKmode, destreg);
+ }
+
+ /* Step 4: Epilogue to copy the remaining bytes. */
+ epilogue:
+ if (label)
+ {
+ /* When the main loop is done, COUNT_EXP might hold original count,
+ while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
+ Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
+ bytes. Compensate if needed. */
+
+ if (size_needed < epilogue_size_needed)
+ {
+ tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
+ GEN_INT (size_needed - 1), count_exp, 1,
+ OPTAB_DIRECT);
+ if (tmp != count_exp)
+ emit_move_insn (count_exp, tmp);
+ }
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+ }
+
+ if (count_exp != const0_rtx && epilogue_size_needed > 1)
+ {
+ if (force_loopy_epilogue)
+ expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
+ epilogue_size_needed);
+ else
+ {
+ if (issetmem)
+ expand_setmem_epilogue (dst, destreg, promoted_val,
+ vec_promoted_val, count_exp,
+ epilogue_size_needed);
+ else
+ expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
+ epilogue_size_needed);
+ }
+ }
+ if (jump_around_label)
+ emit_label (jump_around_label);
+ return true;
+}
+
+
+/* Expand the appropriate insns for doing strlen if not just doing
+ repnz; scasb
+
+ out = result, initialized with the start address
+ align_rtx = alignment of the address.
+ scratch = scratch register, initialized with the startaddress when
+ not aligned, otherwise undefined
+
+ This is just the body. It needs the initializations mentioned above and
+ some address computing at the end. These things are done in i386.md. */
+
+static void
+ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
+{
+ int align;
+ rtx tmp;
+ rtx_code_label *align_2_label = NULL;
+ rtx_code_label *align_3_label = NULL;
+ rtx_code_label *align_4_label = gen_label_rtx ();
+ rtx_code_label *end_0_label = gen_label_rtx ();
+ rtx mem;
+ rtx tmpreg = gen_reg_rtx (SImode);
+ rtx scratch = gen_reg_rtx (SImode);
+ rtx cmp;
+
+ align = 0;
+ if (CONST_INT_P (align_rtx))
+ align = INTVAL (align_rtx);
+
+ /* Loop to check 1..3 bytes for null to get an aligned pointer. */
+
+ /* Is there a known alignment and is it less than 4? */
+ if (align < 4)
+ {
+ rtx scratch1 = gen_reg_rtx (Pmode);
+ emit_move_insn (scratch1, out);
+ /* Is there a known alignment and is it not 2? */
+ if (align != 2)
+ {
+ align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
+ align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
+
+ /* Leave just the 3 lower bits. */
+ align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
+ NULL_RTX, 0, OPTAB_WIDEN);
+
+ emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
+ Pmode, 1, align_4_label);
+ emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
+ Pmode, 1, align_2_label);
+ emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
+ Pmode, 1, align_3_label);
+ }
+ else
+ {
+ /* Since the alignment is 2, we have to check 2 or 0 bytes;
+ check if is aligned to 4 - byte. */
+
+ align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
+ NULL_RTX, 0, OPTAB_WIDEN);
+
+ emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
+ Pmode, 1, align_4_label);
+ }
+
+ mem = change_address (src, QImode, out);
+
+ /* Now compare the bytes. */
+
+ /* Compare the first n unaligned byte on a byte per byte basis. */
+ emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
+ QImode, 1, end_0_label);
+
+ /* Increment the address. */
+ emit_insn (ix86_gen_add3 (out, out, const1_rtx));
+
+ /* Not needed with an alignment of 2 */
+ if (align != 2)
+ {
+ emit_label (align_2_label);
+
+ emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
+ end_0_label);
+
+ emit_insn (ix86_gen_add3 (out, out, const1_rtx));
+
+ emit_label (align_3_label);
+ }
+
+ emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
+ end_0_label);
+
+ emit_insn (ix86_gen_add3 (out, out, const1_rtx));
+ }
+
+ /* Generate loop to check 4 bytes at a time. It is not a good idea to
+ align this loop. It gives only huge programs, but does not help to
+ speed up. */
+ emit_label (align_4_label);
+
+ mem = change_address (src, SImode, out);
+ emit_move_insn (scratch, mem);
+ emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
+
+ /* This formula yields a nonzero result iff one of the bytes is zero.
+ This saves three branches inside loop and many cycles. */
+
+ emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
+ emit_insn (gen_one_cmplsi2 (scratch, scratch));
+ emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
+ emit_insn (gen_andsi3 (tmpreg, tmpreg,
+ gen_int_mode (0x80808080, SImode)));
+ emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
+ align_4_label);
+
+ if (TARGET_CMOVE)
+ {
+ rtx reg = gen_reg_rtx (SImode);
+ rtx reg2 = gen_reg_rtx (Pmode);
+ emit_move_insn (reg, tmpreg);
+ emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
+
+ /* If zero is not in the first two bytes, move two bytes forward. */
+ emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
+ tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
+ tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
+ emit_insn (gen_rtx_SET (tmpreg,
+ gen_rtx_IF_THEN_ELSE (SImode, tmp,
+ reg,
+ tmpreg)));
+ /* Emit lea manually to avoid clobbering of flags. */
+ emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
+
+ tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
+ tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
+ emit_insn (gen_rtx_SET (out,
+ gen_rtx_IF_THEN_ELSE (Pmode, tmp,
+ reg2,
+ out)));
+ }
+ else
+ {
+ rtx_code_label *end_2_label = gen_label_rtx ();
+ /* Is zero in the first two bytes? */
+
+ emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
+ tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
+ tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
+ tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
+ gen_rtx_LABEL_REF (VOIDmode, end_2_label),
+ pc_rtx);
+ tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+ JUMP_LABEL (tmp) = end_2_label;
+
+ /* Not in the first two. Move two bytes forward. */
+ emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
+ emit_insn (ix86_gen_add3 (out, out, const2_rtx));
+
+ emit_label (end_2_label);
+
+ }
+
+ /* Avoid branch in fixing the byte. */
+ tmpreg = gen_lowpart (QImode, tmpreg);
+ emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
+ tmp = gen_rtx_REG (CCmode, FLAGS_REG);
+ cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
+ emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
+
+ emit_label (end_0_label);
+}
+
+/* Expand strlen. */
+
+bool
+ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
+{
+if (TARGET_UNROLL_STRLEN
+ && TARGET_INLINE_ALL_STRINGOPS
+ && eoschar == const0_rtx
+ && optimize > 1)
+ {
+ /* The generic case of strlen expander is long. Avoid it's
+ expanding unless TARGET_INLINE_ALL_STRINGOPS. */
+ rtx addr = force_reg (Pmode, XEXP (src, 0));
+ /* Well it seems that some optimizer does not combine a call like
+ foo(strlen(bar), strlen(bar));
+ when the move and the subtraction is done here. It does calculate
+ the length just once when these instructions are done inside of
+ output_strlen_unroll(). But I think since &bar[strlen(bar)] is
+ often used and I use one fewer register for the lifetime of
+ output_strlen_unroll() this is better. */
+
+ emit_move_insn (out, addr);
+
+ ix86_expand_strlensi_unroll_1 (out, src, align);
+
+ /* strlensi_unroll_1 returns the address of the zero at the end of
+ the string, like memchr(), so compute the length by subtracting
+ the start address. */
+ emit_insn (ix86_gen_sub3 (out, out, addr));
+ return true;
+ }
+ else
+ return false;
+}
+
+/* For given symbol (function) construct code to compute address of it's PLT
+ entry in large x86-64 PIC model. */
+
+static rtx
+construct_plt_address (rtx symbol)
+{
+ rtx tmp, unspec;
+
+ gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
+ gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
+ gcc_assert (Pmode == DImode);
+
+ tmp = gen_reg_rtx (Pmode);
+ unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
+
+ emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
+ emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
+ return tmp;
+}
+
+/* Additional registers that are clobbered by SYSV calls. */
+
+static int const x86_64_ms_sysv_extra_clobbered_registers
+ [NUM_X86_64_MS_CLOBBERED_REGS] =
+{
+ SI_REG, DI_REG,
+ XMM6_REG, XMM7_REG,
+ XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
+ XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
+};
+
+rtx_insn *
+ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
+ rtx callarg2,
+ rtx pop, bool sibcall)
+{
+ rtx vec[3];
+ rtx use = NULL, call;
+ unsigned int vec_len = 0;
+ tree fndecl;
+
+ if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
+ {
+ fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
+ if (fndecl
+ && (lookup_attribute ("interrupt",
+ TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
+ error ("interrupt service routine can%'t be called directly");
+ }
+ else
+ fndecl = NULL_TREE;
+
+ if (pop == const0_rtx)
+ pop = NULL;
+ gcc_assert (!TARGET_64BIT || !pop);
+
+ if (TARGET_MACHO && !TARGET_64BIT)
+ {
+#if TARGET_MACHO
+ if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
+ fnaddr = machopic_indirect_call_target (fnaddr);
+#endif
+ }
+ else
+ {
+ /* Static functions and indirect calls don't need the pic register. Also,
+ check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
+ it an indirect call. */
+ rtx addr = XEXP (fnaddr, 0);
+ if (flag_pic
+ && GET_CODE (addr) == SYMBOL_REF
+ && !SYMBOL_REF_LOCAL_P (addr))
+ {
+ if (flag_plt
+ && (SYMBOL_REF_DECL (addr) == NULL_TREE
+ || !lookup_attribute ("noplt",
+ DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
+ {
+ if (!TARGET_64BIT
+ || (ix86_cmodel == CM_LARGE_PIC
+ && DEFAULT_ABI != MS_ABI))
+ {
+ use_reg (&use, gen_rtx_REG (Pmode,
+ REAL_PIC_OFFSET_TABLE_REGNUM));
+ if (ix86_use_pseudo_pic_reg ())
+ emit_move_insn (gen_rtx_REG (Pmode,
+ REAL_PIC_OFFSET_TABLE_REGNUM),
+ pic_offset_table_rtx);
+ }
+ }
+ else if (!TARGET_PECOFF && !TARGET_MACHO)
+ {
+ if (TARGET_64BIT)
+ {
+ fnaddr = gen_rtx_UNSPEC (Pmode,
+ gen_rtvec (1, addr),
+ UNSPEC_GOTPCREL);
+ fnaddr = gen_rtx_CONST (Pmode, fnaddr);
+ }
+ else
+ {
+ fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
+ UNSPEC_GOT);
+ fnaddr = gen_rtx_CONST (Pmode, fnaddr);
+ fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
+ fnaddr);
+ }
+ fnaddr = gen_const_mem (Pmode, fnaddr);
+ /* Pmode may not be the same as word_mode for x32, which
+ doesn't support indirect branch via 32-bit memory slot.
+ Since x32 GOT slot is 64 bit with zero upper 32 bits,
+ indirect branch via x32 GOT slot is OK. */
+ if (GET_MODE (fnaddr) != word_mode)
+ fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
+ fnaddr = gen_rtx_MEM (QImode, fnaddr);
+ }
+ }
+ }
+
+ /* Skip setting up RAX register for -mskip-rax-setup when there are no
+ parameters passed in vector registers. */
+ if (TARGET_64BIT
+ && (INTVAL (callarg2) > 0
+ || (INTVAL (callarg2) == 0
+ && (TARGET_SSE || !flag_skip_rax_setup))))
+ {
+ rtx al = gen_rtx_REG (QImode, AX_REG);
+ emit_move_insn (al, callarg2);
+ use_reg (&use, al);
+ }
+
+ if (ix86_cmodel == CM_LARGE_PIC
+ && !TARGET_PECOFF
+ && MEM_P (fnaddr)
+ && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
+ && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
+ fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
+ /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
+ branch via x32 GOT slot is OK. */
+ else if (!(TARGET_X32
+ && MEM_P (fnaddr)
+ && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
+ && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
+ && (sibcall
+ ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
+ : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
+ {
+ fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
+ fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
+ }
+
+ call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
+
+ if (retval)
+ call = gen_rtx_SET (retval, call);
+ vec[vec_len++] = call;
+
+ if (pop)
+ {
+ pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
+ pop = gen_rtx_SET (stack_pointer_rtx, pop);
+ vec[vec_len++] = pop;
+ }
+
+ if (cfun->machine->no_caller_saved_registers
+ && (!fndecl
+ || (!TREE_THIS_VOLATILE (fndecl)
+ && !lookup_attribute ("no_caller_saved_registers",
+ TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
+ {
+ static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
+ bool is_64bit_ms_abi = (TARGET_64BIT
+ && ix86_function_abi (fndecl) == MS_ABI);
+ char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
+
+ /* If there are no caller-saved registers, add all registers
+ that are clobbered by the call which returns. */
+ for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+ if (!fixed_regs[i]
+ && (ix86_call_used_regs[i] == 1
+ || (ix86_call_used_regs[i] & c_mask))
+ && !STACK_REGNO_P (i)
+ && !MMX_REGNO_P (i))
+ clobber_reg (&use,
+ gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
+ }
+ else if (TARGET_64BIT_MS_ABI
+ && (!callarg2 || INTVAL (callarg2) != -2))
+ {
+ unsigned i;
+
+ for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
+ {
+ int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
+ machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
+
+ clobber_reg (&use, gen_rtx_REG (mode, regno));
+ }
+
+ /* Set here, but it may get cleared later. */
+ if (TARGET_CALL_MS2SYSV_XLOGUES)
+ {
+ if (!TARGET_SSE)
+ ;
+
+ /* Don't break hot-patched functions. */
+ else if (ix86_function_ms_hook_prologue (current_function_decl))
+ ;
+
+ /* TODO: Cases not yet examined. */
+ else if (flag_split_stack)
+ warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
+
+ else
+ {
+ gcc_assert (!reload_completed);
+ cfun->machine->call_ms2sysv = true;
+ }
+ }
+ }
+
+ if (vec_len > 1)
+ call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
+ rtx_insn *call_insn = emit_call_insn (call);
+ if (use)
+ CALL_INSN_FUNCTION_USAGE (call_insn) = use;
+
+ return call_insn;
+}
+
+/* Split simple return with popping POPC bytes from stack to indirect
+ branch with stack adjustment . */
+
+void
+ix86_split_simple_return_pop_internal (rtx popc)
+{
+ struct machine_function *m = cfun->machine;
+ rtx ecx = gen_rtx_REG (SImode, CX_REG);
+ rtx_insn *insn;
+
+ /* There is no "pascal" calling convention in any 64bit ABI. */
+ gcc_assert (!TARGET_64BIT);
+
+ insn = emit_insn (gen_pop (ecx));
+ m->fs.cfa_offset -= UNITS_PER_WORD;
+ m->fs.sp_offset -= UNITS_PER_WORD;
+
+ rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
+ x = gen_rtx_SET (stack_pointer_rtx, x);
+ add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
+ add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
+ RTX_FRAME_RELATED_P (insn) = 1;
+
+ x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
+ x = gen_rtx_SET (stack_pointer_rtx, x);
+ insn = emit_insn (x);
+ add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
+ RTX_FRAME_RELATED_P (insn) = 1;
+
+ /* Now return address is in ECX. */
+ emit_jump_insn (gen_simple_return_indirect_internal (ecx));
+}
+
+/* Errors in the source file can cause expand_expr to return const0_rtx
+ where we expect a vector. To avoid crashing, use one of the vector
+ clear instructions. */
+
+static rtx
+safe_vector_operand (rtx x, machine_mode mode)
+{
+ if (x == const0_rtx)
+ x = CONST0_RTX (mode);
+ return x;
+}
+
+/* Subroutine of ix86_expand_builtin to take care of binop insns. */
+
+static rtx
+ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
+{
+ rtx pat;
+ tree arg0 = CALL_EXPR_ARG (exp, 0);
+ tree arg1 = CALL_EXPR_ARG (exp, 1);
+ rtx op0 = expand_normal (arg0);
+ rtx op1 = expand_normal (arg1);
+ machine_mode tmode = insn_data[icode].operand[0].mode;
+ machine_mode mode0 = insn_data[icode].operand[1].mode;
+ machine_mode mode1 = insn_data[icode].operand[2].mode;
+
+ if (VECTOR_MODE_P (mode0))
+ op0 = safe_vector_operand (op0, mode0);
+ if (VECTOR_MODE_P (mode1))
+ op1 = safe_vector_operand (op1, mode1);
+
+ if (optimize || !target
+ || GET_MODE (target) != tmode
+ || !insn_data[icode].operand[0].predicate (target, tmode))
+ target = gen_reg_rtx (tmode);
+
+ if (GET_MODE (op1) == SImode && mode1 == TImode)
+ {
+ rtx x = gen_reg_rtx (V4SImode);
+ emit_insn (gen_sse2_loadd (x, op1));
+ op1 = gen_lowpart (TImode, x);
+ }
+
+ if (!insn_data[icode].operand[1].predicate (op0, mode0))
+ op0 = copy_to_mode_reg (mode0, op0);
+ if (!insn_data[icode].operand[2].predicate (op1, mode1))
+ op1 = copy_to_mode_reg (mode1, op1);
+
+ pat = GEN_FCN (icode) (target, op0, op1);
+ if (! pat)
+ return 0;
+
+ emit_insn (pat);
+
+ return target;
+}
+
+/* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
+
+static rtx
+ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
+ enum ix86_builtin_func_type m_type,
+ enum rtx_code sub_code)
+{
+ rtx pat;
+ int i;
+ int nargs;
+ bool comparison_p = false;
+ bool tf_p = false;
+ bool last_arg_constant = false;
+ int num_memory = 0;
+ struct {
+ rtx op;
+ machine_mode mode;
+ } args[4];
+
+ machine_mode tmode = insn_data[icode].operand[0].mode;
+
+ switch (m_type)
+ {
+ case MULTI_ARG_4_DF2_DI_I:
+ case MULTI_ARG_4_DF2_DI_I1:
+ case MULTI_ARG_4_SF2_SI_I:
+ case MULTI_ARG_4_SF2_SI_I1:
+ nargs = 4;
+ last_arg_constant = true;
+ break;
+
+ case MULTI_ARG_3_SF:
+ case MULTI_ARG_3_DF:
+ case MULTI_ARG_3_SF2:
+ case MULTI_ARG_3_DF2:
+ case MULTI_ARG_3_DI:
+ case MULTI_ARG_3_SI:
+ case MULTI_ARG_3_SI_DI:
+ case MULTI_ARG_3_HI:
+ case MULTI_ARG_3_HI_SI:
+ case MULTI_ARG_3_QI:
+ case MULTI_ARG_3_DI2:
+ case MULTI_ARG_3_SI2:
+ case MULTI_ARG_3_HI2:
+ case MULTI_ARG_3_QI2:
+ nargs = 3;
+ break;
+
+ case MULTI_ARG_2_SF:
+ case MULTI_ARG_2_DF:
+ case MULTI_ARG_2_DI:
+ case MULTI_ARG_2_SI:
+ case MULTI_ARG_2_HI:
+ case MULTI_ARG_2_QI:
+ nargs = 2;
+ break;
+
+ case MULTI_ARG_2_DI_IMM:
+ case MULTI_ARG_2_SI_IMM:
+ case MULTI_ARG_2_HI_IMM:
+ case MULTI_ARG_2_QI_IMM:
+ nargs = 2;
+ last_arg_constant = true;
+ break;
+
+ case MULTI_ARG_1_SF:
+ case MULTI_ARG_1_DF:
+ case MULTI_ARG_1_SF2:
+ case MULTI_ARG_1_DF2:
+ case MULTI_ARG_1_DI:
+ case MULTI_ARG_1_SI:
+ case MULTI_ARG_1_HI:
+ case MULTI_ARG_1_QI:
+ case MULTI_ARG_1_SI_DI:
+ case MULTI_ARG_1_HI_DI:
+ case MULTI_ARG_1_HI_SI:
+ case MULTI_ARG_1_QI_DI:
+ case MULTI_ARG_1_QI_SI:
+ case MULTI_ARG_1_QI_HI:
+ nargs = 1;
+ break;
+
+ case MULTI_ARG_2_DI_CMP:
+ case MULTI_ARG_2_SI_CMP:
+ case MULTI_ARG_2_HI_CMP:
+ case MULTI_ARG_2_QI_CMP:
+ nargs = 2;
+ comparison_p = true;
+ break;
+
+ case MULTI_ARG_2_SF_TF:
+ case MULTI_ARG_2_DF_TF:
+ case MULTI_ARG_2_DI_TF:
+ case MULTI_ARG_2_SI_TF:
+ case MULTI_ARG_2_HI_TF:
+ case MULTI_ARG_2_QI_TF:
+ nargs = 2;
+ tf_p = true;
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ if (optimize || !target
+ || GET_MODE (target) != tmode
+ || !insn_data[icode].operand[0].predicate (target, tmode))
+ target = gen_reg_rtx (tmode);
+ else if (memory_operand (target, tmode))
+ num_memory++;
+
+ gcc_assert (nargs <= 4);
+
+ for (i = 0; i < nargs; i++)
+ {
+ tree arg = CALL_EXPR_ARG (exp, i);
+ rtx op = expand_normal (arg);
+ int adjust = (comparison_p) ? 1 : 0;
+ machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
+
+ if (last_arg_constant && i == nargs - 1)
+ {
+ if (!insn_data[icode].operand[i + 1].predicate (op, mode))
+ {
+ enum insn_code new_icode = icode;
+ switch (icode)
+ {
+ case CODE_FOR_xop_vpermil2v2df3:
+ case CODE_FOR_xop_vpermil2v4sf3:
+ case CODE_FOR_xop_vpermil2v4df3:
+ case CODE_FOR_xop_vpermil2v8sf3:
+ error ("the last argument must be a 2-bit immediate");
+ return gen_reg_rtx (tmode);
+ case CODE_FOR_xop_rotlv2di3:
+ new_icode = CODE_FOR_rotlv2di3;
+ goto xop_rotl;
+ case CODE_FOR_xop_rotlv4si3:
+ new_icode = CODE_FOR_rotlv4si3;
+ goto xop_rotl;
+ case CODE_FOR_xop_rotlv8hi3:
+ new_icode = CODE_FOR_rotlv8hi3;
+ goto xop_rotl;
+ case CODE_FOR_xop_rotlv16qi3:
+ new_icode = CODE_FOR_rotlv16qi3;
+ xop_rotl:
+ if (CONST_INT_P (op))
+ {
+ int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
+ op = GEN_INT (INTVAL (op) & mask);
+ gcc_checking_assert
+ (insn_data[icode].operand[i + 1].predicate (op, mode));
+ }
+ else
+ {
+ gcc_checking_assert
+ (nargs == 2
+ && insn_data[new_icode].operand[0].mode == tmode
+ && insn_data[new_icode].operand[1].mode == tmode
+ && insn_data[new_icode].operand[2].mode == mode
+ && insn_data[new_icode].operand[0].predicate
+ == insn_data[icode].operand[0].predicate
+ && insn_data[new_icode].operand[1].predicate
+ == insn_data[icode].operand[1].predicate);
+ icode = new_icode;
+ goto non_constant;
+ }
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ }
+ }
+ else
+ {
+ non_constant:
+ if (VECTOR_MODE_P (mode))
+ op = safe_vector_operand (op, mode);
+
+ /* If we aren't optimizing, only allow one memory operand to be
+ generated. */
+ if (memory_operand (op, mode))
+ num_memory++;
+
+ gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
+
+ if (optimize
+ || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
+ || num_memory > 1)
+ op = force_reg (mode, op);
+ }
+
+ args[i].op = op;
+ args[i].mode = mode;
+ }
+
+ switch (nargs)
+ {
+ case 1:
+ pat = GEN_FCN (icode) (target, args[0].op);
+ break;
+
+ case 2:
+ if (tf_p)
+ pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+ GEN_INT ((int)sub_code));
+ else if (! comparison_p)
+ pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
+ else
+ {
+ rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
+ args[0].op,
+ args[1].op);
+
+ pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
+ }
+ break;
+
+ case 3:
+ pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
+ break;
+
+ case 4:
+ pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ if (! pat)
+ return 0;
+
+ emit_insn (pat);
+ return target;
+}
+
+/* Subroutine of ix86_expand_args_builtin to take care of scalar unop
+ insns with vec_merge. */
+
+static rtx
+ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
+ rtx target)
+{
+ rtx pat;
+ tree arg0 = CALL_EXPR_ARG (exp, 0);
+ rtx op1, op0 = expand_normal (arg0);
+ machine_mode tmode = insn_data[icode].operand[0].mode;
+ machine_mode mode0 = insn_data[icode].operand[1].mode;
+
+ if (optimize || !target
+ || GET_MODE (target) != tmode
+ || !insn_data[icode].operand[0].predicate (target, tmode))
+ target = gen_reg_rtx (tmode);
+
+ if (VECTOR_MODE_P (mode0))
+ op0 = safe_vector_operand (op0, mode0);
+
+ if ((optimize && !register_operand (op0, mode0))
+ || !insn_data[icode].operand[1].predicate (op0, mode0))
+ op0 = copy_to_mode_reg (mode0, op0);
+
+ op1 = op0;
+ if (!insn_data[icode].operand[2].predicate (op1, mode0))
+ op1 = copy_to_mode_reg (mode0, op1);
+
+ pat = GEN_FCN (icode) (target, op0, op1);
+ if (! pat)
+ return 0;
+ emit_insn (pat);
+ return target;
+}
+
+/* Subroutine of ix86_expand_builtin to take care of comparison insns. */
+
+static rtx
+ix86_expand_sse_compare (const struct builtin_description *d,
+ tree exp, rtx target, bool swap)
+{
+ rtx pat;
+ tree arg0 = CALL_EXPR_ARG (exp, 0);
+ tree arg1 = CALL_EXPR_ARG (exp, 1);
+ rtx op0 = expand_normal (arg0);
+ rtx op1 = expand_normal (arg1);
+ rtx op2;
+ machine_mode tmode = insn_data[d->icode].operand[0].mode;
+ machine_mode mode0 = insn_data[d->icode].operand[1].mode;
+ machine_mode mode1 = insn_data[d->icode].operand[2].mode;
+ enum rtx_code comparison = d->comparison;
+
+ if (VECTOR_MODE_P (mode0))
+ op0 = safe_vector_operand (op0, mode0);
+ if (VECTOR_MODE_P (mode1))
+ op1 = safe_vector_operand (op1, mode1);
+
+ /* Swap operands if we have a comparison that isn't available in
+ hardware. */
+ if (swap)
+ std::swap (op0, op1);
+
+ if (optimize || !target
+ || GET_MODE (target) != tmode
+ || !insn_data[d->icode].operand[0].predicate (target, tmode))
+ target = gen_reg_rtx (tmode);
+
+ if ((optimize && !register_operand (op0, mode0))
+ || !insn_data[d->icode].operand[1].predicate (op0, mode0))
+ op0 = copy_to_mode_reg (mode0, op0);
+ if ((optimize && !register_operand (op1, mode1))
+ || !insn_data[d->icode].operand[2].predicate (op1, mode1))
+ op1 = copy_to_mode_reg (mode1, op1);
+
+ op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
+ pat = GEN_FCN (d->icode) (target, op0, op1, op2);
+ if (! pat)
+ return 0;
+ emit_insn (pat);
+ return target;
+}
+
+/* Subroutine of ix86_expand_builtin to take care of comi insns. */
+
+static rtx
+ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
+ rtx target)
+{
+ rtx pat;
+ tree arg0 = CALL_EXPR_ARG (exp, 0);
+ tree arg1 = CALL_EXPR_ARG (exp, 1);
+ rtx op0 = expand_normal (arg0);
+ rtx op1 = expand_normal (arg1);
+ machine_mode mode0 = insn_data[d->icode].operand[0].mode;
+ machine_mode mode1 = insn_data[d->icode].operand[1].mode;
+ enum rtx_code comparison = d->comparison;
+
+ if (VECTOR_MODE_P (mode0))
+ op0 = safe_vector_operand (op0, mode0);
+ if (VECTOR_MODE_P (mode1))
+ op1 = safe_vector_operand (op1, mode1);
+
+ /* Swap operands if we have a comparison that isn't available in
+ hardware. */
+ if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
+ std::swap (op0, op1);
+
+ target = gen_reg_rtx (SImode);
+ emit_move_insn (target, const0_rtx);
+ target = gen_rtx_SUBREG (QImode, target, 0);
+
+ if ((optimize && !register_operand (op0, mode0))
+ || !insn_data[d->icode].operand[0].predicate (op0, mode0))
+ op0 = copy_to_mode_reg (mode0, op0);
+ if ((optimize && !register_operand (op1, mode1))
+ || !insn_data[d->icode].operand[1].predicate (op1, mode1))
+ op1 = copy_to_mode_reg (mode1, op1);
+
+ pat = GEN_FCN (d->icode) (op0, op1);
+ if (! pat)
+ return 0;
+ emit_insn (pat);
+ emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
+ gen_rtx_fmt_ee (comparison, QImode,
+ SET_DEST (pat),
+ const0_rtx)));
+
+ return SUBREG_REG (target);
+}
+
+/* Subroutines of ix86_expand_args_builtin to take care of round insns. */
+
+static rtx
+ix86_expand_sse_round (const struct builtin_description *d, tree exp,
+ rtx target)
+{
+ rtx pat;
+ tree arg0 = CALL_EXPR_ARG (exp, 0);
+ rtx op1, op0 = expand_normal (arg0);
+ machine_mode tmode = insn_data[d->icode].operand[0].mode;
+ machine_mode mode0 = insn_data[d->icode].operand[1].mode;
+
+ if (optimize || target == 0
+ || GET_MODE (target) != tmode
+ || !insn_data[d->icode].operand[0].predicate (target, tmode))
+ target = gen_reg_rtx (tmode);
+
+ if (VECTOR_MODE_P (mode0))
+ op0 = safe_vector_operand (op0, mode0);
+
+ if ((optimize && !register_operand (op0, mode0))
+ || !insn_data[d->icode].operand[0].predicate (op0, mode0))
+ op0 = copy_to_mode_reg (mode0, op0);
+
+ op1 = GEN_INT (d->comparison);
+
+ pat = GEN_FCN (d->icode) (target, op0, op1);
+ if (! pat)
+ return 0;
+ emit_insn (pat);
+ return target;
+}
+
+static rtx
+ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
+ tree exp, rtx target)
+{
+ rtx pat;
+ tree arg0 = CALL_EXPR_ARG (exp, 0);
+ tree arg1 = CALL_EXPR_ARG (exp, 1);
+ rtx op0 = expand_normal (arg0);
+ rtx op1 = expand_normal (arg1);
+ rtx op2;
+ machine_mode tmode = insn_data[d->icode].operand[0].mode;
+ machine_mode mode0 = insn_data[d->icode].operand[1].mode;
+ machine_mode mode1 = insn_data[d->icode].operand[2].mode;
+
+ if (optimize || target == 0
+ || GET_MODE (target) != tmode
+ || !insn_data[d->icode].operand[0].predicate (target, tmode))
+ target = gen_reg_rtx (tmode);
+
+ op0 = safe_vector_operand (op0, mode0);
+ op1 = safe_vector_operand (op1, mode1);
+
+ if ((optimize && !register_operand (op0, mode0))
+ || !insn_data[d->icode].operand[0].predicate (op0, mode0))
+ op0 = copy_to_mode_reg (mode0, op0);
+ if ((optimize && !register_operand (op1, mode1))
+ || !insn_data[d->icode].operand[1].predicate (op1, mode1))
+ op1 = copy_to_mode_reg (mode1, op1);
+
+ op2 = GEN_INT (d->comparison);
+
+ pat = GEN_FCN (d->icode) (target, op0, op1, op2);
+ if (! pat)
+ return 0;
+ emit_insn (pat);
+ return target;
+}
+
+/* Subroutine of ix86_expand_builtin to take care of ptest insns. */
+
+static rtx
+ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
+ rtx target)
+{
+ rtx pat;
+ tree arg0 = CALL_EXPR_ARG (exp, 0);
+ tree arg1 = CALL_EXPR_ARG (exp, 1);
+ rtx op0 = expand_normal (arg0);
+ rtx op1 = expand_normal (arg1);
+ machine_mode mode0 = insn_data[d->icode].operand[0].mode;
+ machine_mode mode1 = insn_data[d->icode].operand[1].mode;
+ enum rtx_code comparison = d->comparison;
+
+ if (VECTOR_MODE_P (mode0))
+ op0 = safe_vector_operand (op0, mode0);
+ if (VECTOR_MODE_P (mode1))
+ op1 = safe_vector_operand (op1, mode1);
+
+ target = gen_reg_rtx (SImode);
+ emit_move_insn (target, const0_rtx);
+ target = gen_rtx_SUBREG (QImode, target, 0);
+
+ if ((optimize && !register_operand (op0, mode0))
+ || !insn_data[d->icode].operand[0].predicate (op0, mode0))
+ op0 = copy_to_mode_reg (mode0, op0);
+ if ((optimize && !register_operand (op1, mode1))
+ || !insn_data[d->icode].operand[1].predicate (op1, mode1))
+ op1 = copy_to_mode_reg (mode1, op1);
+
+ pat = GEN_FCN (d->icode) (op0, op1);
+ if (! pat)
+ return 0;
+ emit_insn (pat);
+ emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
+ gen_rtx_fmt_ee (comparison, QImode,
+ SET_DEST (pat),
+ const0_rtx)));
+
+ return SUBREG_REG (target);
+}
+
+/* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
+
+static rtx
+ix86_expand_sse_pcmpestr (const struct builtin_description *d,
+ tree exp, rtx target)
+{
+ rtx pat;
+ tree arg0 = CALL_EXPR_ARG (exp, 0);
+ tree arg1 = CALL_EXPR_ARG (exp, 1);
+ tree arg2 = CALL_EXPR_ARG (exp, 2);
+ tree arg3 = CALL_EXPR_ARG (exp, 3);
+ tree arg4 = CALL_EXPR_ARG (exp, 4);
+ rtx scratch0, scratch1;
+ rtx op0 = expand_normal (arg0);
+ rtx op1 = expand_normal (arg1);
+ rtx op2 = expand_normal (arg2);
+ rtx op3 = expand_normal (arg3);
+ rtx op4 = expand_normal (arg4);
+ machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
+
+ tmode0 = insn_data[d->icode].operand[0].mode;
+ tmode1 = insn_data[d->icode].operand[1].mode;
+ modev2 = insn_data[d->icode].operand[2].mode;
+ modei3 = insn_data[d->icode].operand[3].mode;
+ modev4 = insn_data[d->icode].operand[4].mode;
+ modei5 = insn_data[d->icode].operand[5].mode;
+ modeimm = insn_data[d->icode].operand[6].mode;
+
+ if (VECTOR_MODE_P (modev2))
+ op0 = safe_vector_operand (op0, modev2);
+ if (VECTOR_MODE_P (modev4))
+ op2 = safe_vector_operand (op2, modev4);
+
+ if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
+ op0 = copy_to_mode_reg (modev2, op0);
+ if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
+ op1 = copy_to_mode_reg (modei3, op1);
+ if ((optimize && !register_operand (op2, modev4))
+ || !insn_data[d->icode].operand[4].predicate (op2, modev4))
+ op2 = copy_to_mode_reg (modev4, op2);
+ if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
+ op3 = copy_to_mode_reg (modei5, op3);
+
+ if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
+ {
+ error ("the fifth argument must be an 8-bit immediate");
+ return const0_rtx;
+ }
+
+ if (d->code == IX86_BUILTIN_PCMPESTRI128)
+ {
+ if (optimize || !target
+ || GET_MODE (target) != tmode0
+ || !insn_data[d->icode].operand[0].predicate (target, tmode0))
+ target = gen_reg_rtx (tmode0);
+
+ scratch1 = gen_reg_rtx (tmode1);
+
+ pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
+ }
+ else if (d->code == IX86_BUILTIN_PCMPESTRM128)
+ {
+ if (optimize || !target
+ || GET_MODE (target) != tmode1
+ || !insn_data[d->icode].operand[1].predicate (target, tmode1))
+ target = gen_reg_rtx (tmode1);
+
+ scratch0 = gen_reg_rtx (tmode0);
+
+ pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
+ }
+ else
+ {
+ gcc_assert (d->flag);
+
+ scratch0 = gen_reg_rtx (tmode0);
+ scratch1 = gen_reg_rtx (tmode1);
+
+ pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
+ }
+
+ if (! pat)
+ return 0;
+
+ emit_insn (pat);
+
+ if (d->flag)
+ {
+ target = gen_reg_rtx (SImode);
+ emit_move_insn (target, const0_rtx);
+ target = gen_rtx_SUBREG (QImode, target, 0);
+
+ emit_insn
+ (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
+ gen_rtx_fmt_ee (EQ, QImode,
+ gen_rtx_REG ((machine_mode) d->flag,
+ FLAGS_REG),
+ const0_rtx)));
+ return SUBREG_REG (target);
+ }
+ else
+ return target;
+}
+
+
+/* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
+
+static rtx
+ix86_expand_sse_pcmpistr (const struct builtin_description *d,
+ tree exp, rtx target)
+{
+ rtx pat;
+ tree arg0 = CALL_EXPR_ARG (exp, 0);
+ tree arg1 = CALL_EXPR_ARG (exp, 1);
+ tree arg2 = CALL_EXPR_ARG (exp, 2);
+ rtx scratch0, scratch1;
+ rtx op0 = expand_normal (arg0);
+ rtx op1 = expand_normal (arg1);
+ rtx op2 = expand_normal (arg2);
+ machine_mode tmode0, tmode1, modev2, modev3, modeimm;
+
+ tmode0 = insn_data[d->icode].operand[0].mode;
+ tmode1 = insn_data[d->icode].operand[1].mode;
+ modev2 = insn_data[d->icode].operand[2].mode;
+ modev3 = insn_data[d->icode].operand[3].mode;
+ modeimm = insn_data[d->icode].operand[4].mode;
+
+ if (VECTOR_MODE_P (modev2))
+ op0 = safe_vector_operand (op0, modev2);
+ if (VECTOR_MODE_P (modev3))
+ op1 = safe_vector_operand (op1, modev3);
+
+ if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
+ op0 = copy_to_mode_reg (modev2, op0);
+ if ((optimize && !register_operand (op1, modev3))
+ || !insn_data[d->icode].operand[3].predicate (op1, modev3))
+ op1 = copy_to_mode_reg (modev3, op1);
+
+ if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
+ {
+ error ("the third argument must be an 8-bit immediate");
+ return const0_rtx;
+ }
+
+ if (d->code == IX86_BUILTIN_PCMPISTRI128)
+ {
+ if (optimize || !target
+ || GET_MODE (target) != tmode0
+ || !insn_data[d->icode].operand[0].predicate (target, tmode0))
+ target = gen_reg_rtx (tmode0);
+
+ scratch1 = gen_reg_rtx (tmode1);
+
+ pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
+ }
+ else if (d->code == IX86_BUILTIN_PCMPISTRM128)
+ {
+ if (optimize || !target
+ || GET_MODE (target) != tmode1
+ || !insn_data[d->icode].operand[1].predicate (target, tmode1))
+ target = gen_reg_rtx (tmode1);
+
+ scratch0 = gen_reg_rtx (tmode0);
+
+ pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
+ }
+ else
+ {
+ gcc_assert (d->flag);
+
+ scratch0 = gen_reg_rtx (tmode0);
+ scratch1 = gen_reg_rtx (tmode1);
+
+ pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
+ }
+
+ if (! pat)
+ return 0;
+
+ emit_insn (pat);
+
+ if (d->flag)
+ {
+ target = gen_reg_rtx (SImode);
+ emit_move_insn (target, const0_rtx);
+ target = gen_rtx_SUBREG (QImode, target, 0);
+
+ emit_insn
+ (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
+ gen_rtx_fmt_ee (EQ, QImode,
+ gen_rtx_REG ((machine_mode) d->flag,
+ FLAGS_REG),
+ const0_rtx)));
+ return SUBREG_REG (target);
+ }
+ else
+ return target;
+}
+
+/* Fixup modeless constants to fit required mode. */
+
+static rtx
+fixup_modeless_constant (rtx x, machine_mode mode)
+{
+ if (GET_MODE (x) == VOIDmode)
+ x = convert_to_mode (mode, x, 1);
+ return x;
+}
+
+/* Subroutine of ix86_expand_builtin to take care of insns with
+ variable number of operands. */
+
+static rtx
+ix86_expand_args_builtin (const struct builtin_description *d,
+ tree exp, rtx target)
+{
+ rtx pat, real_target;
+ unsigned int i, nargs;
+ unsigned int nargs_constant = 0;
+ unsigned int mask_pos = 0;
+ int num_memory = 0;
+ struct
+ {
+ rtx op;
+ machine_mode mode;
+ } args[6];
+ bool second_arg_count = false;
+ enum insn_code icode = d->icode;
+ const struct insn_data_d *insn_p = &insn_data[icode];
+ machine_mode tmode = insn_p->operand[0].mode;
+ machine_mode rmode = VOIDmode;
+ bool swap = false;
+ enum rtx_code comparison = d->comparison;
+
+ switch ((enum ix86_builtin_func_type) d->flag)
+ {
+ case V2DF_FTYPE_V2DF_ROUND:
+ case V4DF_FTYPE_V4DF_ROUND:
+ case V8DF_FTYPE_V8DF_ROUND:
+ case V4SF_FTYPE_V4SF_ROUND:
+ case V8SF_FTYPE_V8SF_ROUND:
+ case V16SF_FTYPE_V16SF_ROUND:
+ case V4SI_FTYPE_V4SF_ROUND:
+ case V8SI_FTYPE_V8SF_ROUND:
+ case V16SI_FTYPE_V16SF_ROUND:
+ return ix86_expand_sse_round (d, exp, target);
+ case V4SI_FTYPE_V2DF_V2DF_ROUND:
+ case V8SI_FTYPE_V4DF_V4DF_ROUND:
+ case V16SI_FTYPE_V8DF_V8DF_ROUND:
+ return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
+ case INT_FTYPE_V8SF_V8SF_PTEST:
+ case INT_FTYPE_V4DI_V4DI_PTEST:
+ case INT_FTYPE_V4DF_V4DF_PTEST:
+ case INT_FTYPE_V4SF_V4SF_PTEST:
+ case INT_FTYPE_V2DI_V2DI_PTEST:
+ case INT_FTYPE_V2DF_V2DF_PTEST:
+ return ix86_expand_sse_ptest (d, exp, target);
+ case FLOAT128_FTYPE_FLOAT128:
+ case FLOAT_FTYPE_FLOAT:
+ case INT_FTYPE_INT:
+ case UINT_FTYPE_UINT:
+ case UINT16_FTYPE_UINT16:
+ case UINT64_FTYPE_INT:
+ case UINT64_FTYPE_UINT64:
+ case INT64_FTYPE_INT64:
+ case INT64_FTYPE_V4SF:
+ case INT64_FTYPE_V2DF:
+ case INT_FTYPE_V16QI:
+ case INT_FTYPE_V8QI:
+ case INT_FTYPE_V8SF:
+ case INT_FTYPE_V4DF:
+ case INT_FTYPE_V4SF:
+ case INT_FTYPE_V2DF:
+ case INT_FTYPE_V32QI:
+ case V16QI_FTYPE_V16QI:
+ case V8SI_FTYPE_V8SF:
+ case V8SI_FTYPE_V4SI:
+ case V8HI_FTYPE_V8HI:
+ case V8HI_FTYPE_V16QI:
+ case V8QI_FTYPE_V8QI:
+ case V8SF_FTYPE_V8SF:
+ case V8SF_FTYPE_V8SI:
+ case V8SF_FTYPE_V4SF:
+ case V8SF_FTYPE_V8HI:
+ case V4SI_FTYPE_V4SI:
+ case V4SI_FTYPE_V16QI:
+ case V4SI_FTYPE_V4SF:
+ case V4SI_FTYPE_V8SI:
+ case V4SI_FTYPE_V8HI:
+ case V4SI_FTYPE_V4DF:
+ case V4SI_FTYPE_V2DF:
+ case V4HI_FTYPE_V4HI:
+ case V4DF_FTYPE_V4DF:
+ case V4DF_FTYPE_V4SI:
+ case V4DF_FTYPE_V4SF:
+ case V4DF_FTYPE_V2DF:
+ case V4SF_FTYPE_V4SF:
+ case V4SF_FTYPE_V4SI:
+ case V4SF_FTYPE_V8SF:
+ case V4SF_FTYPE_V4DF:
+ case V4SF_FTYPE_V8HI:
+ case V4SF_FTYPE_V2DF:
+ case V2DI_FTYPE_V2DI:
+ case V2DI_FTYPE_V16QI:
+ case V2DI_FTYPE_V8HI:
+ case V2DI_FTYPE_V4SI:
+ case V2DF_FTYPE_V2DF:
+ case V2DF_FTYPE_V4SI:
+ case V2DF_FTYPE_V4DF:
+ case V2DF_FTYPE_V4SF:
+ case V2DF_FTYPE_V2SI:
+ case V2SI_FTYPE_V2SI:
+ case V2SI_FTYPE_V4SF:
+ case V2SI_FTYPE_V2SF:
+ case V2SI_FTYPE_V2DF:
+ case V2SF_FTYPE_V2SF:
+ case V2SF_FTYPE_V2SI:
+ case V32QI_FTYPE_V32QI:
+ case V32QI_FTYPE_V16QI:
+ case V16HI_FTYPE_V16HI:
+ case V16HI_FTYPE_V8HI:
+ case V8SI_FTYPE_V8SI:
+ case V16HI_FTYPE_V16QI:
+ case V8SI_FTYPE_V16QI:
+ case V4DI_FTYPE_V16QI:
+ case V8SI_FTYPE_V8HI:
+ case V4DI_FTYPE_V8HI:
+ case V4DI_FTYPE_V4SI:
+ case V4DI_FTYPE_V2DI:
+ case UQI_FTYPE_UQI:
+ case UHI_FTYPE_UHI:
+ case USI_FTYPE_USI:
+ case USI_FTYPE_UQI:
+ case USI_FTYPE_UHI:
+ case UDI_FTYPE_UDI:
+ case UHI_FTYPE_V16QI:
+ case USI_FTYPE_V32QI:
+ case UDI_FTYPE_V64QI:
+ case V16QI_FTYPE_UHI:
+ case V32QI_FTYPE_USI:
+ case V64QI_FTYPE_UDI:
+ case V8HI_FTYPE_UQI:
+ case V16HI_FTYPE_UHI:
+ case V32HI_FTYPE_USI:
+ case V4SI_FTYPE_UQI:
+ case V8SI_FTYPE_UQI:
+ case V4SI_FTYPE_UHI:
+ case V8SI_FTYPE_UHI:
+ case UQI_FTYPE_V8HI:
+ case UHI_FTYPE_V16HI:
+ case USI_FTYPE_V32HI:
+ case UQI_FTYPE_V4SI:
+ case UQI_FTYPE_V8SI:
+ case UHI_FTYPE_V16SI:
+ case UQI_FTYPE_V2DI:
+ case UQI_FTYPE_V4DI:
+ case UQI_FTYPE_V8DI:
+ case V16SI_FTYPE_UHI:
+ case V2DI_FTYPE_UQI:
+ case V4DI_FTYPE_UQI:
+ case V16SI_FTYPE_INT:
+ case V16SF_FTYPE_V8SF:
+ case V16SI_FTYPE_V8SI:
+ case V16SF_FTYPE_V4SF:
+ case V16SI_FTYPE_V4SI:
+ case V16SI_FTYPE_V16SF:
+ case V16SI_FTYPE_V16SI:
+ case V64QI_FTYPE_V64QI:
+ case V32HI_FTYPE_V32HI:
+ case V16SF_FTYPE_V16SF:
+ case V8DI_FTYPE_UQI:
+ case V8DI_FTYPE_V8DI:
+ case V8DF_FTYPE_V4DF:
+ case V8DF_FTYPE_V2DF:
+ case V8DF_FTYPE_V8DF:
+ case V4DI_FTYPE_V4DI:
+ nargs = 1;
+ break;
+ case V4SF_FTYPE_V4SF_VEC_MERGE:
+ case V2DF_FTYPE_V2DF_VEC_MERGE:
+ return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
+ case FLOAT128_FTYPE_FLOAT128_FLOAT128:
+ case V16QI_FTYPE_V16QI_V16QI:
+ case V16QI_FTYPE_V8HI_V8HI:
+ case V16SF_FTYPE_V16SF_V16SF:
+ case V8QI_FTYPE_V8QI_V8QI:
+ case V8QI_FTYPE_V4HI_V4HI:
+ case V8HI_FTYPE_V8HI_V8HI:
+ case V8HI_FTYPE_V16QI_V16QI:
+ case V8HI_FTYPE_V4SI_V4SI:
+ case V8SF_FTYPE_V8SF_V8SF:
+ case V8SF_FTYPE_V8SF_V8SI:
+ case V8DF_FTYPE_V8DF_V8DF:
+ case V4SI_FTYPE_V4SI_V4SI:
+ case V4SI_FTYPE_V8HI_V8HI:
+ case V4SI_FTYPE_V2DF_V2DF:
+ case V4HI_FTYPE_V4HI_V4HI:
+ case V4HI_FTYPE_V8QI_V8QI:
+ case V4HI_FTYPE_V2SI_V2SI:
+ case V4DF_FTYPE_V4DF_V4DF:
+ case V4DF_FTYPE_V4DF_V4DI:
+ case V4SF_FTYPE_V4SF_V4SF:
+ case V4SF_FTYPE_V4SF_V4SI:
+ case V4SF_FTYPE_V4SF_V2SI:
+ case V4SF_FTYPE_V4SF_V2DF:
+ case V4SF_FTYPE_V4SF_UINT:
+ case V4SF_FTYPE_V4SF_DI:
+ case V4SF_FTYPE_V4SF_SI:
+ case V2DI_FTYPE_V2DI_V2DI:
+ case V2DI_FTYPE_V16QI_V16QI:
+ case V2DI_FTYPE_V4SI_V4SI:
+ case V2DI_FTYPE_V2DI_V16QI:
+ case V2SI_FTYPE_V2SI_V2SI:
+ case V2SI_FTYPE_V4HI_V4HI:
+ case V2SI_FTYPE_V2SF_V2SF:
+ case V2DF_FTYPE_V2DF_V2DF:
+ case V2DF_FTYPE_V2DF_V4SF:
+ case V2DF_FTYPE_V2DF_V2DI:
+ case V2DF_FTYPE_V2DF_DI:
+ case V2DF_FTYPE_V2DF_SI:
+ case V2DF_FTYPE_V2DF_UINT:
+ case V2SF_FTYPE_V2SF_V2SF:
+ case V1DI_FTYPE_V1DI_V1DI:
+ case V1DI_FTYPE_V8QI_V8QI:
+ case V1DI_FTYPE_V2SI_V2SI:
+ case V32QI_FTYPE_V16HI_V16HI:
+ case V16HI_FTYPE_V8SI_V8SI:
+ case V64QI_FTYPE_V64QI_V64QI:
+ case V32QI_FTYPE_V32QI_V32QI:
+ case V16HI_FTYPE_V32QI_V32QI:
+ case V16HI_FTYPE_V16HI_V16HI:
+ case V8SI_FTYPE_V4DF_V4DF:
+ case V8SI_FTYPE_V8SI_V8SI:
+ case V8SI_FTYPE_V16HI_V16HI:
+ case V4DI_FTYPE_V4DI_V4DI:
+ case V4DI_FTYPE_V8SI_V8SI:
+ case V8DI_FTYPE_V64QI_V64QI:
+ if (comparison == UNKNOWN)
+ return ix86_expand_binop_builtin (icode, exp, target);
+ nargs = 2;
+ break;
+ case V4SF_FTYPE_V4SF_V4SF_SWAP:
+ case V2DF_FTYPE_V2DF_V2DF_SWAP:
+ gcc_assert (comparison != UNKNOWN);
+ nargs = 2;
+ swap = true;
+ break;
+ case V16HI_FTYPE_V16HI_V8HI_COUNT:
+ case V16HI_FTYPE_V16HI_SI_COUNT:
+ case V8SI_FTYPE_V8SI_V4SI_COUNT:
+ case V8SI_FTYPE_V8SI_SI_COUNT:
+ case V4DI_FTYPE_V4DI_V2DI_COUNT:
+ case V4DI_FTYPE_V4DI_INT_COUNT:
+ case V8HI_FTYPE_V8HI_V8HI_COUNT:
+ case V8HI_FTYPE_V8HI_SI_COUNT:
+ case V4SI_FTYPE_V4SI_V4SI_COUNT:
+ case V4SI_FTYPE_V4SI_SI_COUNT:
+ case V4HI_FTYPE_V4HI_V4HI_COUNT:
+ case V4HI_FTYPE_V4HI_SI_COUNT:
+ case V2DI_FTYPE_V2DI_V2DI_COUNT:
+ case V2DI_FTYPE_V2DI_SI_COUNT:
+ case V2SI_FTYPE_V2SI_V2SI_COUNT:
+ case V2SI_FTYPE_V2SI_SI_COUNT:
+ case V1DI_FTYPE_V1DI_V1DI_COUNT:
+ case V1DI_FTYPE_V1DI_SI_COUNT:
+ nargs = 2;
+ second_arg_count = true;
+ break;
+ case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
+ case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
+ case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
+ case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
+ case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
+ case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
+ case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
+ case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
+ case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
+ case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
+ case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
+ case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
+ case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
+ case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
+ case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
+ case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
+ case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
+ case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
+ nargs = 4;
+ second_arg_count = true;
+ break;
+ case UINT64_FTYPE_UINT64_UINT64:
+ case UINT_FTYPE_UINT_UINT:
+ case UINT_FTYPE_UINT_USHORT:
+ case UINT_FTYPE_UINT_UCHAR:
+ case UINT16_FTYPE_UINT16_INT:
+ case UINT8_FTYPE_UINT8_INT:
+ case UQI_FTYPE_UQI_UQI:
+ case UHI_FTYPE_UHI_UHI:
+ case USI_FTYPE_USI_USI:
+ case UDI_FTYPE_UDI_UDI:
+ case V16SI_FTYPE_V8DF_V8DF:
+ nargs = 2;
+ break;
+ case V2DI_FTYPE_V2DI_INT_CONVERT:
+ nargs = 2;
+ rmode = V1TImode;
+ nargs_constant = 1;
+ break;
+ case V4DI_FTYPE_V4DI_INT_CONVERT:
+ nargs = 2;
+ rmode = V2TImode;
+ nargs_constant = 1;
+ break;
+ case V8DI_FTYPE_V8DI_INT_CONVERT:
+ nargs = 2;
+ rmode = V4TImode;
+ nargs_constant = 1;
+ break;
+ case V8HI_FTYPE_V8HI_INT:
+ case V8HI_FTYPE_V8SF_INT:
+ case V16HI_FTYPE_V16SF_INT:
+ case V8HI_FTYPE_V4SF_INT:
+ case V8SF_FTYPE_V8SF_INT:
+ case V4SF_FTYPE_V16SF_INT:
+ case V16SF_FTYPE_V16SF_INT:
+ case V4SI_FTYPE_V4SI_INT:
+ case V4SI_FTYPE_V8SI_INT:
+ case V4HI_FTYPE_V4HI_INT:
+ case V4DF_FTYPE_V4DF_INT:
+ case V4DF_FTYPE_V8DF_INT:
+ case V4SF_FTYPE_V4SF_INT:
+ case V4SF_FTYPE_V8SF_INT:
+ case V2DI_FTYPE_V2DI_INT:
+ case V2DF_FTYPE_V2DF_INT:
+ case V2DF_FTYPE_V4DF_INT:
+ case V16HI_FTYPE_V16HI_INT:
+ case V8SI_FTYPE_V8SI_INT:
+ case V16SI_FTYPE_V16SI_INT:
+ case V4SI_FTYPE_V16SI_INT:
+ case V4DI_FTYPE_V4DI_INT:
+ case V2DI_FTYPE_V4DI_INT:
+ case V4DI_FTYPE_V8DI_INT:
+ case QI_FTYPE_V4SF_INT:
+ case QI_FTYPE_V2DF_INT:
+ case UQI_FTYPE_UQI_UQI_CONST:
+ case UHI_FTYPE_UHI_UQI:
+ case USI_FTYPE_USI_UQI:
+ case UDI_FTYPE_UDI_UQI:
+ nargs = 2;
+ nargs_constant = 1;
+ break;
+ case V16QI_FTYPE_V16QI_V16QI_V16QI:
+ case V8SF_FTYPE_V8SF_V8SF_V8SF:
+ case V4DF_FTYPE_V4DF_V4DF_V4DF:
+ case V4SF_FTYPE_V4SF_V4SF_V4SF:
+ case V2DF_FTYPE_V2DF_V2DF_V2DF:
+ case V32QI_FTYPE_V32QI_V32QI_V32QI:
+ case UHI_FTYPE_V16SI_V16SI_UHI:
+ case UQI_FTYPE_V8DI_V8DI_UQI:
+ case V16HI_FTYPE_V16SI_V16HI_UHI:
+ case V16QI_FTYPE_V16SI_V16QI_UHI:
+ case V16QI_FTYPE_V8DI_V16QI_UQI:
+ case V16SF_FTYPE_V16SF_V16SF_UHI:
+ case V16SF_FTYPE_V4SF_V16SF_UHI:
+ case V16SI_FTYPE_SI_V16SI_UHI:
+ case V16SI_FTYPE_V16HI_V16SI_UHI:
+ case V16SI_FTYPE_V16QI_V16SI_UHI:
+ case V8SF_FTYPE_V4SF_V8SF_UQI:
+ case V4DF_FTYPE_V2DF_V4DF_UQI:
+ case V8SI_FTYPE_V4SI_V8SI_UQI:
+ case V8SI_FTYPE_SI_V8SI_UQI:
+ case V4SI_FTYPE_V4SI_V4SI_UQI:
+ case V4SI_FTYPE_SI_V4SI_UQI:
+ case V4DI_FTYPE_V2DI_V4DI_UQI:
+ case V4DI_FTYPE_DI_V4DI_UQI:
+ case V2DI_FTYPE_V2DI_V2DI_UQI:
+ case V2DI_FTYPE_DI_V2DI_UQI:
+ case V64QI_FTYPE_V64QI_V64QI_UDI:
+ case V64QI_FTYPE_V16QI_V64QI_UDI:
+ case V64QI_FTYPE_QI_V64QI_UDI:
+ case V32QI_FTYPE_V32QI_V32QI_USI:
+ case V32QI_FTYPE_V16QI_V32QI_USI:
+ case V32QI_FTYPE_QI_V32QI_USI:
+ case V16QI_FTYPE_V16QI_V16QI_UHI:
+ case V16QI_FTYPE_QI_V16QI_UHI:
+ case V32HI_FTYPE_V8HI_V32HI_USI:
+ case V32HI_FTYPE_HI_V32HI_USI:
+ case V16HI_FTYPE_V8HI_V16HI_UHI:
+ case V16HI_FTYPE_HI_V16HI_UHI:
+ case V8HI_FTYPE_V8HI_V8HI_UQI:
+ case V8HI_FTYPE_HI_V8HI_UQI:
+ case V8SF_FTYPE_V8HI_V8SF_UQI:
+ case V4SF_FTYPE_V8HI_V4SF_UQI:
+ case V8SI_FTYPE_V8SF_V8SI_UQI:
+ case V4SI_FTYPE_V4SF_V4SI_UQI:
+ case V4DI_FTYPE_V4SF_V4DI_UQI:
+ case V2DI_FTYPE_V4SF_V2DI_UQI:
+ case V4SF_FTYPE_V4DI_V4SF_UQI:
+ case V4SF_FTYPE_V2DI_V4SF_UQI:
+ case V4DF_FTYPE_V4DI_V4DF_UQI:
+ case V2DF_FTYPE_V2DI_V2DF_UQI:
+ case V16QI_FTYPE_V8HI_V16QI_UQI:
+ case V16QI_FTYPE_V16HI_V16QI_UHI:
+ case V16QI_FTYPE_V4SI_V16QI_UQI:
+ case V16QI_FTYPE_V8SI_V16QI_UQI:
+ case V8HI_FTYPE_V4SI_V8HI_UQI:
+ case V8HI_FTYPE_V8SI_V8HI_UQI:
+ case V16QI_FTYPE_V2DI_V16QI_UQI:
+ case V16QI_FTYPE_V4DI_V16QI_UQI:
+ case V8HI_FTYPE_V2DI_V8HI_UQI:
+ case V8HI_FTYPE_V4DI_V8HI_UQI:
+ case V4SI_FTYPE_V2DI_V4SI_UQI:
+ case V4SI_FTYPE_V4DI_V4SI_UQI:
+ case V32QI_FTYPE_V32HI_V32QI_USI:
+ case UHI_FTYPE_V16QI_V16QI_UHI:
+ case USI_FTYPE_V32QI_V32QI_USI:
+ case UDI_FTYPE_V64QI_V64QI_UDI:
+ case UQI_FTYPE_V8HI_V8HI_UQI:
+ case UHI_FTYPE_V16HI_V16HI_UHI:
+ case USI_FTYPE_V32HI_V32HI_USI:
+ case UQI_FTYPE_V4SI_V4SI_UQI:
+ case UQI_FTYPE_V8SI_V8SI_UQI:
+ case UQI_FTYPE_V2DI_V2DI_UQI:
+ case UQI_FTYPE_V4DI_V4DI_UQI:
+ case V4SF_FTYPE_V2DF_V4SF_UQI:
+ case V4SF_FTYPE_V4DF_V4SF_UQI:
+ case V16SI_FTYPE_V16SI_V16SI_UHI:
+ case V16SI_FTYPE_V4SI_V16SI_UHI:
+ case V2DI_FTYPE_V4SI_V2DI_UQI:
+ case V2DI_FTYPE_V8HI_V2DI_UQI:
+ case V2DI_FTYPE_V16QI_V2DI_UQI:
+ case V4DI_FTYPE_V4DI_V4DI_UQI:
+ case V4DI_FTYPE_V4SI_V4DI_UQI:
+ case V4DI_FTYPE_V8HI_V4DI_UQI:
+ case V4DI_FTYPE_V16QI_V4DI_UQI:
+ case V4DI_FTYPE_V4DF_V4DI_UQI:
+ case V2DI_FTYPE_V2DF_V2DI_UQI:
+ case V4SI_FTYPE_V4DF_V4SI_UQI:
+ case V4SI_FTYPE_V2DF_V4SI_UQI:
+ case V4SI_FTYPE_V8HI_V4SI_UQI:
+ case V4SI_FTYPE_V16QI_V4SI_UQI:
+ case V4DI_FTYPE_V4DI_V4DI_V4DI:
+ case V8DF_FTYPE_V2DF_V8DF_UQI:
+ case V8DF_FTYPE_V4DF_V8DF_UQI:
+ case V8DF_FTYPE_V8DF_V8DF_UQI:
+ case V8SF_FTYPE_V8SF_V8SF_UQI:
+ case V8SF_FTYPE_V8SI_V8SF_UQI:
+ case V4DF_FTYPE_V4DF_V4DF_UQI:
+ case V4SF_FTYPE_V4SF_V4SF_UQI:
+ case V2DF_FTYPE_V2DF_V2DF_UQI:
+ case V2DF_FTYPE_V4SF_V2DF_UQI:
+ case V2DF_FTYPE_V4SI_V2DF_UQI:
+ case V4SF_FTYPE_V4SI_V4SF_UQI:
+ case V4DF_FTYPE_V4SF_V4DF_UQI:
+ case V4DF_FTYPE_V4SI_V4DF_UQI:
+ case V8SI_FTYPE_V8SI_V8SI_UQI:
+ case V8SI_FTYPE_V8HI_V8SI_UQI:
+ case V8SI_FTYPE_V16QI_V8SI_UQI:
+ case V8DF_FTYPE_V8SI_V8DF_UQI:
+ case V8DI_FTYPE_DI_V8DI_UQI:
+ case V16SF_FTYPE_V8SF_V16SF_UHI:
+ case V16SI_FTYPE_V8SI_V16SI_UHI:
+ case V16HI_FTYPE_V16HI_V16HI_UHI:
+ case V8HI_FTYPE_V16QI_V8HI_UQI:
+ case V16HI_FTYPE_V16QI_V16HI_UHI:
+ case V32HI_FTYPE_V32HI_V32HI_USI:
+ case V32HI_FTYPE_V32QI_V32HI_USI:
+ case V8DI_FTYPE_V16QI_V8DI_UQI:
+ case V8DI_FTYPE_V2DI_V8DI_UQI:
+ case V8DI_FTYPE_V4DI_V8DI_UQI:
+ case V8DI_FTYPE_V8DI_V8DI_UQI:
+ case V8DI_FTYPE_V8HI_V8DI_UQI:
+ case V8DI_FTYPE_V8SI_V8DI_UQI:
+ case V8HI_FTYPE_V8DI_V8HI_UQI:
+ case V8SI_FTYPE_V8DI_V8SI_UQI:
+ case V4SI_FTYPE_V4SI_V4SI_V4SI:
+ case V16SI_FTYPE_V16SI_V16SI_V16SI:
+ case V8DI_FTYPE_V8DI_V8DI_V8DI:
+ case V32HI_FTYPE_V32HI_V32HI_V32HI:
+ case V2DI_FTYPE_V2DI_V2DI_V2DI:
+ case V16HI_FTYPE_V16HI_V16HI_V16HI:
+ case V8SI_FTYPE_V8SI_V8SI_V8SI:
+ case V8HI_FTYPE_V8HI_V8HI_V8HI:
+ nargs = 3;
+ break;
+ case V32QI_FTYPE_V32QI_V32QI_INT:
+ case V16HI_FTYPE_V16HI_V16HI_INT:
+ case V16QI_FTYPE_V16QI_V16QI_INT:
+ case V4DI_FTYPE_V4DI_V4DI_INT:
+ case V8HI_FTYPE_V8HI_V8HI_INT:
+ case V8SI_FTYPE_V8SI_V8SI_INT:
+ case V8SI_FTYPE_V8SI_V4SI_INT:
+ case V8SF_FTYPE_V8SF_V8SF_INT:
+ case V8SF_FTYPE_V8SF_V4SF_INT:
+ case V4SI_FTYPE_V4SI_V4SI_INT:
+ case V4DF_FTYPE_V4DF_V4DF_INT:
+ case V16SF_FTYPE_V16SF_V16SF_INT:
+ case V16SF_FTYPE_V16SF_V4SF_INT:
+ case V16SI_FTYPE_V16SI_V4SI_INT:
+ case V4DF_FTYPE_V4DF_V2DF_INT:
+ case V4SF_FTYPE_V4SF_V4SF_INT:
+ case V2DI_FTYPE_V2DI_V2DI_INT:
+ case V4DI_FTYPE_V4DI_V2DI_INT:
+ case V2DF_FTYPE_V2DF_V2DF_INT:
+ case UQI_FTYPE_V8DI_V8UDI_INT:
+ case UQI_FTYPE_V8DF_V8DF_INT:
+ case UQI_FTYPE_V2DF_V2DF_INT:
+ case UQI_FTYPE_V4SF_V4SF_INT:
+ case UHI_FTYPE_V16SI_V16SI_INT:
+ case UHI_FTYPE_V16SF_V16SF_INT:
+ case V64QI_FTYPE_V64QI_V64QI_INT:
+ case V32HI_FTYPE_V32HI_V32HI_INT:
+ case V16SI_FTYPE_V16SI_V16SI_INT:
+ case V8DI_FTYPE_V8DI_V8DI_INT:
+ nargs = 3;
+ nargs_constant = 1;
+ break;
+ case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
+ nargs = 3;
+ rmode = V4DImode;
+ nargs_constant = 1;
+ break;
+ case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
+ nargs = 3;
+ rmode = V2DImode;
+ nargs_constant = 1;
+ break;
+ case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
+ nargs = 3;
+ rmode = DImode;
+ nargs_constant = 1;
+ break;
+ case V2DI_FTYPE_V2DI_UINT_UINT:
+ nargs = 3;
+ nargs_constant = 2;
+ break;
+ case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
+ nargs = 3;
+ rmode = V8DImode;
+ nargs_constant = 1;
+ break;
+ case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
+ nargs = 5;
+ rmode = V8DImode;
+ mask_pos = 2;
+ nargs_constant = 1;
+ break;
+ case QI_FTYPE_V8DF_INT_UQI:
+ case QI_FTYPE_V4DF_INT_UQI:
+ case QI_FTYPE_V2DF_INT_UQI:
+ case HI_FTYPE_V16SF_INT_UHI:
+ case QI_FTYPE_V8SF_INT_UQI:
+ case QI_FTYPE_V4SF_INT_UQI:
+ case V4SI_FTYPE_V4SI_V4SI_UHI:
+ case V8SI_FTYPE_V8SI_V8SI_UHI:
+ nargs = 3;
+ mask_pos = 1;
+ nargs_constant = 1;
+ break;
+ case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
+ nargs = 5;
+ rmode = V4DImode;
+ mask_pos = 2;
+ nargs_constant = 1;
+ break;
+ case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
+ nargs = 5;
+ rmode = V2DImode;
+ mask_pos = 2;
+ nargs_constant = 1;
+ break;
+ case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
+ case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
+ case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
+ case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
+ case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
+ case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
+ case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
+ case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
+ case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
+ case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
+ case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
+ case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
+ case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
+ case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
+ case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
+ case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
+ case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
+ case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
+ case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
+ case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
+ case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
+ case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
+ case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
+ case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
+ case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
+ case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
+ case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
+ case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
+ case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
+ case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
+ case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
+ case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
+ case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
+ case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
+ case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
+ case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
+ case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
+ case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
+ case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
+ case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
+ case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
+ case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
+ case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
+ case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
+ case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
+ case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
+ case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
+ case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
+ case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
+ case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
+ case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
+ nargs = 4;
+ break;
+ case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
+ case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
+ case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
+ case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
+ case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
+ nargs = 4;
+ nargs_constant = 1;
+ break;
+ case UQI_FTYPE_V4DI_V4DI_INT_UQI:
+ case UQI_FTYPE_V8SI_V8SI_INT_UQI:
+ case QI_FTYPE_V4DF_V4DF_INT_UQI:
+ case QI_FTYPE_V8SF_V8SF_INT_UQI:
+ case UQI_FTYPE_V2DI_V2DI_INT_UQI:
+ case UQI_FTYPE_V4SI_V4SI_INT_UQI:
+ case UQI_FTYPE_V2DF_V2DF_INT_UQI:
+ case UQI_FTYPE_V4SF_V4SF_INT_UQI:
+ case UDI_FTYPE_V64QI_V64QI_INT_UDI:
+ case USI_FTYPE_V32QI_V32QI_INT_USI:
+ case UHI_FTYPE_V16QI_V16QI_INT_UHI:
+ case USI_FTYPE_V32HI_V32HI_INT_USI:
+ case UHI_FTYPE_V16HI_V16HI_INT_UHI:
+ case UQI_FTYPE_V8HI_V8HI_INT_UQI:
+ case V32HI_FTYPE_V32HI_V32HI_V32HI_INT:
+ case V16HI_FTYPE_V16HI_V16HI_V16HI_INT:
+ case V8HI_FTYPE_V8HI_V8HI_V8HI_INT:
+ case V8SI_FTYPE_V8SI_V8SI_V8SI_INT:
+ case V4DI_FTYPE_V4DI_V4DI_V4DI_INT:
+ case V8DI_FTYPE_V8DI_V8DI_V8DI_INT:
+ case V16SI_FTYPE_V16SI_V16SI_V16SI_INT:
+ case V2DI_FTYPE_V2DI_V2DI_V2DI_INT:
+ case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
+ nargs = 4;
+ mask_pos = 1;
+ nargs_constant = 1;
+ break;
+ case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
+ nargs = 4;
+ nargs_constant = 2;
+ break;
+ case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
+ case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
+ nargs = 4;
+ break;
+ case UQI_FTYPE_V8DI_V8DI_INT_UQI:
+ case UHI_FTYPE_V16SI_V16SI_INT_UHI:
+ mask_pos = 1;
+ nargs = 4;
+ nargs_constant = 1;
+ break;
+ case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
+ case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
+ case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
+ case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
+ case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
+ case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
+ case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
+ case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
+ case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
+ case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
+ case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
+ case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
+ case V32HI_FTYPE_V32HI_INT_V32HI_USI:
+ case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
+ case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
+ case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
+ case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
+ case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
+ case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
+ case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
+ case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
+ case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
+ case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
+ case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
+ case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
+ case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
+ case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
+ case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
+ case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
+ case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
+ nargs = 4;
+ mask_pos = 2;
+ nargs_constant = 1;
+ break;
+ case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
+ case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
+ case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
+ case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
+ case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
+ case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
+ case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
+ case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
+ case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
+ case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
+ case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
+ case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
+ case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
+ case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
+ case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
+ case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
+ case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
+ case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
+ case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
+ case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
+ case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
+ case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
+ case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
+ case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
+ case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
+ case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
+ case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
+ nargs = 5;
+ mask_pos = 2;
+ nargs_constant = 1;
+ break;
+ case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
+ case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
+ case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
+ case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
+ case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
+ case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
+ case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
+ case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
+ case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
+ case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
+ nargs = 5;
+ mask_pos = 1;
+ nargs_constant = 1;
+ break;
+ case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
+ case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
+ case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
+ case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
+ case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
+ case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
+ case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
+ case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
+ case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
+ case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
+ case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
+ case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
+ nargs = 5;
+ mask_pos = 1;
+ nargs_constant = 2;
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ gcc_assert (nargs <= ARRAY_SIZE (args));
+
+ if (comparison != UNKNOWN)
+ {
+ gcc_assert (nargs == 2);
+ return ix86_expand_sse_compare (d, exp, target, swap);
+ }
+
+ if (rmode == VOIDmode || rmode == tmode)
+ {
+ if (optimize
+ || target == 0
+ || GET_MODE (target) != tmode
+ || !insn_p->operand[0].predicate (target, tmode))
+ target = gen_reg_rtx (tmode);
+ else if (memory_operand (target, tmode))
+ num_memory++;
+ real_target = target;
+ }
+ else
+ {
+ real_target = gen_reg_rtx (tmode);
+ target = lowpart_subreg (rmode, real_target, tmode);
+ }
+
+ for (i = 0; i < nargs; i++)
+ {
+ tree arg = CALL_EXPR_ARG (exp, i);
+ rtx op = expand_normal (arg);
+ machine_mode mode = insn_p->operand[i + 1].mode;
+ bool match = insn_p->operand[i + 1].predicate (op, mode);
+
+ if (second_arg_count && i == 1)
+ {
+ /* SIMD shift insns take either an 8-bit immediate or
+ register as count. But builtin functions take int as
+ count. If count doesn't match, we put it in register.
+ The instructions are using 64-bit count, if op is just
+ 32-bit, zero-extend it, as negative shift counts
+ are undefined behavior and zero-extension is more
+ efficient. */
+ if (!match)
+ {
+ if (SCALAR_INT_MODE_P (GET_MODE (op)))
+ op = convert_modes (mode, GET_MODE (op), op, 1);
+ else
+ op = lowpart_subreg (mode, op, GET_MODE (op));
+ if (!insn_p->operand[i + 1].predicate (op, mode))
+ op = copy_to_reg (op);
+ }
+ }
+ else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
+ (!mask_pos && (nargs - i) <= nargs_constant))
+ {
+ if (!match)
+ switch (icode)
+ {
+ case CODE_FOR_avx_vinsertf128v4di:
+ case CODE_FOR_avx_vextractf128v4di:
+ error ("the last argument must be an 1-bit immediate");
+ return const0_rtx;
+
+ case CODE_FOR_avx512f_cmpv8di3_mask:
+ case CODE_FOR_avx512f_cmpv16si3_mask:
+ case CODE_FOR_avx512f_ucmpv8di3_mask:
+ case CODE_FOR_avx512f_ucmpv16si3_mask:
+ case CODE_FOR_avx512vl_cmpv4di3_mask:
+ case CODE_FOR_avx512vl_cmpv8si3_mask:
+ case CODE_FOR_avx512vl_ucmpv4di3_mask:
+ case CODE_FOR_avx512vl_ucmpv8si3_mask:
+ case CODE_FOR_avx512vl_cmpv2di3_mask:
+ case CODE_FOR_avx512vl_cmpv4si3_mask:
+ case CODE_FOR_avx512vl_ucmpv2di3_mask:
+ case CODE_FOR_avx512vl_ucmpv4si3_mask:
+ error ("the last argument must be a 3-bit immediate");
+ return const0_rtx;
+
+ case CODE_FOR_sse4_1_roundsd:
+ case CODE_FOR_sse4_1_roundss:
+
+ case CODE_FOR_sse4_1_roundpd:
+ case CODE_FOR_sse4_1_roundps:
+ case CODE_FOR_avx_roundpd256:
+ case CODE_FOR_avx_roundps256:
+
+ case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
+ case CODE_FOR_sse4_1_roundps_sfix:
+ case CODE_FOR_avx_roundpd_vec_pack_sfix256:
+ case CODE_FOR_avx_roundps_sfix256:
+
+ case CODE_FOR_sse4_1_blendps:
+ case CODE_FOR_avx_blendpd256:
+ case CODE_FOR_avx_vpermilv4df:
+ case CODE_FOR_avx_vpermilv4df_mask:
+ case CODE_FOR_avx512f_getmantv8df_mask:
+ case CODE_FOR_avx512f_getmantv16sf_mask:
+ case CODE_FOR_avx512vl_getmantv8sf_mask:
+ case CODE_FOR_avx512vl_getmantv4df_mask:
+ case CODE_FOR_avx512vl_getmantv4sf_mask:
+ case CODE_FOR_avx512vl_getmantv2df_mask:
+ case CODE_FOR_avx512dq_rangepv8df_mask_round:
+ case CODE_FOR_avx512dq_rangepv16sf_mask_round:
+ case CODE_FOR_avx512dq_rangepv4df_mask:
+ case CODE_FOR_avx512dq_rangepv8sf_mask:
+ case CODE_FOR_avx512dq_rangepv2df_mask:
+ case CODE_FOR_avx512dq_rangepv4sf_mask:
+ case CODE_FOR_avx_shufpd256_mask:
+ error ("the last argument must be a 4-bit immediate");
+ return const0_rtx;
+
+ case CODE_FOR_sha1rnds4:
+ case CODE_FOR_sse4_1_blendpd:
+ case CODE_FOR_avx_vpermilv2df:
+ case CODE_FOR_avx_vpermilv2df_mask:
+ case CODE_FOR_xop_vpermil2v2df3:
+ case CODE_FOR_xop_vpermil2v4sf3:
+ case CODE_FOR_xop_vpermil2v4df3:
+ case CODE_FOR_xop_vpermil2v8sf3:
+ case CODE_FOR_avx512f_vinsertf32x4_mask:
+ case CODE_FOR_avx512f_vinserti32x4_mask:
+ case CODE_FOR_avx512f_vextractf32x4_mask:
+ case CODE_FOR_avx512f_vextracti32x4_mask:
+ case CODE_FOR_sse2_shufpd:
+ case CODE_FOR_sse2_shufpd_mask:
+ case CODE_FOR_avx512dq_shuf_f64x2_mask:
+ case CODE_FOR_avx512dq_shuf_i64x2_mask:
+ case CODE_FOR_avx512vl_shuf_i32x4_mask:
+ case CODE_FOR_avx512vl_shuf_f32x4_mask:
+ error ("the last argument must be a 2-bit immediate");
+ return const0_rtx;
+
+ case CODE_FOR_avx_vextractf128v4df:
+ case CODE_FOR_avx_vextractf128v8sf:
+ case CODE_FOR_avx_vextractf128v8si:
+ case CODE_FOR_avx_vinsertf128v4df:
+ case CODE_FOR_avx_vinsertf128v8sf:
+ case CODE_FOR_avx_vinsertf128v8si:
+ case CODE_FOR_avx512f_vinsertf64x4_mask:
+ case CODE_FOR_avx512f_vinserti64x4_mask:
+ case CODE_FOR_avx512f_vextractf64x4_mask:
+ case CODE_FOR_avx512f_vextracti64x4_mask:
+ case CODE_FOR_avx512dq_vinsertf32x8_mask:
+ case CODE_FOR_avx512dq_vinserti32x8_mask:
+ case CODE_FOR_avx512vl_vinsertv4df:
+ case CODE_FOR_avx512vl_vinsertv4di:
+ case CODE_FOR_avx512vl_vinsertv8sf:
+ case CODE_FOR_avx512vl_vinsertv8si:
+ error ("the last argument must be a 1-bit immediate");
+ return const0_rtx;
+
+ case CODE_FOR_avx_vmcmpv2df3:
+ case CODE_FOR_avx_vmcmpv4sf3:
+ case CODE_FOR_avx_cmpv2df3:
+ case CODE_FOR_avx_cmpv4sf3:
+ case CODE_FOR_avx_cmpv4df3:
+ case CODE_FOR_avx_cmpv8sf3:
+ case CODE_FOR_avx512f_cmpv8df3_mask:
+ case CODE_FOR_avx512f_cmpv16sf3_mask:
+ case CODE_FOR_avx512f_vmcmpv2df3_mask:
+ case CODE_FOR_avx512f_vmcmpv4sf3_mask:
+ error ("the last argument must be a 5-bit immediate");
+ return const0_rtx;
+
+ default:
+ switch (nargs_constant)
+ {
+ case 2:
+ if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
+ (!mask_pos && (nargs - i) == nargs_constant))
+ {
+ error ("the next to last argument must be an 8-bit immediate");
+ break;
+ }
+ /* FALLTHRU */
+ case 1:
+ error ("the last argument must be an 8-bit immediate");
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ return const0_rtx;
+ }
+ }
+ else
+ {
+ if (VECTOR_MODE_P (mode))
+ op = safe_vector_operand (op, mode);
+
+ /* If we aren't optimizing, only allow one memory operand to
+ be generated. */
+ if (memory_operand (op, mode))
+ num_memory++;
+
+ op = fixup_modeless_constant (op, mode);
+
+ if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
+ {
+ if (optimize || !match || num_memory > 1)
+ op = copy_to_mode_reg (mode, op);
+ }
+ else
+ {
+ op = copy_to_reg (op);
+ op = lowpart_subreg (mode, op, GET_MODE (op));
+ }
+ }
+
+ args[i].op = op;
+ args[i].mode = mode;
+ }
+
+ switch (nargs)
+ {
+ case 1:
+ pat = GEN_FCN (icode) (real_target, args[0].op);
+ break;
+ case 2:
+ pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
+ break;
+ case 3:
+ pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
+ args[2].op);
+ break;
+ case 4:
+ pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
+ args[2].op, args[3].op);
+ break;
+ case 5:
+ pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
+ args[2].op, args[3].op, args[4].op);
+ break;
+ case 6:
+ pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
+ args[2].op, args[3].op, args[4].op,
+ args[5].op);
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ if (! pat)
+ return 0;
+
+ emit_insn (pat);
+ return target;
+}
+
+/* Transform pattern of following layout:
+ (set A
+ (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
+ )
+ into:
+ (set (A B)) */
+
+static rtx
+ix86_erase_embedded_rounding (rtx pat)
+{
+ if (GET_CODE (pat) == INSN)
+ pat = PATTERN (pat);
+
+ gcc_assert (GET_CODE (pat) == SET);
+ rtx src = SET_SRC (pat);
+ gcc_assert (XVECLEN (src, 0) == 2);
+ rtx p0 = XVECEXP (src, 0, 0);
+ gcc_assert (GET_CODE (src) == UNSPEC
+ && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
+ rtx res = gen_rtx_SET (SET_DEST (pat), p0);
+ return res;
+}
+
+/* Subroutine of ix86_expand_round_builtin to take care of comi insns
+ with rounding. */
+static rtx
+ix86_expand_sse_comi_round (const struct builtin_description *d,
+ tree exp, rtx target)
+{
+ rtx pat, set_dst;
+ tree arg0 = CALL_EXPR_ARG (exp, 0);
+ tree arg1 = CALL_EXPR_ARG (exp, 1);
+ tree arg2 = CALL_EXPR_ARG (exp, 2);
+ tree arg3 = CALL_EXPR_ARG (exp, 3);
+ rtx op0 = expand_normal (arg0);
+ rtx op1 = expand_normal (arg1);
+ rtx op2 = expand_normal (arg2);
+ rtx op3 = expand_normal (arg3);
+ enum insn_code icode = d->icode;
+ const struct insn_data_d *insn_p = &insn_data[icode];
+ machine_mode mode0 = insn_p->operand[0].mode;
+ machine_mode mode1 = insn_p->operand[1].mode;
+ enum rtx_code comparison = UNEQ;
+ bool need_ucomi = false;
+
+ /* See avxintrin.h for values. */
+ enum rtx_code comi_comparisons[32] =
+ {
+ UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
+ UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
+ UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
+ };
+ bool need_ucomi_values[32] =
+ {
+ true, false, false, true, true, false, false, true,
+ true, false, false, true, true, false, false, true,
+ false, true, true, false, false, true, true, false,
+ false, true, true, false, false, true, true, false
+ };
+
+ if (!CONST_INT_P (op2))
+ {
+ error ("the third argument must be comparison constant");
+ return const0_rtx;
+ }
+ if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
+ {
+ error ("incorrect comparison mode");
+ return const0_rtx;
+ }
+
+ if (!insn_p->operand[2].predicate (op3, SImode))
+ {
+ error ("incorrect rounding operand");
+ return const0_rtx;
+ }
+
+ comparison = comi_comparisons[INTVAL (op2)];
+ need_ucomi = need_ucomi_values[INTVAL (op2)];
+
+ if (VECTOR_MODE_P (mode0))
+ op0 = safe_vector_operand (op0, mode0);
+ if (VECTOR_MODE_P (mode1))
+ op1 = safe_vector_operand (op1, mode1);
+
+ target = gen_reg_rtx (SImode);
+ emit_move_insn (target, const0_rtx);
+ target = gen_rtx_SUBREG (QImode, target, 0);
+
+ if ((optimize && !register_operand (op0, mode0))
+ || !insn_p->operand[0].predicate (op0, mode0))
+ op0 = copy_to_mode_reg (mode0, op0);
+ if ((optimize && !register_operand (op1, mode1))
+ || !insn_p->operand[1].predicate (op1, mode1))
+ op1 = copy_to_mode_reg (mode1, op1);
+
+ if (need_ucomi)
+ icode = icode == CODE_FOR_sse_comi_round
+ ? CODE_FOR_sse_ucomi_round
+ : CODE_FOR_sse2_ucomi_round;
+
+ pat = GEN_FCN (icode) (op0, op1, op3);
+ if (! pat)
+ return 0;
+
+ /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
+ if (INTVAL (op3) == NO_ROUND)
+ {
+ pat = ix86_erase_embedded_rounding (pat);
+ if (! pat)
+ return 0;
+
+ set_dst = SET_DEST (pat);
+ }
+ else
+ {
+ gcc_assert (GET_CODE (pat) == SET);
+ set_dst = SET_DEST (pat);
+ }
+
+ emit_insn (pat);
+ emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
+ gen_rtx_fmt_ee (comparison, QImode,
+ set_dst,
+ const0_rtx)));
+
+ return SUBREG_REG (target);
+}
+
+static rtx
+ix86_expand_round_builtin (const struct builtin_description *d,
+ tree exp, rtx target)
+{
+ rtx pat;
+ unsigned int i, nargs;
+ struct
+ {
+ rtx op;
+ machine_mode mode;
+ } args[6];
+ enum insn_code icode = d->icode;
+ const struct insn_data_d *insn_p = &insn_data[icode];
+ machine_mode tmode = insn_p->operand[0].mode;
+ unsigned int nargs_constant = 0;
+ unsigned int redundant_embed_rnd = 0;
+
+ switch ((enum ix86_builtin_func_type) d->flag)
+ {
+ case UINT64_FTYPE_V2DF_INT:
+ case UINT64_FTYPE_V4SF_INT:
+ case UINT_FTYPE_V2DF_INT:
+ case UINT_FTYPE_V4SF_INT:
+ case INT64_FTYPE_V2DF_INT:
+ case INT64_FTYPE_V4SF_INT:
+ case INT_FTYPE_V2DF_INT:
+ case INT_FTYPE_V4SF_INT:
+ nargs = 2;
+ break;
+ case V4SF_FTYPE_V4SF_UINT_INT:
+ case V4SF_FTYPE_V4SF_UINT64_INT:
+ case V2DF_FTYPE_V2DF_UINT64_INT:
+ case V4SF_FTYPE_V4SF_INT_INT:
+ case V4SF_FTYPE_V4SF_INT64_INT:
+ case V2DF_FTYPE_V2DF_INT64_INT:
+ case V4SF_FTYPE_V4SF_V4SF_INT:
+ case V2DF_FTYPE_V2DF_V2DF_INT:
+ case V4SF_FTYPE_V4SF_V2DF_INT:
+ case V2DF_FTYPE_V2DF_V4SF_INT:
+ nargs = 3;
+ break;
+ case V8SF_FTYPE_V8DF_V8SF_QI_INT:
+ case V8DF_FTYPE_V8DF_V8DF_QI_INT:
+ case V8SI_FTYPE_V8DF_V8SI_QI_INT:
+ case V8DI_FTYPE_V8DF_V8DI_QI_INT:
+ case V8SF_FTYPE_V8DI_V8SF_QI_INT:
+ case V8DF_FTYPE_V8DI_V8DF_QI_INT:
+ case V16SF_FTYPE_V16SF_V16SF_HI_INT:
+ case V8DI_FTYPE_V8SF_V8DI_QI_INT:
+ case V16SF_FTYPE_V16SI_V16SF_HI_INT:
+ case V16SI_FTYPE_V16SF_V16SI_HI_INT:
+ case V8DF_FTYPE_V8SF_V8DF_QI_INT:
+ case V16SF_FTYPE_V16HI_V16SF_HI_INT:
+ case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
+ case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
+ nargs = 4;
+ break;
+ case V4SF_FTYPE_V4SF_V4SF_INT_INT:
+ case V2DF_FTYPE_V2DF_V2DF_INT_INT:
+ nargs_constant = 2;
+ nargs = 4;
+ break;
+ case INT_FTYPE_V4SF_V4SF_INT_INT:
+ case INT_FTYPE_V2DF_V2DF_INT_INT:
+ return ix86_expand_sse_comi_round (d, exp, target);
+ case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
+ case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
+ case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
+ case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
+ case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
+ case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
+ case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
+ case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
+ nargs = 5;
+ break;
+ case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
+ case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
+ nargs_constant = 4;
+ nargs = 5;
+ break;
+ case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
+ case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
+ case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
+ case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
+ nargs_constant = 3;
+ nargs = 5;
+ break;
+ case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
+ case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
+ case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
+ case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
+ case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
+ case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
+ nargs = 6;
+ nargs_constant = 4;
+ break;
+ case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
+ case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
+ case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
+ case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
+ nargs = 6;
+ nargs_constant = 3;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ gcc_assert (nargs <= ARRAY_SIZE (args));
+
+ if (optimize
+ || target == 0
+ || GET_MODE (target) != tmode
+ || !insn_p->operand[0].predicate (target, tmode))
+ target = gen_reg_rtx (tmode);
+
+ for (i = 0; i < nargs; i++)
+ {
+ tree arg = CALL_EXPR_ARG (exp, i);
+ rtx op = expand_normal (arg);
+ machine_mode mode = insn_p->operand[i + 1].mode;
+ bool match = insn_p->operand[i + 1].predicate (op, mode);
+
+ if (i == nargs - nargs_constant)
+ {
+ if (!match)
+ {
+ switch (icode)
+ {
+ case CODE_FOR_avx512f_getmantv8df_mask_round:
+ case CODE_FOR_avx512f_getmantv16sf_mask_round:
+ case CODE_FOR_avx512f_vgetmantv2df_round:
+ case CODE_FOR_avx512f_vgetmantv2df_mask_round:
+ case CODE_FOR_avx512f_vgetmantv4sf_round:
+ case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
+ error ("the immediate argument must be a 4-bit immediate");
+ return const0_rtx;
+ case CODE_FOR_avx512f_cmpv8df3_mask_round:
+ case CODE_FOR_avx512f_cmpv16sf3_mask_round:
+ case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
+ case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
+ error ("the immediate argument must be a 5-bit immediate");
+ return const0_rtx;
+ default:
+ error ("the immediate argument must be an 8-bit immediate");
+ return const0_rtx;
+ }
+ }
+ }
+ else if (i == nargs-1)
+ {
+ if (!insn_p->operand[nargs].predicate (op, SImode))
+ {
+ error ("incorrect rounding operand");
+ return const0_rtx;
+ }
+
+ /* If there is no rounding use normal version of the pattern. */
+ if (INTVAL (op) == NO_ROUND)
+ redundant_embed_rnd = 1;
+ }
+ else
+ {
+ if (VECTOR_MODE_P (mode))
+ op = safe_vector_operand (op, mode);
+
+ op = fixup_modeless_constant (op, mode);
+
+ if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
+ {
+ if (optimize || !match)
+ op = copy_to_mode_reg (mode, op);
+ }
+ else
+ {
+ op = copy_to_reg (op);
+ op = lowpart_subreg (mode, op, GET_MODE (op));
+ }
+ }
+
+ args[i].op = op;
+ args[i].mode = mode;
+ }
+
+ switch (nargs)
+ {
+ case 1:
+ pat = GEN_FCN (icode) (target, args[0].op);
+ break;
+ case 2:
+ pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
+ break;
+ case 3:
+ pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+ args[2].op);
+ break;
+ case 4:
+ pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+ args[2].op, args[3].op);
+ break;
+ case 5:
+ pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+ args[2].op, args[3].op, args[4].op);
+ break;
+ case 6:
+ pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+ args[2].op, args[3].op, args[4].op,
+ args[5].op);
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ if (!pat)
+ return 0;
+
+ if (redundant_embed_rnd)
+ pat = ix86_erase_embedded_rounding (pat);
+
+ emit_insn (pat);
+ return target;
+}
+
+/* Subroutine of ix86_expand_builtin to take care of special insns
+ with variable number of operands. */
+
+static rtx
+ix86_expand_special_args_builtin (const struct builtin_description *d,
+ tree exp, rtx target)
+{
+ tree arg;
+ rtx pat, op;
+ unsigned int i, nargs, arg_adjust, memory;
+ bool aligned_mem = false;
+ struct
+ {
+ rtx op;
+ machine_mode mode;
+ } args[3];
+ enum insn_code icode = d->icode;
+ bool last_arg_constant = false;
+ const struct insn_data_d *insn_p = &insn_data[icode];
+ machine_mode tmode = insn_p->operand[0].mode;
+ enum { load, store } klass;
+
+ switch ((enum ix86_builtin_func_type) d->flag)
+ {
+ case VOID_FTYPE_VOID:
+ emit_insn (GEN_FCN (icode) (target));
+ return 0;
+ case VOID_FTYPE_UINT64:
+ case VOID_FTYPE_UNSIGNED:
+ nargs = 0;
+ klass = store;
+ memory = 0;
+ break;
+
+ case INT_FTYPE_VOID:
+ case USHORT_FTYPE_VOID:
+ case UINT64_FTYPE_VOID:
+ case UINT_FTYPE_VOID:
+ case UNSIGNED_FTYPE_VOID:
+ nargs = 0;
+ klass = load;
+ memory = 0;
+ break;
+ case UINT64_FTYPE_PUNSIGNED:
+ case V2DI_FTYPE_PV2DI:
+ case V4DI_FTYPE_PV4DI:
+ case V32QI_FTYPE_PCCHAR:
+ case V16QI_FTYPE_PCCHAR:
+ case V8SF_FTYPE_PCV4SF:
+ case V8SF_FTYPE_PCFLOAT:
+ case V4SF_FTYPE_PCFLOAT:
+ case V4DF_FTYPE_PCV2DF:
+ case V4DF_FTYPE_PCDOUBLE:
+ case V2DF_FTYPE_PCDOUBLE:
+ case VOID_FTYPE_PVOID:
+ case V8DI_FTYPE_PV8DI:
+ nargs = 1;
+ klass = load;
+ memory = 0;
+ switch (icode)
+ {
+ case CODE_FOR_sse4_1_movntdqa:
+ case CODE_FOR_avx2_movntdqa:
+ case CODE_FOR_avx512f_movntdqa:
+ aligned_mem = true;
+ break;
+ default:
+ break;
+ }
+ break;
+ case VOID_FTYPE_PV2SF_V4SF:
+ case VOID_FTYPE_PV8DI_V8DI:
+ case VOID_FTYPE_PV4DI_V4DI:
+ case VOID_FTYPE_PV2DI_V2DI:
+ case VOID_FTYPE_PCHAR_V32QI:
+ case VOID_FTYPE_PCHAR_V16QI:
+ case VOID_FTYPE_PFLOAT_V16SF:
+ case VOID_FTYPE_PFLOAT_V8SF:
+ case VOID_FTYPE_PFLOAT_V4SF:
+ case VOID_FTYPE_PDOUBLE_V8DF:
+ case VOID_FTYPE_PDOUBLE_V4DF:
+ case VOID_FTYPE_PDOUBLE_V2DF:
+ case VOID_FTYPE_PLONGLONG_LONGLONG:
+ case VOID_FTYPE_PULONGLONG_ULONGLONG:
+ case VOID_FTYPE_PUNSIGNED_UNSIGNED:
+ case VOID_FTYPE_PINT_INT:
+ nargs = 1;
+ klass = store;
+ /* Reserve memory operand for target. */
+ memory = ARRAY_SIZE (args);
+ switch (icode)
+ {
+ /* These builtins and instructions require the memory
+ to be properly aligned. */
+ case CODE_FOR_avx_movntv4di:
+ case CODE_FOR_sse2_movntv2di:
+ case CODE_FOR_avx_movntv8sf:
+ case CODE_FOR_sse_movntv4sf:
+ case CODE_FOR_sse4a_vmmovntv4sf:
+ case CODE_FOR_avx_movntv4df:
+ case CODE_FOR_sse2_movntv2df:
+ case CODE_FOR_sse4a_vmmovntv2df:
+ case CODE_FOR_sse2_movntidi:
+ case CODE_FOR_sse_movntq:
+ case CODE_FOR_sse2_movntisi:
+ case CODE_FOR_avx512f_movntv16sf:
+ case CODE_FOR_avx512f_movntv8df:
+ case CODE_FOR_avx512f_movntv8di:
+ aligned_mem = true;
+ break;
+ default:
+ break;
+ }
+ break;
+ case VOID_FTYPE_PVOID_PCVOID:
+ nargs = 1;
+ klass = store;
+ memory = 0;
+
+ break;
+ case V4SF_FTYPE_V4SF_PCV2SF:
+ case V2DF_FTYPE_V2DF_PCDOUBLE:
+ nargs = 2;
+ klass = load;
+ memory = 1;
+ break;
+ case V8SF_FTYPE_PCV8SF_V8SI:
+ case V4DF_FTYPE_PCV4DF_V4DI:
+ case V4SF_FTYPE_PCV4SF_V4SI:
+ case V2DF_FTYPE_PCV2DF_V2DI:
+ case V8SI_FTYPE_PCV8SI_V8SI:
+ case V4DI_FTYPE_PCV4DI_V4DI:
+ case V4SI_FTYPE_PCV4SI_V4SI:
+ case V2DI_FTYPE_PCV2DI_V2DI:
+ case VOID_FTYPE_INT_INT64:
+ nargs = 2;
+ klass = load;
+ memory = 0;
+ break;
+ case VOID_FTYPE_PV8DF_V8DF_UQI:
+ case VOID_FTYPE_PV4DF_V4DF_UQI:
+ case VOID_FTYPE_PV2DF_V2DF_UQI:
+ case VOID_FTYPE_PV16SF_V16SF_UHI:
+ case VOID_FTYPE_PV8SF_V8SF_UQI:
+ case VOID_FTYPE_PV4SF_V4SF_UQI:
+ case VOID_FTYPE_PV8DI_V8DI_UQI:
+ case VOID_FTYPE_PV4DI_V4DI_UQI:
+ case VOID_FTYPE_PV2DI_V2DI_UQI:
+ case VOID_FTYPE_PV16SI_V16SI_UHI:
+ case VOID_FTYPE_PV8SI_V8SI_UQI:
+ case VOID_FTYPE_PV4SI_V4SI_UQI:
+ case VOID_FTYPE_PV64QI_V64QI_UDI:
+ case VOID_FTYPE_PV32HI_V32HI_USI:
+ case VOID_FTYPE_PV32QI_V32QI_USI:
+ case VOID_FTYPE_PV16QI_V16QI_UHI:
+ case VOID_FTYPE_PV16HI_V16HI_UHI:
+ case VOID_FTYPE_PV8HI_V8HI_UQI:
+ switch (icode)
+ {
+ /* These builtins and instructions require the memory
+ to be properly aligned. */
+ case CODE_FOR_avx512f_storev16sf_mask:
+ case CODE_FOR_avx512f_storev16si_mask:
+ case CODE_FOR_avx512f_storev8df_mask:
+ case CODE_FOR_avx512f_storev8di_mask:
+ case CODE_FOR_avx512vl_storev8sf_mask:
+ case CODE_FOR_avx512vl_storev8si_mask:
+ case CODE_FOR_avx512vl_storev4df_mask:
+ case CODE_FOR_avx512vl_storev4di_mask:
+ case CODE_FOR_avx512vl_storev4sf_mask:
+ case CODE_FOR_avx512vl_storev4si_mask:
+ case CODE_FOR_avx512vl_storev2df_mask:
+ case CODE_FOR_avx512vl_storev2di_mask:
+ aligned_mem = true;
+ break;
+ default:
+ break;
+ }
+ /* FALLTHRU */
+ case VOID_FTYPE_PV8SF_V8SI_V8SF:
+ case VOID_FTYPE_PV4DF_V4DI_V4DF:
+ case VOID_FTYPE_PV4SF_V4SI_V4SF:
+ case VOID_FTYPE_PV2DF_V2DI_V2DF:
+ case VOID_FTYPE_PV8SI_V8SI_V8SI:
+ case VOID_FTYPE_PV4DI_V4DI_V4DI:
+ case VOID_FTYPE_PV4SI_V4SI_V4SI:
+ case VOID_FTYPE_PV2DI_V2DI_V2DI:
+ case VOID_FTYPE_PV8SI_V8DI_UQI:
+ case VOID_FTYPE_PV8HI_V8DI_UQI:
+ case VOID_FTYPE_PV16HI_V16SI_UHI:
+ case VOID_FTYPE_PV16QI_V8DI_UQI:
+ case VOID_FTYPE_PV16QI_V16SI_UHI:
+ case VOID_FTYPE_PV4SI_V4DI_UQI:
+ case VOID_FTYPE_PV4SI_V2DI_UQI:
+ case VOID_FTYPE_PV8HI_V4DI_UQI:
+ case VOID_FTYPE_PV8HI_V2DI_UQI:
+ case VOID_FTYPE_PV8HI_V8SI_UQI:
+ case VOID_FTYPE_PV8HI_V4SI_UQI:
+ case VOID_FTYPE_PV16QI_V4DI_UQI:
+ case VOID_FTYPE_PV16QI_V2DI_UQI:
+ case VOID_FTYPE_PV16QI_V8SI_UQI:
+ case VOID_FTYPE_PV16QI_V4SI_UQI:
+ case VOID_FTYPE_PCHAR_V64QI_UDI:
+ case VOID_FTYPE_PCHAR_V32QI_USI:
+ case VOID_FTYPE_PCHAR_V16QI_UHI:
+ case VOID_FTYPE_PSHORT_V32HI_USI:
+ case VOID_FTYPE_PSHORT_V16HI_UHI:
+ case VOID_FTYPE_PSHORT_V8HI_UQI:
+ case VOID_FTYPE_PINT_V16SI_UHI:
+ case VOID_FTYPE_PINT_V8SI_UQI:
+ case VOID_FTYPE_PINT_V4SI_UQI:
+ case VOID_FTYPE_PINT64_V8DI_UQI:
+ case VOID_FTYPE_PINT64_V4DI_UQI:
+ case VOID_FTYPE_PINT64_V2DI_UQI:
+ case VOID_FTYPE_PDOUBLE_V8DF_UQI:
+ case VOID_FTYPE_PDOUBLE_V4DF_UQI:
+ case VOID_FTYPE_PDOUBLE_V2DF_UQI:
+ case VOID_FTYPE_PFLOAT_V16SF_UHI:
+ case VOID_FTYPE_PFLOAT_V8SF_UQI:
+ case VOID_FTYPE_PFLOAT_V4SF_UQI:
+ case VOID_FTYPE_PV32QI_V32HI_USI:
+ case VOID_FTYPE_PV16QI_V16HI_UHI:
+ case VOID_FTYPE_PV8QI_V8HI_UQI:
+ nargs = 2;
+ klass = store;
+ /* Reserve memory operand for target. */
+ memory = ARRAY_SIZE (args);
+ break;
+ case V4SF_FTYPE_PCV4SF_V4SF_UQI:
+ case V8SF_FTYPE_PCV8SF_V8SF_UQI:
+ case V16SF_FTYPE_PCV16SF_V16SF_UHI:
+ case V4SI_FTYPE_PCV4SI_V4SI_UQI:
+ case V8SI_FTYPE_PCV8SI_V8SI_UQI:
+ case V16SI_FTYPE_PCV16SI_V16SI_UHI:
+ case V2DF_FTYPE_PCV2DF_V2DF_UQI:
+ case V4DF_FTYPE_PCV4DF_V4DF_UQI:
+ case V8DF_FTYPE_PCV8DF_V8DF_UQI:
+ case V2DI_FTYPE_PCV2DI_V2DI_UQI:
+ case V4DI_FTYPE_PCV4DI_V4DI_UQI:
+ case V8DI_FTYPE_PCV8DI_V8DI_UQI:
+ case V64QI_FTYPE_PCV64QI_V64QI_UDI:
+ case V32HI_FTYPE_PCV32HI_V32HI_USI:
+ case V32QI_FTYPE_PCV32QI_V32QI_USI:
+ case V16QI_FTYPE_PCV16QI_V16QI_UHI:
+ case V16HI_FTYPE_PCV16HI_V16HI_UHI:
+ case V8HI_FTYPE_PCV8HI_V8HI_UQI:
+ switch (icode)
+ {
+ /* These builtins and instructions require the memory
+ to be properly aligned. */
+ case CODE_FOR_avx512f_loadv16sf_mask:
+ case CODE_FOR_avx512f_loadv16si_mask:
+ case CODE_FOR_avx512f_loadv8df_mask:
+ case CODE_FOR_avx512f_loadv8di_mask:
+ case CODE_FOR_avx512vl_loadv8sf_mask:
+ case CODE_FOR_avx512vl_loadv8si_mask:
+ case CODE_FOR_avx512vl_loadv4df_mask:
+ case CODE_FOR_avx512vl_loadv4di_mask:
+ case CODE_FOR_avx512vl_loadv4sf_mask:
+ case CODE_FOR_avx512vl_loadv4si_mask:
+ case CODE_FOR_avx512vl_loadv2df_mask:
+ case CODE_FOR_avx512vl_loadv2di_mask:
+ case CODE_FOR_avx512bw_loadv64qi_mask:
+ case CODE_FOR_avx512vl_loadv32qi_mask:
+ case CODE_FOR_avx512vl_loadv16qi_mask:
+ case CODE_FOR_avx512bw_loadv32hi_mask:
+ case CODE_FOR_avx512vl_loadv16hi_mask:
+ case CODE_FOR_avx512vl_loadv8hi_mask:
+ aligned_mem = true;
+ break;
+ default:
+ break;
+ }
+ /* FALLTHRU */
+ case V64QI_FTYPE_PCCHAR_V64QI_UDI:
+ case V32QI_FTYPE_PCCHAR_V32QI_USI:
+ case V16QI_FTYPE_PCCHAR_V16QI_UHI:
+ case V32HI_FTYPE_PCSHORT_V32HI_USI:
+ case V16HI_FTYPE_PCSHORT_V16HI_UHI:
+ case V8HI_FTYPE_PCSHORT_V8HI_UQI:
+ case V16SI_FTYPE_PCINT_V16SI_UHI:
+ case V8SI_FTYPE_PCINT_V8SI_UQI:
+ case V4SI_FTYPE_PCINT_V4SI_UQI:
+ case V8DI_FTYPE_PCINT64_V8DI_UQI:
+ case V4DI_FTYPE_PCINT64_V4DI_UQI:
+ case V2DI_FTYPE_PCINT64_V2DI_UQI:
+ case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
+ case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
+ case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
+ case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
+ case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
+ case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
+ nargs = 3;
+ klass = load;
+ memory = 0;
+ break;
+ case VOID_FTYPE_UINT_UINT_UINT:
+ case VOID_FTYPE_UINT64_UINT_UINT:
+ case UCHAR_FTYPE_UINT_UINT_UINT:
+ case UCHAR_FTYPE_UINT64_UINT_UINT:
+ nargs = 3;
+ klass = load;
+ memory = ARRAY_SIZE (args);
+ last_arg_constant = true;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ gcc_assert (nargs <= ARRAY_SIZE (args));
+
+ if (klass == store)
+ {
+ arg = CALL_EXPR_ARG (exp, 0);
+ op = expand_normal (arg);
+ gcc_assert (target == 0);
+ if (memory)
+ {
+ op = ix86_zero_extend_to_Pmode (op);
+ target = gen_rtx_MEM (tmode, op);
+ /* target at this point has just BITS_PER_UNIT MEM_ALIGN
+ on it. Try to improve it using get_pointer_alignment,
+ and if the special builtin is one that requires strict
+ mode alignment, also from it's GET_MODE_ALIGNMENT.
+ Failure to do so could lead to ix86_legitimate_combined_insn
+ rejecting all changes to such insns. */
+ unsigned int align = get_pointer_alignment (arg);
+ if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
+ align = GET_MODE_ALIGNMENT (tmode);
+ if (MEM_ALIGN (target) < align)
+ set_mem_align (target, align);
+ }
+ else
+ target = force_reg (tmode, op);
+ arg_adjust = 1;
+ }
+ else
+ {
+ arg_adjust = 0;
+ if (optimize
+ || target == 0
+ || !register_operand (target, tmode)
+ || GET_MODE (target) != tmode)
+ target = gen_reg_rtx (tmode);
+ }
+
+ for (i = 0; i < nargs; i++)
+ {
+ machine_mode mode = insn_p->operand[i + 1].mode;
+ bool match;
+
+ arg = CALL_EXPR_ARG (exp, i + arg_adjust);
+ op = expand_normal (arg);
+ match = insn_p->operand[i + 1].predicate (op, mode);
+
+ if (last_arg_constant && (i + 1) == nargs)
+ {
+ if (!match)
+ {
+ if (icode == CODE_FOR_lwp_lwpvalsi3
+ || icode == CODE_FOR_lwp_lwpinssi3
+ || icode == CODE_FOR_lwp_lwpvaldi3
+ || icode == CODE_FOR_lwp_lwpinsdi3)
+ error ("the last argument must be a 32-bit immediate");
+ else
+ error ("the last argument must be an 8-bit immediate");
+ return const0_rtx;
+ }
+ }
+ else
+ {
+ if (i == memory)
+ {
+ /* This must be the memory operand. */
+ op = ix86_zero_extend_to_Pmode (op);
+ op = gen_rtx_MEM (mode, op);
+ /* op at this point has just BITS_PER_UNIT MEM_ALIGN
+ on it. Try to improve it using get_pointer_alignment,
+ and if the special builtin is one that requires strict
+ mode alignment, also from it's GET_MODE_ALIGNMENT.
+ Failure to do so could lead to ix86_legitimate_combined_insn
+ rejecting all changes to such insns. */
+ unsigned int align = get_pointer_alignment (arg);
+ if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
+ align = GET_MODE_ALIGNMENT (mode);
+ if (MEM_ALIGN (op) < align)
+ set_mem_align (op, align);
+ }
+ else
+ {
+ /* This must be register. */
+ if (VECTOR_MODE_P (mode))
+ op = safe_vector_operand (op, mode);
+
+ op = fixup_modeless_constant (op, mode);
+
+ if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
+ op = copy_to_mode_reg (mode, op);
+ else
+ {
+ op = copy_to_reg (op);
+ op = lowpart_subreg (mode, op, GET_MODE (op));
+ }
+ }
+ }
+
+ args[i].op = op;
+ args[i].mode = mode;
+ }
+
+ switch (nargs)
+ {
+ case 0:
+ pat = GEN_FCN (icode) (target);
+ break;
+ case 1:
+ pat = GEN_FCN (icode) (target, args[0].op);
+ break;
+ case 2:
+ pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
+ break;
+ case 3:
+ pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ if (! pat)
+ return 0;
+ emit_insn (pat);
+ return klass == store ? 0 : target;
+}
+
+/* Return the integer constant in ARG. Constrain it to be in the range
+ of the subparts of VEC_TYPE; issue an error if not. */
+
+static int
+get_element_number (tree vec_type, tree arg)
+{
+ unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
+
+ if (!tree_fits_uhwi_p (arg)
+ || (elt = tree_to_uhwi (arg), elt > max))
+ {
+ error ("selector must be an integer constant in the range 0..%wi", max);
+ return 0;
+ }
+
+ return elt;
+}
+
+/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
+ ix86_expand_vector_init. We DO have language-level syntax for this, in
+ the form of (type){ init-list }. Except that since we can't place emms
+ instructions from inside the compiler, we can't allow the use of MMX
+ registers unless the user explicitly asks for it. So we do *not* define
+ vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
+ we have builtins invoked by mmintrin.h that gives us license to emit
+ these sorts of instructions. */
+
+static rtx
+ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
+{
+ machine_mode tmode = TYPE_MODE (type);
+ machine_mode inner_mode = GET_MODE_INNER (tmode);
+ int i, n_elt = GET_MODE_NUNITS (tmode);
+ rtvec v = rtvec_alloc (n_elt);
+
+ gcc_assert (VECTOR_MODE_P (tmode));
+ gcc_assert (call_expr_nargs (exp) == n_elt);
+
+ for (i = 0; i < n_elt; ++i)
+ {
+ rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
+ RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
+ }
+
+ if (!target || !register_operand (target, tmode))
+ target = gen_reg_rtx (tmode);
+
+ ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
+ return target;
+}
+
+/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
+ ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
+ had a language-level syntax for referencing vector elements. */
+
+static rtx
+ix86_expand_vec_ext_builtin (tree exp, rtx target)
+{
+ machine_mode tmode, mode0;
+ tree arg0, arg1;
+ int elt;
+ rtx op0;
+
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ arg1 = CALL_EXPR_ARG (exp, 1);
+
+ op0 = expand_normal (arg0);
+ elt = get_element_number (TREE_TYPE (arg0), arg1);
+
+ tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
+ mode0 = TYPE_MODE (TREE_TYPE (arg0));
+ gcc_assert (VECTOR_MODE_P (mode0));
+
+ op0 = force_reg (mode0, op0);
+
+ if (optimize || !target || !register_operand (target, tmode))
+ target = gen_reg_rtx (tmode);
+
+ ix86_expand_vector_extract (true, target, op0, elt);
+
+ return target;
+}
+
+/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
+ ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
+ a language-level syntax for referencing vector elements. */
+
+static rtx
+ix86_expand_vec_set_builtin (tree exp)
+{
+ machine_mode tmode, mode1;
+ tree arg0, arg1, arg2;
+ int elt;
+ rtx op0, op1, target;
+
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ arg1 = CALL_EXPR_ARG (exp, 1);
+ arg2 = CALL_EXPR_ARG (exp, 2);
+
+ tmode = TYPE_MODE (TREE_TYPE (arg0));
+ mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
+ gcc_assert (VECTOR_MODE_P (tmode));
+
+ op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
+ op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
+ elt = get_element_number (TREE_TYPE (arg0), arg2);
+
+ if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
+ op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
+
+ op0 = force_reg (tmode, op0);
+ op1 = force_reg (mode1, op1);
+
+ /* OP0 is the source of these builtin functions and shouldn't be
+ modified. Create a copy, use it and return it as target. */
+ target = gen_reg_rtx (tmode);
+ emit_move_insn (target, op0);
+ ix86_expand_vector_set (true, target, op1, elt);
+
+ return target;
+}
+
+/* Expand an expression EXP that calls a built-in function,
+ with result going to TARGET if that's convenient
+ (and in mode MODE if that's convenient).
+ SUBTARGET may be used as the target for computing one of EXP's operands.
+ IGNORE is nonzero if the value is to be ignored. */
+
+rtx
+ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
+ machine_mode mode, int ignore)
+{
+ size_t i;
+ enum insn_code icode, icode2;
+ tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
+ tree arg0, arg1, arg2, arg3, arg4;
+ rtx op0, op1, op2, op3, op4, pat, pat2, insn;
+ machine_mode mode0, mode1, mode2, mode3, mode4;
+ unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
+
+ /* For CPU builtins that can be folded, fold first and expand the fold. */
+ switch (fcode)
+ {
+ case IX86_BUILTIN_CPU_INIT:
+ {
+ /* Make it call __cpu_indicator_init in libgcc. */
+ tree call_expr, fndecl, type;
+ type = build_function_type_list (integer_type_node, NULL_TREE);
+ fndecl = build_fn_decl ("__cpu_indicator_init", type);
+ call_expr = build_call_expr (fndecl, 0);
+ return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
+ }
+ case IX86_BUILTIN_CPU_IS:
+ case IX86_BUILTIN_CPU_SUPPORTS:
+ {
+ tree arg0 = CALL_EXPR_ARG (exp, 0);
+ tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
+ gcc_assert (fold_expr != NULL_TREE);
+ return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
+ }
+ }
+
+ HOST_WIDE_INT isa = ix86_isa_flags;
+ HOST_WIDE_INT isa2 = ix86_isa_flags2;
+ HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
+ HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
+ /* The general case is we require all the ISAs specified in bisa{,2}
+ to be enabled.
+ The exceptions are:
+ OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
+ OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
+ OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
+ where for each this pair it is sufficient if either of the ISAs is
+ enabled, plus if it is ored with other options also those others. */
+ if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
+ == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
+ && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
+ isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
+ if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
+ == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
+ && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
+ isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
+ if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
+ == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
+ && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
+ isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
+ if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
+ {
+ bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
+ if (TARGET_ABI_X32)
+ bisa |= OPTION_MASK_ABI_X32;
+ else
+ bisa |= OPTION_MASK_ABI_64;
+ char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
+ (enum fpmath_unit) 0, false, add_abi_p);
+ if (!opts)
+ error ("%qE needs unknown isa option", fndecl);
+ else
+ {
+ gcc_assert (opts != NULL);
+ error ("%qE needs isa option %s", fndecl, opts);
+ free (opts);
+ }
+ return expand_call (exp, target, ignore);
+ }
+
+ switch (fcode)
+ {
+ case IX86_BUILTIN_MASKMOVQ:
+ case IX86_BUILTIN_MASKMOVDQU:
+ icode = (fcode == IX86_BUILTIN_MASKMOVQ
+ ? CODE_FOR_mmx_maskmovq
+ : CODE_FOR_sse2_maskmovdqu);
+ /* Note the arg order is different from the operand order. */
+ arg1 = CALL_EXPR_ARG (exp, 0);
+ arg2 = CALL_EXPR_ARG (exp, 1);
+ arg0 = CALL_EXPR_ARG (exp, 2);
+ op0 = expand_normal (arg0);
+ op1 = expand_normal (arg1);
+ op2 = expand_normal (arg2);
+ mode0 = insn_data[icode].operand[0].mode;
+ mode1 = insn_data[icode].operand[1].mode;
+ mode2 = insn_data[icode].operand[2].mode;
+
+ op0 = ix86_zero_extend_to_Pmode (op0);
+ op0 = gen_rtx_MEM (mode1, op0);
+
+ if (!insn_data[icode].operand[0].predicate (op0, mode0))
+ op0 = copy_to_mode_reg (mode0, op0);
+ if (!insn_data[icode].operand[1].predicate (op1, mode1))
+ op1 = copy_to_mode_reg (mode1, op1);
+ if (!insn_data[icode].operand[2].predicate (op2, mode2))
+ op2 = copy_to_mode_reg (mode2, op2);
+ pat = GEN_FCN (icode) (op0, op1, op2);
+ if (! pat)
+ return 0;
+ emit_insn (pat);
+ return 0;
+
+ case IX86_BUILTIN_LDMXCSR:
+ op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
+ target = assign_386_stack_local (SImode, SLOT_TEMP);
+ emit_move_insn (target, op0);
+ emit_insn (gen_sse_ldmxcsr (target));
+ return 0;
+
+ case IX86_BUILTIN_STMXCSR:
+ target = assign_386_stack_local (SImode, SLOT_TEMP);
+ emit_insn (gen_sse_stmxcsr (target));
+ return copy_to_mode_reg (SImode, target);
+
+ case IX86_BUILTIN_CLFLUSH:
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op0 = expand_normal (arg0);
+ icode = CODE_FOR_sse2_clflush;
+ if (!insn_data[icode].operand[0].predicate (op0, Pmode))
+ op0 = ix86_zero_extend_to_Pmode (op0);
+
+ emit_insn (gen_sse2_clflush (op0));
+ return 0;
+
+ case IX86_BUILTIN_CLWB:
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op0 = expand_normal (arg0);
+ icode = CODE_FOR_clwb;
+ if (!insn_data[icode].operand[0].predicate (op0, Pmode))
+ op0 = ix86_zero_extend_to_Pmode (op0);
+
+ emit_insn (gen_clwb (op0));
+ return 0;
+
+ case IX86_BUILTIN_CLFLUSHOPT:
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op0 = expand_normal (arg0);
+ icode = CODE_FOR_clflushopt;
+ if (!insn_data[icode].operand[0].predicate (op0, Pmode))
+ op0 = ix86_zero_extend_to_Pmode (op0);
+
+ emit_insn (gen_clflushopt (op0));
+ return 0;
+
+ case IX86_BUILTIN_MONITOR:
+ case IX86_BUILTIN_MONITORX:
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ arg1 = CALL_EXPR_ARG (exp, 1);
+ arg2 = CALL_EXPR_ARG (exp, 2);
+ op0 = expand_normal (arg0);
+ op1 = expand_normal (arg1);
+ op2 = expand_normal (arg2);
+ if (!REG_P (op0))
+ op0 = ix86_zero_extend_to_Pmode (op0);
+ if (!REG_P (op1))
+ op1 = copy_to_mode_reg (SImode, op1);
+ if (!REG_P (op2))
+ op2 = copy_to_mode_reg (SImode, op2);
+
+ emit_insn (fcode == IX86_BUILTIN_MONITOR
+ ? ix86_gen_monitor (op0, op1, op2)
+ : ix86_gen_monitorx (op0, op1, op2));
+ return 0;
+
+ case IX86_BUILTIN_MWAIT:
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ arg1 = CALL_EXPR_ARG (exp, 1);
+ op0 = expand_normal (arg0);
+ op1 = expand_normal (arg1);
+ if (!REG_P (op0))
+ op0 = copy_to_mode_reg (SImode, op0);
+ if (!REG_P (op1))
+ op1 = copy_to_mode_reg (SImode, op1);
+ emit_insn (gen_sse3_mwait (op0, op1));
+ return 0;
+
+ case IX86_BUILTIN_MWAITX:
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ arg1 = CALL_EXPR_ARG (exp, 1);
+ arg2 = CALL_EXPR_ARG (exp, 2);
+ op0 = expand_normal (arg0);
+ op1 = expand_normal (arg1);
+ op2 = expand_normal (arg2);
+ if (!REG_P (op0))
+ op0 = copy_to_mode_reg (SImode, op0);
+ if (!REG_P (op1))
+ op1 = copy_to_mode_reg (SImode, op1);
+ if (!REG_P (op2))
+ op2 = copy_to_mode_reg (SImode, op2);
+ emit_insn (gen_mwaitx (op0, op1, op2));
+ return 0;
+
+ case IX86_BUILTIN_UMONITOR:
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op0 = expand_normal (arg0);
+
+ op0 = ix86_zero_extend_to_Pmode (op0);
+
+ insn = (TARGET_64BIT
+ ? gen_umonitor_di (op0)
+ : gen_umonitor_si (op0));
+
+ emit_insn (insn);
+ return 0;
+
+ case IX86_BUILTIN_UMWAIT:
+ case IX86_BUILTIN_TPAUSE:
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ arg1 = CALL_EXPR_ARG (exp, 1);
+ op0 = expand_normal (arg0);
+ op1 = expand_normal (arg1);
+
+ if (!REG_P (op0))
+ op0 = copy_to_mode_reg (SImode, op0);
+
+ op1 = force_reg (DImode, op1);
+
+ if (TARGET_64BIT)
+ {
+ op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
+ NULL, 1, OPTAB_DIRECT);
+ switch (fcode)
+ {
+ case IX86_BUILTIN_UMWAIT:
+ icode = CODE_FOR_umwait_rex64;
+ break;
+ case IX86_BUILTIN_TPAUSE:
+ icode = CODE_FOR_tpause_rex64;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ op2 = gen_lowpart (SImode, op2);
+ op1 = gen_lowpart (SImode, op1);
+ pat = GEN_FCN (icode) (op0, op1, op2);
+ }
+ else
+ {
+ switch (fcode)
+ {
+ case IX86_BUILTIN_UMWAIT:
+ icode = CODE_FOR_umwait;
+ break;
+ case IX86_BUILTIN_TPAUSE:
+ icode = CODE_FOR_tpause;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ pat = GEN_FCN (icode) (op0, op1);
+ }
+
+ if (!pat)
+ return 0;
+
+ emit_insn (pat);
+
+ if (target == 0
+ || !register_operand (target, QImode))
+ target = gen_reg_rtx (QImode);
+
+ pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
+ const0_rtx);
+ emit_insn (gen_rtx_SET (target, pat));
+
+ return target;
+
+ case IX86_BUILTIN_CLZERO:
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op0 = expand_normal (arg0);
+ if (!REG_P (op0))
+ op0 = ix86_zero_extend_to_Pmode (op0);
+ emit_insn (ix86_gen_clzero (op0));
+ return 0;
+
+ case IX86_BUILTIN_CLDEMOTE:
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op0 = expand_normal (arg0);
+ icode = CODE_FOR_cldemote;
+ if (!insn_data[icode].operand[0].predicate (op0, Pmode))
+ op0 = ix86_zero_extend_to_Pmode (op0);
+
+ emit_insn (gen_cldemote (op0));
+ return 0;
+
+ case IX86_BUILTIN_VEC_INIT_V2SI:
+ case IX86_BUILTIN_VEC_INIT_V4HI:
+ case IX86_BUILTIN_VEC_INIT_V8QI:
+ return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
+
+ case IX86_BUILTIN_VEC_EXT_V2DF:
+ case IX86_BUILTIN_VEC_EXT_V2DI:
+ case IX86_BUILTIN_VEC_EXT_V4SF:
+ case IX86_BUILTIN_VEC_EXT_V4SI:
+ case IX86_BUILTIN_VEC_EXT_V8HI:
+ case IX86_BUILTIN_VEC_EXT_V2SI:
+ case IX86_BUILTIN_VEC_EXT_V4HI:
+ case IX86_BUILTIN_VEC_EXT_V16QI:
+ return ix86_expand_vec_ext_builtin (exp, target);
+
+ case IX86_BUILTIN_VEC_SET_V2DI:
+ case IX86_BUILTIN_VEC_SET_V4SF:
+ case IX86_BUILTIN_VEC_SET_V4SI:
+ case IX86_BUILTIN_VEC_SET_V8HI:
+ case IX86_BUILTIN_VEC_SET_V4HI:
+ case IX86_BUILTIN_VEC_SET_V16QI:
+ return ix86_expand_vec_set_builtin (exp);
+
+ case IX86_BUILTIN_NANQ:
+ case IX86_BUILTIN_NANSQ:
+ return expand_call (exp, target, ignore);
+
+ case IX86_BUILTIN_RDPID:
+
+ op0 = gen_reg_rtx (word_mode);
+
+ if (TARGET_64BIT)
+ {
+ insn = gen_rdpid_rex64 (op0);
+ op0 = convert_to_mode (SImode, op0, 1);
+ }
+ else
+ insn = gen_rdpid (op0);
+
+ emit_insn (insn);
+
+ if (target == 0
+ || !register_operand (target, SImode))
+ target = gen_reg_rtx (SImode);
+
+ emit_move_insn (target, op0);
+ return target;
+
+ case IX86_BUILTIN_RDPMC:
+ case IX86_BUILTIN_RDTSC:
+ case IX86_BUILTIN_RDTSCP:
+ case IX86_BUILTIN_XGETBV:
+
+ op0 = gen_reg_rtx (DImode);
+ op1 = gen_reg_rtx (DImode);
+
+ if (fcode == IX86_BUILTIN_RDPMC)
+ {
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op2 = expand_normal (arg0);
+ if (!register_operand (op2, SImode))
+ op2 = copy_to_mode_reg (SImode, op2);
+
+ insn = (TARGET_64BIT
+ ? gen_rdpmc_rex64 (op0, op1, op2)
+ : gen_rdpmc (op0, op2));
+ emit_insn (insn);
+ }
+ else if (fcode == IX86_BUILTIN_XGETBV)
+ {
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op2 = expand_normal (arg0);
+ if (!register_operand (op2, SImode))
+ op2 = copy_to_mode_reg (SImode, op2);
+
+ insn = (TARGET_64BIT
+ ? gen_xgetbv_rex64 (op0, op1, op2)
+ : gen_xgetbv (op0, op2));
+ emit_insn (insn);
+ }
+ else if (fcode == IX86_BUILTIN_RDTSC)
+ {
+ insn = (TARGET_64BIT
+ ? gen_rdtsc_rex64 (op0, op1)
+ : gen_rdtsc (op0));
+ emit_insn (insn);
+ }
+ else
+ {
+ op2 = gen_reg_rtx (SImode);
+
+ insn = (TARGET_64BIT
+ ? gen_rdtscp_rex64 (op0, op1, op2)
+ : gen_rdtscp (op0, op2));
+ emit_insn (insn);
+
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op4 = expand_normal (arg0);
+ if (!address_operand (op4, VOIDmode))
+ {
+ op4 = convert_memory_address (Pmode, op4);
+ op4 = copy_addr_to_reg (op4);
+ }
+ emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
+ }
+
+ if (target == 0
+ || !register_operand (target, DImode))
+ target = gen_reg_rtx (DImode);
+
+ if (TARGET_64BIT)
+ {
+ op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
+ op1, 1, OPTAB_DIRECT);
+ op0 = expand_simple_binop (DImode, IOR, op0, op1,
+ op0, 1, OPTAB_DIRECT);
+ }
+
+ emit_move_insn (target, op0);
+ return target;
+
+ case IX86_BUILTIN_MOVDIR64B:
+
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ arg1 = CALL_EXPR_ARG (exp, 1);
+ op0 = expand_normal (arg0);
+ op1 = expand_normal (arg1);
+
+ op0 = ix86_zero_extend_to_Pmode (op0);
+ if (!address_operand (op1, VOIDmode))
+ {
+ op1 = convert_memory_address (Pmode, op1);
+ op1 = copy_addr_to_reg (op1);
+ }
+ op1 = gen_rtx_MEM (XImode, op1);
+
+ insn = (TARGET_64BIT
+ ? gen_movdir64b_di (op0, op1)
+ : gen_movdir64b_si (op0, op1));
+ emit_insn (insn);
+ return 0;
+
+ case IX86_BUILTIN_FXSAVE:
+ case IX86_BUILTIN_FXRSTOR:
+ case IX86_BUILTIN_FXSAVE64:
+ case IX86_BUILTIN_FXRSTOR64:
+ case IX86_BUILTIN_FNSTENV:
+ case IX86_BUILTIN_FLDENV:
+ mode0 = BLKmode;
+ switch (fcode)
+ {
+ case IX86_BUILTIN_FXSAVE:
+ icode = CODE_FOR_fxsave;
+ break;
+ case IX86_BUILTIN_FXRSTOR:
+ icode = CODE_FOR_fxrstor;
+ break;
+ case IX86_BUILTIN_FXSAVE64:
+ icode = CODE_FOR_fxsave64;
+ break;
+ case IX86_BUILTIN_FXRSTOR64:
+ icode = CODE_FOR_fxrstor64;
+ break;
+ case IX86_BUILTIN_FNSTENV:
+ icode = CODE_FOR_fnstenv;
+ break;
+ case IX86_BUILTIN_FLDENV:
+ icode = CODE_FOR_fldenv;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op0 = expand_normal (arg0);
+
+ if (!address_operand (op0, VOIDmode))
+ {
+ op0 = convert_memory_address (Pmode, op0);
+ op0 = copy_addr_to_reg (op0);
+ }
+ op0 = gen_rtx_MEM (mode0, op0);
+
+ pat = GEN_FCN (icode) (op0);
+ if (pat)
+ emit_insn (pat);
+ return 0;
+
+ case IX86_BUILTIN_XSETBV:
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ arg1 = CALL_EXPR_ARG (exp, 1);
+ op0 = expand_normal (arg0);
+ op1 = expand_normal (arg1);
+
+ if (!REG_P (op0))
+ op0 = copy_to_mode_reg (SImode, op0);
+
+ op1 = force_reg (DImode, op1);
+
+ if (TARGET_64BIT)
+ {
+ op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
+ NULL, 1, OPTAB_DIRECT);
+
+ icode = CODE_FOR_xsetbv_rex64;
+
+ op2 = gen_lowpart (SImode, op2);
+ op1 = gen_lowpart (SImode, op1);
+ pat = GEN_FCN (icode) (op0, op1, op2);
+ }
+ else
+ {
+ icode = CODE_FOR_xsetbv;
+
+ pat = GEN_FCN (icode) (op0, op1);
+ }
+ if (pat)
+ emit_insn (pat);
+ return 0;
+
+ case IX86_BUILTIN_XSAVE:
+ case IX86_BUILTIN_XRSTOR:
+ case IX86_BUILTIN_XSAVE64:
+ case IX86_BUILTIN_XRSTOR64:
+ case IX86_BUILTIN_XSAVEOPT:
+ case IX86_BUILTIN_XSAVEOPT64:
+ case IX86_BUILTIN_XSAVES:
+ case IX86_BUILTIN_XRSTORS:
+ case IX86_BUILTIN_XSAVES64:
+ case IX86_BUILTIN_XRSTORS64:
+ case IX86_BUILTIN_XSAVEC:
+ case IX86_BUILTIN_XSAVEC64:
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ arg1 = CALL_EXPR_ARG (exp, 1);
+ op0 = expand_normal (arg0);
+ op1 = expand_normal (arg1);
+
+ if (!address_operand (op0, VOIDmode))
+ {
+ op0 = convert_memory_address (Pmode, op0);
+ op0 = copy_addr_to_reg (op0);
+ }
+ op0 = gen_rtx_MEM (BLKmode, op0);
+
+ op1 = force_reg (DImode, op1);
+
+ if (TARGET_64BIT)
+ {
+ op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
+ NULL, 1, OPTAB_DIRECT);
+ switch (fcode)
+ {
+ case IX86_BUILTIN_XSAVE:
+ icode = CODE_FOR_xsave_rex64;
+ break;
+ case IX86_BUILTIN_XRSTOR:
+ icode = CODE_FOR_xrstor_rex64;
+ break;
+ case IX86_BUILTIN_XSAVE64:
+ icode = CODE_FOR_xsave64;
+ break;
+ case IX86_BUILTIN_XRSTOR64:
+ icode = CODE_FOR_xrstor64;
+ break;
+ case IX86_BUILTIN_XSAVEOPT:
+ icode = CODE_FOR_xsaveopt_rex64;
+ break;
+ case IX86_BUILTIN_XSAVEOPT64:
+ icode = CODE_FOR_xsaveopt64;
+ break;
+ case IX86_BUILTIN_XSAVES:
+ icode = CODE_FOR_xsaves_rex64;
+ break;
+ case IX86_BUILTIN_XRSTORS:
+ icode = CODE_FOR_xrstors_rex64;
+ break;
+ case IX86_BUILTIN_XSAVES64:
+ icode = CODE_FOR_xsaves64;
+ break;
+ case IX86_BUILTIN_XRSTORS64:
+ icode = CODE_FOR_xrstors64;
+ break;
+ case IX86_BUILTIN_XSAVEC:
+ icode = CODE_FOR_xsavec_rex64;
+ break;
+ case IX86_BUILTIN_XSAVEC64:
+ icode = CODE_FOR_xsavec64;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ op2 = gen_lowpart (SImode, op2);
+ op1 = gen_lowpart (SImode, op1);
+ pat = GEN_FCN (icode) (op0, op1, op2);
+ }
+ else
+ {
+ switch (fcode)
+ {
+ case IX86_BUILTIN_XSAVE:
+ icode = CODE_FOR_xsave;
+ break;
+ case IX86_BUILTIN_XRSTOR:
+ icode = CODE_FOR_xrstor;
+ break;
+ case IX86_BUILTIN_XSAVEOPT:
+ icode = CODE_FOR_xsaveopt;
+ break;
+ case IX86_BUILTIN_XSAVES:
+ icode = CODE_FOR_xsaves;
+ break;
+ case IX86_BUILTIN_XRSTORS:
+ icode = CODE_FOR_xrstors;
+ break;
+ case IX86_BUILTIN_XSAVEC:
+ icode = CODE_FOR_xsavec;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ pat = GEN_FCN (icode) (op0, op1);
+ }
+
+ if (pat)
+ emit_insn (pat);
+ return 0;
+
+ case IX86_BUILTIN_LLWPCB:
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op0 = expand_normal (arg0);
+ icode = CODE_FOR_lwp_llwpcb;
+ if (!insn_data[icode].operand[0].predicate (op0, Pmode))
+ op0 = ix86_zero_extend_to_Pmode (op0);
+ emit_insn (gen_lwp_llwpcb (op0));
+ return 0;
+
+ case IX86_BUILTIN_SLWPCB:
+ icode = CODE_FOR_lwp_slwpcb;
+ if (!target
+ || !insn_data[icode].operand[0].predicate (target, Pmode))
+ target = gen_reg_rtx (Pmode);
+ emit_insn (gen_lwp_slwpcb (target));
+ return target;
+
+ case IX86_BUILTIN_BEXTRI32:
+ case IX86_BUILTIN_BEXTRI64:
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ arg1 = CALL_EXPR_ARG (exp, 1);
+ op0 = expand_normal (arg0);
+ op1 = expand_normal (arg1);
+ icode = (fcode == IX86_BUILTIN_BEXTRI32
+ ? CODE_FOR_tbm_bextri_si
+ : CODE_FOR_tbm_bextri_di);
+ if (!CONST_INT_P (op1))
+ {
+ error ("last argument must be an immediate");
+ return const0_rtx;
+ }
+ else
+ {
+ unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
+ unsigned char lsb_index = INTVAL (op1) & 0xFF;
+ op1 = GEN_INT (length);
+ op2 = GEN_INT (lsb_index);
+
+ mode1 = insn_data[icode].operand[1].mode;
+ if (!insn_data[icode].operand[1].predicate (op0, mode1))
+ op0 = copy_to_mode_reg (mode1, op0);
+
+ mode0 = insn_data[icode].operand[0].mode;
+ if (target == 0
+ || !register_operand (target, mode0))
+ target = gen_reg_rtx (mode0);
+
+ pat = GEN_FCN (icode) (target, op0, op1, op2);
+ if (pat)
+ emit_insn (pat);
+ return target;
+ }
+
+ case IX86_BUILTIN_RDRAND16_STEP:
+ icode = CODE_FOR_rdrandhi_1;
+ mode0 = HImode;
+ goto rdrand_step;
+
+ case IX86_BUILTIN_RDRAND32_STEP:
+ icode = CODE_FOR_rdrandsi_1;
+ mode0 = SImode;
+ goto rdrand_step;
+
+ case IX86_BUILTIN_RDRAND64_STEP:
+ icode = CODE_FOR_rdranddi_1;
+ mode0 = DImode;
+
+rdrand_step:
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op1 = expand_normal (arg0);
+ if (!address_operand (op1, VOIDmode))
+ {
+ op1 = convert_memory_address (Pmode, op1);
+ op1 = copy_addr_to_reg (op1);
+ }
+
+ op0 = gen_reg_rtx (mode0);
+ emit_insn (GEN_FCN (icode) (op0));
+
+ emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
+
+ op1 = gen_reg_rtx (SImode);
+ emit_move_insn (op1, CONST1_RTX (SImode));
+
+ /* Emit SImode conditional move. */
+ if (mode0 == HImode)
+ {
+ if (TARGET_ZERO_EXTEND_WITH_AND
+ && optimize_function_for_speed_p (cfun))
+ {
+ op2 = force_reg (SImode, const0_rtx);
+
+ emit_insn (gen_movstricthi
+ (gen_lowpart (HImode, op2), op0));
+ }
+ else
+ {
+ op2 = gen_reg_rtx (SImode);
+
+ emit_insn (gen_zero_extendhisi2 (op2, op0));
+ }
+ }
+ else if (mode0 == SImode)
+ op2 = op0;
+ else
+ op2 = gen_rtx_SUBREG (SImode, op0, 0);
+
+ if (target == 0
+ || !register_operand (target, SImode))
+ target = gen_reg_rtx (SImode);
+
+ pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
+ const0_rtx);
+ emit_insn (gen_rtx_SET (target,
+ gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
+ return target;
+
+ case IX86_BUILTIN_RDSEED16_STEP:
+ icode = CODE_FOR_rdseedhi_1;
+ mode0 = HImode;
+ goto rdseed_step;
+
+ case IX86_BUILTIN_RDSEED32_STEP:
+ icode = CODE_FOR_rdseedsi_1;
+ mode0 = SImode;
+ goto rdseed_step;
+
+ case IX86_BUILTIN_RDSEED64_STEP:
+ icode = CODE_FOR_rdseeddi_1;
+ mode0 = DImode;
+
+rdseed_step:
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op1 = expand_normal (arg0);
+ if (!address_operand (op1, VOIDmode))
+ {
+ op1 = convert_memory_address (Pmode, op1);
+ op1 = copy_addr_to_reg (op1);
+ }
+
+ op0 = gen_reg_rtx (mode0);
+ emit_insn (GEN_FCN (icode) (op0));
+
+ emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
+
+ op2 = gen_reg_rtx (QImode);
+
+ pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
+ const0_rtx);
+ emit_insn (gen_rtx_SET (op2, pat));
+
+ if (target == 0
+ || !register_operand (target, SImode))
+ target = gen_reg_rtx (SImode);
+
+ emit_insn (gen_zero_extendqisi2 (target, op2));
+ return target;
+
+ case IX86_BUILTIN_SBB32:
+ icode = CODE_FOR_subborrowsi;
+ icode2 = CODE_FOR_subborrowsi_0;
+ mode0 = SImode;
+ mode1 = DImode;
+ mode2 = CCmode;
+ goto handlecarry;
+
+ case IX86_BUILTIN_SBB64:
+ icode = CODE_FOR_subborrowdi;
+ icode2 = CODE_FOR_subborrowdi_0;
+ mode0 = DImode;
+ mode1 = TImode;
+ mode2 = CCmode;
+ goto handlecarry;
+
+ case IX86_BUILTIN_ADDCARRYX32:
+ icode = CODE_FOR_addcarrysi;
+ icode2 = CODE_FOR_addcarrysi_0;
+ mode0 = SImode;
+ mode1 = DImode;
+ mode2 = CCCmode;
+ goto handlecarry;
+
+ case IX86_BUILTIN_ADDCARRYX64:
+ icode = CODE_FOR_addcarrydi;
+ icode2 = CODE_FOR_addcarrydi_0;
+ mode0 = DImode;
+ mode1 = TImode;
+ mode2 = CCCmode;
+
+ handlecarry:
+ arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
+ arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
+ arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
+ arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
+
+ op1 = expand_normal (arg0);
+ if (!integer_zerop (arg0))
+ op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
+
+ op2 = expand_normal (arg1);
+ if (!register_operand (op2, mode0))
+ op2 = copy_to_mode_reg (mode0, op2);
+
+ op3 = expand_normal (arg2);
+ if (!register_operand (op3, mode0))
+ op3 = copy_to_mode_reg (mode0, op3);
+
+ op4 = expand_normal (arg3);
+ if (!address_operand (op4, VOIDmode))
+ {
+ op4 = convert_memory_address (Pmode, op4);
+ op4 = copy_addr_to_reg (op4);
+ }
+
+ op0 = gen_reg_rtx (mode0);
+ if (integer_zerop (arg0))
+ {
+ /* If arg0 is 0, optimize right away into add or sub
+ instruction that sets CCCmode flags. */
+ op1 = gen_rtx_REG (mode2, FLAGS_REG);
+ emit_insn (GEN_FCN (icode2) (op0, op2, op3));
+ }
+ else
+ {
+ /* Generate CF from input operand. */
+ emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
+
+ /* Generate instruction that consumes CF. */
+ op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
+ pat = gen_rtx_LTU (mode1, op1, const0_rtx);
+ pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
+ emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
+ }
+
+ /* Return current CF value. */
+ if (target == 0)
+ target = gen_reg_rtx (QImode);
+
+ pat = gen_rtx_LTU (QImode, op1, const0_rtx);
+ emit_insn (gen_rtx_SET (target, pat));
+
+ /* Store the result. */
+ emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
+
+ return target;
+
+ case IX86_BUILTIN_READ_FLAGS:
+ emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
+
+ if (optimize
+ || target == NULL_RTX
+ || !nonimmediate_operand (target, word_mode)
+ || GET_MODE (target) != word_mode)
+ target = gen_reg_rtx (word_mode);
+
+ emit_insn (gen_pop (target));
+ return target;
+
+ case IX86_BUILTIN_WRITE_FLAGS:
+
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op0 = expand_normal (arg0);
+ if (!general_no_elim_operand (op0, word_mode))
+ op0 = copy_to_mode_reg (word_mode, op0);
+
+ emit_insn (gen_push (op0));
+ emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
+ return 0;
+
+ case IX86_BUILTIN_KTESTC8:
+ icode = CODE_FOR_ktestqi;
+ mode3 = CCCmode;
+ goto kortest;
+
+ case IX86_BUILTIN_KTESTZ8:
+ icode = CODE_FOR_ktestqi;
+ mode3 = CCZmode;
+ goto kortest;
+
+ case IX86_BUILTIN_KTESTC16:
+ icode = CODE_FOR_ktesthi;
+ mode3 = CCCmode;
+ goto kortest;
+
+ case IX86_BUILTIN_KTESTZ16:
+ icode = CODE_FOR_ktesthi;
+ mode3 = CCZmode;
+ goto kortest;
+
+ case IX86_BUILTIN_KTESTC32:
+ icode = CODE_FOR_ktestsi;
+ mode3 = CCCmode;
+ goto kortest;
+
+ case IX86_BUILTIN_KTESTZ32:
+ icode = CODE_FOR_ktestsi;
+ mode3 = CCZmode;
+ goto kortest;
+
+ case IX86_BUILTIN_KTESTC64:
+ icode = CODE_FOR_ktestdi;
+ mode3 = CCCmode;
+ goto kortest;
+
+ case IX86_BUILTIN_KTESTZ64:
+ icode = CODE_FOR_ktestdi;
+ mode3 = CCZmode;
+ goto kortest;
+
+ case IX86_BUILTIN_KORTESTC8:
+ icode = CODE_FOR_kortestqi;
+ mode3 = CCCmode;
+ goto kortest;
+
+ case IX86_BUILTIN_KORTESTZ8:
+ icode = CODE_FOR_kortestqi;
+ mode3 = CCZmode;
+ goto kortest;
+
+ case IX86_BUILTIN_KORTESTC16:
+ icode = CODE_FOR_kortesthi;
+ mode3 = CCCmode;
+ goto kortest;
+
+ case IX86_BUILTIN_KORTESTZ16:
+ icode = CODE_FOR_kortesthi;
+ mode3 = CCZmode;
+ goto kortest;
+
+ case IX86_BUILTIN_KORTESTC32:
+ icode = CODE_FOR_kortestsi;
+ mode3 = CCCmode;
+ goto kortest;
+
+ case IX86_BUILTIN_KORTESTZ32:
+ icode = CODE_FOR_kortestsi;
+ mode3 = CCZmode;
+ goto kortest;
+
+ case IX86_BUILTIN_KORTESTC64:
+ icode = CODE_FOR_kortestdi;
+ mode3 = CCCmode;
+ goto kortest;
+
+ case IX86_BUILTIN_KORTESTZ64:
+ icode = CODE_FOR_kortestdi;
+ mode3 = CCZmode;
+
+ kortest:
+ arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
+ arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
+ op0 = expand_normal (arg0);
+ op1 = expand_normal (arg1);
+
+ mode0 = insn_data[icode].operand[0].mode;
+ mode1 = insn_data[icode].operand[1].mode;
+
+ if (GET_MODE (op0) != VOIDmode)
+ op0 = force_reg (GET_MODE (op0), op0);
+
+ op0 = gen_lowpart (mode0, op0);
+
+ if (!insn_data[icode].operand[0].predicate (op0, mode0))
+ op0 = copy_to_mode_reg (mode0, op0);
+
+ if (GET_MODE (op1) != VOIDmode)
+ op1 = force_reg (GET_MODE (op1), op1);
+
+ op1 = gen_lowpart (mode1, op1);
+
+ if (!insn_data[icode].operand[1].predicate (op1, mode1))
+ op1 = copy_to_mode_reg (mode1, op1);
+
+ target = gen_reg_rtx (QImode);
+
+ /* Emit kortest. */
+ emit_insn (GEN_FCN (icode) (op0, op1));
+ /* And use setcc to return result from flags. */
+ ix86_expand_setcc (target, EQ,
+ gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
+ return target;
+
+ case IX86_BUILTIN_GATHERSIV2DF:
+ icode = CODE_FOR_avx2_gathersiv2df;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERSIV4DF:
+ icode = CODE_FOR_avx2_gathersiv4df;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERDIV2DF:
+ icode = CODE_FOR_avx2_gatherdiv2df;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERDIV4DF:
+ icode = CODE_FOR_avx2_gatherdiv4df;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERSIV4SF:
+ icode = CODE_FOR_avx2_gathersiv4sf;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERSIV8SF:
+ icode = CODE_FOR_avx2_gathersiv8sf;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERDIV4SF:
+ icode = CODE_FOR_avx2_gatherdiv4sf;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERDIV8SF:
+ icode = CODE_FOR_avx2_gatherdiv8sf;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERSIV2DI:
+ icode = CODE_FOR_avx2_gathersiv2di;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERSIV4DI:
+ icode = CODE_FOR_avx2_gathersiv4di;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERDIV2DI:
+ icode = CODE_FOR_avx2_gatherdiv2di;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERDIV4DI:
+ icode = CODE_FOR_avx2_gatherdiv4di;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERSIV4SI:
+ icode = CODE_FOR_avx2_gathersiv4si;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERSIV8SI:
+ icode = CODE_FOR_avx2_gathersiv8si;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERDIV4SI:
+ icode = CODE_FOR_avx2_gatherdiv4si;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERDIV8SI:
+ icode = CODE_FOR_avx2_gatherdiv8si;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERALTSIV4DF:
+ icode = CODE_FOR_avx2_gathersiv4df;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERALTDIV8SF:
+ icode = CODE_FOR_avx2_gatherdiv8sf;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERALTSIV4DI:
+ icode = CODE_FOR_avx2_gathersiv4di;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHERALTDIV8SI:
+ icode = CODE_FOR_avx2_gatherdiv8si;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3SIV16SF:
+ icode = CODE_FOR_avx512f_gathersiv16sf;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3SIV8DF:
+ icode = CODE_FOR_avx512f_gathersiv8df;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3DIV16SF:
+ icode = CODE_FOR_avx512f_gatherdiv16sf;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3DIV8DF:
+ icode = CODE_FOR_avx512f_gatherdiv8df;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3SIV16SI:
+ icode = CODE_FOR_avx512f_gathersiv16si;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3SIV8DI:
+ icode = CODE_FOR_avx512f_gathersiv8di;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3DIV16SI:
+ icode = CODE_FOR_avx512f_gatherdiv16si;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3DIV8DI:
+ icode = CODE_FOR_avx512f_gatherdiv8di;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3ALTSIV8DF:
+ icode = CODE_FOR_avx512f_gathersiv8df;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3ALTDIV16SF:
+ icode = CODE_FOR_avx512f_gatherdiv16sf;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3ALTSIV8DI:
+ icode = CODE_FOR_avx512f_gathersiv8di;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3ALTDIV16SI:
+ icode = CODE_FOR_avx512f_gatherdiv16si;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3SIV2DF:
+ icode = CODE_FOR_avx512vl_gathersiv2df;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3SIV4DF:
+ icode = CODE_FOR_avx512vl_gathersiv4df;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3DIV2DF:
+ icode = CODE_FOR_avx512vl_gatherdiv2df;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3DIV4DF:
+ icode = CODE_FOR_avx512vl_gatherdiv4df;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3SIV4SF:
+ icode = CODE_FOR_avx512vl_gathersiv4sf;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3SIV8SF:
+ icode = CODE_FOR_avx512vl_gathersiv8sf;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3DIV4SF:
+ icode = CODE_FOR_avx512vl_gatherdiv4sf;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3DIV8SF:
+ icode = CODE_FOR_avx512vl_gatherdiv8sf;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3SIV2DI:
+ icode = CODE_FOR_avx512vl_gathersiv2di;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3SIV4DI:
+ icode = CODE_FOR_avx512vl_gathersiv4di;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3DIV2DI:
+ icode = CODE_FOR_avx512vl_gatherdiv2di;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3DIV4DI:
+ icode = CODE_FOR_avx512vl_gatherdiv4di;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3SIV4SI:
+ icode = CODE_FOR_avx512vl_gathersiv4si;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3SIV8SI:
+ icode = CODE_FOR_avx512vl_gathersiv8si;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3DIV4SI:
+ icode = CODE_FOR_avx512vl_gatherdiv4si;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3DIV8SI:
+ icode = CODE_FOR_avx512vl_gatherdiv8si;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3ALTSIV4DF:
+ icode = CODE_FOR_avx512vl_gathersiv4df;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3ALTDIV8SF:
+ icode = CODE_FOR_avx512vl_gatherdiv8sf;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3ALTSIV4DI:
+ icode = CODE_FOR_avx512vl_gathersiv4di;
+ goto gather_gen;
+ case IX86_BUILTIN_GATHER3ALTDIV8SI:
+ icode = CODE_FOR_avx512vl_gatherdiv8si;
+ goto gather_gen;
+ case IX86_BUILTIN_SCATTERSIV16SF:
+ icode = CODE_FOR_avx512f_scattersiv16sf;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERSIV8DF:
+ icode = CODE_FOR_avx512f_scattersiv8df;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERDIV16SF:
+ icode = CODE_FOR_avx512f_scatterdiv16sf;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERDIV8DF:
+ icode = CODE_FOR_avx512f_scatterdiv8df;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERSIV16SI:
+ icode = CODE_FOR_avx512f_scattersiv16si;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERSIV8DI:
+ icode = CODE_FOR_avx512f_scattersiv8di;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERDIV16SI:
+ icode = CODE_FOR_avx512f_scatterdiv16si;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERDIV8DI:
+ icode = CODE_FOR_avx512f_scatterdiv8di;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERSIV8SF:
+ icode = CODE_FOR_avx512vl_scattersiv8sf;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERSIV4SF:
+ icode = CODE_FOR_avx512vl_scattersiv4sf;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERSIV4DF:
+ icode = CODE_FOR_avx512vl_scattersiv4df;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERSIV2DF:
+ icode = CODE_FOR_avx512vl_scattersiv2df;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERDIV8SF:
+ icode = CODE_FOR_avx512vl_scatterdiv8sf;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERDIV4SF:
+ icode = CODE_FOR_avx512vl_scatterdiv4sf;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERDIV4DF:
+ icode = CODE_FOR_avx512vl_scatterdiv4df;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERDIV2DF:
+ icode = CODE_FOR_avx512vl_scatterdiv2df;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERSIV8SI:
+ icode = CODE_FOR_avx512vl_scattersiv8si;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERSIV4SI:
+ icode = CODE_FOR_avx512vl_scattersiv4si;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERSIV4DI:
+ icode = CODE_FOR_avx512vl_scattersiv4di;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERSIV2DI:
+ icode = CODE_FOR_avx512vl_scattersiv2di;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERDIV8SI:
+ icode = CODE_FOR_avx512vl_scatterdiv8si;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERDIV4SI:
+ icode = CODE_FOR_avx512vl_scatterdiv4si;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERDIV4DI:
+ icode = CODE_FOR_avx512vl_scatterdiv4di;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERDIV2DI:
+ icode = CODE_FOR_avx512vl_scatterdiv2di;
+ goto scatter_gen;
+ case IX86_BUILTIN_GATHERPFDPD:
+ icode = CODE_FOR_avx512pf_gatherpfv8sidf;
+ goto vec_prefetch_gen;
+ case IX86_BUILTIN_SCATTERALTSIV8DF:
+ icode = CODE_FOR_avx512f_scattersiv8df;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERALTDIV16SF:
+ icode = CODE_FOR_avx512f_scatterdiv16sf;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERALTSIV8DI:
+ icode = CODE_FOR_avx512f_scattersiv8di;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERALTDIV16SI:
+ icode = CODE_FOR_avx512f_scatterdiv16si;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERALTSIV4DF:
+ icode = CODE_FOR_avx512vl_scattersiv4df;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERALTDIV8SF:
+ icode = CODE_FOR_avx512vl_scatterdiv8sf;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERALTSIV4DI:
+ icode = CODE_FOR_avx512vl_scattersiv4di;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERALTDIV8SI:
+ icode = CODE_FOR_avx512vl_scatterdiv8si;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERALTSIV2DF:
+ icode = CODE_FOR_avx512vl_scattersiv2df;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERALTDIV4SF:
+ icode = CODE_FOR_avx512vl_scatterdiv4sf;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERALTSIV2DI:
+ icode = CODE_FOR_avx512vl_scattersiv2di;
+ goto scatter_gen;
+ case IX86_BUILTIN_SCATTERALTDIV4SI:
+ icode = CODE_FOR_avx512vl_scatterdiv4si;
+ goto scatter_gen;
+ case IX86_BUILTIN_GATHERPFDPS:
+ icode = CODE_FOR_avx512pf_gatherpfv16sisf;
+ goto vec_prefetch_gen;
+ case IX86_BUILTIN_GATHERPFQPD:
+ icode = CODE_FOR_avx512pf_gatherpfv8didf;
+ goto vec_prefetch_gen;
+ case IX86_BUILTIN_GATHERPFQPS:
+ icode = CODE_FOR_avx512pf_gatherpfv8disf;
+ goto vec_prefetch_gen;
+ case IX86_BUILTIN_SCATTERPFDPD:
+ icode = CODE_FOR_avx512pf_scatterpfv8sidf;
+ goto vec_prefetch_gen;
+ case IX86_BUILTIN_SCATTERPFDPS:
+ icode = CODE_FOR_avx512pf_scatterpfv16sisf;
+ goto vec_prefetch_gen;
+ case IX86_BUILTIN_SCATTERPFQPD:
+ icode = CODE_FOR_avx512pf_scatterpfv8didf;
+ goto vec_prefetch_gen;
+ case IX86_BUILTIN_SCATTERPFQPS:
+ icode = CODE_FOR_avx512pf_scatterpfv8disf;
+ goto vec_prefetch_gen;
+
+ gather_gen:
+ rtx half;
+ rtx (*gen) (rtx, rtx);
+
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ arg1 = CALL_EXPR_ARG (exp, 1);
+ arg2 = CALL_EXPR_ARG (exp, 2);
+ arg3 = CALL_EXPR_ARG (exp, 3);
+ arg4 = CALL_EXPR_ARG (exp, 4);
+ op0 = expand_normal (arg0);
+ op1 = expand_normal (arg1);
+ op2 = expand_normal (arg2);
+ op3 = expand_normal (arg3);
+ op4 = expand_normal (arg4);
+ /* Note the arg order is different from the operand order. */
+ mode0 = insn_data[icode].operand[1].mode;
+ mode2 = insn_data[icode].operand[3].mode;
+ mode3 = insn_data[icode].operand[4].mode;
+ mode4 = insn_data[icode].operand[5].mode;
+
+ if (target == NULL_RTX
+ || GET_MODE (target) != insn_data[icode].operand[0].mode
+ || !insn_data[icode].operand[0].predicate (target,
+ GET_MODE (target)))
+ subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
+ else
+ subtarget = target;
+
+ switch (fcode)
+ {
+ case IX86_BUILTIN_GATHER3ALTSIV8DF:
+ case IX86_BUILTIN_GATHER3ALTSIV8DI:
+ half = gen_reg_rtx (V8SImode);
+ if (!nonimmediate_operand (op2, V16SImode))
+ op2 = copy_to_mode_reg (V16SImode, op2);
+ emit_insn (gen_vec_extract_lo_v16si (half, op2));
+ op2 = half;
+ break;
+ case IX86_BUILTIN_GATHER3ALTSIV4DF:
+ case IX86_BUILTIN_GATHER3ALTSIV4DI:
+ case IX86_BUILTIN_GATHERALTSIV4DF:
+ case IX86_BUILTIN_GATHERALTSIV4DI:
+ half = gen_reg_rtx (V4SImode);
+ if (!nonimmediate_operand (op2, V8SImode))
+ op2 = copy_to_mode_reg (V8SImode, op2);
+ emit_insn (gen_vec_extract_lo_v8si (half, op2));
+ op2 = half;
+ break;
+ case IX86_BUILTIN_GATHER3ALTDIV16SF:
+ case IX86_BUILTIN_GATHER3ALTDIV16SI:
+ half = gen_reg_rtx (mode0);
+ if (mode0 == V8SFmode)
+ gen = gen_vec_extract_lo_v16sf;
+ else
+ gen = gen_vec_extract_lo_v16si;
+ if (!nonimmediate_operand (op0, GET_MODE (op0)))
+ op0 = copy_to_mode_reg (GET_MODE (op0), op0);
+ emit_insn (gen (half, op0));
+ op0 = half;
+ op3 = lowpart_subreg (QImode, op3, HImode);
+ break;
+ case IX86_BUILTIN_GATHER3ALTDIV8SF:
+ case IX86_BUILTIN_GATHER3ALTDIV8SI:
+ case IX86_BUILTIN_GATHERALTDIV8SF:
+ case IX86_BUILTIN_GATHERALTDIV8SI:
+ half = gen_reg_rtx (mode0);
+ if (mode0 == V4SFmode)
+ gen = gen_vec_extract_lo_v8sf;
+ else
+ gen = gen_vec_extract_lo_v8si;
+ if (!nonimmediate_operand (op0, GET_MODE (op0)))
+ op0 = copy_to_mode_reg (GET_MODE (op0), op0);
+ emit_insn (gen (half, op0));
+ op0 = half;
+ if (VECTOR_MODE_P (GET_MODE (op3)))
+ {
+ half = gen_reg_rtx (mode0);
+ if (!nonimmediate_operand (op3, GET_MODE (op3)))
+ op3 = copy_to_mode_reg (GET_MODE (op3), op3);
+ emit_insn (gen (half, op3));
+ op3 = half;
+ }
+ break;
+ default:
+ break;
+ }
+
+ /* Force memory operand only with base register here. But we
+ don't want to do it on memory operand for other builtin
+ functions. */
+ op1 = ix86_zero_extend_to_Pmode (op1);
+
+ if (!insn_data[icode].operand[1].predicate (op0, mode0))
+ op0 = copy_to_mode_reg (mode0, op0);
+ if (!insn_data[icode].operand[2].predicate (op1, Pmode))
+ op1 = copy_to_mode_reg (Pmode, op1);
+ if (!insn_data[icode].operand[3].predicate (op2, mode2))
+ op2 = copy_to_mode_reg (mode2, op2);
+
+ op3 = fixup_modeless_constant (op3, mode3);
+
+ if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
+ {
+ if (!insn_data[icode].operand[4].predicate (op3, mode3))
+ op3 = copy_to_mode_reg (mode3, op3);
+ }
+ else
+ {
+ op3 = copy_to_reg (op3);
+ op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
+ }
+ if (!insn_data[icode].operand[5].predicate (op4, mode4))
+ {
+ error ("the last argument must be scale 1, 2, 4, 8");
+ return const0_rtx;
+ }
+
+ /* Optimize. If mask is known to have all high bits set,
+ replace op0 with pc_rtx to signal that the instruction
+ overwrites the whole destination and doesn't use its
+ previous contents. */
+ if (optimize)
+ {
+ if (TREE_CODE (arg3) == INTEGER_CST)
+ {
+ if (integer_all_onesp (arg3))
+ op0 = pc_rtx;
+ }
+ else if (TREE_CODE (arg3) == VECTOR_CST)
+ {
+ unsigned int negative = 0;
+ for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
+ {
+ tree cst = VECTOR_CST_ELT (arg3, i);
+ if (TREE_CODE (cst) == INTEGER_CST
+ && tree_int_cst_sign_bit (cst))
+ negative++;
+ else if (TREE_CODE (cst) == REAL_CST
+ && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
+ negative++;
+ }
+ if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
+ op0 = pc_rtx;
+ }
+ else if (TREE_CODE (arg3) == SSA_NAME
+ && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
+ {
+ /* Recognize also when mask is like:
+ __v2df src = _mm_setzero_pd ();
+ __v2df mask = _mm_cmpeq_pd (src, src);
+ or
+ __v8sf src = _mm256_setzero_ps ();
+ __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
+ as that is a cheaper way to load all ones into
+ a register than having to load a constant from
+ memory. */
+ gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
+ if (is_gimple_call (def_stmt))
+ {
+ tree fndecl = gimple_call_fndecl (def_stmt);
+ if (fndecl
+ && fndecl_built_in_p (fndecl, BUILT_IN_MD))
+ switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
+ {
+ case IX86_BUILTIN_CMPPD:
+ case IX86_BUILTIN_CMPPS:
+ case IX86_BUILTIN_CMPPD256:
+ case IX86_BUILTIN_CMPPS256:
+ if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
+ break;
+ /* FALLTHRU */
+ case IX86_BUILTIN_CMPEQPD:
+ case IX86_BUILTIN_CMPEQPS:
+ if (initializer_zerop (gimple_call_arg (def_stmt, 0))
+ && initializer_zerop (gimple_call_arg (def_stmt,
+ 1)))
+ op0 = pc_rtx;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ }
+
+ pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
+ if (! pat)
+ return const0_rtx;
+ emit_insn (pat);
+
+ switch (fcode)
+ {
+ case IX86_BUILTIN_GATHER3DIV16SF:
+ if (target == NULL_RTX)
+ target = gen_reg_rtx (V8SFmode);
+ emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
+ break;
+ case IX86_BUILTIN_GATHER3DIV16SI:
+ if (target == NULL_RTX)
+ target = gen_reg_rtx (V8SImode);
+ emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
+ break;
+ case IX86_BUILTIN_GATHER3DIV8SF:
+ case IX86_BUILTIN_GATHERDIV8SF:
+ if (target == NULL_RTX)
+ target = gen_reg_rtx (V4SFmode);
+ emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
+ break;
+ case IX86_BUILTIN_GATHER3DIV8SI:
+ case IX86_BUILTIN_GATHERDIV8SI:
+ if (target == NULL_RTX)
+ target = gen_reg_rtx (V4SImode);
+ emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
+ break;
+ default:
+ target = subtarget;
+ break;
+ }
+ return target;
+
+ scatter_gen:
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ arg1 = CALL_EXPR_ARG (exp, 1);
+ arg2 = CALL_EXPR_ARG (exp, 2);
+ arg3 = CALL_EXPR_ARG (exp, 3);
+ arg4 = CALL_EXPR_ARG (exp, 4);
+ op0 = expand_normal (arg0);
+ op1 = expand_normal (arg1);
+ op2 = expand_normal (arg2);
+ op3 = expand_normal (arg3);
+ op4 = expand_normal (arg4);
+ mode1 = insn_data[icode].operand[1].mode;
+ mode2 = insn_data[icode].operand[2].mode;
+ mode3 = insn_data[icode].operand[3].mode;
+ mode4 = insn_data[icode].operand[4].mode;
+
+ /* Scatter instruction stores operand op3 to memory with
+ indices from op2 and scale from op4 under writemask op1.
+ If index operand op2 has more elements then source operand
+ op3 one need to use only its low half. And vice versa. */
+ switch (fcode)
+ {
+ case IX86_BUILTIN_SCATTERALTSIV8DF:
+ case IX86_BUILTIN_SCATTERALTSIV8DI:
+ half = gen_reg_rtx (V8SImode);
+ if (!nonimmediate_operand (op2, V16SImode))
+ op2 = copy_to_mode_reg (V16SImode, op2);
+ emit_insn (gen_vec_extract_lo_v16si (half, op2));
+ op2 = half;
+ break;
+ case IX86_BUILTIN_SCATTERALTDIV16SF:
+ case IX86_BUILTIN_SCATTERALTDIV16SI:
+ half = gen_reg_rtx (mode3);
+ if (mode3 == V8SFmode)
+ gen = gen_vec_extract_lo_v16sf;
+ else
+ gen = gen_vec_extract_lo_v16si;
+ if (!nonimmediate_operand (op3, GET_MODE (op3)))
+ op3 = copy_to_mode_reg (GET_MODE (op3), op3);
+ emit_insn (gen (half, op3));
+ op3 = half;
+ break;
+ case IX86_BUILTIN_SCATTERALTSIV4DF:
+ case IX86_BUILTIN_SCATTERALTSIV4DI:
+ half = gen_reg_rtx (V4SImode);
+ if (!nonimmediate_operand (op2, V8SImode))
+ op2 = copy_to_mode_reg (V8SImode, op2);
+ emit_insn (gen_vec_extract_lo_v8si (half, op2));
+ op2 = half;
+ break;
+ case IX86_BUILTIN_SCATTERALTDIV8SF:
+ case IX86_BUILTIN_SCATTERALTDIV8SI:
+ half = gen_reg_rtx (mode3);
+ if (mode3 == V4SFmode)
+ gen = gen_vec_extract_lo_v8sf;
+ else
+ gen = gen_vec_extract_lo_v8si;
+ if (!nonimmediate_operand (op3, GET_MODE (op3)))
+ op3 = copy_to_mode_reg (GET_MODE (op3), op3);
+ emit_insn (gen (half, op3));
+ op3 = half;
+ break;
+ case IX86_BUILTIN_SCATTERALTSIV2DF:
+ case IX86_BUILTIN_SCATTERALTSIV2DI:
+ if (!nonimmediate_operand (op2, V4SImode))
+ op2 = copy_to_mode_reg (V4SImode, op2);
+ break;
+ case IX86_BUILTIN_SCATTERALTDIV4SF:
+ case IX86_BUILTIN_SCATTERALTDIV4SI:
+ if (!nonimmediate_operand (op3, GET_MODE (op3)))
+ op3 = copy_to_mode_reg (GET_MODE (op3), op3);
+ break;
+ default:
+ break;
+ }
+
+ /* Force memory operand only with base register here. But we
+ don't want to do it on memory operand for other builtin
+ functions. */
+ op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
+
+ if (!insn_data[icode].operand[0].predicate (op0, Pmode))
+ op0 = copy_to_mode_reg (Pmode, op0);
+
+ op1 = fixup_modeless_constant (op1, mode1);
+
+ if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
+ {
+ if (!insn_data[icode].operand[1].predicate (op1, mode1))
+ op1 = copy_to_mode_reg (mode1, op1);
+ }
+ else
+ {
+ op1 = copy_to_reg (op1);
+ op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
+ }
+
+ if (!insn_data[icode].operand[2].predicate (op2, mode2))
+ op2 = copy_to_mode_reg (mode2, op2);
+
+ if (!insn_data[icode].operand[3].predicate (op3, mode3))
+ op3 = copy_to_mode_reg (mode3, op3);
+
+ if (!insn_data[icode].operand[4].predicate (op4, mode4))
+ {
+ error ("the last argument must be scale 1, 2, 4, 8");
+ return const0_rtx;
+ }
+
+ pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
+ if (! pat)
+ return const0_rtx;
+
+ emit_insn (pat);
+ return 0;
+
+ vec_prefetch_gen:
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ arg1 = CALL_EXPR_ARG (exp, 1);
+ arg2 = CALL_EXPR_ARG (exp, 2);
+ arg3 = CALL_EXPR_ARG (exp, 3);
+ arg4 = CALL_EXPR_ARG (exp, 4);
+ op0 = expand_normal (arg0);
+ op1 = expand_normal (arg1);
+ op2 = expand_normal (arg2);
+ op3 = expand_normal (arg3);
+ op4 = expand_normal (arg4);
+ mode0 = insn_data[icode].operand[0].mode;
+ mode1 = insn_data[icode].operand[1].mode;
+ mode3 = insn_data[icode].operand[3].mode;
+ mode4 = insn_data[icode].operand[4].mode;
+
+ op0 = fixup_modeless_constant (op0, mode0);
+
+ if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
+ {
+ if (!insn_data[icode].operand[0].predicate (op0, mode0))
+ op0 = copy_to_mode_reg (mode0, op0);
+ }
+ else
+ {
+ op0 = copy_to_reg (op0);
+ op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
+ }
+
+ if (!insn_data[icode].operand[1].predicate (op1, mode1))
+ op1 = copy_to_mode_reg (mode1, op1);
+
+ /* Force memory operand only with base register here. But we
+ don't want to do it on memory operand for other builtin
+ functions. */
+ op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
+
+ if (!insn_data[icode].operand[2].predicate (op2, Pmode))
+ op2 = copy_to_mode_reg (Pmode, op2);
+
+ if (!insn_data[icode].operand[3].predicate (op3, mode3))
+ {
+ error ("the forth argument must be scale 1, 2, 4, 8");
+ return const0_rtx;
+ }
+
+ if (!insn_data[icode].operand[4].predicate (op4, mode4))
+ {
+ error ("incorrect hint operand");
+ return const0_rtx;
+ }
+
+ pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
+ if (! pat)
+ return const0_rtx;
+
+ emit_insn (pat);
+
+ return 0;
+
+ case IX86_BUILTIN_XABORT:
+ icode = CODE_FOR_xabort;
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op0 = expand_normal (arg0);
+ mode0 = insn_data[icode].operand[0].mode;
+ if (!insn_data[icode].operand[0].predicate (op0, mode0))
+ {
+ error ("the argument to %<xabort%> intrinsic must "
+ "be an 8-bit immediate");
+ return const0_rtx;
+ }
+ emit_insn (gen_xabort (op0));
+ return 0;
+
+ case IX86_BUILTIN_RSTORSSP:
+ case IX86_BUILTIN_CLRSSBSY:
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op0 = expand_normal (arg0);
+ icode = (fcode == IX86_BUILTIN_RSTORSSP
+ ? CODE_FOR_rstorssp
+ : CODE_FOR_clrssbsy);
+ if (!address_operand (op0, VOIDmode))
+ {
+ op1 = convert_memory_address (Pmode, op0);
+ op0 = copy_addr_to_reg (op1);
+ }
+ emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
+ return 0;
+
+ case IX86_BUILTIN_WRSSD:
+ case IX86_BUILTIN_WRSSQ:
+ case IX86_BUILTIN_WRUSSD:
+ case IX86_BUILTIN_WRUSSQ:
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op0 = expand_normal (arg0);
+ arg1 = CALL_EXPR_ARG (exp, 1);
+ op1 = expand_normal (arg1);
+ switch (fcode)
+ {
+ case IX86_BUILTIN_WRSSD:
+ icode = CODE_FOR_wrsssi;
+ mode = SImode;
+ break;
+ case IX86_BUILTIN_WRSSQ:
+ icode = CODE_FOR_wrssdi;
+ mode = DImode;
+ break;
+ case IX86_BUILTIN_WRUSSD:
+ icode = CODE_FOR_wrusssi;
+ mode = SImode;
+ break;
+ case IX86_BUILTIN_WRUSSQ:
+ icode = CODE_FOR_wrussdi;
+ mode = DImode;
+ break;
+ }
+ op0 = force_reg (mode, op0);
+ if (!address_operand (op1, VOIDmode))
+ {
+ op2 = convert_memory_address (Pmode, op1);
+ op1 = copy_addr_to_reg (op2);
+ }
+ emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
+ return 0;
+
+ default:
+ break;
+ }
+
+ if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
+ && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
+ {
+ i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
+ return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
+ target);
+ }
+
+ if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
+ && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
+ {
+ i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
+ rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
+ rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
+ rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
+ int masked = 1;
+ machine_mode mode, wide_mode, nar_mode;
+
+ nar_mode = V4SFmode;
+ mode = V16SFmode;
+ wide_mode = V64SFmode;
+ fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
+ fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
+
+ switch (fcode)
+ {
+ case IX86_BUILTIN_4FMAPS:
+ fcn = gen_avx5124fmaddps_4fmaddps;
+ masked = 0;
+ goto v4fma_expand;
+
+ case IX86_BUILTIN_4DPWSSD:
+ nar_mode = V4SImode;
+ mode = V16SImode;
+ wide_mode = V64SImode;
+ fcn = gen_avx5124vnniw_vp4dpwssd;
+ masked = 0;
+ goto v4fma_expand;
+
+ case IX86_BUILTIN_4DPWSSDS:
+ nar_mode = V4SImode;
+ mode = V16SImode;
+ wide_mode = V64SImode;
+ fcn = gen_avx5124vnniw_vp4dpwssds;
+ masked = 0;
+ goto v4fma_expand;
+
+ case IX86_BUILTIN_4FNMAPS:
+ fcn = gen_avx5124fmaddps_4fnmaddps;
+ masked = 0;
+ goto v4fma_expand;
+
+ case IX86_BUILTIN_4FNMAPS_MASK:
+ fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
+ fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
+ goto v4fma_expand;
+
+ case IX86_BUILTIN_4DPWSSD_MASK:
+ nar_mode = V4SImode;
+ mode = V16SImode;
+ wide_mode = V64SImode;
+ fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
+ fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
+ goto v4fma_expand;
+
+ case IX86_BUILTIN_4DPWSSDS_MASK:
+ nar_mode = V4SImode;
+ mode = V16SImode;
+ wide_mode = V64SImode;
+ fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
+ fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
+ goto v4fma_expand;
+
+ case IX86_BUILTIN_4FMAPS_MASK:
+ {
+ tree args[4];
+ rtx ops[4];
+ rtx wide_reg;
+ rtx accum;
+ rtx addr;
+ rtx mem;
+
+v4fma_expand:
+ wide_reg = gen_reg_rtx (wide_mode);
+ for (i = 0; i < 4; i++)
+ {
+ args[i] = CALL_EXPR_ARG (exp, i);
+ ops[i] = expand_normal (args[i]);
+
+ emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
+ ops[i]);
+ }
+
+ accum = expand_normal (CALL_EXPR_ARG (exp, 4));
+ accum = force_reg (mode, accum);
+
+ addr = expand_normal (CALL_EXPR_ARG (exp, 5));
+ addr = force_reg (Pmode, addr);
+
+ mem = gen_rtx_MEM (nar_mode, addr);
+
+ target = gen_reg_rtx (mode);
+
+ emit_move_insn (target, accum);
+
+ if (! masked)
+ emit_insn (fcn (target, accum, wide_reg, mem));
+ else
+ {
+ rtx merge, mask;
+ merge = expand_normal (CALL_EXPR_ARG (exp, 6));
+
+ mask = expand_normal (CALL_EXPR_ARG (exp, 7));
+
+ if (CONST_INT_P (mask))
+ mask = fixup_modeless_constant (mask, HImode);
+
+ mask = force_reg (HImode, mask);
+
+ if (GET_MODE (mask) != HImode)
+ mask = gen_rtx_SUBREG (HImode, mask, 0);
+
+ /* If merge is 0 then we're about to emit z-masked variant. */
+ if (const0_operand (merge, mode))
+ emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
+ /* If merge is the same as accum then emit merge-masked variant. */
+ else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
+ {
+ merge = force_reg (mode, merge);
+ emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
+ }
+ /* Merge with something unknown might happen if we z-mask w/ -O0. */
+ else
+ {
+ target = gen_reg_rtx (mode);
+ emit_move_insn (target, merge);
+ emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
+ }
+ }
+ return target;
+ }
+
+ case IX86_BUILTIN_4FNMASS:
+ fcn = gen_avx5124fmaddps_4fnmaddss;
+ masked = 0;
+ goto s4fma_expand;
+
+ case IX86_BUILTIN_4FMASS:
+ fcn = gen_avx5124fmaddps_4fmaddss;
+ masked = 0;
+ goto s4fma_expand;
+
+ case IX86_BUILTIN_4FNMASS_MASK:
+ fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
+ fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
+ goto s4fma_expand;
+
+ case IX86_BUILTIN_4FMASS_MASK:
+ {
+ tree args[4];
+ rtx ops[4];
+ rtx wide_reg;
+ rtx accum;
+ rtx addr;
+ rtx mem;
+
+ fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
+ fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
+
+s4fma_expand:
+ mode = V4SFmode;
+ wide_reg = gen_reg_rtx (V64SFmode);
+ for (i = 0; i < 4; i++)
+ {
+ rtx tmp;
+ args[i] = CALL_EXPR_ARG (exp, i);
+ ops[i] = expand_normal (args[i]);
+
+ tmp = gen_reg_rtx (SFmode);
+ emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
+
+ emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
+ gen_rtx_SUBREG (V16SFmode, tmp, 0));
+ }
+
+ accum = expand_normal (CALL_EXPR_ARG (exp, 4));
+ accum = force_reg (V4SFmode, accum);
+
+ addr = expand_normal (CALL_EXPR_ARG (exp, 5));
+ addr = force_reg (Pmode, addr);
+
+ mem = gen_rtx_MEM (V4SFmode, addr);
+
+ target = gen_reg_rtx (V4SFmode);
+
+ emit_move_insn (target, accum);
+
+ if (! masked)
+ emit_insn (fcn (target, accum, wide_reg, mem));
+ else
+ {
+ rtx merge, mask;
+ merge = expand_normal (CALL_EXPR_ARG (exp, 6));
+
+ mask = expand_normal (CALL_EXPR_ARG (exp, 7));
+
+ if (CONST_INT_P (mask))
+ mask = fixup_modeless_constant (mask, QImode);
+
+ mask = force_reg (QImode, mask);
+
+ if (GET_MODE (mask) != QImode)
+ mask = gen_rtx_SUBREG (QImode, mask, 0);
+
+ /* If merge is 0 then we're about to emit z-masked variant. */
+ if (const0_operand (merge, mode))
+ emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
+ /* If merge is the same as accum then emit merge-masked
+ variant. */
+ else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
+ {
+ merge = force_reg (mode, merge);
+ emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
+ }
+ /* Merge with something unknown might happen if we z-mask
+ w/ -O0. */
+ else
+ {
+ target = gen_reg_rtx (mode);
+ emit_move_insn (target, merge);
+ emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
+ }
+ }
+ return target;
+ }
+ case IX86_BUILTIN_RDPID:
+ return ix86_expand_special_args_builtin (bdesc_args + i, exp,
+ target);
+ case IX86_BUILTIN_FABSQ:
+ case IX86_BUILTIN_COPYSIGNQ:
+ if (!TARGET_SSE)
+ /* Emit a normal call if SSE isn't available. */
+ return expand_call (exp, target, ignore);
+ /* FALLTHRU */
+ default:
+ return ix86_expand_args_builtin (bdesc_args + i, exp, target);
+ }
+ }
+
+ if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
+ && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
+ {
+ i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
+ return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
+ }
+
+ if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
+ && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
+ {
+ i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
+ return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
+ }
+
+ if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
+ && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
+ {
+ i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
+ return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
+ }
+
+ if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
+ && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
+ {
+ i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
+ return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
+ }
+
+ if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
+ && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
+ {
+ i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
+ const struct builtin_description *d = bdesc_multi_arg + i;
+ return ix86_expand_multi_arg_builtin (d->icode, exp, target,
+ (enum ix86_builtin_func_type)
+ d->flag, d->comparison);
+ }
+
+ if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
+ && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
+ {
+ i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
+ return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
+ target);
+ }
+
+ if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
+ && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
+ {
+ i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
+ return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp,
+ target);
+ }
+
+ gcc_unreachable ();
+}
+
+/* A subroutine of ix86_expand_vector_init_duplicate. Tries to
+ fill target with val via vec_duplicate. */
+
+static bool
+ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
+{
+ bool ok;
+ rtx_insn *insn;
+ rtx dup;
+
+ /* First attempt to recognize VAL as-is. */
+ dup = gen_vec_duplicate (mode, val);
+ insn = emit_insn (gen_rtx_SET (target, dup));
+ if (recog_memoized (insn) < 0)
+ {
+ rtx_insn *seq;
+ machine_mode innermode = GET_MODE_INNER (mode);
+ rtx reg;
+
+ /* If that fails, force VAL into a register. */
+
+ start_sequence ();
+ reg = force_reg (innermode, val);
+ if (GET_MODE (reg) != innermode)
+ reg = gen_lowpart (innermode, reg);
+ SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
+ seq = get_insns ();
+ end_sequence ();
+ if (seq)
+ emit_insn_before (seq, insn);
+
+ ok = recog_memoized (insn) >= 0;
+ gcc_assert (ok);
+ }
+ return true;
+}
+
+/* Get a vector mode of the same size as the original but with elements
+ twice as wide. This is only guaranteed to apply to integral vectors. */
+
+static machine_mode
+get_mode_wider_vector (machine_mode o)
+{
+ /* ??? Rely on the ordering that genmodes.c gives to vectors. */
+ machine_mode n = GET_MODE_WIDER_MODE (o).require ();
+ gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
+ gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
+ return n;
+}
+
+static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
+static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
+
+/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
+ with all elements equal to VAR. Return true if successful. */
+
+static bool
+ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
+ rtx target, rtx val)
+{
+ bool ok;
+
+ switch (mode)
+ {
+ case E_V2SImode:
+ case E_V2SFmode:
+ if (!mmx_ok)
+ return false;
+ /* FALLTHRU */
+
+ case E_V4DFmode:
+ case E_V4DImode:
+ case E_V8SFmode:
+ case E_V8SImode:
+ case E_V2DFmode:
+ case E_V2DImode:
+ case E_V4SFmode:
+ case E_V4SImode:
+ case E_V16SImode:
+ case E_V8DImode:
+ case E_V16SFmode:
+ case E_V8DFmode:
+ return ix86_vector_duplicate_value (mode, target, val);
+
+ case E_V4HImode:
+ if (!mmx_ok)
+ return false;
+ if (TARGET_SSE || TARGET_3DNOW_A)
+ {
+ rtx x;
+
+ val = gen_lowpart (SImode, val);
+ x = gen_rtx_TRUNCATE (HImode, val);
+ x = gen_rtx_VEC_DUPLICATE (mode, x);
+ emit_insn (gen_rtx_SET (target, x));
+ return true;
+ }
+ goto widen;
+
+ case E_V8QImode:
+ if (!mmx_ok)
+ return false;
+ goto widen;
+
+ case E_V8HImode:
+ if (TARGET_AVX2)
+ return ix86_vector_duplicate_value (mode, target, val);
+
+ if (TARGET_SSE2)
+ {
+ struct expand_vec_perm_d dperm;
+ rtx tmp1, tmp2;
+
+ permute:
+ memset (&dperm, 0, sizeof (dperm));
+ dperm.target = target;
+ dperm.vmode = mode;
+ dperm.nelt = GET_MODE_NUNITS (mode);
+ dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
+ dperm.one_operand_p = true;
+
+ /* Extend to SImode using a paradoxical SUBREG. */
+ tmp1 = gen_reg_rtx (SImode);
+ emit_move_insn (tmp1, gen_lowpart (SImode, val));
+
+ /* Insert the SImode value as low element of a V4SImode vector. */
+ tmp2 = gen_reg_rtx (V4SImode);
+ emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
+ emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
+
+ ok = (expand_vec_perm_1 (&dperm)
+ || expand_vec_perm_broadcast_1 (&dperm));
+ gcc_assert (ok);
+ return ok;
+ }
+ goto widen;
+
+ case E_V16QImode:
+ if (TARGET_AVX2)
+ return ix86_vector_duplicate_value (mode, target, val);
+
+ if (TARGET_SSE2)
+ goto permute;
+ goto widen;
+
+ widen:
+ /* Replicate the value once into the next wider mode and recurse. */
+ {
+ machine_mode smode, wsmode, wvmode;
+ rtx x;
+
+ smode = GET_MODE_INNER (mode);
+ wvmode = get_mode_wider_vector (mode);
+ wsmode = GET_MODE_INNER (wvmode);
+
+ val = convert_modes (wsmode, smode, val, true);
+ x = expand_simple_binop (wsmode, ASHIFT, val,
+ GEN_INT (GET_MODE_BITSIZE (smode)),
+ NULL_RTX, 1, OPTAB_LIB_WIDEN);
+ val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
+
+ x = gen_reg_rtx (wvmode);
+ ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
+ gcc_assert (ok);
+ emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
+ return ok;
+ }
+
+ case E_V16HImode:
+ case E_V32QImode:
+ if (TARGET_AVX2)
+ return ix86_vector_duplicate_value (mode, target, val);
+ else
+ {
+ machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
+ rtx x = gen_reg_rtx (hvmode);
+
+ ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
+ gcc_assert (ok);
+
+ x = gen_rtx_VEC_CONCAT (mode, x, x);
+ emit_insn (gen_rtx_SET (target, x));
+ }
+ return true;
+
+ case E_V64QImode:
+ case E_V32HImode:
+ if (TARGET_AVX512BW)
+ return ix86_vector_duplicate_value (mode, target, val);
+ else
+ {
+ machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
+ rtx x = gen_reg_rtx (hvmode);
+
+ ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
+ gcc_assert (ok);
+
+ x = gen_rtx_VEC_CONCAT (mode, x, x);
+ emit_insn (gen_rtx_SET (target, x));
+ }
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
+ whose ONE_VAR element is VAR, and other elements are zero. Return true
+ if successful. */
+
+static bool
+ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
+ rtx target, rtx var, int one_var)
+{
+ machine_mode vsimode;
+ rtx new_target;
+ rtx x, tmp;
+ bool use_vector_set = false;
+ rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
+
+ switch (mode)
+ {
+ case E_V2DImode:
+ /* For SSE4.1, we normally use vector set. But if the second
+ element is zero and inter-unit moves are OK, we use movq
+ instead. */
+ use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
+ && !(TARGET_INTER_UNIT_MOVES_TO_VEC
+ && one_var == 0));
+ break;
+ case E_V16QImode:
+ case E_V4SImode:
+ case E_V4SFmode:
+ use_vector_set = TARGET_SSE4_1;
+ break;
+ case E_V8HImode:
+ use_vector_set = TARGET_SSE2;
+ break;
+ case E_V4HImode:
+ use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
+ break;
+ case E_V32QImode:
+ case E_V16HImode:
+ use_vector_set = TARGET_AVX;
+ break;
+ case E_V8SImode:
+ use_vector_set = TARGET_AVX;
+ gen_vec_set_0 = gen_vec_setv8si_0;
+ break;
+ case E_V8SFmode:
+ use_vector_set = TARGET_AVX;
+ gen_vec_set_0 = gen_vec_setv8sf_0;
+ break;
+ case E_V4DFmode:
+ use_vector_set = TARGET_AVX;
+ gen_vec_set_0 = gen_vec_setv4df_0;
+ break;
+ case E_V4DImode:
+ /* Use ix86_expand_vector_set in 64bit mode only. */
+ use_vector_set = TARGET_AVX && TARGET_64BIT;
+ gen_vec_set_0 = gen_vec_setv4di_0;
+ break;
+ case E_V16SImode:
+ use_vector_set = TARGET_AVX512F && one_var == 0;
+ gen_vec_set_0 = gen_vec_setv16si_0;
+ break;
+ case E_V16SFmode:
+ use_vector_set = TARGET_AVX512F && one_var == 0;
+ gen_vec_set_0 = gen_vec_setv16sf_0;
+ break;
+ case E_V8DFmode:
+ use_vector_set = TARGET_AVX512F && one_var == 0;
+ gen_vec_set_0 = gen_vec_setv8df_0;
+ break;
+ case E_V8DImode:
+ /* Use ix86_expand_vector_set in 64bit mode only. */
+ use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
+ gen_vec_set_0 = gen_vec_setv8di_0;
+ break;
+ default:
+ break;
+ }
+
+ if (use_vector_set)
+ {
+ if (gen_vec_set_0 && one_var == 0)
+ {
+ var = force_reg (GET_MODE_INNER (mode), var);
+ emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
+ return true;
+ }
+ emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
+ var = force_reg (GET_MODE_INNER (mode), var);
+ ix86_expand_vector_set (mmx_ok, target, var, one_var);
+ return true;
+ }
+
+ switch (mode)
+ {
+ case E_V2SFmode:
+ case E_V2SImode:
+ if (!mmx_ok)
+ return false;
+ /* FALLTHRU */
+
+ case E_V2DFmode:
+ case E_V2DImode:
+ if (one_var != 0)
+ return false;
+ var = force_reg (GET_MODE_INNER (mode), var);
+ x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
+ emit_insn (gen_rtx_SET (target, x));
+ return true;
+
+ case E_V4SFmode:
+ case E_V4SImode:
+ if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+ new_target = gen_reg_rtx (mode);
+ else
+ new_target = target;
+ var = force_reg (GET_MODE_INNER (mode), var);
+ x = gen_rtx_VEC_DUPLICATE (mode, var);
+ x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
+ emit_insn (gen_rtx_SET (new_target, x));
+ if (one_var != 0)
+ {
+ /* We need to shuffle the value to the correct position, so
+ create a new pseudo to store the intermediate result. */
+
+ /* With SSE2, we can use the integer shuffle insns. */
+ if (mode != V4SFmode && TARGET_SSE2)
+ {
+ emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
+ const1_rtx,
+ GEN_INT (one_var == 1 ? 0 : 1),
+ GEN_INT (one_var == 2 ? 0 : 1),
+ GEN_INT (one_var == 3 ? 0 : 1)));
+ if (target != new_target)
+ emit_move_insn (target, new_target);
+ return true;
+ }
+
+ /* Otherwise convert the intermediate result to V4SFmode and
+ use the SSE1 shuffle instructions. */
+ if (mode != V4SFmode)
+ {
+ tmp = gen_reg_rtx (V4SFmode);
+ emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
+ }
+ else
+ tmp = new_target;
+
+ emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
+ const1_rtx,
+ GEN_INT (one_var == 1 ? 0 : 1),
+ GEN_INT (one_var == 2 ? 0+4 : 1+4),
+ GEN_INT (one_var == 3 ? 0+4 : 1+4)));
+
+ if (mode != V4SFmode)
+ emit_move_insn (target, gen_lowpart (V4SImode, tmp));
+ else if (tmp != target)
+ emit_move_insn (target, tmp);
+ }
+ else if (target != new_target)
+ emit_move_insn (target, new_target);
+ return true;
+
+ case E_V8HImode:
+ case E_V16QImode:
+ vsimode = V4SImode;
+ goto widen;
+ case E_V4HImode:
+ case E_V8QImode:
+ if (!mmx_ok)
+ return false;
+ vsimode = V2SImode;
+ goto widen;
+ widen:
+ if (one_var != 0)
+ return false;
+
+ /* Zero extend the variable element to SImode and recurse. */
+ var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
+
+ x = gen_reg_rtx (vsimode);
+ if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
+ var, one_var))
+ gcc_unreachable ();
+
+ emit_move_insn (target, gen_lowpart (mode, x));
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
+ consisting of the values in VALS. It is known that all elements
+ except ONE_VAR are constants. Return true if successful. */
+
+static bool
+ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
+ rtx target, rtx vals, int one_var)
+{
+ rtx var = XVECEXP (vals, 0, one_var);
+ machine_mode wmode;
+ rtx const_vec, x;
+
+ const_vec = copy_rtx (vals);
+ XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
+ const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
+
+ switch (mode)
+ {
+ case E_V2DFmode:
+ case E_V2DImode:
+ case E_V2SFmode:
+ case E_V2SImode:
+ /* For the two element vectors, it's just as easy to use
+ the general case. */
+ return false;
+
+ case E_V4DImode:
+ /* Use ix86_expand_vector_set in 64bit mode only. */
+ if (!TARGET_64BIT)
+ return false;
+ /* FALLTHRU */
+ case E_V4DFmode:
+ case E_V8SFmode:
+ case E_V8SImode:
+ case E_V16HImode:
+ case E_V32QImode:
+ case E_V4SFmode:
+ case E_V4SImode:
+ case E_V8HImode:
+ case E_V4HImode:
+ break;
+
+ case E_V16QImode:
+ if (TARGET_SSE4_1)
+ break;
+ wmode = V8HImode;
+ goto widen;
+ case E_V8QImode:
+ wmode = V4HImode;
+ goto widen;
+ widen:
+ /* There's no way to set one QImode entry easily. Combine
+ the variable value with its adjacent constant value, and
+ promote to an HImode set. */
+ x = XVECEXP (vals, 0, one_var ^ 1);
+ if (one_var & 1)
+ {
+ var = convert_modes (HImode, QImode, var, true);
+ var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
+ NULL_RTX, 1, OPTAB_LIB_WIDEN);
+ x = GEN_INT (INTVAL (x) & 0xff);
+ }
+ else
+ {
+ var = convert_modes (HImode, QImode, var, true);
+ x = gen_int_mode (UINTVAL (x) << 8, HImode);
+ }
+ if (x != const0_rtx)
+ var = expand_simple_binop (HImode, IOR, var, x, var,
+ 1, OPTAB_LIB_WIDEN);
+
+ x = gen_reg_rtx (wmode);
+ emit_move_insn (x, gen_lowpart (wmode, const_vec));
+ ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
+
+ emit_move_insn (target, gen_lowpart (mode, x));
+ return true;
+
+ default:
+ return false;
+ }
+
+ emit_move_insn (target, const_vec);
+ ix86_expand_vector_set (mmx_ok, target, var, one_var);
+ return true;
+}
+
+/* A subroutine of ix86_expand_vector_init_general. Use vector
+ concatenate to handle the most general case: all values variable,
+ and none identical. */
+
+static void
+ix86_expand_vector_init_concat (machine_mode mode,
+ rtx target, rtx *ops, int n)
+{
+ machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
+ rtx first[16], second[8], third[4];
+ rtvec v;
+ int i, j;
+
+ switch (n)
+ {
+ case 2:
+ switch (mode)
+ {
+ case E_V16SImode:
+ cmode = V8SImode;
+ break;
+ case E_V16SFmode:
+ cmode = V8SFmode;
+ break;
+ case E_V8DImode:
+ cmode = V4DImode;
+ break;
+ case E_V8DFmode:
+ cmode = V4DFmode;
+ break;
+ case E_V8SImode:
+ cmode = V4SImode;
+ break;
+ case E_V8SFmode:
+ cmode = V4SFmode;
+ break;
+ case E_V4DImode:
+ cmode = V2DImode;
+ break;
+ case E_V4DFmode:
+ cmode = V2DFmode;
+ break;
+ case E_V4SImode:
+ cmode = V2SImode;
+ break;
+ case E_V4SFmode:
+ cmode = V2SFmode;
+ break;
+ case E_V2DImode:
+ cmode = DImode;
+ break;
+ case E_V2SImode:
+ cmode = SImode;
+ break;
+ case E_V2DFmode:
+ cmode = DFmode;
+ break;
+ case E_V2SFmode:
+ cmode = SFmode;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ if (!register_operand (ops[1], cmode))
+ ops[1] = force_reg (cmode, ops[1]);
+ if (!register_operand (ops[0], cmode))
+ ops[0] = force_reg (cmode, ops[0]);
+ emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
+ ops[1])));
+ break;
+
+ case 4:
+ switch (mode)
+ {
+ case E_V4DImode:
+ cmode = V2DImode;
+ break;
+ case E_V4DFmode:
+ cmode = V2DFmode;
+ break;
+ case E_V4SImode:
+ cmode = V2SImode;
+ break;
+ case E_V4SFmode:
+ cmode = V2SFmode;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ goto half;
+
+ case 8:
+ switch (mode)
+ {
+ case E_V8DImode:
+ cmode = V2DImode;
+ hmode = V4DImode;
+ break;
+ case E_V8DFmode:
+ cmode = V2DFmode;
+ hmode = V4DFmode;
+ break;
+ case E_V8SImode:
+ cmode = V2SImode;
+ hmode = V4SImode;
+ break;
+ case E_V8SFmode:
+ cmode = V2SFmode;
+ hmode = V4SFmode;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ goto half;
+
+ case 16:
+ switch (mode)
+ {
+ case E_V16SImode:
+ cmode = V2SImode;
+ hmode = V4SImode;
+ gmode = V8SImode;
+ break;
+ case E_V16SFmode:
+ cmode = V2SFmode;
+ hmode = V4SFmode;
+ gmode = V8SFmode;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ goto half;
+
+half:
+ /* FIXME: We process inputs backward to help RA. PR 36222. */
+ i = n - 1;
+ j = (n >> 1) - 1;
+ for (; i > 0; i -= 2, j--)
+ {
+ first[j] = gen_reg_rtx (cmode);
+ v = gen_rtvec (2, ops[i - 1], ops[i]);
+ ix86_expand_vector_init (false, first[j],
+ gen_rtx_PARALLEL (cmode, v));
+ }
+
+ n >>= 1;
+ if (n > 4)
+ {
+ gcc_assert (hmode != VOIDmode);
+ gcc_assert (gmode != VOIDmode);
+ for (i = j = 0; i < n; i += 2, j++)
+ {
+ second[j] = gen_reg_rtx (hmode);
+ ix86_expand_vector_init_concat (hmode, second [j],
+ &first [i], 2);
+ }
+ n >>= 1;
+ for (i = j = 0; i < n; i += 2, j++)
+ {
+ third[j] = gen_reg_rtx (gmode);
+ ix86_expand_vector_init_concat (gmode, third[j],
+ &second[i], 2);
+ }
+ n >>= 1;
+ ix86_expand_vector_init_concat (mode, target, third, n);
+ }
+ else if (n > 2)
+ {
+ gcc_assert (hmode != VOIDmode);
+ for (i = j = 0; i < n; i += 2, j++)
+ {
+ second[j] = gen_reg_rtx (hmode);
+ ix86_expand_vector_init_concat (hmode, second [j],
+ &first [i], 2);
+ }
+ n >>= 1;
+ ix86_expand_vector_init_concat (mode, target, second, n);
+ }
+ else
+ ix86_expand_vector_init_concat (mode, target, first, n);
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+}
+
+/* A subroutine of ix86_expand_vector_init_general. Use vector
+ interleave to handle the most general case: all values variable,
+ and none identical. */
+
+static void
+ix86_expand_vector_init_interleave (machine_mode mode,
+ rtx target, rtx *ops, int n)
+{
+ machine_mode first_imode, second_imode, third_imode, inner_mode;
+ int i, j;
+ rtx op0, op1;
+ rtx (*gen_load_even) (rtx, rtx, rtx);
+ rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
+ rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
+
+ switch (mode)
+ {
+ case E_V8HImode:
+ gen_load_even = gen_vec_setv8hi;
+ gen_interleave_first_low = gen_vec_interleave_lowv4si;
+ gen_interleave_second_low = gen_vec_interleave_lowv2di;
+ inner_mode = HImode;
+ first_imode = V4SImode;
+ second_imode = V2DImode;
+ third_imode = VOIDmode;
+ break;
+ case E_V16QImode:
+ gen_load_even = gen_vec_setv16qi;
+ gen_interleave_first_low = gen_vec_interleave_lowv8hi;
+ gen_interleave_second_low = gen_vec_interleave_lowv4si;
+ inner_mode = QImode;
+ first_imode = V8HImode;
+ second_imode = V4SImode;
+ third_imode = V2DImode;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ for (i = 0; i < n; i++)
+ {
+ /* Extend the odd elment to SImode using a paradoxical SUBREG. */
+ op0 = gen_reg_rtx (SImode);
+ emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
+
+ /* Insert the SImode value as low element of V4SImode vector. */
+ op1 = gen_reg_rtx (V4SImode);
+ op0 = gen_rtx_VEC_MERGE (V4SImode,
+ gen_rtx_VEC_DUPLICATE (V4SImode,
+ op0),
+ CONST0_RTX (V4SImode),
+ const1_rtx);
+ emit_insn (gen_rtx_SET (op1, op0));
+
+ /* Cast the V4SImode vector back to a vector in orignal mode. */
+ op0 = gen_reg_rtx (mode);
+ emit_move_insn (op0, gen_lowpart (mode, op1));
+
+ /* Load even elements into the second position. */
+ emit_insn (gen_load_even (op0,
+ force_reg (inner_mode,
+ ops [i + i + 1]),
+ const1_rtx));
+
+ /* Cast vector to FIRST_IMODE vector. */
+ ops[i] = gen_reg_rtx (first_imode);
+ emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
+ }
+
+ /* Interleave low FIRST_IMODE vectors. */
+ for (i = j = 0; i < n; i += 2, j++)
+ {
+ op0 = gen_reg_rtx (first_imode);
+ emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
+
+ /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
+ ops[j] = gen_reg_rtx (second_imode);
+ emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
+ }
+
+ /* Interleave low SECOND_IMODE vectors. */
+ switch (second_imode)
+ {
+ case E_V4SImode:
+ for (i = j = 0; i < n / 2; i += 2, j++)
+ {
+ op0 = gen_reg_rtx (second_imode);
+ emit_insn (gen_interleave_second_low (op0, ops[i],
+ ops[i + 1]));
+
+ /* Cast the SECOND_IMODE vector to the THIRD_IMODE
+ vector. */
+ ops[j] = gen_reg_rtx (third_imode);
+ emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
+ }
+ second_imode = V2DImode;
+ gen_interleave_second_low = gen_vec_interleave_lowv2di;
+ /* FALLTHRU */
+
+ case E_V2DImode:
+ op0 = gen_reg_rtx (second_imode);
+ emit_insn (gen_interleave_second_low (op0, ops[0],
+ ops[1]));
+
+ /* Cast the SECOND_IMODE vector back to a vector on original
+ mode. */
+ emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+}
+
+/* A subroutine of ix86_expand_vector_init. Handle the most general case:
+ all values variable, and none identical. */
+
+static void
+ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
+ rtx target, rtx vals)
+{
+ rtx ops[64], op0, op1, op2, op3, op4, op5;
+ machine_mode half_mode = VOIDmode;
+ machine_mode quarter_mode = VOIDmode;
+ int n, i;
+
+ switch (mode)
+ {
+ case E_V2SFmode:
+ case E_V2SImode:
+ if (!mmx_ok && !TARGET_SSE)
+ break;
+ /* FALLTHRU */
+
+ case E_V16SImode:
+ case E_V16SFmode:
+ case E_V8DFmode:
+ case E_V8DImode:
+ case E_V8SFmode:
+ case E_V8SImode:
+ case E_V4DFmode:
+ case E_V4DImode:
+ case E_V4SFmode:
+ case E_V4SImode:
+ case E_V2DFmode:
+ case E_V2DImode:
+ n = GET_MODE_NUNITS (mode);
+ for (i = 0; i < n; i++)
+ ops[i] = XVECEXP (vals, 0, i);
+ ix86_expand_vector_init_concat (mode, target, ops, n);
+ return;
+
+ case E_V2TImode:
+ for (i = 0; i < 2; i++)
+ ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
+ op0 = gen_reg_rtx (V4DImode);
+ ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
+ emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
+ return;
+
+ case E_V4TImode:
+ for (i = 0; i < 4; i++)
+ ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
+ ops[4] = gen_reg_rtx (V4DImode);
+ ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
+ ops[5] = gen_reg_rtx (V4DImode);
+ ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
+ op0 = gen_reg_rtx (V8DImode);
+ ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
+ emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
+ return;
+
+ case E_V32QImode:
+ half_mode = V16QImode;
+ goto half;
+
+ case E_V16HImode:
+ half_mode = V8HImode;
+ goto half;
+
+half:
+ n = GET_MODE_NUNITS (mode);
+ for (i = 0; i < n; i++)
+ ops[i] = XVECEXP (vals, 0, i);
+ op0 = gen_reg_rtx (half_mode);
+ op1 = gen_reg_rtx (half_mode);
+ ix86_expand_vector_init_interleave (half_mode, op0, ops,
+ n >> 2);
+ ix86_expand_vector_init_interleave (half_mode, op1,
+ &ops [n >> 1], n >> 2);
+ emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
+ return;
+
+ case E_V64QImode:
+ quarter_mode = V16QImode;
+ half_mode = V32QImode;
+ goto quarter;
+
+ case E_V32HImode:
+ quarter_mode = V8HImode;
+ half_mode = V16HImode;
+ goto quarter;
+
+quarter:
+ n = GET_MODE_NUNITS (mode);
+ for (i = 0; i < n; i++)
+ ops[i] = XVECEXP (vals, 0, i);
+ op0 = gen_reg_rtx (quarter_mode);
+ op1 = gen_reg_rtx (quarter_mode);
+ op2 = gen_reg_rtx (quarter_mode);
+ op3 = gen_reg_rtx (quarter_mode);
+ op4 = gen_reg_rtx (half_mode);
+ op5 = gen_reg_rtx (half_mode);
+ ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
+ n >> 3);
+ ix86_expand_vector_init_interleave (quarter_mode, op1,
+ &ops [n >> 2], n >> 3);
+ ix86_expand_vector_init_interleave (quarter_mode, op2,
+ &ops [n >> 1], n >> 3);
+ ix86_expand_vector_init_interleave (quarter_mode, op3,
+ &ops [(n >> 1) | (n >> 2)], n >> 3);
+ emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
+ emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
+ emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
+ return;
+
+ case E_V16QImode:
+ if (!TARGET_SSE4_1)
+ break;
+ /* FALLTHRU */
+
+ case E_V8HImode:
+ if (!TARGET_SSE2)
+ break;
+
+ /* Don't use ix86_expand_vector_init_interleave if we can't
+ move from GPR to SSE register directly. */
+ if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
+ break;
+
+ n = GET_MODE_NUNITS (mode);
+ for (i = 0; i < n; i++)
+ ops[i] = XVECEXP (vals, 0, i);
+ ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
+ return;
+
+ case E_V4HImode:
+ case E_V8QImode:
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ {
+ int i, j, n_elts, n_words, n_elt_per_word;
+ machine_mode inner_mode;
+ rtx words[4], shift;
+
+ inner_mode = GET_MODE_INNER (mode);
+ n_elts = GET_MODE_NUNITS (mode);
+ n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
+ n_elt_per_word = n_elts / n_words;
+ shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
+
+ for (i = 0; i < n_words; ++i)
+ {
+ rtx word = NULL_RTX;
+
+ for (j = 0; j < n_elt_per_word; ++j)
+ {
+ rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
+ elt = convert_modes (word_mode, inner_mode, elt, true);
+
+ if (j == 0)
+ word = elt;
+ else
+ {
+ word = expand_simple_binop (word_mode, ASHIFT, word, shift,
+ word, 1, OPTAB_LIB_WIDEN);
+ word = expand_simple_binop (word_mode, IOR, word, elt,
+ word, 1, OPTAB_LIB_WIDEN);
+ }
+ }
+
+ words[i] = word;
+ }
+
+ if (n_words == 1)
+ emit_move_insn (target, gen_lowpart (mode, words[0]));
+ else if (n_words == 2)
+ {
+ rtx tmp = gen_reg_rtx (mode);
+ emit_clobber (tmp);
+ emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
+ emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
+ emit_move_insn (target, tmp);
+ }
+ else if (n_words == 4)
+ {
+ rtx tmp = gen_reg_rtx (V4SImode);
+ gcc_assert (word_mode == SImode);
+ vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
+ ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
+ emit_move_insn (target, gen_lowpart (mode, tmp));
+ }
+ else
+ gcc_unreachable ();
+ }
+}
+
+/* Initialize vector TARGET via VALS. Suppress the use of MMX
+ instructions unless MMX_OK is true. */
+
+void
+ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
+{
+ machine_mode mode = GET_MODE (target);
+ machine_mode inner_mode = GET_MODE_INNER (mode);
+ int n_elts = GET_MODE_NUNITS (mode);
+ int n_var = 0, one_var = -1;
+ bool all_same = true, all_const_zero = true;
+ int i;
+ rtx x;
+
+ /* Handle first initialization from vector elts. */
+ if (n_elts != XVECLEN (vals, 0))
+ {
+ rtx subtarget = target;
+ x = XVECEXP (vals, 0, 0);
+ gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
+ if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
+ {
+ rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
+ if (inner_mode == QImode || inner_mode == HImode)
+ {
+ unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
+ mode = mode_for_vector (SImode, n_bits / 4).require ();
+ inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
+ ops[0] = gen_lowpart (inner_mode, ops[0]);
+ ops[1] = gen_lowpart (inner_mode, ops[1]);
+ subtarget = gen_reg_rtx (mode);
+ }
+ ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
+ if (subtarget != target)
+ emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
+ return;
+ }
+ gcc_unreachable ();
+ }
+
+ for (i = 0; i < n_elts; ++i)
+ {
+ x = XVECEXP (vals, 0, i);
+ if (!(CONST_SCALAR_INT_P (x)
+ || CONST_DOUBLE_P (x)
+ || CONST_FIXED_P (x)))
+ n_var++, one_var = i;
+ else if (x != CONST0_RTX (inner_mode))
+ all_const_zero = false;
+ if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
+ all_same = false;
+ }
+
+ /* Constants are best loaded from the constant pool. */
+ if (n_var == 0)
+ {
+ emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
+ return;
+ }
+
+ /* If all values are identical, broadcast the value. */
+ if (all_same
+ && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
+ XVECEXP (vals, 0, 0)))
+ return;
+
+ /* Values where only one field is non-constant are best loaded from
+ the pool and overwritten via move later. */
+ if (n_var == 1)
+ {
+ if (all_const_zero
+ && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
+ XVECEXP (vals, 0, one_var),
+ one_var))
+ return;
+
+ if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
+ return;
+ }
+
+ ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
+}
+
+void
+ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
+{
+ machine_mode mode = GET_MODE (target);
+ machine_mode inner_mode = GET_MODE_INNER (mode);
+ machine_mode half_mode;
+ bool use_vec_merge = false;
+ rtx tmp;
+ static rtx (*gen_extract[6][2]) (rtx, rtx)
+ = {
+ { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
+ { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
+ { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
+ { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
+ { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
+ { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
+ };
+ static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
+ = {
+ { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
+ { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
+ { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
+ { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
+ { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
+ { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
+ };
+ int i, j, n;
+ machine_mode mmode = VOIDmode;
+ rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
+
+ switch (mode)
+ {
+ case E_V2SFmode:
+ case E_V2SImode:
+ if (mmx_ok)
+ {
+ tmp = gen_reg_rtx (GET_MODE_INNER (mode));
+ ix86_expand_vector_extract (true, tmp, target, 1 - elt);
+ if (elt == 0)
+ tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
+ else
+ tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
+ emit_insn (gen_rtx_SET (target, tmp));
+ return;
+ }
+ break;
+
+ case E_V2DImode:
+ use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
+ if (use_vec_merge)
+ break;
+
+ tmp = gen_reg_rtx (GET_MODE_INNER (mode));
+ ix86_expand_vector_extract (false, tmp, target, 1 - elt);
+ if (elt == 0)
+ tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
+ else
+ tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
+ emit_insn (gen_rtx_SET (target, tmp));
+ return;
+
+ case E_V2DFmode:
+ {
+ rtx op0, op1;
+
+ /* For the two element vectors, we implement a VEC_CONCAT with
+ the extraction of the other element. */
+
+ tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
+ tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
+
+ if (elt == 0)
+ op0 = val, op1 = tmp;
+ else
+ op0 = tmp, op1 = val;
+
+ tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
+ emit_insn (gen_rtx_SET (target, tmp));
+ }
+ return;
+
+ case E_V4SFmode:
+ use_vec_merge = TARGET_SSE4_1;
+ if (use_vec_merge)
+ break;
+
+ switch (elt)
+ {
+ case 0:
+ use_vec_merge = true;
+ break;
+
+ case 1:
+ /* tmp = target = A B C D */
+ tmp = copy_to_reg (target);
+ /* target = A A B B */
+ emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
+ /* target = X A B B */
+ ix86_expand_vector_set (false, target, val, 0);
+ /* target = A X C D */
+ emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
+ const1_rtx, const0_rtx,
+ GEN_INT (2+4), GEN_INT (3+4)));
+ return;
+
+ case 2:
+ /* tmp = target = A B C D */
+ tmp = copy_to_reg (target);
+ /* tmp = X B C D */
+ ix86_expand_vector_set (false, tmp, val, 0);
+ /* target = A B X D */
+ emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
+ const0_rtx, const1_rtx,
+ GEN_INT (0+4), GEN_INT (3+4)));
+ return;
+
+ case 3:
+ /* tmp = target = A B C D */
+ tmp = copy_to_reg (target);
+ /* tmp = X B C D */
+ ix86_expand_vector_set (false, tmp, val, 0);
+ /* target = A B X D */
+ emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
+ const0_rtx, const1_rtx,
+ GEN_INT (2+4), GEN_INT (0+4)));
+ return;
+
+ default:
+ gcc_unreachable ();
+ }
+ break;
+
+ case E_V4SImode:
+ use_vec_merge = TARGET_SSE4_1;
+ if (use_vec_merge)
+ break;
+
+ /* Element 0 handled by vec_merge below. */
+ if (elt == 0)
+ {
+ use_vec_merge = true;
+ break;
+ }
+
+ if (TARGET_SSE2)
+ {
+ /* With SSE2, use integer shuffles to swap element 0 and ELT,
+ store into element 0, then shuffle them back. */
+
+ rtx order[4];
+
+ order[0] = GEN_INT (elt);
+ order[1] = const1_rtx;
+ order[2] = const2_rtx;
+ order[3] = GEN_INT (3);
+ order[elt] = const0_rtx;
+
+ emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
+ order[1], order[2], order[3]));
+
+ ix86_expand_vector_set (false, target, val, 0);
+
+ emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
+ order[1], order[2], order[3]));
+ }
+ else
+ {
+ /* For SSE1, we have to reuse the V4SF code. */
+ rtx t = gen_reg_rtx (V4SFmode);
+ emit_move_insn (t, gen_lowpart (V4SFmode, target));
+ ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
+ emit_move_insn (target, gen_lowpart (mode, t));
+ }
+ return;
+
+ case E_V8HImode:
+ use_vec_merge = TARGET_SSE2;
+ break;
+ case E_V4HImode:
+ use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
+ break;
+
+ case E_V16QImode:
+ use_vec_merge = TARGET_SSE4_1;
+ break;
+
+ case E_V8QImode:
+ break;
+
+ case E_V32QImode:
+ half_mode = V16QImode;
+ j = 0;
+ n = 16;
+ goto half;
+
+ case E_V16HImode:
+ half_mode = V8HImode;
+ j = 1;
+ n = 8;
+ goto half;
+
+ case E_V8SImode:
+ half_mode = V4SImode;
+ j = 2;
+ n = 4;
+ goto half;
+
+ case E_V4DImode:
+ half_mode = V2DImode;
+ j = 3;
+ n = 2;
+ goto half;
+
+ case E_V8SFmode:
+ half_mode = V4SFmode;
+ j = 4;
+ n = 4;
+ goto half;
+
+ case E_V4DFmode:
+ half_mode = V2DFmode;
+ j = 5;
+ n = 2;
+ goto half;
+
+half:
+ /* Compute offset. */
+ i = elt / n;
+ elt %= n;
+
+ gcc_assert (i <= 1);
+
+ /* Extract the half. */
+ tmp = gen_reg_rtx (half_mode);
+ emit_insn (gen_extract[j][i] (tmp, target));
+
+ /* Put val in tmp at elt. */
+ ix86_expand_vector_set (false, tmp, val, elt);
+
+ /* Put it back. */
+ emit_insn (gen_insert[j][i] (target, target, tmp));
+ return;
+
+ case E_V8DFmode:
+ if (TARGET_AVX512F)
+ {
+ mmode = QImode;
+ gen_blendm = gen_avx512f_blendmv8df;
+ }
+ break;
+
+ case E_V8DImode:
+ if (TARGET_AVX512F)
+ {
+ mmode = QImode;
+ gen_blendm = gen_avx512f_blendmv8di;
+ }
+ break;
+
+ case E_V16SFmode:
+ if (TARGET_AVX512F)
+ {
+ mmode = HImode;
+ gen_blendm = gen_avx512f_blendmv16sf;
+ }
+ break;
+
+ case E_V16SImode:
+ if (TARGET_AVX512F)
+ {
+ mmode = HImode;
+ gen_blendm = gen_avx512f_blendmv16si;
+ }
+ break;
+
+ case E_V32HImode:
+ if (TARGET_AVX512BW)
+ {
+ mmode = SImode;
+ gen_blendm = gen_avx512bw_blendmv32hi;
+ }
+ else if (TARGET_AVX512F)
+ {
+ half_mode = E_V8HImode;
+ n = 8;
+ goto quarter;
+ }
+ break;
+
+ case E_V64QImode:
+ if (TARGET_AVX512BW)
+ {
+ mmode = DImode;
+ gen_blendm = gen_avx512bw_blendmv64qi;
+ }
+ else if (TARGET_AVX512F)
+ {
+ half_mode = E_V16QImode;
+ n = 16;
+ goto quarter;
+ }
+ break;
+
+quarter:
+ /* Compute offset. */
+ i = elt / n;
+ elt %= n;
+
+ gcc_assert (i <= 3);
+
+ {
+ /* Extract the quarter. */
+ tmp = gen_reg_rtx (V4SImode);
+ rtx tmp2 = gen_lowpart (V16SImode, target);
+ rtx mask = gen_reg_rtx (QImode);
+
+ emit_move_insn (mask, constm1_rtx);
+ emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
+ tmp, mask));
+
+ tmp2 = gen_reg_rtx (half_mode);
+ emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
+ tmp = tmp2;
+
+ /* Put val in tmp at elt. */
+ ix86_expand_vector_set (false, tmp, val, elt);
+
+ /* Put it back. */
+ tmp2 = gen_reg_rtx (V16SImode);
+ rtx tmp3 = gen_lowpart (V16SImode, target);
+ mask = gen_reg_rtx (HImode);
+ emit_move_insn (mask, constm1_rtx);
+ tmp = gen_lowpart (V4SImode, tmp);
+ emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
+ tmp3, mask));
+ emit_move_insn (target, gen_lowpart (mode, tmp2));
+ }
+ return;
+
+ default:
+ break;
+ }
+
+ if (mmode != VOIDmode)
+ {
+ tmp = gen_reg_rtx (mode);
+ emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
+ /* The avx512*_blendm<mode> expanders have different operand order
+ from VEC_MERGE. In VEC_MERGE, the first input operand is used for
+ elements where the mask is set and second input operand otherwise,
+ in {sse,avx}*_*blend* the first input operand is used for elements
+ where the mask is clear and second input operand otherwise. */
+ emit_insn (gen_blendm (target, target, tmp,
+ force_reg (mmode,
+ gen_int_mode (HOST_WIDE_INT_1U << elt,
+ mmode))));
+ }
+ else if (use_vec_merge)
+ {
+ tmp = gen_rtx_VEC_DUPLICATE (mode, val);
+ tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
+ GEN_INT (HOST_WIDE_INT_1U << elt));
+ emit_insn (gen_rtx_SET (target, tmp));
+ }
+ else
+ {
+ rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
+
+ emit_move_insn (mem, target);
+
+ tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
+ emit_move_insn (tmp, val);
+
+ emit_move_insn (target, mem);
+ }
+}
+
+void
+ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
+{
+ machine_mode mode = GET_MODE (vec);
+ machine_mode inner_mode = GET_MODE_INNER (mode);
+ bool use_vec_extr = false;
+ rtx tmp;
+
+ switch (mode)
+ {
+ case E_V2SImode:
+ case E_V2SFmode:
+ if (!mmx_ok)
+ break;
+ /* FALLTHRU */
+
+ case E_V2DFmode:
+ case E_V2DImode:
+ case E_V2TImode:
+ case E_V4TImode:
+ use_vec_extr = true;
+ break;
+
+ case E_V4SFmode:
+ use_vec_extr = TARGET_SSE4_1;
+ if (use_vec_extr)
+ break;
+
+ switch (elt)
+ {
+ case 0:
+ tmp = vec;
+ break;
+
+ case 1:
+ case 3:
+ tmp = gen_reg_rtx (mode);
+ emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
+ GEN_INT (elt), GEN_INT (elt),
+ GEN_INT (elt+4), GEN_INT (elt+4)));
+ break;
+
+ case 2:
+ tmp = gen_reg_rtx (mode);
+ emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+ vec = tmp;
+ use_vec_extr = true;
+ elt = 0;
+ break;
+
+ case E_V4SImode:
+ use_vec_extr = TARGET_SSE4_1;
+ if (use_vec_extr)
+ break;
+
+ if (TARGET_SSE2)
+ {
+ switch (elt)
+ {
+ case 0:
+ tmp = vec;
+ break;
+
+ case 1:
+ case 3:
+ tmp = gen_reg_rtx (mode);
+ emit_insn (gen_sse2_pshufd_1 (tmp, vec,
+ GEN_INT (elt), GEN_INT (elt),
+ GEN_INT (elt), GEN_INT (elt)));
+ break;
+
+ case 2:
+ tmp = gen_reg_rtx (mode);
+ emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+ vec = tmp;
+ use_vec_extr = true;
+ elt = 0;
+ }
+ else
+ {
+ /* For SSE1, we have to reuse the V4SF code. */
+ ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
+ gen_lowpart (V4SFmode, vec), elt);
+ return;
+ }
+ break;
+
+ case E_V8HImode:
+ use_vec_extr = TARGET_SSE2;
+ break;
+ case E_V4HImode:
+ use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
+ break;
+
+ case E_V16QImode:
+ use_vec_extr = TARGET_SSE4_1;
+ break;
+
+ case E_V8SFmode:
+ if (TARGET_AVX)
+ {
+ tmp = gen_reg_rtx (V4SFmode);
+ if (elt < 4)
+ emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
+ else
+ emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
+ ix86_expand_vector_extract (false, target, tmp, elt & 3);
+ return;
+ }
+ break;
+
+ case E_V4DFmode:
+ if (TARGET_AVX)
+ {
+ tmp = gen_reg_rtx (V2DFmode);
+ if (elt < 2)
+ emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
+ else
+ emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
+ ix86_expand_vector_extract (false, target, tmp, elt & 1);
+ return;
+ }
+ break;
+
+ case E_V32QImode:
+ if (TARGET_AVX)
+ {
+ tmp = gen_reg_rtx (V16QImode);
+ if (elt < 16)
+ emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
+ else
+ emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
+ ix86_expand_vector_extract (false, target, tmp, elt & 15);
+ return;
+ }
+ break;
+
+ case E_V16HImode:
+ if (TARGET_AVX)
+ {
+ tmp = gen_reg_rtx (V8HImode);
+ if (elt < 8)
+ emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
+ else
+ emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
+ ix86_expand_vector_extract (false, target, tmp, elt & 7);
+ return;
+ }
+ break;
+
+ case E_V8SImode:
+ if (TARGET_AVX)
+ {
+ tmp = gen_reg_rtx (V4SImode);
+ if (elt < 4)
+ emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
+ else
+ emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
+ ix86_expand_vector_extract (false, target, tmp, elt & 3);
+ return;
+ }
+ break;
+
+ case E_V4DImode:
+ if (TARGET_AVX)
+ {
+ tmp = gen_reg_rtx (V2DImode);
+ if (elt < 2)
+ emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
+ else
+ emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
+ ix86_expand_vector_extract (false, target, tmp, elt & 1);
+ return;
+ }
+ break;
+
+ case E_V32HImode:
+ if (TARGET_AVX512BW)
+ {
+ tmp = gen_reg_rtx (V16HImode);
+ if (elt < 16)
+ emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
+ else
+ emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
+ ix86_expand_vector_extract (false, target, tmp, elt & 15);
+ return;
+ }
+ break;
+
+ case E_V64QImode:
+ if (TARGET_AVX512BW)
+ {
+ tmp = gen_reg_rtx (V32QImode);
+ if (elt < 32)
+ emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
+ else
+ emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
+ ix86_expand_vector_extract (false, target, tmp, elt & 31);
+ return;
+ }
+ break;
+
+ case E_V16SFmode:
+ tmp = gen_reg_rtx (V8SFmode);
+ if (elt < 8)
+ emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
+ else
+ emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
+ ix86_expand_vector_extract (false, target, tmp, elt & 7);
+ return;
+
+ case E_V8DFmode:
+ tmp = gen_reg_rtx (V4DFmode);
+ if (elt < 4)
+ emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
+ else
+ emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
+ ix86_expand_vector_extract (false, target, tmp, elt & 3);
+ return;
+
+ case E_V16SImode:
+ tmp = gen_reg_rtx (V8SImode);
+ if (elt < 8)
+ emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
+ else
+ emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
+ ix86_expand_vector_extract (false, target, tmp, elt & 7);
+ return;
+
+ case E_V8DImode:
+ tmp = gen_reg_rtx (V4DImode);
+ if (elt < 4)
+ emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
+ else
+ emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
+ ix86_expand_vector_extract (false, target, tmp, elt & 3);
+ return;
+
+ case E_V8QImode:
+ /* ??? Could extract the appropriate HImode element and shift. */
+ default:
+ break;
+ }
+
+ if (use_vec_extr)
+ {
+ tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
+ tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
+
+ /* Let the rtl optimizers know about the zero extension performed. */
+ if (inner_mode == QImode || inner_mode == HImode)
+ {
+ tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
+ target = gen_lowpart (SImode, target);
+ }
+
+ emit_insn (gen_rtx_SET (target, tmp));
+ }
+ else
+ {
+ rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
+
+ emit_move_insn (mem, vec);
+
+ tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
+ emit_move_insn (target, tmp);
+ }
+}
+
+/* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
+ to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
+ The upper bits of DEST are undefined, though they shouldn't cause
+ exceptions (some bits from src or all zeros are ok). */
+
+static void
+emit_reduc_half (rtx dest, rtx src, int i)
+{
+ rtx tem, d = dest;
+ switch (GET_MODE (src))
+ {
+ case E_V4SFmode:
+ if (i == 128)
+ tem = gen_sse_movhlps (dest, src, src);
+ else
+ tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
+ GEN_INT (1 + 4), GEN_INT (1 + 4));
+ break;
+ case E_V2DFmode:
+ tem = gen_vec_interleave_highv2df (dest, src, src);
+ break;
+ case E_V16QImode:
+ case E_V8HImode:
+ case E_V4SImode:
+ case E_V2DImode:
+ d = gen_reg_rtx (V1TImode);
+ tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
+ GEN_INT (i / 2));
+ break;
+ case E_V8SFmode:
+ if (i == 256)
+ tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
+ else
+ tem = gen_avx_shufps256 (dest, src, src,
+ GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
+ break;
+ case E_V4DFmode:
+ if (i == 256)
+ tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
+ else
+ tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
+ break;
+ case E_V32QImode:
+ case E_V16HImode:
+ case E_V8SImode:
+ case E_V4DImode:
+ if (i == 256)
+ {
+ if (GET_MODE (dest) != V4DImode)
+ d = gen_reg_rtx (V4DImode);
+ tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
+ gen_lowpart (V4DImode, src),
+ const1_rtx);
+ }
+ else
+ {
+ d = gen_reg_rtx (V2TImode);
+ tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
+ GEN_INT (i / 2));
+ }
+ break;
+ case E_V64QImode:
+ case E_V32HImode:
+ case E_V16SImode:
+ case E_V16SFmode:
+ case E_V8DImode:
+ case E_V8DFmode:
+ if (i > 128)
+ tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
+ gen_lowpart (V16SImode, src),
+ gen_lowpart (V16SImode, src),
+ GEN_INT (0x4 + (i == 512 ? 4 : 0)),
+ GEN_INT (0x5 + (i == 512 ? 4 : 0)),
+ GEN_INT (0x6 + (i == 512 ? 4 : 0)),
+ GEN_INT (0x7 + (i == 512 ? 4 : 0)),
+ GEN_INT (0xC), GEN_INT (0xD),
+ GEN_INT (0xE), GEN_INT (0xF),
+ GEN_INT (0x10), GEN_INT (0x11),
+ GEN_INT (0x12), GEN_INT (0x13),
+ GEN_INT (0x14), GEN_INT (0x15),
+ GEN_INT (0x16), GEN_INT (0x17));
+ else
+ tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
+ gen_lowpart (V16SImode, src),
+ GEN_INT (i == 128 ? 0x2 : 0x1),
+ GEN_INT (0x3),
+ GEN_INT (0x3),
+ GEN_INT (0x3),
+ GEN_INT (i == 128 ? 0x6 : 0x5),
+ GEN_INT (0x7),
+ GEN_INT (0x7),
+ GEN_INT (0x7),
+ GEN_INT (i == 128 ? 0xA : 0x9),
+ GEN_INT (0xB),
+ GEN_INT (0xB),
+ GEN_INT (0xB),
+ GEN_INT (i == 128 ? 0xE : 0xD),
+ GEN_INT (0xF),
+ GEN_INT (0xF),
+ GEN_INT (0xF));
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ emit_insn (tem);
+ if (d != dest)
+ emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
+}
+
+/* Expand a vector reduction. FN is the binary pattern to reduce;
+ DEST is the destination; IN is the input vector. */
+
+void
+ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
+{
+ rtx half, dst, vec = in;
+ machine_mode mode = GET_MODE (in);
+ int i;
+
+ /* SSE4 has a special instruction for V8HImode UMIN reduction. */
+ if (TARGET_SSE4_1
+ && mode == V8HImode
+ && fn == gen_uminv8hi3)
+ {
+ emit_insn (gen_sse4_1_phminposuw (dest, in));
+ return;
+ }
+
+ for (i = GET_MODE_BITSIZE (mode);
+ i > GET_MODE_UNIT_BITSIZE (mode);
+ i >>= 1)
+ {
+ half = gen_reg_rtx (mode);
+ emit_reduc_half (half, vec, i);
+ if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
+ dst = dest;
+ else
+ dst = gen_reg_rtx (mode);
+ emit_insn (fn (dst, half, vec));
+ vec = dst;
+ }
+}
+
+/* Output code to perform a conditional jump to LABEL, if C2 flag in
+ FP status register is set. */
+
+void
+ix86_emit_fp_unordered_jump (rtx label)
+{
+ rtx reg = gen_reg_rtx (HImode);
+ rtx_insn *insn;
+ rtx temp;
+
+ emit_insn (gen_x86_fnstsw_1 (reg));
+
+ if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
+ {
+ emit_insn (gen_x86_sahf_1 (reg));
+
+ temp = gen_rtx_REG (CCmode, FLAGS_REG);
+ temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
+ }
+ else
+ {
+ emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
+
+ temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
+ temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
+ }
+
+ temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
+ gen_rtx_LABEL_REF (VOIDmode, label),
+ pc_rtx);
+ insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
+ predict_jump (REG_BR_PROB_BASE * 10 / 100);
+ JUMP_LABEL (insn) = label;
+}
+
+/* Output code to perform an sinh XFmode calculation. */
+
+void ix86_emit_i387_sinh (rtx op0, rtx op1)
+{
+ rtx e1 = gen_reg_rtx (XFmode);
+ rtx e2 = gen_reg_rtx (XFmode);
+ rtx scratch = gen_reg_rtx (HImode);
+ rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
+ rtx half = const_double_from_real_value (dconsthalf, XFmode);
+ rtx cst1, tmp;
+ rtx_code_label *jump_label = gen_label_rtx ();
+ rtx_insn *insn;
+
+ /* scratch = fxam (op1) */
+ emit_insn (gen_fxamxf2_i387 (scratch, op1));
+
+ /* e1 = expm1 (|op1|) */
+ emit_insn (gen_absxf2 (e2, op1));
+ emit_insn (gen_expm1xf2 (e1, e2));
+
+ /* e2 = e1 / (e1 + 1.0) + e1 */
+ cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
+ emit_insn (gen_addxf3 (e2, e1, cst1));
+ emit_insn (gen_divxf3 (e2, e1, e2));
+ emit_insn (gen_addxf3 (e2, e2, e1));
+
+ /* flags = signbit (op1) */
+ emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
+
+ /* if (flags) then e2 = -e2 */
+ tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
+ gen_rtx_EQ (VOIDmode, flags, const0_rtx),
+ gen_rtx_LABEL_REF (VOIDmode, jump_label),
+ pc_rtx);
+ insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+ predict_jump (REG_BR_PROB_BASE * 50 / 100);
+ JUMP_LABEL (insn) = jump_label;
+
+ emit_insn (gen_negxf2 (e2, e2));
+
+ emit_label (jump_label);
+ LABEL_NUSES (jump_label) = 1;
+
+ /* op0 = 0.5 * e2 */
+ half = force_reg (XFmode, half);
+ emit_insn (gen_mulxf3 (op0, e2, half));
+}
+
+/* Output code to perform an cosh XFmode calculation. */
+
+void ix86_emit_i387_cosh (rtx op0, rtx op1)
+{
+ rtx e1 = gen_reg_rtx (XFmode);
+ rtx e2 = gen_reg_rtx (XFmode);
+ rtx half = const_double_from_real_value (dconsthalf, XFmode);
+ rtx cst1;
+
+ /* e1 = exp (op1) */
+ emit_insn (gen_expxf2 (e1, op1));
+
+ /* e2 = e1 + 1.0 / e1 */
+ cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
+ emit_insn (gen_divxf3 (e2, cst1, e1));
+ emit_insn (gen_addxf3 (e2, e1, e2));
+
+ /* op0 = 0.5 * e2 */
+ half = force_reg (XFmode, half);
+ emit_insn (gen_mulxf3 (op0, e2, half));
+}
+
+/* Output code to perform an tanh XFmode calculation. */
+
+void ix86_emit_i387_tanh (rtx op0, rtx op1)
+{
+ rtx e1 = gen_reg_rtx (XFmode);
+ rtx e2 = gen_reg_rtx (XFmode);
+ rtx scratch = gen_reg_rtx (HImode);
+ rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
+ rtx cst2, tmp;
+ rtx_code_label *jump_label = gen_label_rtx ();
+ rtx_insn *insn;
+
+ /* scratch = fxam (op1) */
+ emit_insn (gen_fxamxf2_i387 (scratch, op1));
+
+ /* e1 = expm1 (-|2 * op1|) */
+ emit_insn (gen_addxf3 (e2, op1, op1));
+ emit_insn (gen_absxf2 (e2, e2));
+ emit_insn (gen_negxf2 (e2, e2));
+ emit_insn (gen_expm1xf2 (e1, e2));
+
+ /* e2 = e1 / (e1 + 2.0) */
+ cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
+ emit_insn (gen_addxf3 (e2, e1, cst2));
+ emit_insn (gen_divxf3 (e2, e1, e2));
+
+ /* flags = signbit (op1) */
+ emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
+
+ /* if (!flags) then e2 = -e2 */
+ tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
+ gen_rtx_NE (VOIDmode, flags, const0_rtx),
+ gen_rtx_LABEL_REF (VOIDmode, jump_label),
+ pc_rtx);
+ insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+ predict_jump (REG_BR_PROB_BASE * 50 / 100);
+ JUMP_LABEL (insn) = jump_label;
+
+ emit_insn (gen_negxf2 (e2, e2));
+
+ emit_label (jump_label);
+ LABEL_NUSES (jump_label) = 1;
+
+ emit_move_insn (op0, e2);
+}
+
+/* Output code to perform an asinh XFmode calculation. */
+
+void ix86_emit_i387_asinh (rtx op0, rtx op1)
+{
+ rtx e1 = gen_reg_rtx (XFmode);
+ rtx e2 = gen_reg_rtx (XFmode);
+ rtx scratch = gen_reg_rtx (HImode);
+ rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
+ rtx cst1, tmp;
+ rtx_code_label *jump_label = gen_label_rtx ();
+ rtx_insn *insn;
+
+ /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
+ emit_insn (gen_mulxf3 (e1, op1, op1));
+ cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
+ emit_insn (gen_addxf3 (e2, e1, cst1));
+ emit_insn (gen_sqrtxf2 (e2, e2));
+ emit_insn (gen_addxf3 (e2, e2, cst1));
+
+ /* e1 = e1 / e2 */
+ emit_insn (gen_divxf3 (e1, e1, e2));
+
+ /* scratch = fxam (op1) */
+ emit_insn (gen_fxamxf2_i387 (scratch, op1));
+
+ /* e1 = e1 + |op1| */
+ emit_insn (gen_absxf2 (e2, op1));
+ emit_insn (gen_addxf3 (e1, e1, e2));
+
+ /* e2 = log1p (e1) */
+ ix86_emit_i387_log1p (e2, e1);
+
+ /* flags = signbit (op1) */
+ emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
+
+ /* if (flags) then e2 = -e2 */
+ tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
+ gen_rtx_EQ (VOIDmode, flags, const0_rtx),
+ gen_rtx_LABEL_REF (VOIDmode, jump_label),
+ pc_rtx);
+ insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+ predict_jump (REG_BR_PROB_BASE * 50 / 100);
+ JUMP_LABEL (insn) = jump_label;
+
+ emit_insn (gen_negxf2 (e2, e2));
+
+ emit_label (jump_label);
+ LABEL_NUSES (jump_label) = 1;
+
+ emit_move_insn (op0, e2);
+}
+
+/* Output code to perform an acosh XFmode calculation. */
+
+void ix86_emit_i387_acosh (rtx op0, rtx op1)
+{
+ rtx e1 = gen_reg_rtx (XFmode);
+ rtx e2 = gen_reg_rtx (XFmode);
+ rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
+
+ /* e2 = sqrt (op1 + 1.0) */
+ emit_insn (gen_addxf3 (e2, op1, cst1));
+ emit_insn (gen_sqrtxf2 (e2, e2));
+
+ /* e1 = sqrt (op1 - 1.0) */
+ emit_insn (gen_subxf3 (e1, op1, cst1));
+ emit_insn (gen_sqrtxf2 (e1, e1));
+
+ /* e1 = e1 * e2 */
+ emit_insn (gen_mulxf3 (e1, e1, e2));
+
+ /* e1 = e1 + op1 */
+ emit_insn (gen_addxf3 (e1, e1, op1));
+
+ /* op0 = log (e1) */
+ emit_insn (gen_logxf2 (op0, e1));
+}
+
+/* Output code to perform an atanh XFmode calculation. */
+
+void ix86_emit_i387_atanh (rtx op0, rtx op1)
+{
+ rtx e1 = gen_reg_rtx (XFmode);
+ rtx e2 = gen_reg_rtx (XFmode);
+ rtx scratch = gen_reg_rtx (HImode);
+ rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
+ rtx half = const_double_from_real_value (dconsthalf, XFmode);
+ rtx cst1, tmp;
+ rtx_code_label *jump_label = gen_label_rtx ();
+ rtx_insn *insn;
+
+ /* scratch = fxam (op1) */
+ emit_insn (gen_fxamxf2_i387 (scratch, op1));
+
+ /* e2 = |op1| */
+ emit_insn (gen_absxf2 (e2, op1));
+
+ /* e1 = -(e2 + e2) / (e2 + 1.0) */
+ cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
+ emit_insn (gen_addxf3 (e1, e2, cst1));
+ emit_insn (gen_addxf3 (e2, e2, e2));
+ emit_insn (gen_negxf2 (e2, e2));
+ emit_insn (gen_divxf3 (e1, e2, e1));
+
+ /* e2 = log1p (e1) */
+ ix86_emit_i387_log1p (e2, e1);
+
+ /* flags = signbit (op1) */
+ emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
+
+ /* if (!flags) then e2 = -e2 */
+ tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
+ gen_rtx_NE (VOIDmode, flags, const0_rtx),
+ gen_rtx_LABEL_REF (VOIDmode, jump_label),
+ pc_rtx);
+ insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+ predict_jump (REG_BR_PROB_BASE * 50 / 100);
+ JUMP_LABEL (insn) = jump_label;
+
+ emit_insn (gen_negxf2 (e2, e2));
+
+ emit_label (jump_label);
+ LABEL_NUSES (jump_label) = 1;
+
+ /* op0 = 0.5 * e2 */
+ half = force_reg (XFmode, half);
+ emit_insn (gen_mulxf3 (op0, e2, half));
+}
+
+/* Output code to perform a log1p XFmode calculation. */
+
+void ix86_emit_i387_log1p (rtx op0, rtx op1)
+{
+ rtx_code_label *label1 = gen_label_rtx ();
+ rtx_code_label *label2 = gen_label_rtx ();
+
+ rtx tmp = gen_reg_rtx (XFmode);
+ rtx res = gen_reg_rtx (XFmode);
+ rtx cst, cstln2, cst1;
+ rtx_insn *insn;
+
+ cst = const_double_from_real_value
+ (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
+ cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
+
+ emit_insn (gen_absxf2 (tmp, op1));
+
+ cst = force_reg (XFmode, cst);
+ ix86_expand_branch (GE, tmp, cst, label1);
+ predict_jump (REG_BR_PROB_BASE * 10 / 100);
+ insn = get_last_insn ();
+ JUMP_LABEL (insn) = label1;
+
+ emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
+ emit_jump (label2);
+
+ emit_label (label1);
+ LABEL_NUSES (label1) = 1;
+
+ cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
+ emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
+ emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
+
+ emit_label (label2);
+ LABEL_NUSES (label2) = 1;
+
+ emit_move_insn (op0, res);
+}
+
+/* Emit code for round calculation. */
+void ix86_emit_i387_round (rtx op0, rtx op1)
+{
+ machine_mode inmode = GET_MODE (op1);
+ machine_mode outmode = GET_MODE (op0);
+ rtx e1 = gen_reg_rtx (XFmode);
+ rtx e2 = gen_reg_rtx (XFmode);
+ rtx scratch = gen_reg_rtx (HImode);
+ rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
+ rtx half = const_double_from_real_value (dconsthalf, XFmode);
+ rtx res = gen_reg_rtx (outmode);
+ rtx_code_label *jump_label = gen_label_rtx ();
+ rtx (*floor_insn) (rtx, rtx);
+ rtx (*neg_insn) (rtx, rtx);
+ rtx_insn *insn;
+ rtx tmp;
+
+ switch (inmode)
+ {
+ case E_SFmode:
+ case E_DFmode:
+ tmp = gen_reg_rtx (XFmode);
+
+ emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
+ op1 = tmp;
+ break;
+ case E_XFmode:
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ switch (outmode)
+ {
+ case E_SFmode:
+ floor_insn = gen_frndintxf2_floor;
+ neg_insn = gen_negsf2;
+ break;
+ case E_DFmode:
+ floor_insn = gen_frndintxf2_floor;
+ neg_insn = gen_negdf2;
+ break;
+ case E_XFmode:
+ floor_insn = gen_frndintxf2_floor;
+ neg_insn = gen_negxf2;
+ break;
+ case E_HImode:
+ floor_insn = gen_lfloorxfhi2;
+ neg_insn = gen_neghi2;
+ break;
+ case E_SImode:
+ floor_insn = gen_lfloorxfsi2;
+ neg_insn = gen_negsi2;
+ break;
+ case E_DImode:
+ floor_insn = gen_lfloorxfdi2;
+ neg_insn = gen_negdi2;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
+
+ /* scratch = fxam(op1) */
+ emit_insn (gen_fxamxf2_i387 (scratch, op1));
+
+ /* e1 = fabs(op1) */
+ emit_insn (gen_absxf2 (e1, op1));
+
+ /* e2 = e1 + 0.5 */
+ half = force_reg (XFmode, half);
+ emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
+
+ /* res = floor(e2) */
+ switch (outmode)
+ {
+ case E_SFmode:
+ case E_DFmode:
+ {
+ tmp = gen_reg_rtx (XFmode);
+
+ emit_insn (floor_insn (tmp, e2));
+ emit_insn (gen_rtx_SET (res,
+ gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
+ UNSPEC_TRUNC_NOOP)));
+ }
+ break;
+ default:
+ emit_insn (floor_insn (res, e2));
+ }
+
+ /* flags = signbit(a) */
+ emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
+
+ /* if (flags) then res = -res */
+ tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
+ gen_rtx_EQ (VOIDmode, flags, const0_rtx),
+ gen_rtx_LABEL_REF (VOIDmode, jump_label),
+ pc_rtx);
+ insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+ predict_jump (REG_BR_PROB_BASE * 50 / 100);
+ JUMP_LABEL (insn) = jump_label;
+
+ emit_insn (neg_insn (res, res));
+
+ emit_label (jump_label);
+ LABEL_NUSES (jump_label) = 1;
+
+ emit_move_insn (op0, res);
+}
+
+/* Output code to perform a Newton-Rhapson approximation of a single precision
+ floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
+
+void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
+{
+ rtx x0, x1, e0, e1;
+
+ x0 = gen_reg_rtx (mode);
+ e0 = gen_reg_rtx (mode);
+ e1 = gen_reg_rtx (mode);
+ x1 = gen_reg_rtx (mode);
+
+ /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
+
+ b = force_reg (mode, b);
+
+ /* x0 = rcp(b) estimate */
+ if (mode == V16SFmode || mode == V8DFmode)
+ {
+ if (TARGET_AVX512ER)
+ {
+ emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
+ UNSPEC_RCP28)));
+ /* res = a * x0 */
+ emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
+ return;
+ }
+ else
+ emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
+ UNSPEC_RCP14)));
+ }
+ else
+ emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
+ UNSPEC_RCP)));
+
+ /* e0 = x0 * b */
+ emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
+
+ /* e0 = x0 * e0 */
+ emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
+
+ /* e1 = x0 + x0 */
+ emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
+
+ /* x1 = e1 - e0 */
+ emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
+
+ /* res = a * x1 */
+ emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
+}
+
+/* Output code to perform a Newton-Rhapson approximation of a
+ single precision floating point [reciprocal] square root. */
+
+void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
+{
+ rtx x0, e0, e1, e2, e3, mthree, mhalf;
+ REAL_VALUE_TYPE r;
+ int unspec;
+
+ x0 = gen_reg_rtx (mode);
+ e0 = gen_reg_rtx (mode);
+ e1 = gen_reg_rtx (mode);
+ e2 = gen_reg_rtx (mode);
+ e3 = gen_reg_rtx (mode);
+
+ if (TARGET_AVX512ER && mode == V16SFmode)
+ {
+ if (recip)
+ /* res = rsqrt28(a) estimate */
+ emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
+ UNSPEC_RSQRT28)));
+ else
+ {
+ /* x0 = rsqrt28(a) estimate */
+ emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
+ UNSPEC_RSQRT28)));
+ /* res = rcp28(x0) estimate */
+ emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
+ UNSPEC_RCP28)));
+ }
+ return;
+ }
+
+ real_from_integer (&r, VOIDmode, -3, SIGNED);
+ mthree = const_double_from_real_value (r, SFmode);
+
+ real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
+ mhalf = const_double_from_real_value (r, SFmode);
+ unspec = UNSPEC_RSQRT;
+
+ if (VECTOR_MODE_P (mode))
+ {
+ mthree = ix86_build_const_vector (mode, true, mthree);
+ mhalf = ix86_build_const_vector (mode, true, mhalf);
+ /* There is no 512-bit rsqrt. There is however rsqrt14. */
+ if (GET_MODE_SIZE (mode) == 64)
+ unspec = UNSPEC_RSQRT14;
+ }
+
+ /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
+ rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
+
+ a = force_reg (mode, a);
+
+ /* x0 = rsqrt(a) estimate */
+ emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
+ unspec)));
+
+ /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
+ if (!recip)
+ {
+ rtx zero = force_reg (mode, CONST0_RTX(mode));
+ rtx mask;
+
+ /* Handle masked compare. */
+ if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
+ {
+ mask = gen_reg_rtx (HImode);
+ /* Imm value 0x4 corresponds to not-equal comparison. */
+ emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
+ emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
+ }
+ else
+ {
+ mask = gen_reg_rtx (mode);
+ emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
+ emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
+ }
+ }
+
+ /* e0 = x0 * a */
+ emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
+ /* e1 = e0 * x0 */
+ emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
+
+ /* e2 = e1 - 3. */
+ mthree = force_reg (mode, mthree);
+ emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
+
+ mhalf = force_reg (mode, mhalf);
+ if (recip)
+ /* e3 = -.5 * x0 */
+ emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
+ else
+ /* e3 = -.5 * e0 */
+ emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
+ /* ret = e2 * e3 */
+ emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
+}
+
+/* Expand fabs (OP0) and return a new rtx that holds the result. The
+ mask for masking out the sign-bit is stored in *SMASK, if that is
+ non-null. */
+
+static rtx
+ix86_expand_sse_fabs (rtx op0, rtx *smask)
+{
+ machine_mode vmode, mode = GET_MODE (op0);
+ rtx xa, mask;
+
+ xa = gen_reg_rtx (mode);
+ if (mode == SFmode)
+ vmode = V4SFmode;
+ else if (mode == DFmode)
+ vmode = V2DFmode;
+ else
+ vmode = mode;
+ mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
+ if (!VECTOR_MODE_P (mode))
+ {
+ /* We need to generate a scalar mode mask in this case. */
+ rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
+ tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
+ mask = gen_reg_rtx (mode);
+ emit_insn (gen_rtx_SET (mask, tmp));
+ }
+ emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
+
+ if (smask)
+ *smask = mask;
+
+ return xa;
+}
+
+/* Expands a comparison of OP0 with OP1 using comparison code CODE,
+ swapping the operands if SWAP_OPERANDS is true. The expanded
+ code is a forward jump to a newly created label in case the
+ comparison is true. The generated label rtx is returned. */
+static rtx_code_label *
+ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
+ bool swap_operands)
+{
+ bool unordered_compare = ix86_unordered_fp_compare (code);
+ rtx_code_label *label;
+ rtx tmp, reg;
+
+ if (swap_operands)
+ std::swap (op0, op1);
+
+ label = gen_label_rtx ();
+ tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
+ if (unordered_compare)
+ tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
+ reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
+ emit_insn (gen_rtx_SET (reg, tmp));
+ tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
+ tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
+ gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
+ tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
+ JUMP_LABEL (tmp) = label;
+
+ return label;
+}
+
+/* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
+ using comparison code CODE. Operands are swapped for the comparison if
+ SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
+static rtx
+ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
+ bool swap_operands)
+{
+ rtx (*insn)(rtx, rtx, rtx, rtx);
+ machine_mode mode = GET_MODE (op0);
+ rtx mask = gen_reg_rtx (mode);
+
+ if (swap_operands)
+ std::swap (op0, op1);
+
+ insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
+
+ emit_insn (insn (mask, op0, op1,
+ gen_rtx_fmt_ee (code, mode, op0, op1)));
+ return mask;
+}
+
+/* Expand copysign from SIGN to the positive value ABS_VALUE
+ storing in RESULT. If MASK is non-null, it shall be a mask to mask out
+ the sign-bit. */
+
+static void
+ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
+{
+ machine_mode mode = GET_MODE (sign);
+ rtx sgn = gen_reg_rtx (mode);
+ if (mask == NULL_RTX)
+ {
+ machine_mode vmode;
+
+ if (mode == SFmode)
+ vmode = V4SFmode;
+ else if (mode == DFmode)
+ vmode = V2DFmode;
+ else
+ vmode = mode;
+
+ mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
+ if (!VECTOR_MODE_P (mode))
+ {
+ /* We need to generate a scalar mode mask in this case. */
+ rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
+ tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
+ mask = gen_reg_rtx (mode);
+ emit_insn (gen_rtx_SET (mask, tmp));
+ }
+ }
+ else
+ mask = gen_rtx_NOT (mode, mask);
+ emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
+ emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
+}
+
+/* Expand SSE sequence for computing lround from OP1 storing
+ into OP0. */
+
+void
+ix86_expand_lround (rtx op0, rtx op1)
+{
+ /* C code for the stuff we're doing below:
+ tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
+ return (long)tmp;
+ */
+ machine_mode mode = GET_MODE (op1);
+ const struct real_format *fmt;
+ REAL_VALUE_TYPE pred_half, half_minus_pred_half;
+ rtx adj;
+
+ /* load nextafter (0.5, 0.0) */
+ fmt = REAL_MODE_FORMAT (mode);
+ real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
+ real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
+
+ /* adj = copysign (0.5, op1) */
+ adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
+ ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
+
+ /* adj = op1 + adj */
+ adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
+
+ /* op0 = (imode)adj */
+ expand_fix (op0, adj, 0);
+}
+
+/* Expand SSE2 sequence for computing lround from OPERAND1 storing
+ into OPERAND0. */
+
+void
+ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
+{
+ /* C code for the stuff we're doing below (for do_floor):
+ xi = (long)op1;
+ xi -= (double)xi > op1 ? 1 : 0;
+ return xi;
+ */
+ machine_mode fmode = GET_MODE (op1);
+ machine_mode imode = GET_MODE (op0);
+ rtx ireg, freg, tmp;
+ rtx_code_label *label;
+
+ /* reg = (long)op1 */
+ ireg = gen_reg_rtx (imode);
+ expand_fix (ireg, op1, 0);
+
+ /* freg = (double)reg */
+ freg = gen_reg_rtx (fmode);
+ expand_float (freg, ireg, 0);
+
+ /* ireg = (freg > op1) ? ireg - 1 : ireg */
+ label = ix86_expand_sse_compare_and_jump (UNLE,
+ freg, op1, !do_floor);
+ tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
+ ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
+ emit_move_insn (ireg, tmp);
+
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+
+ emit_move_insn (op0, ireg);
+}
+
+/* Generate and return a rtx of mode MODE for 2**n where n is the number
+ of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
+
+static rtx
+ix86_gen_TWO52 (machine_mode mode)
+{
+ REAL_VALUE_TYPE TWO52r;
+ rtx TWO52;
+
+ real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
+ TWO52 = const_double_from_real_value (TWO52r, mode);
+ TWO52 = force_reg (mode, TWO52);
+
+ return TWO52;
+}
+
+/* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
+
+void
+ix86_expand_rint (rtx operand0, rtx operand1)
+{
+ /* C code for the stuff we're doing below:
+ xa = fabs (operand1);
+ if (!isless (xa, 2**52))
+ return operand1;
+ two52 = 2**52;
+ if (flag_rounding_math)
+ {
+ two52 = copysign (two52, operand1);
+ xa = operand1;
+ }
+ xa = xa + two52 - two52;
+ return copysign (xa, operand1);
+ */
+ machine_mode mode = GET_MODE (operand0);
+ rtx res, xa, TWO52, two52, mask;
+ rtx_code_label *label;
+
+ res = gen_reg_rtx (mode);
+ emit_move_insn (res, operand1);
+
+ /* xa = abs (operand1) */
+ xa = ix86_expand_sse_fabs (res, &mask);
+
+ /* if (!isless (xa, TWO52)) goto label; */
+ TWO52 = ix86_gen_TWO52 (mode);
+ label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+ two52 = TWO52;
+ if (flag_rounding_math)
+ {
+ two52 = gen_reg_rtx (mode);
+ ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
+ xa = res;
+ }
+
+ xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
+ xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
+
+ ix86_sse_copysign_to_positive (res, xa, res, mask);
+
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+
+ emit_move_insn (operand0, res);
+}
+
+/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
+ into OPERAND0. */
+void
+ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
+{
+ /* C code for the stuff we expand below.
+ double xa = fabs (x), x2;
+ if (!isless (xa, TWO52))
+ return x;
+ xa = xa + TWO52 - TWO52;
+ x2 = copysign (xa, x);
+ Compensate. Floor:
+ if (x2 > x)
+ x2 -= 1;
+ Compensate. Ceil:
+ if (x2 < x)
+ x2 -= -1;
+ return x2;
+ */
+ machine_mode mode = GET_MODE (operand0);
+ rtx xa, TWO52, tmp, one, res, mask;
+ rtx_code_label *label;
+
+ TWO52 = ix86_gen_TWO52 (mode);
+
+ /* Temporary for holding the result, initialized to the input
+ operand to ease control flow. */
+ res = gen_reg_rtx (mode);
+ emit_move_insn (res, operand1);
+
+ /* xa = abs (operand1) */
+ xa = ix86_expand_sse_fabs (res, &mask);
+
+ /* if (!isless (xa, TWO52)) goto label; */
+ label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+ /* xa = xa + TWO52 - TWO52; */
+ xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
+ xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
+
+ /* xa = copysign (xa, operand1) */
+ ix86_sse_copysign_to_positive (xa, xa, res, mask);
+
+ /* generate 1.0 or -1.0 */
+ one = force_reg (mode,
+ const_double_from_real_value (do_floor
+ ? dconst1 : dconstm1, mode));
+
+ /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
+ tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
+ emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
+ /* We always need to subtract here to preserve signed zero. */
+ tmp = expand_simple_binop (mode, MINUS,
+ xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+ emit_move_insn (res, tmp);
+
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+
+ emit_move_insn (operand0, res);
+}
+
+/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
+ into OPERAND0. */
+void
+ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
+{
+ /* C code for the stuff we expand below.
+ double xa = fabs (x), x2;
+ if (!isless (xa, TWO52))
+ return x;
+ x2 = (double)(long)x;
+ Compensate. Floor:
+ if (x2 > x)
+ x2 -= 1;
+ Compensate. Ceil:
+ if (x2 < x)
+ x2 += 1;
+ if (HONOR_SIGNED_ZEROS (mode))
+ return copysign (x2, x);
+ return x2;
+ */
+ machine_mode mode = GET_MODE (operand0);
+ rtx xa, xi, TWO52, tmp, one, res, mask;
+ rtx_code_label *label;
+
+ TWO52 = ix86_gen_TWO52 (mode);
+
+ /* Temporary for holding the result, initialized to the input
+ operand to ease control flow. */
+ res = gen_reg_rtx (mode);
+ emit_move_insn (res, operand1);
+
+ /* xa = abs (operand1) */
+ xa = ix86_expand_sse_fabs (res, &mask);
+
+ /* if (!isless (xa, TWO52)) goto label; */
+ label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+ /* xa = (double)(long)x */
+ xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
+ expand_fix (xi, res, 0);
+ expand_float (xa, xi, 0);
+
+ /* generate 1.0 */
+ one = force_reg (mode, const_double_from_real_value (dconst1, mode));
+
+ /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
+ tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
+ emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
+ tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
+ xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+ emit_move_insn (res, tmp);
+
+ if (HONOR_SIGNED_ZEROS (mode))
+ ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
+
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+
+ emit_move_insn (operand0, res);
+}
+
+/* Expand SSE sequence for computing round from OPERAND1 storing
+ into OPERAND0. Sequence that works without relying on DImode truncation
+ via cvttsd2siq that is only available on 64bit targets. */
+void
+ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
+{
+ /* C code for the stuff we expand below.
+ double xa = fabs (x), xa2, x2;
+ if (!isless (xa, TWO52))
+ return x;
+ Using the absolute value and copying back sign makes
+ -0.0 -> -0.0 correct.
+ xa2 = xa + TWO52 - TWO52;
+ Compensate.
+ dxa = xa2 - xa;
+ if (dxa <= -0.5)
+ xa2 += 1;
+ else if (dxa > 0.5)
+ xa2 -= 1;
+ x2 = copysign (xa2, x);
+ return x2;
+ */
+ machine_mode mode = GET_MODE (operand0);
+ rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
+ rtx_code_label *label;
+
+ TWO52 = ix86_gen_TWO52 (mode);
+
+ /* Temporary for holding the result, initialized to the input
+ operand to ease control flow. */
+ res = gen_reg_rtx (mode);
+ emit_move_insn (res, operand1);
+
+ /* xa = abs (operand1) */
+ xa = ix86_expand_sse_fabs (res, &mask);
+
+ /* if (!isless (xa, TWO52)) goto label; */
+ label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+ /* xa2 = xa + TWO52 - TWO52; */
+ xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
+ xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
+
+ /* dxa = xa2 - xa; */
+ dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
+
+ /* generate 0.5, 1.0 and -0.5 */
+ half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
+ one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
+ mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
+ 0, OPTAB_DIRECT);
+
+ /* Compensate. */
+ tmp = gen_reg_rtx (mode);
+ /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
+ tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
+ emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
+ xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+ /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
+ tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
+ emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
+ xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+
+ /* res = copysign (xa2, operand1) */
+ ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
+
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+
+ emit_move_insn (operand0, res);
+}
+
+/* Expand SSE sequence for computing trunc from OPERAND1 storing
+ into OPERAND0. */
+void
+ix86_expand_trunc (rtx operand0, rtx operand1)
+{
+ /* C code for SSE variant we expand below.
+ double xa = fabs (x), x2;
+ if (!isless (xa, TWO52))
+ return x;
+ x2 = (double)(long)x;
+ if (HONOR_SIGNED_ZEROS (mode))
+ return copysign (x2, x);
+ return x2;
+ */
+ machine_mode mode = GET_MODE (operand0);
+ rtx xa, xi, TWO52, res, mask;
+ rtx_code_label *label;
+
+ TWO52 = ix86_gen_TWO52 (mode);
+
+ /* Temporary for holding the result, initialized to the input
+ operand to ease control flow. */
+ res = gen_reg_rtx (mode);
+ emit_move_insn (res, operand1);
+
+ /* xa = abs (operand1) */
+ xa = ix86_expand_sse_fabs (res, &mask);
+
+ /* if (!isless (xa, TWO52)) goto label; */
+ label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+ /* x = (double)(long)x */
+ xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
+ expand_fix (xi, res, 0);
+ expand_float (res, xi, 0);
+
+ if (HONOR_SIGNED_ZEROS (mode))
+ ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
+
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+
+ emit_move_insn (operand0, res);
+}
+
+/* Expand SSE sequence for computing trunc from OPERAND1 storing
+ into OPERAND0. */
+void
+ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
+{
+ machine_mode mode = GET_MODE (operand0);
+ rtx xa, mask, TWO52, one, res, smask, tmp;
+ rtx_code_label *label;
+
+ /* C code for SSE variant we expand below.
+ double xa = fabs (x), x2;
+ if (!isless (xa, TWO52))
+ return x;
+ xa2 = xa + TWO52 - TWO52;
+ Compensate:
+ if (xa2 > xa)
+ xa2 -= 1.0;
+ x2 = copysign (xa2, x);
+ return x2;
+ */
+
+ TWO52 = ix86_gen_TWO52 (mode);
+
+ /* Temporary for holding the result, initialized to the input
+ operand to ease control flow. */
+ res = gen_reg_rtx (mode);
+ emit_move_insn (res, operand1);
+
+ /* xa = abs (operand1) */
+ xa = ix86_expand_sse_fabs (res, &smask);
+
+ /* if (!isless (xa, TWO52)) goto label; */
+ label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+ /* res = xa + TWO52 - TWO52; */
+ tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
+ tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
+ emit_move_insn (res, tmp);
+
+ /* generate 1.0 */
+ one = force_reg (mode, const_double_from_real_value (dconst1, mode));
+
+ /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
+ mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
+ emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
+ tmp = expand_simple_binop (mode, MINUS,
+ res, mask, NULL_RTX, 0, OPTAB_DIRECT);
+ emit_move_insn (res, tmp);
+
+ /* res = copysign (res, operand1) */
+ ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
+
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+
+ emit_move_insn (operand0, res);
+}
+
+/* Expand SSE sequence for computing round from OPERAND1 storing
+ into OPERAND0. */
+void
+ix86_expand_round (rtx operand0, rtx operand1)
+{
+ /* C code for the stuff we're doing below:
+ double xa = fabs (x);
+ if (!isless (xa, TWO52))
+ return x;
+ xa = (double)(long)(xa + nextafter (0.5, 0.0));
+ return copysign (xa, x);
+ */
+ machine_mode mode = GET_MODE (operand0);
+ rtx res, TWO52, xa, xi, half, mask;
+ rtx_code_label *label;
+ const struct real_format *fmt;
+ REAL_VALUE_TYPE pred_half, half_minus_pred_half;
+
+ /* Temporary for holding the result, initialized to the input
+ operand to ease control flow. */
+ res = gen_reg_rtx (mode);
+ emit_move_insn (res, operand1);
+
+ TWO52 = ix86_gen_TWO52 (mode);
+ xa = ix86_expand_sse_fabs (res, &mask);
+ label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+ /* load nextafter (0.5, 0.0) */
+ fmt = REAL_MODE_FORMAT (mode);
+ real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
+ real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
+
+ /* xa = xa + 0.5 */
+ half = force_reg (mode, const_double_from_real_value (pred_half, mode));
+ xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
+
+ /* xa = (double)(int64_t)xa */
+ xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
+ expand_fix (xi, xa, 0);
+ expand_float (xa, xi, 0);
+
+ /* res = copysign (xa, operand1) */
+ ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
+
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+
+ emit_move_insn (operand0, res);
+}
+
+/* Expand SSE sequence for computing round
+ from OP1 storing into OP0 using sse4 round insn. */
+void
+ix86_expand_round_sse4 (rtx op0, rtx op1)
+{
+ machine_mode mode = GET_MODE (op0);
+ rtx e1, e2, res, half;
+ const struct real_format *fmt;
+ REAL_VALUE_TYPE pred_half, half_minus_pred_half;
+ rtx (*gen_copysign) (rtx, rtx, rtx);
+ rtx (*gen_round) (rtx, rtx, rtx);
+
+ switch (mode)
+ {
+ case E_SFmode:
+ gen_copysign = gen_copysignsf3;
+ gen_round = gen_sse4_1_roundsf2;
+ break;
+ case E_DFmode:
+ gen_copysign = gen_copysigndf3;
+ gen_round = gen_sse4_1_rounddf2;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ /* round (a) = trunc (a + copysign (0.5, a)) */
+
+ /* load nextafter (0.5, 0.0) */
+ fmt = REAL_MODE_FORMAT (mode);
+ real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
+ real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
+ half = const_double_from_real_value (pred_half, mode);
+
+ /* e1 = copysign (0.5, op1) */
+ e1 = gen_reg_rtx (mode);
+ emit_insn (gen_copysign (e1, half, op1));
+
+ /* e2 = op1 + e1 */
+ e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
+
+ /* res = trunc (e2) */
+ res = gen_reg_rtx (mode);
+ emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
+
+ emit_move_insn (op0, res);
+}
+
+/* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
+ insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
+ insn every time. */
+
+static GTY(()) rtx_insn *vselect_insn;
+
+/* Initialize vselect_insn. */
+
+static void
+init_vselect_insn (void)
+{
+ unsigned i;
+ rtx x;
+
+ x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
+ for (i = 0; i < MAX_VECT_LEN; ++i)
+ XVECEXP (x, 0, i) = const0_rtx;
+ x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
+ const0_rtx), x);
+ x = gen_rtx_SET (const0_rtx, x);
+ start_sequence ();
+ vselect_insn = emit_insn (x);
+ end_sequence ();
+}
+
+/* Construct (set target (vec_select op0 (parallel perm))) and
+ return true if that's a valid instruction in the active ISA. */
+
+static bool
+expand_vselect (rtx target, rtx op0, const unsigned char *perm,
+ unsigned nelt, bool testing_p)
+{
+ unsigned int i;
+ rtx x, save_vconcat;
+ int icode;
+
+ if (vselect_insn == NULL_RTX)
+ init_vselect_insn ();
+
+ x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
+ PUT_NUM_ELEM (XVEC (x, 0), nelt);
+ for (i = 0; i < nelt; ++i)
+ XVECEXP (x, 0, i) = GEN_INT (perm[i]);
+ save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
+ XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
+ PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
+ SET_DEST (PATTERN (vselect_insn)) = target;
+ icode = recog_memoized (vselect_insn);
+
+ if (icode >= 0 && !testing_p)
+ emit_insn (copy_rtx (PATTERN (vselect_insn)));
+
+ SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
+ XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
+ INSN_CODE (vselect_insn) = -1;
+
+ return icode >= 0;
+}
+
+/* Similar, but generate a vec_concat from op0 and op1 as well. */
+
+static bool
+expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
+ const unsigned char *perm, unsigned nelt,
+ bool testing_p)
+{
+ machine_mode v2mode;
+ rtx x;
+ bool ok;
+
+ if (vselect_insn == NULL_RTX)
+ init_vselect_insn ();
+
+ if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
+ return false;
+ x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
+ PUT_MODE (x, v2mode);
+ XEXP (x, 0) = op0;
+ XEXP (x, 1) = op1;
+ ok = expand_vselect (target, x, perm, nelt, testing_p);
+ XEXP (x, 0) = const0_rtx;
+ XEXP (x, 1) = const0_rtx;
+ return ok;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+ using movss or movsd. */
+static bool
+expand_vec_perm_movs (struct expand_vec_perm_d *d)
+{
+ machine_mode vmode = d->vmode;
+ unsigned i, nelt = d->nelt;
+ rtx x;
+
+ if (d->one_operand_p)
+ return false;
+
+ if (!(TARGET_SSE && vmode == V4SFmode)
+ && !(TARGET_SSE2 && vmode == V2DFmode))
+ return false;
+
+ /* Only the first element is changed. */
+ if (d->perm[0] != nelt && d->perm[0] != 0)
+ return false;
+ for (i = 1; i < nelt; ++i)
+ if (d->perm[i] != i + nelt - d->perm[0])
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ if (d->perm[0] == nelt)
+ x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
+ else
+ x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
+
+ emit_insn (gen_rtx_SET (d->target, x));
+
+ return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+ in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
+
+static bool
+expand_vec_perm_blend (struct expand_vec_perm_d *d)
+{
+ machine_mode mmode, vmode = d->vmode;
+ unsigned i, mask, nelt = d->nelt;
+ rtx target, op0, op1, maskop, x;
+ rtx rperm[32], vperm;
+
+ if (d->one_operand_p)
+ return false;
+ if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
+ && (TARGET_AVX512BW
+ || GET_MODE_UNIT_SIZE (vmode) >= 4))
+ ;
+ else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+ ;
+ else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
+ ;
+ else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
+ ;
+ else
+ return false;
+
+ /* This is a blend, not a permute. Elements must stay in their
+ respective lanes. */
+ for (i = 0; i < nelt; ++i)
+ {
+ unsigned e = d->perm[i];
+ if (!(e == i || e == i + nelt))
+ return false;
+ }
+
+ if (d->testing_p)
+ return true;
+
+ /* ??? Without SSE4.1, we could implement this with and/andn/or. This
+ decision should be extracted elsewhere, so that we only try that
+ sequence once all budget==3 options have been tried. */
+ target = d->target;
+ op0 = d->op0;
+ op1 = d->op1;
+ mask = 0;
+
+ switch (vmode)
+ {
+ case E_V8DFmode:
+ case E_V16SFmode:
+ case E_V4DFmode:
+ case E_V8SFmode:
+ case E_V2DFmode:
+ case E_V4SFmode:
+ case E_V8HImode:
+ case E_V8SImode:
+ case E_V32HImode:
+ case E_V64QImode:
+ case E_V16SImode:
+ case E_V8DImode:
+ for (i = 0; i < nelt; ++i)
+ mask |= (d->perm[i] >= nelt) << i;
+ break;
+
+ case E_V2DImode:
+ for (i = 0; i < 2; ++i)
+ mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
+ vmode = V8HImode;
+ goto do_subreg;
+
+ case E_V4SImode:
+ for (i = 0; i < 4; ++i)
+ mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
+ vmode = V8HImode;
+ goto do_subreg;
+
+ case E_V16QImode:
+ /* See if bytes move in pairs so we can use pblendw with
+ an immediate argument, rather than pblendvb with a vector
+ argument. */
+ for (i = 0; i < 16; i += 2)
+ if (d->perm[i] + 1 != d->perm[i + 1])
+ {
+ use_pblendvb:
+ for (i = 0; i < nelt; ++i)
+ rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
+
+ finish_pblendvb:
+ vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
+ vperm = force_reg (vmode, vperm);
+
+ if (GET_MODE_SIZE (vmode) == 16)
+ emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
+ else
+ emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
+ if (target != d->target)
+ emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+ return true;
+ }
+
+ for (i = 0; i < 8; ++i)
+ mask |= (d->perm[i * 2] >= 16) << i;
+ vmode = V8HImode;
+ /* FALLTHRU */
+
+ do_subreg:
+ target = gen_reg_rtx (vmode);
+ op0 = gen_lowpart (vmode, op0);
+ op1 = gen_lowpart (vmode, op1);
+ break;
+
+ case E_V32QImode:
+ /* See if bytes move in pairs. If not, vpblendvb must be used. */
+ for (i = 0; i < 32; i += 2)
+ if (d->perm[i] + 1 != d->perm[i + 1])
+ goto use_pblendvb;
+ /* See if bytes move in quadruplets. If yes, vpblendd
+ with immediate can be used. */
+ for (i = 0; i < 32; i += 4)
+ if (d->perm[i] + 2 != d->perm[i + 2])
+ break;
+ if (i < 32)
+ {
+ /* See if bytes move the same in both lanes. If yes,
+ vpblendw with immediate can be used. */
+ for (i = 0; i < 16; i += 2)
+ if (d->perm[i] + 16 != d->perm[i + 16])
+ goto use_pblendvb;
+
+ /* Use vpblendw. */
+ for (i = 0; i < 16; ++i)
+ mask |= (d->perm[i * 2] >= 32) << i;
+ vmode = V16HImode;
+ goto do_subreg;
+ }
+
+ /* Use vpblendd. */
+ for (i = 0; i < 8; ++i)
+ mask |= (d->perm[i * 4] >= 32) << i;
+ vmode = V8SImode;
+ goto do_subreg;
+
+ case E_V16HImode:
+ /* See if words move in pairs. If yes, vpblendd can be used. */
+ for (i = 0; i < 16; i += 2)
+ if (d->perm[i] + 1 != d->perm[i + 1])
+ break;
+ if (i < 16)
+ {
+ /* See if words move the same in both lanes. If not,
+ vpblendvb must be used. */
+ for (i = 0; i < 8; i++)
+ if (d->perm[i] + 8 != d->perm[i + 8])
+ {
+ /* Use vpblendvb. */
+ for (i = 0; i < 32; ++i)
+ rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
+
+ vmode = V32QImode;
+ nelt = 32;
+ target = gen_reg_rtx (vmode);
+ op0 = gen_lowpart (vmode, op0);
+ op1 = gen_lowpart (vmode, op1);
+ goto finish_pblendvb;
+ }
+
+ /* Use vpblendw. */
+ for (i = 0; i < 16; ++i)
+ mask |= (d->perm[i] >= 16) << i;
+ break;
+ }
+
+ /* Use vpblendd. */
+ for (i = 0; i < 8; ++i)
+ mask |= (d->perm[i * 2] >= 16) << i;
+ vmode = V8SImode;
+ goto do_subreg;
+
+ case E_V4DImode:
+ /* Use vpblendd. */
+ for (i = 0; i < 4; ++i)
+ mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
+ vmode = V8SImode;
+ goto do_subreg;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ switch (vmode)
+ {
+ case E_V8DFmode:
+ case E_V8DImode:
+ mmode = QImode;
+ break;
+ case E_V16SFmode:
+ case E_V16SImode:
+ mmode = HImode;
+ break;
+ case E_V32HImode:
+ mmode = SImode;
+ break;
+ case E_V64QImode:
+ mmode = DImode;
+ break;
+ default:
+ mmode = VOIDmode;
+ }
+
+ if (mmode != VOIDmode)
+ maskop = force_reg (mmode, gen_int_mode (mask, mmode));
+ else
+ maskop = GEN_INT (mask);
+
+ /* This matches five different patterns with the different modes. */
+ x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
+ x = gen_rtx_SET (target, x);
+ emit_insn (x);
+ if (target != d->target)
+ emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+
+ return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+ in terms of the variable form of vpermilps.
+
+ Note that we will have already failed the immediate input vpermilps,
+ which requires that the high and low part shuffle be identical; the
+ variable form doesn't require that. */
+
+static bool
+expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
+{
+ rtx rperm[8], vperm;
+ unsigned i;
+
+ if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
+ return false;
+
+ /* We can only permute within the 128-bit lane. */
+ for (i = 0; i < 8; ++i)
+ {
+ unsigned e = d->perm[i];
+ if (i < 4 ? e >= 4 : e < 4)
+ return false;
+ }
+
+ if (d->testing_p)
+ return true;
+
+ for (i = 0; i < 8; ++i)
+ {
+ unsigned e = d->perm[i];
+
+ /* Within each 128-bit lane, the elements of op0 are numbered
+ from 0 and the elements of op1 are numbered from 4. */
+ if (e >= 8 + 4)
+ e -= 8;
+ else if (e >= 4)
+ e -= 4;
+
+ rperm[i] = GEN_INT (e);
+ }
+
+ vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
+ vperm = force_reg (V8SImode, vperm);
+ emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
+
+ return true;
+}
+
+/* Return true if permutation D can be performed as VMODE permutation
+ instead. */
+
+static bool
+valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
+{
+ unsigned int i, j, chunk;
+
+ if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
+ || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
+ || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
+ return false;
+
+ if (GET_MODE_NUNITS (vmode) >= d->nelt)
+ return true;
+
+ chunk = d->nelt / GET_MODE_NUNITS (vmode);
+ for (i = 0; i < d->nelt; i += chunk)
+ if (d->perm[i] & (chunk - 1))
+ return false;
+ else
+ for (j = 1; j < chunk; ++j)
+ if (d->perm[i] + j != d->perm[i + j])
+ return false;
+
+ return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+ in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
+
+static bool
+expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
+{
+ unsigned i, nelt, eltsz, mask;
+ unsigned char perm[64];
+ machine_mode vmode = V16QImode;
+ rtx rperm[64], vperm, target, op0, op1;
+
+ nelt = d->nelt;
+
+ if (!d->one_operand_p)
+ {
+ if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
+ {
+ if (TARGET_AVX2
+ && valid_perm_using_mode_p (V2TImode, d))
+ {
+ if (d->testing_p)
+ return true;
+
+ /* Use vperm2i128 insn. The pattern uses
+ V4DImode instead of V2TImode. */
+ target = d->target;
+ if (d->vmode != V4DImode)
+ target = gen_reg_rtx (V4DImode);
+ op0 = gen_lowpart (V4DImode, d->op0);
+ op1 = gen_lowpart (V4DImode, d->op1);
+ rperm[0]
+ = GEN_INT ((d->perm[0] / (nelt / 2))
+ | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
+ emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
+ if (target != d->target)
+ emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+ return true;
+ }
+ return false;
+ }
+ }
+ else
+ {
+ if (GET_MODE_SIZE (d->vmode) == 16)
+ {
+ if (!TARGET_SSSE3)
+ return false;
+ }
+ else if (GET_MODE_SIZE (d->vmode) == 32)
+ {
+ if (!TARGET_AVX2)
+ return false;
+
+ /* V4DImode should be already handled through
+ expand_vselect by vpermq instruction. */
+ gcc_assert (d->vmode != V4DImode);
+
+ vmode = V32QImode;
+ if (d->vmode == V8SImode
+ || d->vmode == V16HImode
+ || d->vmode == V32QImode)
+ {
+ /* First see if vpermq can be used for
+ V8SImode/V16HImode/V32QImode. */
+ if (valid_perm_using_mode_p (V4DImode, d))
+ {
+ for (i = 0; i < 4; i++)
+ perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
+ if (d->testing_p)
+ return true;
+ target = gen_reg_rtx (V4DImode);
+ if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
+ perm, 4, false))
+ {
+ emit_move_insn (d->target,
+ gen_lowpart (d->vmode, target));
+ return true;
+ }
+ return false;
+ }
+
+ /* Next see if vpermd can be used. */
+ if (valid_perm_using_mode_p (V8SImode, d))
+ vmode = V8SImode;
+ }
+ /* Or if vpermps can be used. */
+ else if (d->vmode == V8SFmode)
+ vmode = V8SImode;
+
+ if (vmode == V32QImode)
+ {
+ /* vpshufb only works intra lanes, it is not
+ possible to shuffle bytes in between the lanes. */
+ for (i = 0; i < nelt; ++i)
+ if ((d->perm[i] ^ i) & (nelt / 2))
+ return false;
+ }
+ }
+ else if (GET_MODE_SIZE (d->vmode) == 64)
+ {
+ if (!TARGET_AVX512BW)
+ return false;
+
+ /* If vpermq didn't work, vpshufb won't work either. */
+ if (d->vmode == V8DFmode || d->vmode == V8DImode)
+ return false;
+
+ vmode = V64QImode;
+ if (d->vmode == V16SImode
+ || d->vmode == V32HImode
+ || d->vmode == V64QImode)
+ {
+ /* First see if vpermq can be used for
+ V16SImode/V32HImode/V64QImode. */
+ if (valid_perm_using_mode_p (V8DImode, d))
+ {
+ for (i = 0; i < 8; i++)
+ perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
+ if (d->testing_p)
+ return true;
+ target = gen_reg_rtx (V8DImode);
+ if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
+ perm, 8, false))
+ {
+ emit_move_insn (d->target,
+ gen_lowpart (d->vmode, target));
+ return true;
+ }
+ return false;
+ }
+
+ /* Next see if vpermd can be used. */
+ if (valid_perm_using_mode_p (V16SImode, d))
+ vmode = V16SImode;
+ }
+ /* Or if vpermps can be used. */
+ else if (d->vmode == V16SFmode)
+ vmode = V16SImode;
+ if (vmode == V64QImode)
+ {
+ /* vpshufb only works intra lanes, it is not
+ possible to shuffle bytes in between the lanes. */
+ for (i = 0; i < nelt; ++i)
+ if ((d->perm[i] ^ i) & (nelt / 4))
+ return false;
+ }
+ }
+ else
+ return false;
+ }
+
+ if (d->testing_p)
+ return true;
+
+ if (vmode == V8SImode)
+ for (i = 0; i < 8; ++i)
+ rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
+ else if (vmode == V16SImode)
+ for (i = 0; i < 16; ++i)
+ rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
+ else
+ {
+ eltsz = GET_MODE_UNIT_SIZE (d->vmode);
+ if (!d->one_operand_p)
+ mask = 2 * nelt - 1;
+ else if (vmode == V16QImode)
+ mask = nelt - 1;
+ else if (vmode == V64QImode)
+ mask = nelt / 4 - 1;
+ else
+ mask = nelt / 2 - 1;
+
+ for (i = 0; i < nelt; ++i)
+ {
+ unsigned j, e = d->perm[i] & mask;
+ for (j = 0; j < eltsz; ++j)
+ rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
+ }
+ }
+
+ vperm = gen_rtx_CONST_VECTOR (vmode,
+ gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
+ vperm = force_reg (vmode, vperm);
+
+ target = d->target;
+ if (d->vmode != vmode)
+ target = gen_reg_rtx (vmode);
+ op0 = gen_lowpart (vmode, d->op0);
+ if (d->one_operand_p)
+ {
+ if (vmode == V16QImode)
+ emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
+ else if (vmode == V32QImode)
+ emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
+ else if (vmode == V64QImode)
+ emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
+ else if (vmode == V8SFmode)
+ emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
+ else if (vmode == V8SImode)
+ emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
+ else if (vmode == V16SFmode)
+ emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
+ else if (vmode == V16SImode)
+ emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
+ else
+ gcc_unreachable ();
+ }
+ else
+ {
+ op1 = gen_lowpart (vmode, d->op1);
+ emit_insn (gen_xop_pperm (target, op0, op1, vperm));
+ }
+ if (target != d->target)
+ emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+
+ return true;
+}
+
+/* For V*[QHS]Imode permutations, check if the same permutation
+ can't be performed in a 2x, 4x or 8x wider inner mode. */
+
+static bool
+canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
+ struct expand_vec_perm_d *nd)
+{
+ int i;
+ machine_mode mode = VOIDmode;
+
+ switch (d->vmode)
+ {
+ case E_V16QImode: mode = V8HImode; break;
+ case E_V32QImode: mode = V16HImode; break;
+ case E_V64QImode: mode = V32HImode; break;
+ case E_V8HImode: mode = V4SImode; break;
+ case E_V16HImode: mode = V8SImode; break;
+ case E_V32HImode: mode = V16SImode; break;
+ case E_V4SImode: mode = V2DImode; break;
+ case E_V8SImode: mode = V4DImode; break;
+ case E_V16SImode: mode = V8DImode; break;
+ default: return false;
+ }
+ for (i = 0; i < d->nelt; i += 2)
+ if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
+ return false;
+ nd->vmode = mode;
+ nd->nelt = d->nelt / 2;
+ for (i = 0; i < nd->nelt; i++)
+ nd->perm[i] = d->perm[2 * i] / 2;
+ if (GET_MODE_INNER (mode) != DImode)
+ canonicalize_vector_int_perm (nd, nd);
+ if (nd != d)
+ {
+ nd->one_operand_p = d->one_operand_p;
+ nd->testing_p = d->testing_p;
+ if (d->op0 == d->op1)
+ nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
+ else
+ {
+ nd->op0 = gen_lowpart (nd->vmode, d->op0);
+ nd->op1 = gen_lowpart (nd->vmode, d->op1);
+ }
+ if (d->testing_p)
+ nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
+ else
+ nd->target = gen_reg_rtx (nd->vmode);
+ }
+ return true;
+}
+
+/* Try to expand one-operand permutation with constant mask. */
+
+static bool
+ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
+{
+ machine_mode mode = GET_MODE (d->op0);
+ machine_mode maskmode = mode;
+ rtx (*gen) (rtx, rtx, rtx) = NULL;
+ rtx target, op0, mask;
+ rtx vec[64];
+
+ if (!rtx_equal_p (d->op0, d->op1))
+ return false;
+
+ if (!TARGET_AVX512F)
+ return false;
+
+ switch (mode)
+ {
+ case E_V16SImode:
+ gen = gen_avx512f_permvarv16si;
+ break;
+ case E_V16SFmode:
+ gen = gen_avx512f_permvarv16sf;
+ maskmode = V16SImode;
+ break;
+ case E_V8DImode:
+ gen = gen_avx512f_permvarv8di;
+ break;
+ case E_V8DFmode:
+ gen = gen_avx512f_permvarv8df;
+ maskmode = V8DImode;
+ break;
+ default:
+ return false;
+ }
+
+ target = d->target;
+ op0 = d->op0;
+ for (int i = 0; i < d->nelt; ++i)
+ vec[i] = GEN_INT (d->perm[i]);
+ mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
+ emit_insn (gen (target, op0, force_reg (maskmode, mask)));
+ return true;
+}
+
+static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
+ in a single instruction. */
+
+static bool
+expand_vec_perm_1 (struct expand_vec_perm_d *d)
+{
+ unsigned i, nelt = d->nelt;
+ struct expand_vec_perm_d nd;
+
+ /* Check plain VEC_SELECT first, because AVX has instructions that could
+ match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
+ input where SEL+CONCAT may not. */
+ if (d->one_operand_p)
+ {
+ int mask = nelt - 1;
+ bool identity_perm = true;
+ bool broadcast_perm = true;
+
+ for (i = 0; i < nelt; i++)
+ {
+ nd.perm[i] = d->perm[i] & mask;
+ if (nd.perm[i] != i)
+ identity_perm = false;
+ if (nd.perm[i])
+ broadcast_perm = false;
+ }
+
+ if (identity_perm)
+ {
+ if (!d->testing_p)
+ emit_move_insn (d->target, d->op0);
+ return true;
+ }
+ else if (broadcast_perm && TARGET_AVX2)
+ {
+ /* Use vpbroadcast{b,w,d}. */
+ rtx (*gen) (rtx, rtx) = NULL;
+ switch (d->vmode)
+ {
+ case E_V64QImode:
+ if (TARGET_AVX512BW)
+ gen = gen_avx512bw_vec_dupv64qi_1;
+ break;
+ case E_V32QImode:
+ gen = gen_avx2_pbroadcastv32qi_1;
+ break;
+ case E_V32HImode:
+ if (TARGET_AVX512BW)
+ gen = gen_avx512bw_vec_dupv32hi_1;
+ break;
+ case E_V16HImode:
+ gen = gen_avx2_pbroadcastv16hi_1;
+ break;
+ case E_V16SImode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv16si_1;
+ break;
+ case E_V8SImode:
+ gen = gen_avx2_pbroadcastv8si_1;
+ break;
+ case E_V16QImode:
+ gen = gen_avx2_pbroadcastv16qi;
+ break;
+ case E_V8HImode:
+ gen = gen_avx2_pbroadcastv8hi;
+ break;
+ case E_V16SFmode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv16sf_1;
+ break;
+ case E_V8SFmode:
+ gen = gen_avx2_vec_dupv8sf_1;
+ break;
+ case E_V8DFmode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv8df_1;
+ break;
+ case E_V8DImode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv8di_1;
+ break;
+ /* For other modes prefer other shuffles this function creates. */
+ default: break;
+ }
+ if (gen != NULL)
+ {
+ if (!d->testing_p)
+ emit_insn (gen (d->target, d->op0));
+ return true;
+ }
+ }
+
+ if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
+ return true;
+
+ /* There are plenty of patterns in sse.md that are written for
+ SEL+CONCAT and are not replicated for a single op. Perhaps
+ that should be changed, to avoid the nastiness here. */
+
+ /* Recognize interleave style patterns, which means incrementing
+ every other permutation operand. */
+ for (i = 0; i < nelt; i += 2)
+ {
+ nd.perm[i] = d->perm[i] & mask;
+ nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
+ }
+ if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
+ d->testing_p))
+ return true;
+
+ /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
+ if (nelt >= 4)
+ {
+ for (i = 0; i < nelt; i += 4)
+ {
+ nd.perm[i + 0] = d->perm[i + 0] & mask;
+ nd.perm[i + 1] = d->perm[i + 1] & mask;
+ nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
+ nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
+ }
+
+ if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
+ d->testing_p))
+ return true;
+ }
+ }
+
+ /* Try movss/movsd instructions. */
+ if (expand_vec_perm_movs (d))
+ return true;
+
+ /* Finally, try the fully general two operand permute. */
+ if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
+ d->testing_p))
+ return true;
+
+ /* Recognize interleave style patterns with reversed operands. */
+ if (!d->one_operand_p)
+ {
+ for (i = 0; i < nelt; ++i)
+ {
+ unsigned e = d->perm[i];
+ if (e >= nelt)
+ e -= nelt;
+ else
+ e += nelt;
+ nd.perm[i] = e;
+ }
+
+ if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
+ d->testing_p))
+ return true;
+ }
+
+ /* Try the SSE4.1 blend variable merge instructions. */
+ if (expand_vec_perm_blend (d))
+ return true;
+
+ /* Try one of the AVX vpermil variable permutations. */
+ if (expand_vec_perm_vpermil (d))
+ return true;
+
+ /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
+ vpshufb, vpermd, vpermps or vpermq variable permutation. */
+ if (expand_vec_perm_pshufb (d))
+ return true;
+
+ /* Try the AVX2 vpalignr instruction. */
+ if (expand_vec_perm_palignr (d, true))
+ return true;
+
+ /* Try the AVX512F vperm{s,d} instructions. */
+ if (ix86_expand_vec_one_operand_perm_avx512 (d))
+ return true;
+
+ /* Try the AVX512F vpermt2/vpermi2 instructions. */
+ if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
+ return true;
+
+ /* See if we can get the same permutation in different vector integer
+ mode. */
+ if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
+ {
+ if (!d->testing_p)
+ emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
+ return true;
+ }
+ return false;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+ in terms of a pair of pshuflw + pshufhw instructions. */
+
+static bool
+expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
+{
+ unsigned char perm2[MAX_VECT_LEN];
+ unsigned i;
+ bool ok;
+
+ if (d->vmode != V8HImode || !d->one_operand_p)
+ return false;
+
+ /* The two permutations only operate in 64-bit lanes. */
+ for (i = 0; i < 4; ++i)
+ if (d->perm[i] >= 4)
+ return false;
+ for (i = 4; i < 8; ++i)
+ if (d->perm[i] < 4)
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ /* Emit the pshuflw. */
+ memcpy (perm2, d->perm, 4);
+ for (i = 4; i < 8; ++i)
+ perm2[i] = i;
+ ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
+ gcc_assert (ok);
+
+ /* Emit the pshufhw. */
+ memcpy (perm2 + 4, d->perm + 4, 4);
+ for (i = 0; i < 4; ++i)
+ perm2[i] = i;
+ ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
+ gcc_assert (ok);
+
+ return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
+ the permutation using the SSSE3 palignr instruction. This succeeds
+ when all of the elements in PERM fit within one vector and we merely
+ need to shift them down so that a single vector permutation has a
+ chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
+ the vpalignr instruction itself can perform the requested permutation. */
+
+static bool
+expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
+{
+ unsigned i, nelt = d->nelt;
+ unsigned min, max, minswap, maxswap;
+ bool in_order, ok, swap = false;
+ rtx shift, target;
+ struct expand_vec_perm_d dcopy;
+
+ /* Even with AVX, palignr only operates on 128-bit vectors,
+ in AVX2 palignr operates on both 128-bit lanes. */
+ if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
+ && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
+ return false;
+
+ min = 2 * nelt;
+ max = 0;
+ minswap = 2 * nelt;
+ maxswap = 0;
+ for (i = 0; i < nelt; ++i)
+ {
+ unsigned e = d->perm[i];
+ unsigned eswap = d->perm[i] ^ nelt;
+ if (GET_MODE_SIZE (d->vmode) == 32)
+ {
+ e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
+ eswap = e ^ (nelt / 2);
+ }
+ if (e < min)
+ min = e;
+ if (e > max)
+ max = e;
+ if (eswap < minswap)
+ minswap = eswap;
+ if (eswap > maxswap)
+ maxswap = eswap;
+ }
+ if (min == 0
+ || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
+ {
+ if (d->one_operand_p
+ || minswap == 0
+ || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
+ ? nelt / 2 : nelt))
+ return false;
+ swap = true;
+ min = minswap;
+ max = maxswap;
+ }
+
+ /* Given that we have SSSE3, we know we'll be able to implement the
+ single operand permutation after the palignr with pshufb for
+ 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
+ first. */
+ if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
+ return true;
+
+ dcopy = *d;
+ if (swap)
+ {
+ dcopy.op0 = d->op1;
+ dcopy.op1 = d->op0;
+ for (i = 0; i < nelt; ++i)
+ dcopy.perm[i] ^= nelt;
+ }
+
+ in_order = true;
+ for (i = 0; i < nelt; ++i)
+ {
+ unsigned e = dcopy.perm[i];
+ if (GET_MODE_SIZE (d->vmode) == 32
+ && e >= nelt
+ && (e & (nelt / 2 - 1)) < min)
+ e = e - min - (nelt / 2);
+ else
+ e = e - min;
+ if (e != i)
+ in_order = false;
+ dcopy.perm[i] = e;
+ }
+ dcopy.one_operand_p = true;
+
+ if (single_insn_only_p && !in_order)
+ return false;
+
+ /* For AVX2, test whether we can permute the result in one instruction. */
+ if (d->testing_p)
+ {
+ if (in_order)
+ return true;
+ dcopy.op1 = dcopy.op0;
+ return expand_vec_perm_1 (&dcopy);
+ }
+
+ shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
+ if (GET_MODE_SIZE (d->vmode) == 16)
+ {
+ target = gen_reg_rtx (TImode);
+ emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
+ gen_lowpart (TImode, dcopy.op0), shift));
+ }
+ else
+ {
+ target = gen_reg_rtx (V2TImode);
+ emit_insn (gen_avx2_palignrv2ti (target,
+ gen_lowpart (V2TImode, dcopy.op1),
+ gen_lowpart (V2TImode, dcopy.op0),
+ shift));
+ }
+
+ dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
+
+ /* Test for the degenerate case where the alignment by itself
+ produces the desired permutation. */
+ if (in_order)
+ {
+ emit_move_insn (d->target, dcopy.op0);
+ return true;
+ }
+
+ ok = expand_vec_perm_1 (&dcopy);
+ gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
+
+ return ok;
+}
+
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
+ the permutation using the SSE4_1 pblendv instruction. Potentially
+ reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
+
+static bool
+expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
+{
+ unsigned i, which, nelt = d->nelt;
+ struct expand_vec_perm_d dcopy, dcopy1;
+ machine_mode vmode = d->vmode;
+ bool ok;
+
+ /* Use the same checks as in expand_vec_perm_blend. */
+ if (d->one_operand_p)
+ return false;
+ if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+ ;
+ else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
+ ;
+ else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
+ ;
+ else
+ return false;
+
+ /* Figure out where permutation elements stay not in their
+ respective lanes. */
+ for (i = 0, which = 0; i < nelt; ++i)
+ {
+ unsigned e = d->perm[i];
+ if (e != i)
+ which |= (e < nelt ? 1 : 2);
+ }
+ /* We can pblend the part where elements stay not in their
+ respective lanes only when these elements are all in one
+ half of a permutation.
+ {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
+ lanes, but both 8 and 9 >= 8
+ {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
+ respective lanes and 8 >= 8, but 2 not. */
+ if (which != 1 && which != 2)
+ return false;
+ if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
+ return true;
+
+ /* First we apply one operand permutation to the part where
+ elements stay not in their respective lanes. */
+ dcopy = *d;
+ if (which == 2)
+ dcopy.op0 = dcopy.op1 = d->op1;
+ else
+ dcopy.op0 = dcopy.op1 = d->op0;
+ if (!d->testing_p)
+ dcopy.target = gen_reg_rtx (vmode);
+ dcopy.one_operand_p = true;
+
+ for (i = 0; i < nelt; ++i)
+ dcopy.perm[i] = d->perm[i] & (nelt - 1);
+
+ ok = expand_vec_perm_1 (&dcopy);
+ if (GET_MODE_SIZE (vmode) != 16 && !ok)
+ return false;
+ else
+ gcc_assert (ok);
+ if (d->testing_p)
+ return true;
+
+ /* Next we put permuted elements into their positions. */
+ dcopy1 = *d;
+ if (which == 2)
+ dcopy1.op1 = dcopy.target;
+ else
+ dcopy1.op0 = dcopy.target;
+
+ for (i = 0; i < nelt; ++i)
+ dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
+
+ ok = expand_vec_perm_blend (&dcopy1);
+ gcc_assert (ok);
+
+ return true;
+}
+
+static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
+ a two vector permutation into a single vector permutation by using
+ an interleave operation to merge the vectors. */
+
+static bool
+expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
+{
+ struct expand_vec_perm_d dremap, dfinal;
+ unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
+ unsigned HOST_WIDE_INT contents;
+ unsigned char remap[2 * MAX_VECT_LEN];
+ rtx_insn *seq;
+ bool ok, same_halves = false;
+
+ if (GET_MODE_SIZE (d->vmode) == 16)
+ {
+ if (d->one_operand_p)
+ return false;
+ }
+ else if (GET_MODE_SIZE (d->vmode) == 32)
+ {
+ if (!TARGET_AVX)
+ return false;
+ /* For 32-byte modes allow even d->one_operand_p.
+ The lack of cross-lane shuffling in some instructions
+ might prevent a single insn shuffle. */
+ dfinal = *d;
+ dfinal.testing_p = true;
+ /* If expand_vec_perm_interleave3 can expand this into
+ a 3 insn sequence, give up and let it be expanded as
+ 3 insn sequence. While that is one insn longer,
+ it doesn't need a memory operand and in the common
+ case that both interleave low and high permutations
+ with the same operands are adjacent needs 4 insns
+ for both after CSE. */
+ if (expand_vec_perm_interleave3 (&dfinal))
+ return false;
+ }
+ else
+ return false;
+
+ /* Examine from whence the elements come. */
+ contents = 0;
+ for (i = 0; i < nelt; ++i)
+ contents |= HOST_WIDE_INT_1U << d->perm[i];
+
+ memset (remap, 0xff, sizeof (remap));
+ dremap = *d;
+
+ if (GET_MODE_SIZE (d->vmode) == 16)
+ {
+ unsigned HOST_WIDE_INT h1, h2, h3, h4;
+
+ /* Split the two input vectors into 4 halves. */
+ h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
+ h2 = h1 << nelt2;
+ h3 = h2 << nelt2;
+ h4 = h3 << nelt2;
+
+ /* If the elements from the low halves use interleave low, and similarly
+ for interleave high. If the elements are from mis-matched halves, we
+ can use shufps for V4SF/V4SI or do a DImode shuffle. */
+ if ((contents & (h1 | h3)) == contents)
+ {
+ /* punpckl* */
+ for (i = 0; i < nelt2; ++i)
+ {
+ remap[i] = i * 2;
+ remap[i + nelt] = i * 2 + 1;
+ dremap.perm[i * 2] = i;
+ dremap.perm[i * 2 + 1] = i + nelt;
+ }
+ if (!TARGET_SSE2 && d->vmode == V4SImode)
+ dremap.vmode = V4SFmode;
+ }
+ else if ((contents & (h2 | h4)) == contents)
+ {
+ /* punpckh* */
+ for (i = 0; i < nelt2; ++i)
+ {
+ remap[i + nelt2] = i * 2;
+ remap[i + nelt + nelt2] = i * 2 + 1;
+ dremap.perm[i * 2] = i + nelt2;
+ dremap.perm[i * 2 + 1] = i + nelt + nelt2;
+ }
+ if (!TARGET_SSE2 && d->vmode == V4SImode)
+ dremap.vmode = V4SFmode;
+ }
+ else if ((contents & (h1 | h4)) == contents)
+ {
+ /* shufps */
+ for (i = 0; i < nelt2; ++i)
+ {
+ remap[i] = i;
+ remap[i + nelt + nelt2] = i + nelt2;
+ dremap.perm[i] = i;
+ dremap.perm[i + nelt2] = i + nelt + nelt2;
+ }
+ if (nelt != 4)
+ {
+ /* shufpd */
+ dremap.vmode = V2DImode;
+ dremap.nelt = 2;
+ dremap.perm[0] = 0;
+ dremap.perm[1] = 3;
+ }
+ }
+ else if ((contents & (h2 | h3)) == contents)
+ {
+ /* shufps */
+ for (i = 0; i < nelt2; ++i)
+ {
+ remap[i + nelt2] = i;
+ remap[i + nelt] = i + nelt2;
+ dremap.perm[i] = i + nelt2;
+ dremap.perm[i + nelt2] = i + nelt;
+ }
+ if (nelt != 4)
+ {
+ /* shufpd */
+ dremap.vmode = V2DImode;
+ dremap.nelt = 2;
+ dremap.perm[0] = 1;
+ dremap.perm[1] = 2;
+ }
+ }
+ else
+ return false;
+ }
+ else
+ {
+ unsigned int nelt4 = nelt / 4, nzcnt = 0;
+ unsigned HOST_WIDE_INT q[8];
+ unsigned int nonzero_halves[4];
+
+ /* Split the two input vectors into 8 quarters. */
+ q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
+ for (i = 1; i < 8; ++i)
+ q[i] = q[0] << (nelt4 * i);
+ for (i = 0; i < 4; ++i)
+ if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
+ {
+ nonzero_halves[nzcnt] = i;
+ ++nzcnt;
+ }
+
+ if (nzcnt == 1)
+ {
+ gcc_assert (d->one_operand_p);
+ nonzero_halves[1] = nonzero_halves[0];
+ same_halves = true;
+ }
+ else if (d->one_operand_p)
+ {
+ gcc_assert (nonzero_halves[0] == 0);
+ gcc_assert (nonzero_halves[1] == 1);
+ }
+
+ if (nzcnt <= 2)
+ {
+ if (d->perm[0] / nelt2 == nonzero_halves[1])
+ {
+ /* Attempt to increase the likelihood that dfinal
+ shuffle will be intra-lane. */
+ std::swap (nonzero_halves[0], nonzero_halves[1]);
+ }
+
+ /* vperm2f128 or vperm2i128. */
+ for (i = 0; i < nelt2; ++i)
+ {
+ remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
+ remap[i + nonzero_halves[0] * nelt2] = i;
+ dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
+ dremap.perm[i] = i + nonzero_halves[0] * nelt2;
+ }
+
+ if (d->vmode != V8SFmode
+ && d->vmode != V4DFmode
+ && d->vmode != V8SImode)
+ {
+ dremap.vmode = V8SImode;
+ dremap.nelt = 8;
+ for (i = 0; i < 4; ++i)
+ {
+ dremap.perm[i] = i + nonzero_halves[0] * 4;
+ dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
+ }
+ }
+ }
+ else if (d->one_operand_p)
+ return false;
+ else if (TARGET_AVX2
+ && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
+ {
+ /* vpunpckl* */
+ for (i = 0; i < nelt4; ++i)
+ {
+ remap[i] = i * 2;
+ remap[i + nelt] = i * 2 + 1;
+ remap[i + nelt2] = i * 2 + nelt2;
+ remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
+ dremap.perm[i * 2] = i;
+ dremap.perm[i * 2 + 1] = i + nelt;
+ dremap.perm[i * 2 + nelt2] = i + nelt2;
+ dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
+ }
+ }
+ else if (TARGET_AVX2
+ && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
+ {
+ /* vpunpckh* */
+ for (i = 0; i < nelt4; ++i)
+ {
+ remap[i + nelt4] = i * 2;
+ remap[i + nelt + nelt4] = i * 2 + 1;
+ remap[i + nelt2 + nelt4] = i * 2 + nelt2;
+ remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
+ dremap.perm[i * 2] = i + nelt4;
+ dremap.perm[i * 2 + 1] = i + nelt + nelt4;
+ dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
+ dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
+ }
+ }
+ else
+ return false;
+ }
+
+ /* Use the remapping array set up above to move the elements from their
+ swizzled locations into their final destinations. */
+ dfinal = *d;
+ for (i = 0; i < nelt; ++i)
+ {
+ unsigned e = remap[d->perm[i]];
+ gcc_assert (e < nelt);
+ /* If same_halves is true, both halves of the remapped vector are the
+ same. Avoid cross-lane accesses if possible. */
+ if (same_halves && i >= nelt2)
+ {
+ gcc_assert (e < nelt2);
+ dfinal.perm[i] = e + nelt2;
+ }
+ else
+ dfinal.perm[i] = e;
+ }
+ if (!d->testing_p)
+ {
+ dremap.target = gen_reg_rtx (dremap.vmode);
+ dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
+ }
+ dfinal.op1 = dfinal.op0;
+ dfinal.one_operand_p = true;
+
+ /* Test if the final remap can be done with a single insn. For V4SFmode or
+ V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
+ start_sequence ();
+ ok = expand_vec_perm_1 (&dfinal);
+ seq = get_insns ();
+ end_sequence ();
+
+ if (!ok)
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ if (dremap.vmode != dfinal.vmode)
+ {
+ dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
+ dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
+ }
+
+ ok = expand_vec_perm_1 (&dremap);
+ gcc_assert (ok);
+
+ emit_insn (seq);
+ return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
+ a single vector cross-lane permutation into vpermq followed
+ by any of the single insn permutations. */
+
+static bool
+expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
+{
+ struct expand_vec_perm_d dremap, dfinal;
+ unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
+ unsigned contents[2];
+ bool ok;
+
+ if (!(TARGET_AVX2
+ && (d->vmode == V32QImode || d->vmode == V16HImode)
+ && d->one_operand_p))
+ return false;
+
+ contents[0] = 0;
+ contents[1] = 0;
+ for (i = 0; i < nelt2; ++i)
+ {
+ contents[0] |= 1u << (d->perm[i] / nelt4);
+ contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
+ }
+
+ for (i = 0; i < 2; ++i)
+ {
+ unsigned int cnt = 0;
+ for (j = 0; j < 4; ++j)
+ if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
+ return false;
+ }
+
+ if (d->testing_p)
+ return true;
+
+ dremap = *d;
+ dremap.vmode = V4DImode;
+ dremap.nelt = 4;
+ dremap.target = gen_reg_rtx (V4DImode);
+ dremap.op0 = gen_lowpart (V4DImode, d->op0);
+ dremap.op1 = dremap.op0;
+ dremap.one_operand_p = true;
+ for (i = 0; i < 2; ++i)
+ {
+ unsigned int cnt = 0;
+ for (j = 0; j < 4; ++j)
+ if ((contents[i] & (1u << j)) != 0)
+ dremap.perm[2 * i + cnt++] = j;
+ for (; cnt < 2; ++cnt)
+ dremap.perm[2 * i + cnt] = 0;
+ }
+
+ dfinal = *d;
+ dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
+ dfinal.op1 = dfinal.op0;
+ dfinal.one_operand_p = true;
+ for (i = 0, j = 0; i < nelt; ++i)
+ {
+ if (i == nelt2)
+ j = 2;
+ dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
+ if ((d->perm[i] / nelt4) == dremap.perm[j])
+ ;
+ else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
+ dfinal.perm[i] |= nelt4;
+ else
+ gcc_unreachable ();
+ }
+
+ ok = expand_vec_perm_1 (&dremap);
+ gcc_assert (ok);
+
+ ok = expand_vec_perm_1 (&dfinal);
+ gcc_assert (ok);
+
+ return true;
+}
+
+static bool canonicalize_perm (struct expand_vec_perm_d *d);
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
+ a vector permutation using two instructions, vperm2f128 resp.
+ vperm2i128 followed by any single in-lane permutation. */
+
+static bool
+expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
+{
+ struct expand_vec_perm_d dfirst, dsecond;
+ unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
+ bool ok;
+
+ if (!TARGET_AVX
+ || GET_MODE_SIZE (d->vmode) != 32
+ || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
+ return false;
+
+ dsecond = *d;
+ dsecond.one_operand_p = false;
+ dsecond.testing_p = true;
+
+ /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
+ immediate. For perm < 16 the second permutation uses
+ d->op0 as first operand, for perm >= 16 it uses d->op1
+ as first operand. The second operand is the result of
+ vperm2[fi]128. */
+ for (perm = 0; perm < 32; perm++)
+ {
+ /* Ignore permutations which do not move anything cross-lane. */
+ if (perm < 16)
+ {
+ /* The second shuffle for e.g. V4DFmode has
+ 0123 and ABCD operands.
+ Ignore AB23, as 23 is already in the second lane
+ of the first operand. */
+ if ((perm & 0xc) == (1 << 2)) continue;
+ /* And 01CD, as 01 is in the first lane of the first
+ operand. */
+ if ((perm & 3) == 0) continue;
+ /* And 4567, as then the vperm2[fi]128 doesn't change
+ anything on the original 4567 second operand. */
+ if ((perm & 0xf) == ((3 << 2) | 2)) continue;
+ }
+ else
+ {
+ /* The second shuffle for e.g. V4DFmode has
+ 4567 and ABCD operands.
+ Ignore AB67, as 67 is already in the second lane
+ of the first operand. */
+ if ((perm & 0xc) == (3 << 2)) continue;
+ /* And 45CD, as 45 is in the first lane of the first
+ operand. */
+ if ((perm & 3) == 2) continue;
+ /* And 0123, as then the vperm2[fi]128 doesn't change
+ anything on the original 0123 first operand. */
+ if ((perm & 0xf) == (1 << 2)) continue;
+ }
+
+ for (i = 0; i < nelt; i++)
+ {
+ j = d->perm[i] / nelt2;
+ if (j == ((perm >> (2 * (i >= nelt2))) & 3))
+ dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
+ else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
+ dsecond.perm[i] = d->perm[i] & (nelt - 1);
+ else
+ break;
+ }
+
+ if (i == nelt)
+ {
+ start_sequence ();
+ ok = expand_vec_perm_1 (&dsecond);
+ end_sequence ();
+ }
+ else
+ ok = false;
+
+ if (ok)
+ {
+ if (d->testing_p)
+ return true;
+
+ /* Found a usable second shuffle. dfirst will be
+ vperm2f128 on d->op0 and d->op1. */
+ dsecond.testing_p = false;
+ dfirst = *d;
+ dfirst.target = gen_reg_rtx (d->vmode);
+ for (i = 0; i < nelt; i++)
+ dfirst.perm[i] = (i & (nelt2 - 1))
+ + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
+
+ canonicalize_perm (&dfirst);
+ ok = expand_vec_perm_1 (&dfirst);
+ gcc_assert (ok);
+
+ /* And dsecond is some single insn shuffle, taking
+ d->op0 and result of vperm2f128 (if perm < 16) or
+ d->op1 and result of vperm2f128 (otherwise). */
+ if (perm >= 16)
+ dsecond.op0 = dsecond.op1;
+ dsecond.op1 = dfirst.target;
+
+ ok = expand_vec_perm_1 (&dsecond);
+ gcc_assert (ok);
+
+ return true;
+ }
+
+ /* For one operand, the only useful vperm2f128 permutation is 0x01
+ aka lanes swap. */
+ if (d->one_operand_p)
+ return false;
+ }
+
+ return false;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
+ a two vector permutation using 2 intra-lane interleave insns
+ and cross-lane shuffle for 32-byte vectors. */
+
+static bool
+expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
+{
+ unsigned i, nelt;
+ rtx (*gen) (rtx, rtx, rtx);
+
+ if (d->one_operand_p)
+ return false;
+ if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
+ ;
+ else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
+ ;
+ else
+ return false;
+
+ nelt = d->nelt;
+ if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
+ return false;
+ for (i = 0; i < nelt; i += 2)
+ if (d->perm[i] != d->perm[0] + i / 2
+ || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ switch (d->vmode)
+ {
+ case E_V32QImode:
+ if (d->perm[0])
+ gen = gen_vec_interleave_highv32qi;
+ else
+ gen = gen_vec_interleave_lowv32qi;
+ break;
+ case E_V16HImode:
+ if (d->perm[0])
+ gen = gen_vec_interleave_highv16hi;
+ else
+ gen = gen_vec_interleave_lowv16hi;
+ break;
+ case E_V8SImode:
+ if (d->perm[0])
+ gen = gen_vec_interleave_highv8si;
+ else
+ gen = gen_vec_interleave_lowv8si;
+ break;
+ case E_V4DImode:
+ if (d->perm[0])
+ gen = gen_vec_interleave_highv4di;
+ else
+ gen = gen_vec_interleave_lowv4di;
+ break;
+ case E_V8SFmode:
+ if (d->perm[0])
+ gen = gen_vec_interleave_highv8sf;
+ else
+ gen = gen_vec_interleave_lowv8sf;
+ break;
+ case E_V4DFmode:
+ if (d->perm[0])
+ gen = gen_vec_interleave_highv4df;
+ else
+ gen = gen_vec_interleave_lowv4df;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ emit_insn (gen (d->target, d->op0, d->op1));
+ return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
+ a single vector permutation using a single intra-lane vector
+ permutation, vperm2f128 swapping the lanes and vblend* insn blending
+ the non-swapped and swapped vectors together. */
+
+static bool
+expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
+{
+ struct expand_vec_perm_d dfirst, dsecond;
+ unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
+ rtx_insn *seq;
+ bool ok;
+ rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
+
+ if (!TARGET_AVX
+ || TARGET_AVX2
+ || (d->vmode != V8SFmode && d->vmode != V4DFmode)
+ || !d->one_operand_p)
+ return false;
+
+ dfirst = *d;
+ for (i = 0; i < nelt; i++)
+ dfirst.perm[i] = 0xff;
+ for (i = 0, msk = 0; i < nelt; i++)
+ {
+ j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
+ if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
+ return false;
+ dfirst.perm[j] = d->perm[i];
+ if (j != i)
+ msk |= (1 << i);
+ }
+ for (i = 0; i < nelt; i++)
+ if (dfirst.perm[i] == 0xff)
+ dfirst.perm[i] = i;
+
+ if (!d->testing_p)
+ dfirst.target = gen_reg_rtx (dfirst.vmode);
+
+ start_sequence ();
+ ok = expand_vec_perm_1 (&dfirst);
+ seq = get_insns ();
+ end_sequence ();
+
+ if (!ok)
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ emit_insn (seq);
+
+ dsecond = *d;
+ dsecond.op0 = dfirst.target;
+ dsecond.op1 = dfirst.target;
+ dsecond.one_operand_p = true;
+ dsecond.target = gen_reg_rtx (dsecond.vmode);
+ for (i = 0; i < nelt; i++)
+ dsecond.perm[i] = i ^ nelt2;
+
+ ok = expand_vec_perm_1 (&dsecond);
+ gcc_assert (ok);
+
+ blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
+ emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
+ return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
+ permutation using two vperm2f128, followed by a vshufpd insn blending
+ the two vectors together. */
+
+static bool
+expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
+{
+ struct expand_vec_perm_d dfirst, dsecond, dthird;
+ bool ok;
+
+ if (!TARGET_AVX || (d->vmode != V4DFmode))
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ dfirst = *d;
+ dsecond = *d;
+ dthird = *d;
+
+ dfirst.perm[0] = (d->perm[0] & ~1);
+ dfirst.perm[1] = (d->perm[0] & ~1) + 1;
+ dfirst.perm[2] = (d->perm[2] & ~1);
+ dfirst.perm[3] = (d->perm[2] & ~1) + 1;
+ dsecond.perm[0] = (d->perm[1] & ~1);
+ dsecond.perm[1] = (d->perm[1] & ~1) + 1;
+ dsecond.perm[2] = (d->perm[3] & ~1);
+ dsecond.perm[3] = (d->perm[3] & ~1) + 1;
+ dthird.perm[0] = (d->perm[0] % 2);
+ dthird.perm[1] = (d->perm[1] % 2) + 4;
+ dthird.perm[2] = (d->perm[2] % 2) + 2;
+ dthird.perm[3] = (d->perm[3] % 2) + 6;
+
+ dfirst.target = gen_reg_rtx (dfirst.vmode);
+ dsecond.target = gen_reg_rtx (dsecond.vmode);
+ dthird.op0 = dfirst.target;
+ dthird.op1 = dsecond.target;
+ dthird.one_operand_p = false;
+
+ canonicalize_perm (&dfirst);
+ canonicalize_perm (&dsecond);
+
+ ok = expand_vec_perm_1 (&dfirst)
+ && expand_vec_perm_1 (&dsecond)
+ && expand_vec_perm_1 (&dthird);
+
+ gcc_assert (ok);
+
+ return true;
+}
+
+/* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
+ permutation with two pshufb insns and an ior. We should have already
+ failed all two instruction sequences. */
+
+static bool
+expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
+{
+ rtx rperm[2][16], vperm, l, h, op, m128;
+ unsigned int i, nelt, eltsz;
+
+ if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
+ return false;
+ gcc_assert (!d->one_operand_p);
+
+ if (d->testing_p)
+ return true;
+
+ nelt = d->nelt;
+ eltsz = GET_MODE_UNIT_SIZE (d->vmode);
+
+ /* Generate two permutation masks. If the required element is within
+ the given vector it is shuffled into the proper lane. If the required
+ element is in the other vector, force a zero into the lane by setting
+ bit 7 in the permutation mask. */
+ m128 = GEN_INT (-128);
+ for (i = 0; i < nelt; ++i)
+ {
+ unsigned j, e = d->perm[i];
+ unsigned which = (e >= nelt);
+ if (e >= nelt)
+ e -= nelt;
+
+ for (j = 0; j < eltsz; ++j)
+ {
+ rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
+ rperm[1-which][i*eltsz + j] = m128;
+ }
+ }
+
+ vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
+ vperm = force_reg (V16QImode, vperm);
+
+ l = gen_reg_rtx (V16QImode);
+ op = gen_lowpart (V16QImode, d->op0);
+ emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
+
+ vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
+ vperm = force_reg (V16QImode, vperm);
+
+ h = gen_reg_rtx (V16QImode);
+ op = gen_lowpart (V16QImode, d->op1);
+ emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
+
+ op = d->target;
+ if (d->vmode != V16QImode)
+ op = gen_reg_rtx (V16QImode);
+ emit_insn (gen_iorv16qi3 (op, l, h));
+ if (op != d->target)
+ emit_move_insn (d->target, gen_lowpart (d->vmode, op));
+
+ return true;
+}
+
+/* Implement arbitrary permutation of one V32QImode and V16QImode operand
+ with two vpshufb insns, vpermq and vpor. We should have already failed
+ all two or three instruction sequences. */
+
+static bool
+expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
+{
+ rtx rperm[2][32], vperm, l, h, hp, op, m128;
+ unsigned int i, nelt, eltsz;
+
+ if (!TARGET_AVX2
+ || !d->one_operand_p
+ || (d->vmode != V32QImode && d->vmode != V16HImode))
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ nelt = d->nelt;
+ eltsz = GET_MODE_UNIT_SIZE (d->vmode);
+
+ /* Generate two permutation masks. If the required element is within
+ the same lane, it is shuffled in. If the required element from the
+ other lane, force a zero by setting bit 7 in the permutation mask.
+ In the other mask the mask has non-negative elements if element
+ is requested from the other lane, but also moved to the other lane,
+ so that the result of vpshufb can have the two V2TImode halves
+ swapped. */
+ m128 = GEN_INT (-128);
+ for (i = 0; i < nelt; ++i)
+ {
+ unsigned j, e = d->perm[i] & (nelt / 2 - 1);
+ unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
+
+ for (j = 0; j < eltsz; ++j)
+ {
+ rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
+ rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
+ }
+ }
+
+ vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
+ vperm = force_reg (V32QImode, vperm);
+
+ h = gen_reg_rtx (V32QImode);
+ op = gen_lowpart (V32QImode, d->op0);
+ emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
+
+ /* Swap the 128-byte lanes of h into hp. */
+ hp = gen_reg_rtx (V4DImode);
+ op = gen_lowpart (V4DImode, h);
+ emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
+ const1_rtx));
+
+ vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
+ vperm = force_reg (V32QImode, vperm);
+
+ l = gen_reg_rtx (V32QImode);
+ op = gen_lowpart (V32QImode, d->op0);
+ emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
+
+ op = d->target;
+ if (d->vmode != V32QImode)
+ op = gen_reg_rtx (V32QImode);
+ emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
+ if (op != d->target)
+ emit_move_insn (d->target, gen_lowpart (d->vmode, op));
+
+ return true;
+}
+
+/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
+ and extract-odd permutations of two V32QImode and V16QImode operand
+ with two vpshufb insns, vpor and vpermq. We should have already
+ failed all two or three instruction sequences. */
+
+static bool
+expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
+{
+ rtx rperm[2][32], vperm, l, h, ior, op, m128;
+ unsigned int i, nelt, eltsz;
+
+ if (!TARGET_AVX2
+ || d->one_operand_p
+ || (d->vmode != V32QImode && d->vmode != V16HImode))
+ return false;
+
+ for (i = 0; i < d->nelt; ++i)
+ if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ nelt = d->nelt;
+ eltsz = GET_MODE_UNIT_SIZE (d->vmode);
+
+ /* Generate two permutation masks. In the first permutation mask
+ the first quarter will contain indexes for the first half
+ of the op0, the second quarter will contain bit 7 set, third quarter
+ will contain indexes for the second half of the op0 and the
+ last quarter bit 7 set. In the second permutation mask
+ the first quarter will contain bit 7 set, the second quarter
+ indexes for the first half of the op1, the third quarter bit 7 set
+ and last quarter indexes for the second half of the op1.
+ I.e. the first mask e.g. for V32QImode extract even will be:
+ 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
+ (all values masked with 0xf except for -128) and second mask
+ for extract even will be
+ -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
+ m128 = GEN_INT (-128);
+ for (i = 0; i < nelt; ++i)
+ {
+ unsigned j, e = d->perm[i] & (nelt / 2 - 1);
+ unsigned which = d->perm[i] >= nelt;
+ unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
+
+ for (j = 0; j < eltsz; ++j)
+ {
+ rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
+ rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
+ }
+ }
+
+ vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
+ vperm = force_reg (V32QImode, vperm);
+
+ l = gen_reg_rtx (V32QImode);
+ op = gen_lowpart (V32QImode, d->op0);
+ emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
+
+ vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
+ vperm = force_reg (V32QImode, vperm);
+
+ h = gen_reg_rtx (V32QImode);
+ op = gen_lowpart (V32QImode, d->op1);
+ emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
+
+ ior = gen_reg_rtx (V32QImode);
+ emit_insn (gen_iorv32qi3 (ior, l, h));
+
+ /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
+ op = gen_reg_rtx (V4DImode);
+ ior = gen_lowpart (V4DImode, ior);
+ emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
+ const1_rtx, GEN_INT (3)));
+ emit_move_insn (d->target, gen_lowpart (d->vmode, op));
+
+ return true;
+}
+
+/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
+ and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
+ with two "and" and "pack" or two "shift" and "pack" insns. We should
+ have already failed all two instruction sequences. */
+
+static bool
+expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
+{
+ rtx op, dop0, dop1, t;
+ unsigned i, odd, c, s, nelt = d->nelt;
+ bool end_perm = false;
+ machine_mode half_mode;
+ rtx (*gen_and) (rtx, rtx, rtx);
+ rtx (*gen_pack) (rtx, rtx, rtx);
+ rtx (*gen_shift) (rtx, rtx, rtx);
+
+ if (d->one_operand_p)
+ return false;
+
+ switch (d->vmode)
+ {
+ case E_V8HImode:
+ /* Required for "pack". */
+ if (!TARGET_SSE4_1)
+ return false;
+ c = 0xffff;
+ s = 16;
+ half_mode = V4SImode;
+ gen_and = gen_andv4si3;
+ gen_pack = gen_sse4_1_packusdw;
+ gen_shift = gen_lshrv4si3;
+ break;
+ case E_V16QImode:
+ /* No check as all instructions are SSE2. */
+ c = 0xff;
+ s = 8;
+ half_mode = V8HImode;
+ gen_and = gen_andv8hi3;
+ gen_pack = gen_sse2_packuswb;
+ gen_shift = gen_lshrv8hi3;
+ break;
+ case E_V16HImode:
+ if (!TARGET_AVX2)
+ return false;
+ c = 0xffff;
+ s = 16;
+ half_mode = V8SImode;
+ gen_and = gen_andv8si3;
+ gen_pack = gen_avx2_packusdw;
+ gen_shift = gen_lshrv8si3;
+ end_perm = true;
+ break;
+ case E_V32QImode:
+ if (!TARGET_AVX2)
+ return false;
+ c = 0xff;
+ s = 8;
+ half_mode = V16HImode;
+ gen_and = gen_andv16hi3;
+ gen_pack = gen_avx2_packuswb;
+ gen_shift = gen_lshrv16hi3;
+ end_perm = true;
+ break;
+ default:
+ /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
+ general shuffles. */
+ return false;
+ }
+
+ /* Check that permutation is even or odd. */
+ odd = d->perm[0];
+ if (odd > 1)
+ return false;
+
+ for (i = 1; i < nelt; ++i)
+ if (d->perm[i] != 2 * i + odd)
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ dop0 = gen_reg_rtx (half_mode);
+ dop1 = gen_reg_rtx (half_mode);
+ if (odd == 0)
+ {
+ t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
+ t = force_reg (half_mode, t);
+ emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
+ emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
+ }
+ else
+ {
+ emit_insn (gen_shift (dop0,
+ gen_lowpart (half_mode, d->op0),
+ GEN_INT (s)));
+ emit_insn (gen_shift (dop1,
+ gen_lowpart (half_mode, d->op1),
+ GEN_INT (s)));
+ }
+ /* In AVX2 for 256 bit case we need to permute pack result. */
+ if (TARGET_AVX2 && end_perm)
+ {
+ op = gen_reg_rtx (d->vmode);
+ t = gen_reg_rtx (V4DImode);
+ emit_insn (gen_pack (op, dop0, dop1));
+ emit_insn (gen_avx2_permv4di_1 (t,
+ gen_lowpart (V4DImode, op),
+ const0_rtx,
+ const2_rtx,
+ const1_rtx,
+ GEN_INT (3)));
+ emit_move_insn (d->target, gen_lowpart (d->vmode, t));
+ }
+ else
+ emit_insn (gen_pack (d->target, dop0, dop1));
+
+ return true;
+}
+
+/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
+ and extract-odd permutations of two V64QI operands
+ with two "shifts", two "truncs" and one "concat" insns for "odd"
+ and two "truncs" and one concat insn for "even."
+ Have already failed all two instruction sequences. */
+
+static bool
+expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
+{
+ rtx t1, t2, t3, t4;
+ unsigned i, odd, nelt = d->nelt;
+
+ if (!TARGET_AVX512BW
+ || d->one_operand_p
+ || d->vmode != V64QImode)
+ return false;
+
+ /* Check that permutation is even or odd. */
+ odd = d->perm[0];
+ if (odd > 1)
+ return false;
+
+ for (i = 1; i < nelt; ++i)
+ if (d->perm[i] != 2 * i + odd)
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+
+ if (odd)
+ {
+ t1 = gen_reg_rtx (V32HImode);
+ t2 = gen_reg_rtx (V32HImode);
+ emit_insn (gen_lshrv32hi3 (t1,
+ gen_lowpart (V32HImode, d->op0),
+ GEN_INT (8)));
+ emit_insn (gen_lshrv32hi3 (t2,
+ gen_lowpart (V32HImode, d->op1),
+ GEN_INT (8)));
+ }
+ else
+ {
+ t1 = gen_lowpart (V32HImode, d->op0);
+ t2 = gen_lowpart (V32HImode, d->op1);
+ }
+
+ t3 = gen_reg_rtx (V32QImode);
+ t4 = gen_reg_rtx (V32QImode);
+ emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
+ emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
+ emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
+
+ return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
+ and extract-odd permutations. */
+
+static bool
+expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
+{
+ rtx t1, t2, t3, t4, t5;
+
+ switch (d->vmode)
+ {
+ case E_V4DFmode:
+ if (d->testing_p)
+ break;
+ t1 = gen_reg_rtx (V4DFmode);
+ t2 = gen_reg_rtx (V4DFmode);
+
+ /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
+ emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
+ emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
+
+ /* Now an unpck[lh]pd will produce the result required. */
+ if (odd)
+ t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
+ else
+ t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
+ emit_insn (t3);
+ break;
+
+ case E_V8SFmode:
+ {
+ int mask = odd ? 0xdd : 0x88;
+
+ if (d->testing_p)
+ break;
+ t1 = gen_reg_rtx (V8SFmode);
+ t2 = gen_reg_rtx (V8SFmode);
+ t3 = gen_reg_rtx (V8SFmode);
+
+ /* Shuffle within the 128-bit lanes to produce:
+ { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
+ emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
+ GEN_INT (mask)));
+
+ /* Shuffle the lanes around to produce:
+ { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
+ emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
+ GEN_INT (0x3)));
+
+ /* Shuffle within the 128-bit lanes to produce:
+ { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
+ emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
+
+ /* Shuffle within the 128-bit lanes to produce:
+ { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
+ emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
+
+ /* Shuffle the lanes around to produce:
+ { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
+ emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
+ GEN_INT (0x20)));
+ }
+ break;
+
+ case E_V2DFmode:
+ case E_V4SFmode:
+ case E_V2DImode:
+ case E_V4SImode:
+ /* These are always directly implementable by expand_vec_perm_1. */
+ gcc_unreachable ();
+
+ case E_V8HImode:
+ if (TARGET_SSE4_1)
+ return expand_vec_perm_even_odd_pack (d);
+ else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
+ return expand_vec_perm_pshufb2 (d);
+ else
+ {
+ if (d->testing_p)
+ break;
+ /* We need 2*log2(N)-1 operations to achieve odd/even
+ with interleave. */
+ t1 = gen_reg_rtx (V8HImode);
+ t2 = gen_reg_rtx (V8HImode);
+ emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
+ emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
+ emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
+ emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
+ if (odd)
+ t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
+ else
+ t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
+ emit_insn (t3);
+ }
+ break;
+
+ case E_V16QImode:
+ return expand_vec_perm_even_odd_pack (d);
+
+ case E_V16HImode:
+ case E_V32QImode:
+ return expand_vec_perm_even_odd_pack (d);
+
+ case E_V64QImode:
+ return expand_vec_perm_even_odd_trunc (d);
+
+ case E_V4DImode:
+ if (!TARGET_AVX2)
+ {
+ struct expand_vec_perm_d d_copy = *d;
+ d_copy.vmode = V4DFmode;
+ if (d->testing_p)
+ d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
+ else
+ d_copy.target = gen_reg_rtx (V4DFmode);
+ d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
+ d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
+ if (expand_vec_perm_even_odd_1 (&d_copy, odd))
+ {
+ if (!d->testing_p)
+ emit_move_insn (d->target,
+ gen_lowpart (V4DImode, d_copy.target));
+ return true;
+ }
+ return false;
+ }
+
+ if (d->testing_p)
+ break;
+
+ t1 = gen_reg_rtx (V4DImode);
+ t2 = gen_reg_rtx (V4DImode);
+
+ /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
+ emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
+ emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
+
+ /* Now an vpunpck[lh]qdq will produce the result required. */
+ if (odd)
+ t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
+ else
+ t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
+ emit_insn (t3);
+ break;
+
+ case E_V8SImode:
+ if (!TARGET_AVX2)
+ {
+ struct expand_vec_perm_d d_copy = *d;
+ d_copy.vmode = V8SFmode;
+ if (d->testing_p)
+ d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
+ else
+ d_copy.target = gen_reg_rtx (V8SFmode);
+ d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
+ d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
+ if (expand_vec_perm_even_odd_1 (&d_copy, odd))
+ {
+ if (!d->testing_p)
+ emit_move_insn (d->target,
+ gen_lowpart (V8SImode, d_copy.target));
+ return true;
+ }
+ return false;
+ }
+
+ if (d->testing_p)
+ break;
+
+ t1 = gen_reg_rtx (V8SImode);
+ t2 = gen_reg_rtx (V8SImode);
+ t3 = gen_reg_rtx (V4DImode);
+ t4 = gen_reg_rtx (V4DImode);
+ t5 = gen_reg_rtx (V4DImode);
+
+ /* Shuffle the lanes around into
+ { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
+ emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
+ gen_lowpart (V4DImode, d->op1),
+ GEN_INT (0x20)));
+ emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
+ gen_lowpart (V4DImode, d->op1),
+ GEN_INT (0x31)));
+
+ /* Swap the 2nd and 3rd position in each lane into
+ { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
+ emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
+ GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
+ emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
+ GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
+
+ /* Now an vpunpck[lh]qdq will produce
+ { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
+ if (odd)
+ t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
+ gen_lowpart (V4DImode, t2));
+ else
+ t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
+ gen_lowpart (V4DImode, t2));
+ emit_insn (t3);
+ emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
+ extract-even and extract-odd permutations. */
+
+static bool
+expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
+{
+ unsigned i, odd, nelt = d->nelt;
+
+ odd = d->perm[0];
+ if (odd != 0 && odd != 1)
+ return false;
+
+ for (i = 1; i < nelt; ++i)
+ if (d->perm[i] != 2 * i + odd)
+ return false;
+
+ return expand_vec_perm_even_odd_1 (d, odd);
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
+ permutations. We assume that expand_vec_perm_1 has already failed. */
+
+static bool
+expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
+{
+ unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
+ machine_mode vmode = d->vmode;
+ unsigned char perm2[4];
+ rtx op0 = d->op0, dest;
+ bool ok;
+
+ switch (vmode)
+ {
+ case E_V4DFmode:
+ case E_V8SFmode:
+ /* These are special-cased in sse.md so that we can optionally
+ use the vbroadcast instruction. They expand to two insns
+ if the input happens to be in a register. */
+ gcc_unreachable ();
+
+ case E_V2DFmode:
+ case E_V2DImode:
+ case E_V4SFmode:
+ case E_V4SImode:
+ /* These are always implementable using standard shuffle patterns. */
+ gcc_unreachable ();
+
+ case E_V8HImode:
+ case E_V16QImode:
+ /* These can be implemented via interleave. We save one insn by
+ stopping once we have promoted to V4SImode and then use pshufd. */
+ if (d->testing_p)
+ return true;
+ do
+ {
+ rtx dest;
+ rtx (*gen) (rtx, rtx, rtx)
+ = vmode == V16QImode ? gen_vec_interleave_lowv16qi
+ : gen_vec_interleave_lowv8hi;
+
+ if (elt >= nelt2)
+ {
+ gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
+ : gen_vec_interleave_highv8hi;
+ elt -= nelt2;
+ }
+ nelt2 /= 2;
+
+ dest = gen_reg_rtx (vmode);
+ emit_insn (gen (dest, op0, op0));
+ vmode = get_mode_wider_vector (vmode);
+ op0 = gen_lowpart (vmode, dest);
+ }
+ while (vmode != V4SImode);
+
+ memset (perm2, elt, 4);
+ dest = gen_reg_rtx (V4SImode);
+ ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
+ gcc_assert (ok);
+ if (!d->testing_p)
+ emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
+ return true;
+
+ case E_V64QImode:
+ case E_V32QImode:
+ case E_V16HImode:
+ case E_V8SImode:
+ case E_V4DImode:
+ /* For AVX2 broadcasts of the first element vpbroadcast* or
+ vpermq should be used by expand_vec_perm_1. */
+ gcc_assert (!TARGET_AVX2 || d->perm[0]);
+ return false;
+
+ default:
+ gcc_unreachable ();
+ }
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
+ broadcast permutations. */
+
+static bool
+expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
+{
+ unsigned i, elt, nelt = d->nelt;
+
+ if (!d->one_operand_p)
+ return false;
+
+ elt = d->perm[0];
+ for (i = 1; i < nelt; ++i)
+ if (d->perm[i] != elt)
+ return false;
+
+ return expand_vec_perm_broadcast_1 (d);
+}
+
+/* Implement arbitrary permutations of two V64QImode operands
+ with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
+static bool
+expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
+{
+ if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ struct expand_vec_perm_d ds[2];
+ rtx rperm[128], vperm, target0, target1;
+ unsigned int i, nelt;
+ machine_mode vmode;
+
+ nelt = d->nelt;
+ vmode = V64QImode;
+
+ for (i = 0; i < 2; i++)
+ {
+ ds[i] = *d;
+ ds[i].vmode = V32HImode;
+ ds[i].nelt = 32;
+ ds[i].target = gen_reg_rtx (V32HImode);
+ ds[i].op0 = gen_lowpart (V32HImode, d->op0);
+ ds[i].op1 = gen_lowpart (V32HImode, d->op1);
+ }
+
+ /* Prepare permutations such that the first one takes care of
+ putting the even bytes into the right positions or one higher
+ positions (ds[0]) and the second one takes care of
+ putting the odd bytes into the right positions or one below
+ (ds[1]). */
+
+ for (i = 0; i < nelt; i++)
+ {
+ ds[i & 1].perm[i / 2] = d->perm[i] / 2;
+ if (i & 1)
+ {
+ rperm[i] = constm1_rtx;
+ rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
+ }
+ else
+ {
+ rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
+ rperm[i + 64] = constm1_rtx;
+ }
+ }
+
+ bool ok = expand_vec_perm_1 (&ds[0]);
+ gcc_assert (ok);
+ ds[0].target = gen_lowpart (V64QImode, ds[0].target);
+
+ ok = expand_vec_perm_1 (&ds[1]);
+ gcc_assert (ok);
+ ds[1].target = gen_lowpart (V64QImode, ds[1].target);
+
+ vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
+ vperm = force_reg (vmode, vperm);
+ target0 = gen_reg_rtx (V64QImode);
+ emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
+
+ vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
+ vperm = force_reg (vmode, vperm);
+ target1 = gen_reg_rtx (V64QImode);
+ emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
+
+ emit_insn (gen_iorv64qi3 (d->target, target0, target1));
+ return true;
+}
+
+/* Implement arbitrary permutation of two V32QImode and V16QImode operands
+ with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
+ all the shorter instruction sequences. */
+
+static bool
+expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
+{
+ rtx rperm[4][32], vperm, l[2], h[2], op, m128;
+ unsigned int i, nelt, eltsz;
+ bool used[4];
+
+ if (!TARGET_AVX2
+ || d->one_operand_p
+ || (d->vmode != V32QImode && d->vmode != V16HImode))
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ nelt = d->nelt;
+ eltsz = GET_MODE_UNIT_SIZE (d->vmode);
+
+ /* Generate 4 permutation masks. If the required element is within
+ the same lane, it is shuffled in. If the required element from the
+ other lane, force a zero by setting bit 7 in the permutation mask.
+ In the other mask the mask has non-negative elements if element
+ is requested from the other lane, but also moved to the other lane,
+ so that the result of vpshufb can have the two V2TImode halves
+ swapped. */
+ m128 = GEN_INT (-128);
+ for (i = 0; i < 32; ++i)
+ {
+ rperm[0][i] = m128;
+ rperm[1][i] = m128;
+ rperm[2][i] = m128;
+ rperm[3][i] = m128;
+ }
+ used[0] = false;
+ used[1] = false;
+ used[2] = false;
+ used[3] = false;
+ for (i = 0; i < nelt; ++i)
+ {
+ unsigned j, e = d->perm[i] & (nelt / 2 - 1);
+ unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
+ unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
+
+ for (j = 0; j < eltsz; ++j)
+ rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
+ used[which] = true;
+ }
+
+ for (i = 0; i < 2; ++i)
+ {
+ if (!used[2 * i + 1])
+ {
+ h[i] = NULL_RTX;
+ continue;
+ }
+ vperm = gen_rtx_CONST_VECTOR (V32QImode,
+ gen_rtvec_v (32, rperm[2 * i + 1]));
+ vperm = force_reg (V32QImode, vperm);
+ h[i] = gen_reg_rtx (V32QImode);
+ op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
+ emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
+ }
+
+ /* Swap the 128-byte lanes of h[X]. */
+ for (i = 0; i < 2; ++i)
+ {
+ if (h[i] == NULL_RTX)
+ continue;
+ op = gen_reg_rtx (V4DImode);
+ emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
+ const2_rtx, GEN_INT (3), const0_rtx,
+ const1_rtx));
+ h[i] = gen_lowpart (V32QImode, op);
+ }
+
+ for (i = 0; i < 2; ++i)
+ {
+ if (!used[2 * i])
+ {
+ l[i] = NULL_RTX;
+ continue;
+ }
+ vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
+ vperm = force_reg (V32QImode, vperm);
+ l[i] = gen_reg_rtx (V32QImode);
+ op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
+ emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
+ }
+
+ for (i = 0; i < 2; ++i)
+ {
+ if (h[i] && l[i])
+ {
+ op = gen_reg_rtx (V32QImode);
+ emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
+ l[i] = op;
+ }
+ else if (h[i])
+ l[i] = h[i];
+ }
+
+ gcc_assert (l[0] && l[1]);
+ op = d->target;
+ if (d->vmode != V32QImode)
+ op = gen_reg_rtx (V32QImode);
+ emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
+ if (op != d->target)
+ emit_move_insn (d->target, gen_lowpart (d->vmode, op));
+ return true;
+}
+
+/* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
+ taken care of, perform the expansion in D and return true on success. */
+
+static bool
+ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
+{
+ /* Try a single instruction expansion. */
+ if (expand_vec_perm_1 (d))
+ return true;
+
+ /* Try sequences of two instructions. */
+
+ if (expand_vec_perm_pshuflw_pshufhw (d))
+ return true;
+
+ if (expand_vec_perm_palignr (d, false))
+ return true;
+
+ if (expand_vec_perm_interleave2 (d))
+ return true;
+
+ if (expand_vec_perm_broadcast (d))
+ return true;
+
+ if (expand_vec_perm_vpermq_perm_1 (d))
+ return true;
+
+ if (expand_vec_perm_vperm2f128 (d))
+ return true;
+
+ if (expand_vec_perm_pblendv (d))
+ return true;
+
+ /* Try sequences of three instructions. */
+
+ if (expand_vec_perm_even_odd_pack (d))
+ return true;
+
+ if (expand_vec_perm_2vperm2f128_vshuf (d))
+ return true;
+
+ if (expand_vec_perm_pshufb2 (d))
+ return true;
+
+ if (expand_vec_perm_interleave3 (d))
+ return true;
+
+ if (expand_vec_perm_vperm2f128_vblend (d))
+ return true;
+
+ /* Try sequences of four instructions. */
+
+ if (expand_vec_perm_even_odd_trunc (d))
+ return true;
+ if (expand_vec_perm_vpshufb2_vpermq (d))
+ return true;
+
+ if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
+ return true;
+
+ if (expand_vec_perm_vpermt2_vpshub2 (d))
+ return true;
+
+ /* ??? Look for narrow permutations whose element orderings would
+ allow the promotion to a wider mode. */
+
+ /* ??? Look for sequences of interleave or a wider permute that place
+ the data into the correct lanes for a half-vector shuffle like
+ pshuf[lh]w or vpermilps. */
+
+ /* ??? Look for sequences of interleave that produce the desired results.
+ The combinatorics of punpck[lh] get pretty ugly... */
+
+ if (expand_vec_perm_even_odd (d))
+ return true;
+
+ /* Even longer sequences. */
+ if (expand_vec_perm_vpshufb4_vpermq2 (d))
+ return true;
+
+ /* See if we can get the same permutation in different vector integer
+ mode. */
+ struct expand_vec_perm_d nd;
+ if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
+ {
+ if (!d->testing_p)
+ emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
+ return true;
+ }
+
+ return false;
+}
+
+/* If a permutation only uses one operand, make it clear. Returns true
+ if the permutation references both operands. */
+
+static bool
+canonicalize_perm (struct expand_vec_perm_d *d)
+{
+ int i, which, nelt = d->nelt;
+
+ for (i = which = 0; i < nelt; ++i)
+ which |= (d->perm[i] < nelt ? 1 : 2);
+
+ d->one_operand_p = true;
+ switch (which)
+ {
+ default:
+ gcc_unreachable();
+
+ case 3:
+ if (!rtx_equal_p (d->op0, d->op1))
+ {
+ d->one_operand_p = false;
+ break;
+ }
+ /* The elements of PERM do not suggest that only the first operand
+ is used, but both operands are identical. Allow easier matching
+ of the permutation by folding the permutation into the single
+ input vector. */
+ /* FALLTHRU */
+
+ case 2:
+ for (i = 0; i < nelt; ++i)
+ d->perm[i] &= nelt - 1;
+ d->op0 = d->op1;
+ break;
+
+ case 1:
+ d->op1 = d->op0;
+ break;
+ }
+
+ return (which == 3);
+}
+
+/* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
+
+bool
+ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
+ rtx op1, const vec_perm_indices &sel)
+{
+ struct expand_vec_perm_d d;
+ unsigned char perm[MAX_VECT_LEN];
+ unsigned int i, nelt, which;
+ bool two_args;
+
+ d.target = target;
+ d.op0 = op0;
+ d.op1 = op1;
+
+ d.vmode = vmode;
+ gcc_assert (VECTOR_MODE_P (d.vmode));
+ d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+ d.testing_p = !target;
+
+ gcc_assert (sel.length () == nelt);
+ gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
+
+ /* Given sufficient ISA support we can just return true here
+ for selected vector modes. */
+ switch (d.vmode)
+ {
+ case E_V16SFmode:
+ case E_V16SImode:
+ case E_V8DImode:
+ case E_V8DFmode:
+ if (!TARGET_AVX512F)
+ return false;
+ /* All implementable with a single vperm[it]2 insn. */
+ if (d.testing_p)
+ return true;
+ break;
+ case E_V32HImode:
+ if (!TARGET_AVX512BW)
+ return false;
+ if (d.testing_p)
+ /* All implementable with a single vperm[it]2 insn. */
+ return true;
+ break;
+ case E_V64QImode:
+ if (!TARGET_AVX512BW)
+ return false;
+ if (d.testing_p)
+ /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
+ return true;
+ break;
+ case E_V8SImode:
+ case E_V8SFmode:
+ case E_V4DFmode:
+ case E_V4DImode:
+ if (!TARGET_AVX)
+ return false;
+ if (d.testing_p && TARGET_AVX512VL)
+ /* All implementable with a single vperm[it]2 insn. */
+ return true;
+ break;
+ case E_V16HImode:
+ if (!TARGET_SSE2)
+ return false;
+ if (d.testing_p && TARGET_AVX2)
+ /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
+ return true;
+ break;
+ case E_V32QImode:
+ if (!TARGET_SSE2)
+ return false;
+ if (d.testing_p && TARGET_AVX2)
+ /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
+ return true;
+ break;
+ case E_V8HImode:
+ case E_V16QImode:
+ if (!TARGET_SSE2)
+ return false;
+ /* Fall through. */
+ case E_V4SImode:
+ case E_V4SFmode:
+ if (!TARGET_SSE)
+ return false;
+ /* All implementable with a single vpperm insn. */
+ if (d.testing_p && TARGET_XOP)
+ return true;
+ /* All implementable with 2 pshufb + 1 ior. */
+ if (d.testing_p && TARGET_SSSE3)
+ return true;
+ break;
+ case E_V2DImode:
+ case E_V2DFmode:
+ if (!TARGET_SSE)
+ return false;
+ /* All implementable with shufpd or unpck[lh]pd. */
+ if (d.testing_p)
+ return true;
+ break;
+ default:
+ return false;
+ }
+
+ for (i = which = 0; i < nelt; ++i)
+ {
+ unsigned char e = sel[i];
+ gcc_assert (e < 2 * nelt);
+ d.perm[i] = e;
+ perm[i] = e;
+ which |= (e < nelt ? 1 : 2);
+ }
+
+ if (d.testing_p)
+ {
+ /* For all elements from second vector, fold the elements to first. */
+ if (which == 2)
+ for (i = 0; i < nelt; ++i)
+ d.perm[i] -= nelt;
+
+ /* Check whether the mask can be applied to the vector type. */
+ d.one_operand_p = (which != 3);
+
+ /* Implementable with shufps or pshufd. */
+ if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
+ return true;
+
+ /* Otherwise we have to go through the motions and see if we can
+ figure out how to generate the requested permutation. */
+ d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
+ d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
+ if (!d.one_operand_p)
+ d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
+
+ start_sequence ();
+ bool ret = ix86_expand_vec_perm_const_1 (&d);
+ end_sequence ();
+
+ return ret;
+ }
+
+ two_args = canonicalize_perm (&d);
+
+ if (ix86_expand_vec_perm_const_1 (&d))
+ return true;
+
+ /* If the selector says both arguments are needed, but the operands are the
+ same, the above tried to expand with one_operand_p and flattened selector.
+ If that didn't work, retry without one_operand_p; we succeeded with that
+ during testing. */
+ if (two_args && d.one_operand_p)
+ {
+ d.one_operand_p = false;
+ memcpy (d.perm, perm, sizeof (perm));
+ return ix86_expand_vec_perm_const_1 (&d);
+ }
+
+ return false;
+}
+
+void
+ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
+{
+ struct expand_vec_perm_d d;
+ unsigned i, nelt;
+
+ d.target = targ;
+ d.op0 = op0;
+ d.op1 = op1;
+ d.vmode = GET_MODE (targ);
+ d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+ d.one_operand_p = false;
+ d.testing_p = false;
+
+ for (i = 0; i < nelt; ++i)
+ d.perm[i] = i * 2 + odd;
+
+ /* We'll either be able to implement the permutation directly... */
+ if (expand_vec_perm_1 (&d))
+ return;
+
+ /* ... or we use the special-case patterns. */
+ expand_vec_perm_even_odd_1 (&d, odd);
+}
+
+static void
+ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
+{
+ struct expand_vec_perm_d d;
+ unsigned i, nelt, base;
+ bool ok;
+
+ d.target = targ;
+ d.op0 = op0;
+ d.op1 = op1;
+ d.vmode = GET_MODE (targ);
+ d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+ d.one_operand_p = false;
+ d.testing_p = false;
+
+ base = high_p ? nelt / 2 : 0;
+ for (i = 0; i < nelt / 2; ++i)
+ {
+ d.perm[i * 2] = i + base;
+ d.perm[i * 2 + 1] = i + base + nelt;
+ }
+
+ /* Note that for AVX this isn't one instruction. */
+ ok = ix86_expand_vec_perm_const_1 (&d);
+ gcc_assert (ok);
+}
+
+
+/* Expand a vector operation CODE for a V*QImode in terms of the
+ same operation on V*HImode. */
+
+void
+ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+{
+ machine_mode qimode = GET_MODE (dest);
+ machine_mode himode;
+ rtx (*gen_il) (rtx, rtx, rtx);
+ rtx (*gen_ih) (rtx, rtx, rtx);
+ rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
+ struct expand_vec_perm_d d;
+ bool ok, full_interleave;
+ bool uns_p = false;
+ int i;
+
+ switch (qimode)
+ {
+ case E_V16QImode:
+ himode = V8HImode;
+ gen_il = gen_vec_interleave_lowv16qi;
+ gen_ih = gen_vec_interleave_highv16qi;
+ break;
+ case E_V32QImode:
+ himode = V16HImode;
+ gen_il = gen_avx2_interleave_lowv32qi;
+ gen_ih = gen_avx2_interleave_highv32qi;
+ break;
+ case E_V64QImode:
+ himode = V32HImode;
+ gen_il = gen_avx512bw_interleave_lowv64qi;
+ gen_ih = gen_avx512bw_interleave_highv64qi;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ op2_l = op2_h = op2;
+ switch (code)
+ {
+ case MULT:
+ /* Unpack data such that we've got a source byte in each low byte of
+ each word. We don't care what goes into the high byte of each word.
+ Rather than trying to get zero in there, most convenient is to let
+ it be a copy of the low byte. */
+ op2_l = gen_reg_rtx (qimode);
+ op2_h = gen_reg_rtx (qimode);
+ emit_insn (gen_il (op2_l, op2, op2));
+ emit_insn (gen_ih (op2_h, op2, op2));
+
+ op1_l = gen_reg_rtx (qimode);
+ op1_h = gen_reg_rtx (qimode);
+ emit_insn (gen_il (op1_l, op1, op1));
+ emit_insn (gen_ih (op1_h, op1, op1));
+ full_interleave = qimode == V16QImode;
+ break;
+
+ case ASHIFT:
+ case LSHIFTRT:
+ uns_p = true;
+ /* FALLTHRU */
+ case ASHIFTRT:
+ op1_l = gen_reg_rtx (himode);
+ op1_h = gen_reg_rtx (himode);
+ ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
+ ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
+ full_interleave = true;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ /* Perform the operation. */
+ res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
+ 1, OPTAB_DIRECT);
+ res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
+ 1, OPTAB_DIRECT);
+ gcc_assert (res_l && res_h);
+
+ /* Merge the data back into the right place. */
+ d.target = dest;
+ d.op0 = gen_lowpart (qimode, res_l);
+ d.op1 = gen_lowpart (qimode, res_h);
+ d.vmode = qimode;
+ d.nelt = GET_MODE_NUNITS (qimode);
+ d.one_operand_p = false;
+ d.testing_p = false;
+
+ if (full_interleave)
+ {
+ /* For SSE2, we used an full interleave, so the desired
+ results are in the even elements. */
+ for (i = 0; i < d.nelt; ++i)
+ d.perm[i] = i * 2;
+ }
+ else
+ {
+ /* For AVX, the interleave used above was not cross-lane. So the
+ extraction is evens but with the second and third quarter swapped.
+ Happily, that is even one insn shorter than even extraction.
+ For AVX512BW we have 4 lanes. We extract evens from within a lane,
+ always first from the first and then from the second source operand,
+ the index bits above the low 4 bits remains the same.
+ Thus, for d.nelt == 32 we want permutation
+ 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
+ and for d.nelt == 64 we want permutation
+ 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
+ 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
+ for (i = 0; i < d.nelt; ++i)
+ d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
+ }
+
+ ok = ix86_expand_vec_perm_const_1 (&d);
+ gcc_assert (ok);
+
+ set_unique_reg_note (get_last_insn (), REG_EQUAL,
+ gen_rtx_fmt_ee (code, qimode, op1, op2));
+}
+
+/* Helper function of ix86_expand_mul_widen_evenodd. Return true
+ if op is CONST_VECTOR with all odd elements equal to their
+ preceding element. */
+
+static bool
+const_vector_equal_evenodd_p (rtx op)
+{
+ machine_mode mode = GET_MODE (op);
+ int i, nunits = GET_MODE_NUNITS (mode);
+ if (GET_CODE (op) != CONST_VECTOR
+ || nunits != CONST_VECTOR_NUNITS (op))
+ return false;
+ for (i = 0; i < nunits; i += 2)
+ if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
+ return false;
+ return true;
+}
+
+void
+ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
+ bool uns_p, bool odd_p)
+{
+ machine_mode mode = GET_MODE (op1);
+ machine_mode wmode = GET_MODE (dest);
+ rtx x;
+ rtx orig_op1 = op1, orig_op2 = op2;
+
+ if (!nonimmediate_operand (op1, mode))
+ op1 = force_reg (mode, op1);
+ if (!nonimmediate_operand (op2, mode))
+ op2 = force_reg (mode, op2);
+
+ /* We only play even/odd games with vectors of SImode. */
+ gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
+
+ /* If we're looking for the odd results, shift those members down to
+ the even slots. For some cpus this is faster than a PSHUFD. */
+ if (odd_p)
+ {
+ /* For XOP use vpmacsdqh, but only for smult, as it is only
+ signed. */
+ if (TARGET_XOP && mode == V4SImode && !uns_p)
+ {
+ x = force_reg (wmode, CONST0_RTX (wmode));
+ emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
+ return;
+ }
+
+ x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
+ if (!const_vector_equal_evenodd_p (orig_op1))
+ op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
+ x, NULL, 1, OPTAB_DIRECT);
+ if (!const_vector_equal_evenodd_p (orig_op2))
+ op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
+ x, NULL, 1, OPTAB_DIRECT);
+ op1 = gen_lowpart (mode, op1);
+ op2 = gen_lowpart (mode, op2);
+ }
+
+ if (mode == V16SImode)
+ {
+ if (uns_p)
+ x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
+ else
+ x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
+ }
+ else if (mode == V8SImode)
+ {
+ if (uns_p)
+ x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
+ else
+ x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
+ }
+ else if (uns_p)
+ x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
+ else if (TARGET_SSE4_1)
+ x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
+ else
+ {
+ rtx s1, s2, t0, t1, t2;
+
+ /* The easiest way to implement this without PMULDQ is to go through
+ the motions as if we are performing a full 64-bit multiply. With
+ the exception that we need to do less shuffling of the elements. */
+
+ /* Compute the sign-extension, aka highparts, of the two operands. */
+ s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
+ op1, pc_rtx, pc_rtx);
+ s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
+ op2, pc_rtx, pc_rtx);
+
+ /* Multiply LO(A) * HI(B), and vice-versa. */
+ t1 = gen_reg_rtx (wmode);
+ t2 = gen_reg_rtx (wmode);
+ emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
+ emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
+
+ /* Multiply LO(A) * LO(B). */
+ t0 = gen_reg_rtx (wmode);
+ emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
+
+ /* Combine and shift the highparts into place. */
+ t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
+ t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
+ 1, OPTAB_DIRECT);
+
+ /* Combine high and low parts. */
+ force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
+ return;
+ }
+ emit_insn (x);
+}
+
+void
+ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
+ bool uns_p, bool high_p)
+{
+ machine_mode wmode = GET_MODE (dest);
+ machine_mode mode = GET_MODE (op1);
+ rtx t1, t2, t3, t4, mask;
+
+ switch (mode)
+ {
+ case E_V4SImode:
+ t1 = gen_reg_rtx (mode);
+ t2 = gen_reg_rtx (mode);
+ if (TARGET_XOP && !uns_p)
+ {
+ /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
+ shuffle the elements once so that all elements are in the right
+ place for immediate use: { A C B D }. */
+ emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
+ const1_rtx, GEN_INT (3)));
+ emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
+ const1_rtx, GEN_INT (3)));
+ }
+ else
+ {
+ /* Put the elements into place for the multiply. */
+ ix86_expand_vec_interleave (t1, op1, op1, high_p);
+ ix86_expand_vec_interleave (t2, op2, op2, high_p);
+ high_p = false;
+ }
+ ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
+ break;
+
+ case E_V8SImode:
+ /* Shuffle the elements between the lanes. After this we
+ have { A B E F | C D G H } for each operand. */
+ t1 = gen_reg_rtx (V4DImode);
+ t2 = gen_reg_rtx (V4DImode);
+ emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
+ const0_rtx, const2_rtx,
+ const1_rtx, GEN_INT (3)));
+ emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
+ const0_rtx, const2_rtx,
+ const1_rtx, GEN_INT (3)));
+
+ /* Shuffle the elements within the lanes. After this we
+ have { A A B B | C C D D } or { E E F F | G G H H }. */
+ t3 = gen_reg_rtx (V8SImode);
+ t4 = gen_reg_rtx (V8SImode);
+ mask = GEN_INT (high_p
+ ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
+ : 0 + (0 << 2) + (1 << 4) + (1 << 6));
+ emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
+ emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
+
+ ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
+ break;
+
+ case E_V8HImode:
+ case E_V16HImode:
+ t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
+ uns_p, OPTAB_DIRECT);
+ t2 = expand_binop (mode,
+ uns_p ? umul_highpart_optab : smul_highpart_optab,
+ op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
+ gcc_assert (t1 && t2);
+
+ t3 = gen_reg_rtx (mode);
+ ix86_expand_vec_interleave (t3, t1, t2, high_p);
+ emit_move_insn (dest, gen_lowpart (wmode, t3));
+ break;
+
+ case E_V16QImode:
+ case E_V32QImode:
+ case E_V32HImode:
+ case E_V16SImode:
+ case E_V64QImode:
+ t1 = gen_reg_rtx (wmode);
+ t2 = gen_reg_rtx (wmode);
+ ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
+ ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
+
+ emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+}
+
+void
+ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
+{
+ rtx res_1, res_2, res_3, res_4;
+
+ res_1 = gen_reg_rtx (V4SImode);
+ res_2 = gen_reg_rtx (V4SImode);
+ res_3 = gen_reg_rtx (V2DImode);
+ res_4 = gen_reg_rtx (V2DImode);
+ ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
+ ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
+
+ /* Move the results in element 2 down to element 1; we don't care
+ what goes in elements 2 and 3. Then we can merge the parts
+ back together with an interleave.
+
+ Note that two other sequences were tried:
+ (1) Use interleaves at the start instead of psrldq, which allows
+ us to use a single shufps to merge things back at the end.
+ (2) Use shufps here to combine the two vectors, then pshufd to
+ put the elements in the correct order.
+ In both cases the cost of the reformatting stall was too high
+ and the overall sequence slower. */
+
+ emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
+ const0_rtx, const2_rtx,
+ const0_rtx, const0_rtx));
+ emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
+ const0_rtx, const2_rtx,
+ const0_rtx, const0_rtx));
+ res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
+
+ set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
+}
+
+void
+ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
+{
+ machine_mode mode = GET_MODE (op0);
+ rtx t1, t2, t3, t4, t5, t6;
+
+ if (TARGET_AVX512DQ && mode == V8DImode)
+ emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
+ else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
+ emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
+ else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
+ emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
+ else if (TARGET_XOP && mode == V2DImode)
+ {
+ /* op1: A,B,C,D, op2: E,F,G,H */
+ op1 = gen_lowpart (V4SImode, op1);
+ op2 = gen_lowpart (V4SImode, op2);
+
+ t1 = gen_reg_rtx (V4SImode);
+ t2 = gen_reg_rtx (V4SImode);
+ t3 = gen_reg_rtx (V2DImode);
+ t4 = gen_reg_rtx (V2DImode);
+
+ /* t1: B,A,D,C */
+ emit_insn (gen_sse2_pshufd_1 (t1, op1,
+ GEN_INT (1),
+ GEN_INT (0),
+ GEN_INT (3),
+ GEN_INT (2)));
+
+ /* t2: (B*E),(A*F),(D*G),(C*H) */
+ emit_insn (gen_mulv4si3 (t2, t1, op2));
+
+ /* t3: (B*E)+(A*F), (D*G)+(C*H) */
+ emit_insn (gen_xop_phadddq (t3, t2));
+
+ /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
+ emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
+
+ /* Multiply lower parts and add all */
+ t5 = gen_reg_rtx (V2DImode);
+ emit_insn (gen_vec_widen_umult_even_v4si (t5,
+ gen_lowpart (V4SImode, op1),
+ gen_lowpart (V4SImode, op2)));
+ op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
+
+ }
+ else
+ {
+ machine_mode nmode;
+ rtx (*umul) (rtx, rtx, rtx);
+
+ if (mode == V2DImode)
+ {
+ umul = gen_vec_widen_umult_even_v4si;
+ nmode = V4SImode;
+ }
+ else if (mode == V4DImode)
+ {
+ umul = gen_vec_widen_umult_even_v8si;
+ nmode = V8SImode;
+ }
+ else if (mode == V8DImode)
+ {
+ umul = gen_vec_widen_umult_even_v16si;
+ nmode = V16SImode;
+ }
+ else
+ gcc_unreachable ();
+
+
+ /* Multiply low parts. */
+ t1 = gen_reg_rtx (mode);
+ emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
+
+ /* Shift input vectors right 32 bits so we can multiply high parts. */
+ t6 = GEN_INT (32);
+ t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
+ t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
+
+ /* Multiply high parts by low parts. */
+ t4 = gen_reg_rtx (mode);
+ t5 = gen_reg_rtx (mode);
+ emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
+ emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
+
+ /* Combine and shift the highparts back. */
+ t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
+ t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
+
+ /* Combine high and low parts. */
+ force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
+ }
+
+ set_unique_reg_note (get_last_insn (), REG_EQUAL,
+ gen_rtx_MULT (mode, op1, op2));
+}
+
+/* Return 1 if control tansfer instruction INSN
+ should be encoded with notrack prefix. */
+
+bool
+ix86_notrack_prefixed_insn_p (rtx insn)
+{
+ if (!insn || !((flag_cf_protection & CF_BRANCH)))
+ return false;
+
+ if (CALL_P (insn))
+ {
+ rtx call = get_call_rtx_from (insn);
+ gcc_assert (call != NULL_RTX);
+ rtx addr = XEXP (call, 0);
+
+ /* Do not emit 'notrack' if it's not an indirect call. */
+ if (MEM_P (addr)
+ && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
+ return false;
+ else
+ return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
+ }
+
+ if (JUMP_P (insn) && !flag_cet_switch)
+ {
+ rtx target = JUMP_LABEL (insn);
+ if (target == NULL_RTX || ANY_RETURN_P (target))
+ return false;
+
+ /* Check the jump is a switch table. */
+ rtx_insn *label = as_a<rtx_insn *> (target);
+ rtx_insn *table = next_insn (label);
+ if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
+ return false;
+ else
+ return true;
+ }
+ return false;
+}
+
+/* Calculate integer abs() using only SSE2 instructions. */
+
+void
+ix86_expand_sse2_abs (rtx target, rtx input)
+{
+ machine_mode mode = GET_MODE (target);
+ rtx tmp0, tmp1, x;
+
+ switch (mode)
+ {
+ case E_V2DImode:
+ case E_V4DImode:
+ /* For 64-bit signed integer X, with SSE4.2 use
+ pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
+ Otherwise handle it similarly to V4SImode, except use 64 as W instead of
+ 32 and use logical instead of arithmetic right shift (which is
+ unimplemented) and subtract. */
+ if (TARGET_SSE4_2)
+ {
+ tmp0 = gen_reg_rtx (mode);
+ tmp1 = gen_reg_rtx (mode);
+ emit_move_insn (tmp1, CONST0_RTX (mode));
+ if (mode == E_V2DImode)
+ emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
+ else
+ emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
+ }
+ else
+ {
+ tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
+ GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
+ - 1), NULL, 0, OPTAB_DIRECT);
+ tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
+ }
+
+ tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
+ NULL, 0, OPTAB_DIRECT);
+ x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
+ target, 0, OPTAB_DIRECT);
+ break;
+
+ case E_V4SImode:
+ /* For 32-bit signed integer X, the best way to calculate the absolute
+ value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
+ tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
+ GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
+ NULL, 0, OPTAB_DIRECT);
+ tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
+ NULL, 0, OPTAB_DIRECT);
+ x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
+ target, 0, OPTAB_DIRECT);
+ break;
+
+ case E_V8HImode:
+ /* For 16-bit signed integer X, the best way to calculate the absolute
+ value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
+ tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
+
+ x = expand_simple_binop (mode, SMAX, tmp0, input,
+ target, 0, OPTAB_DIRECT);
+ break;
+
+ case E_V16QImode:
+ /* For 8-bit signed integer X, the best way to calculate the absolute
+ value of X is min ((unsigned char) X, (unsigned char) (-X)),
+ as SSE2 provides the PMINUB insn. */
+ tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
+
+ x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
+ target, 0, OPTAB_DIRECT);
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ if (x != target)
+ emit_move_insn (target, x);
+}
+
+/* Expand an extract from a vector register through pextr insn.
+ Return true if successful. */
+
+bool
+ix86_expand_pextr (rtx *operands)
+{
+ rtx dst = operands[0];
+ rtx src = operands[1];
+
+ unsigned int size = INTVAL (operands[2]);
+ unsigned int pos = INTVAL (operands[3]);
+
+ if (SUBREG_P (dst))
+ {
+ /* Reject non-lowpart subregs. */
+ if (SUBREG_BYTE (dst) > 0)
+ return false;
+ dst = SUBREG_REG (dst);
+ }
+
+ if (SUBREG_P (src))
+ {
+ pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
+ src = SUBREG_REG (src);
+ }
+
+ switch (GET_MODE (src))
+ {
+ case E_V16QImode:
+ case E_V8HImode:
+ case E_V4SImode:
+ case E_V2DImode:
+ case E_V1TImode:
+ case E_TImode:
+ {
+ machine_mode srcmode, dstmode;
+ rtx d, pat;
+
+ if (!int_mode_for_size (size, 0).exists (&dstmode))
+ return false;
+
+ switch (dstmode)
+ {
+ case E_QImode:
+ if (!TARGET_SSE4_1)
+ return false;
+ srcmode = V16QImode;
+ break;
+
+ case E_HImode:
+ if (!TARGET_SSE2)
+ return false;
+ srcmode = V8HImode;
+ break;
+
+ case E_SImode:
+ if (!TARGET_SSE4_1)
+ return false;
+ srcmode = V4SImode;
+ break;
+
+ case E_DImode:
+ gcc_assert (TARGET_64BIT);
+ if (!TARGET_SSE4_1)
+ return false;
+ srcmode = V2DImode;
+ break;
+
+ default:
+ return false;
+ }
+
+ /* Reject extractions from misaligned positions. */
+ if (pos & (size-1))
+ return false;
+
+ if (GET_MODE (dst) == dstmode)
+ d = dst;
+ else
+ d = gen_reg_rtx (dstmode);
+
+ /* Construct insn pattern. */
+ pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
+ pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
+
+ /* Let the rtl optimizers know about the zero extension performed. */
+ if (dstmode == QImode || dstmode == HImode)
+ {
+ pat = gen_rtx_ZERO_EXTEND (SImode, pat);
+ d = gen_lowpart (SImode, d);
+ }
+
+ emit_insn (gen_rtx_SET (d, pat));
+
+ if (d != dst)
+ emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
+ return true;
+ }
+
+ default:
+ return false;
+ }
+}
+
+/* Expand an insert into a vector register through pinsr insn.
+ Return true if successful. */
+
+bool
+ix86_expand_pinsr (rtx *operands)
+{
+ rtx dst = operands[0];
+ rtx src = operands[3];
+
+ unsigned int size = INTVAL (operands[1]);
+ unsigned int pos = INTVAL (operands[2]);
+
+ if (SUBREG_P (dst))
+ {
+ pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
+ dst = SUBREG_REG (dst);
+ }
+
+ switch (GET_MODE (dst))
+ {
+ case E_V16QImode:
+ case E_V8HImode:
+ case E_V4SImode:
+ case E_V2DImode:
+ case E_V1TImode:
+ case E_TImode:
+ {
+ machine_mode srcmode, dstmode;
+ rtx (*pinsr)(rtx, rtx, rtx, rtx);
+ rtx d;
+
+ if (!int_mode_for_size (size, 0).exists (&srcmode))
+ return false;
+
+ switch (srcmode)
+ {
+ case E_QImode:
+ if (!TARGET_SSE4_1)
+ return false;
+ dstmode = V16QImode;
+ pinsr = gen_sse4_1_pinsrb;
+ break;
+
+ case E_HImode:
+ if (!TARGET_SSE2)
+ return false;
+ dstmode = V8HImode;
+ pinsr = gen_sse2_pinsrw;
+ break;
+
+ case E_SImode:
+ if (!TARGET_SSE4_1)
+ return false;
+ dstmode = V4SImode;
+ pinsr = gen_sse4_1_pinsrd;
+ break;
+
+ case E_DImode:
+ gcc_assert (TARGET_64BIT);
+ if (!TARGET_SSE4_1)
+ return false;
+ dstmode = V2DImode;
+ pinsr = gen_sse4_1_pinsrq;
+ break;
+
+ default:
+ return false;
+ }
+
+ /* Reject insertions to misaligned positions. */
+ if (pos & (size-1))
+ return false;
+
+ if (SUBREG_P (src))
+ {
+ unsigned int srcpos = SUBREG_BYTE (src);
+
+ if (srcpos > 0)
+ {
+ rtx extr_ops[4];
+
+ extr_ops[0] = gen_reg_rtx (srcmode);
+ extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
+ extr_ops[2] = GEN_INT (size);
+ extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
+
+ if (!ix86_expand_pextr (extr_ops))
+ return false;
+
+ src = extr_ops[0];
+ }
+ else
+ src = gen_lowpart (srcmode, SUBREG_REG (src));
+ }
+
+ if (GET_MODE (dst) == dstmode)
+ d = dst;
+ else
+ d = gen_reg_rtx (dstmode);
+
+ emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
+ gen_lowpart (srcmode, src),
+ GEN_INT (1 << (pos / size))));
+ if (d != dst)
+ emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
+ return true;
+ }
+
+ default:
+ return false;
+ }
+}
+
+/* All CPUs prefer to avoid cross-lane operations so perform reductions
+ upper against lower halves up to SSE reg size. */
+
+machine_mode
+ix86_split_reduction (machine_mode mode)
+{
+ /* Reduce lowpart against highpart until we reach SSE reg width to
+ avoid cross-lane operations. */
+ switch (mode)
+ {
+ case E_V8DImode:
+ case E_V4DImode:
+ return V2DImode;
+ case E_V16SImode:
+ case E_V8SImode:
+ return V4SImode;
+ case E_V32HImode:
+ case E_V16HImode:
+ return V8HImode;
+ case E_V64QImode:
+ case E_V32QImode:
+ return V16QImode;
+ case E_V16SFmode:
+ case E_V8SFmode:
+ return V4SFmode;
+ case E_V8DFmode:
+ case E_V4DFmode:
+ return V2DFmode;
+ default:
+ return mode;
+ }
+}
+
+/* Generate call to __divmoddi4. */
+
+void
+ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
+ rtx op0, rtx op1,
+ rtx *quot_p, rtx *rem_p)
+{
+ rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
+
+ rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
+ mode, op0, mode, op1, mode,
+ XEXP (rem, 0), Pmode);
+ *quot_p = quot;
+ *rem_p = rem;
+}
+
+#include "gt-i386-expand.h"
#include "tree-vector-builder.h"
#include "debug.h"
#include "dwarf2out.h"
+#include "i386-options.h"
+#include "i386-builtins.h"
+#include "i386-expand.h"
+#include "i386-features.h"
/* This file should be included last. */
#include "target-def.h"
-#include "x86-tune-costs.h"
-
static rtx legitimize_dllimport_symbol (rtx, bool);
static rtx legitimize_pe_coff_extern_decl (rtx, bool);
-static rtx legitimize_pe_coff_symbol (rtx, bool);
static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
-static bool ix86_save_reg (unsigned int, bool, bool);
-static bool ix86_function_naked (const_tree);
-static bool ix86_notrack_prefixed_insn_p (rtx);
static void ix86_emit_restore_reg_using_pop (rtx);
/* Set by -mtune or -Os. */
const struct processor_costs *ix86_cost = NULL;
-/* Processor feature/optimization bitmasks. */
-#define m_386 (HOST_WIDE_INT_1U<<PROCESSOR_I386)
-#define m_486 (HOST_WIDE_INT_1U<<PROCESSOR_I486)
-#define m_PENT (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUM)
-#define m_LAKEMONT (HOST_WIDE_INT_1U<<PROCESSOR_LAKEMONT)
-#define m_PPRO (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUMPRO)
-#define m_PENT4 (HOST_WIDE_INT_1U<<PROCESSOR_PENTIUM4)
-#define m_NOCONA (HOST_WIDE_INT_1U<<PROCESSOR_NOCONA)
-#define m_P4_NOCONA (m_PENT4 | m_NOCONA)
-#define m_CORE2 (HOST_WIDE_INT_1U<<PROCESSOR_CORE2)
-#define m_NEHALEM (HOST_WIDE_INT_1U<<PROCESSOR_NEHALEM)
-#define m_SANDYBRIDGE (HOST_WIDE_INT_1U<<PROCESSOR_SANDYBRIDGE)
-#define m_HASWELL (HOST_WIDE_INT_1U<<PROCESSOR_HASWELL)
-#define m_BONNELL (HOST_WIDE_INT_1U<<PROCESSOR_BONNELL)
-#define m_SILVERMONT (HOST_WIDE_INT_1U<<PROCESSOR_SILVERMONT)
-#define m_KNL (HOST_WIDE_INT_1U<<PROCESSOR_KNL)
-#define m_KNM (HOST_WIDE_INT_1U<<PROCESSOR_KNM)
-#define m_SKYLAKE (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE)
-#define m_SKYLAKE_AVX512 (HOST_WIDE_INT_1U<<PROCESSOR_SKYLAKE_AVX512)
-#define m_CANNONLAKE (HOST_WIDE_INT_1U<<PROCESSOR_CANNONLAKE)
-#define m_ICELAKE_CLIENT (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_CLIENT)
-#define m_ICELAKE_SERVER (HOST_WIDE_INT_1U<<PROCESSOR_ICELAKE_SERVER)
-#define m_CASCADELAKE (HOST_WIDE_INT_1U<<PROCESSOR_CASCADELAKE)
-#define m_CORE_AVX512 (m_SKYLAKE_AVX512 | m_CANNONLAKE \
- | m_ICELAKE_CLIENT | m_ICELAKE_SERVER | m_CASCADELAKE)
-#define m_CORE_AVX2 (m_HASWELL | m_SKYLAKE | m_CORE_AVX512)
-#define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2)
-#define m_GOLDMONT (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT)
-#define m_GOLDMONT_PLUS (HOST_WIDE_INT_1U<<PROCESSOR_GOLDMONT_PLUS)
-#define m_TREMONT (HOST_WIDE_INT_1U<<PROCESSOR_TREMONT)
-#define m_INTEL (HOST_WIDE_INT_1U<<PROCESSOR_INTEL)
-
-#define m_GEODE (HOST_WIDE_INT_1U<<PROCESSOR_GEODE)
-#define m_K6 (HOST_WIDE_INT_1U<<PROCESSOR_K6)
-#define m_K6_GEODE (m_K6 | m_GEODE)
-#define m_K8 (HOST_WIDE_INT_1U<<PROCESSOR_K8)
-#define m_ATHLON (HOST_WIDE_INT_1U<<PROCESSOR_ATHLON)
-#define m_ATHLON_K8 (m_K8 | m_ATHLON)
-#define m_AMDFAM10 (HOST_WIDE_INT_1U<<PROCESSOR_AMDFAM10)
-#define m_BDVER1 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER1)
-#define m_BDVER2 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER2)
-#define m_BDVER3 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER3)
-#define m_BDVER4 (HOST_WIDE_INT_1U<<PROCESSOR_BDVER4)
-#define m_ZNVER1 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER1)
-#define m_ZNVER2 (HOST_WIDE_INT_1U<<PROCESSOR_ZNVER2)
-#define m_BTVER1 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER1)
-#define m_BTVER2 (HOST_WIDE_INT_1U<<PROCESSOR_BTVER2)
-#define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
-#define m_BTVER (m_BTVER1 | m_BTVER2)
-#define m_ZNVER (m_ZNVER1 | m_ZNVER2)
-#define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
- | m_ZNVER)
-
-#define m_GENERIC (HOST_WIDE_INT_1U<<PROCESSOR_GENERIC)
-
-const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
-#undef DEF_TUNE
-#define DEF_TUNE(tune, name, selector) name,
-#include "x86-tune.def"
-#undef DEF_TUNE
-};
-
-/* Feature tests against the various tunings. */
-unsigned char ix86_tune_features[X86_TUNE_LAST];
-
-/* Feature tests against the various tunings used to create ix86_tune_features
- based on the processor mask. */
-static unsigned HOST_WIDE_INT initial_ix86_tune_features[X86_TUNE_LAST] = {
-#undef DEF_TUNE
-#define DEF_TUNE(tune, name, selector) selector,
-#include "x86-tune.def"
-#undef DEF_TUNE
-};
-
-/* Feature tests against the various architecture variations. */
-unsigned char ix86_arch_features[X86_ARCH_LAST];
-
-/* Feature tests against the various architecture variations, used to create
- ix86_arch_features based on the processor mask. */
-static unsigned HOST_WIDE_INT initial_ix86_arch_features[X86_ARCH_LAST] = {
- /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
- ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
-
- /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
- ~m_386,
-
- /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
- ~(m_386 | m_486),
-
- /* X86_ARCH_XADD: Exchange and add was added for 80486. */
- ~m_386,
-
- /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
- ~m_386,
-};
-
/* In case the average insn count for single function invocation is
lower than this constant, emit fast (but longer) prologue and
epilogue code. */
AX_REG, DX_REG, DI_REG, SI_REG
};
-/* Additional registers that are clobbered by SYSV calls. */
-
-#define NUM_X86_64_MS_CLOBBERED_REGS 12
-static int const x86_64_ms_sysv_extra_clobbered_registers
- [NUM_X86_64_MS_CLOBBERED_REGS] =
-{
- SI_REG, DI_REG,
- XMM6_REG, XMM7_REG,
- XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
- XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
-};
-
-enum xlogue_stub {
- XLOGUE_STUB_SAVE,
- XLOGUE_STUB_RESTORE,
- XLOGUE_STUB_RESTORE_TAIL,
- XLOGUE_STUB_SAVE_HFP,
- XLOGUE_STUB_RESTORE_HFP,
- XLOGUE_STUB_RESTORE_HFP_TAIL,
-
- XLOGUE_STUB_COUNT
-};
-
-enum xlogue_stub_sets {
- XLOGUE_SET_ALIGNED,
- XLOGUE_SET_ALIGNED_PLUS_8,
- XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
- XLOGUE_SET_HFP_ALIGNED_PLUS_8,
-
- XLOGUE_SET_COUNT
-};
-
-/* Register save/restore layout used by out-of-line stubs. */
-class xlogue_layout {
-public:
- struct reginfo
- {
- unsigned regno;
- HOST_WIDE_INT offset; /* Offset used by stub base pointer (rax or
- rsi) to where each register is stored. */
- };
-
- unsigned get_nregs () const {return m_nregs;}
- HOST_WIDE_INT get_stack_align_off_in () const {return m_stack_align_off_in;}
-
- const reginfo &get_reginfo (unsigned reg) const
- {
- gcc_assert (reg < m_nregs);
- return m_regs[reg];
- }
-
- static const char *get_stub_name (enum xlogue_stub stub,
- unsigned n_extra_args);
-
- /* Returns an rtx for the stub's symbol based upon
- 1.) the specified stub (save, restore or restore_ret) and
- 2.) the value of cfun->machine->call_ms2sysv_extra_regs and
- 3.) rather or not stack alignment is being performed. */
- static rtx get_stub_rtx (enum xlogue_stub stub);
-
- /* Returns the amount of stack space (including padding) that the stub
- needs to store registers based upon data in the machine_function. */
- HOST_WIDE_INT get_stack_space_used () const
- {
- const struct machine_function *m = cfun->machine;
- unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
-
- gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
- return m_regs[last_reg].offset + STUB_INDEX_OFFSET;
- }
-
- /* Returns the offset for the base pointer used by the stub. */
- HOST_WIDE_INT get_stub_ptr_offset () const
- {
- return STUB_INDEX_OFFSET + m_stack_align_off_in;
- }
-
- static const struct xlogue_layout &get_instance ();
- static unsigned count_stub_managed_regs ();
- static bool is_stub_managed_reg (unsigned regno, unsigned count);
-
- static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
- static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
- static const unsigned MAX_REGS = 18;
- static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
- static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
- static const unsigned STUB_NAME_MAX_LEN = 20;
- static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
- static const unsigned REG_ORDER[MAX_REGS];
- static const unsigned REG_ORDER_REALIGN[MAX_REGS];
-
-private:
- xlogue_layout ();
- xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
- xlogue_layout (const xlogue_layout &);
-
- /* True if hard frame pointer is used. */
- bool m_hfp;
-
- /* Max number of register this layout manages. */
- unsigned m_nregs;
-
- /* Incoming offset from 16-byte alignment. */
- HOST_WIDE_INT m_stack_align_off_in;
-
- /* Register order and offsets. */
- struct reginfo m_regs[MAX_REGS];
-
- /* Lazy-inited cache of symbol names for stubs. */
- static char s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
- [STUB_NAME_MAX_LEN];
-
- static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
-};
-
-const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
- "savms64",
- "resms64",
- "resms64x",
- "savms64f",
- "resms64f",
- "resms64fx"
-};
-
-const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
-/* The below offset values are where each register is stored for the layout
- relative to incoming stack pointer. The value of each m_regs[].offset will
- be relative to the incoming base pointer (rax or rsi) used by the stub.
-
- s_instances: 0 1 2 3
- Offset: realigned or aligned + 8
- Register aligned aligned + 8 aligned w/HFP w/HFP */
- XMM15_REG, /* 0x10 0x18 0x10 0x18 */
- XMM14_REG, /* 0x20 0x28 0x20 0x28 */
- XMM13_REG, /* 0x30 0x38 0x30 0x38 */
- XMM12_REG, /* 0x40 0x48 0x40 0x48 */
- XMM11_REG, /* 0x50 0x58 0x50 0x58 */
- XMM10_REG, /* 0x60 0x68 0x60 0x68 */
- XMM9_REG, /* 0x70 0x78 0x70 0x78 */
- XMM8_REG, /* 0x80 0x88 0x80 0x88 */
- XMM7_REG, /* 0x90 0x98 0x90 0x98 */
- XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
- SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
- DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
- BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
- BP_REG, /* 0xc0 0xc8 N/A N/A */
- R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
- R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
- R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
- R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
-};
-
-/* Instantiate static const values. */
-const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
-const unsigned xlogue_layout::MIN_REGS;
-const unsigned xlogue_layout::MAX_REGS;
-const unsigned xlogue_layout::MAX_EXTRA_REGS;
-const unsigned xlogue_layout::VARIANT_COUNT;
-const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
-
-/* Initialize xlogue_layout::s_stub_names to zero. */
-char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
- [STUB_NAME_MAX_LEN];
-
-/* Instantiates all xlogue_layout instances. */
-const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
- xlogue_layout (0, false),
- xlogue_layout (8, false),
- xlogue_layout (0, true),
- xlogue_layout (8, true)
-};
-
-/* Return an appropriate const instance of xlogue_layout based upon values
- in cfun->machine and crtl. */
-const struct xlogue_layout &
-xlogue_layout::get_instance ()
-{
- enum xlogue_stub_sets stub_set;
- bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
-
- if (stack_realign_fp)
- stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
- else if (frame_pointer_needed)
- stub_set = aligned_plus_8
- ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
- : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
- else
- stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
-
- return s_instances[stub_set];
-}
-
-/* Determine how many clobbered registers can be saved by the stub.
- Returns the count of registers the stub will save and restore. */
-unsigned
-xlogue_layout::count_stub_managed_regs ()
-{
- bool hfp = frame_pointer_needed || stack_realign_fp;
- unsigned i, count;
- unsigned regno;
-
- for (count = i = MIN_REGS; i < MAX_REGS; ++i)
- {
- regno = REG_ORDER[i];
- if (regno == BP_REG && hfp)
- continue;
- if (!ix86_save_reg (regno, false, false))
- break;
- ++count;
- }
- return count;
-}
-
-/* Determine if register REGNO is a stub managed register given the
- total COUNT of stub managed registers. */
-bool
-xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
-{
- bool hfp = frame_pointer_needed || stack_realign_fp;
- unsigned i;
-
- for (i = 0; i < count; ++i)
- {
- gcc_assert (i < MAX_REGS);
- if (REG_ORDER[i] == BP_REG && hfp)
- ++count;
- else if (REG_ORDER[i] == regno)
- return true;
- }
- return false;
-}
-
-/* Constructor for xlogue_layout. */
-xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
- : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
- m_stack_align_off_in (stack_align_off_in)
-{
- HOST_WIDE_INT offset = stack_align_off_in;
- unsigned i, j;
-
- for (i = j = 0; i < MAX_REGS; ++i)
- {
- unsigned regno = REG_ORDER[i];
-
- if (regno == BP_REG && hfp)
- continue;
- if (SSE_REGNO_P (regno))
- {
- offset += 16;
- /* Verify that SSE regs are always aligned. */
- gcc_assert (!((stack_align_off_in + offset) & 15));
- }
- else
- offset += 8;
-
- m_regs[j].regno = regno;
- m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
- }
- gcc_assert (j == m_nregs);
-}
-
-const char *
-xlogue_layout::get_stub_name (enum xlogue_stub stub,
- unsigned n_extra_regs)
-{
- const int have_avx = TARGET_AVX;
- char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
-
- /* Lazy init */
- if (!*name)
- {
- int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
- (have_avx ? "avx" : "sse"),
- STUB_BASE_NAMES[stub],
- MIN_REGS + n_extra_regs);
- gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
- }
-
- return name;
-}
-
-/* Return rtx of a symbol ref for the entry point (based upon
- cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
-rtx
-xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
-{
- const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
- gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
- gcc_assert (stub < XLOGUE_STUB_COUNT);
- gcc_assert (crtl->stack_realign_finalized);
-
- return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
-}
-
/* Define the structure for the machine field in struct function. */
struct GTY(()) stack_local_entry {
/* True if processor has SSE prefetch instruction. */
unsigned char x86_prefetch_sse;
-/* -mstackrealign option */
-static const char ix86_force_align_arg_pointer_string[]
- = "force_align_arg_pointer";
-
-static rtx (*ix86_gen_leave) (void);
-static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
-static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
-static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
-static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
-static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
-static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
-static rtx (*ix86_gen_clzero) (rtx);
-static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
-static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
-static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
-static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
-static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
-static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
+rtx (*ix86_gen_leave) (void);
+rtx (*ix86_gen_add3) (rtx, rtx, rtx);
+rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
+rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
+rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
+rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
+rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
+rtx (*ix86_gen_clzero) (rtx);
+rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
+rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
+rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
+rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
+rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
+rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
/* Preferred alignment for stack boundary in bits. */
unsigned int ix86_preferred_stack_boundary;
/* Alignment for incoming stack boundary in bits specified at
command line. */
-static unsigned int ix86_user_incoming_stack_boundary;
+unsigned int ix86_user_incoming_stack_boundary;
/* Default alignment for incoming stack boundary in bits. */
-static unsigned int ix86_default_incoming_stack_boundary;
+unsigned int ix86_default_incoming_stack_boundary;
/* Alignment for incoming stack boundary in bits. */
unsigned int ix86_incoming_stack_boundary;
/* Calling abi specific va_list type nodes. */
-static GTY(()) tree sysv_va_list_type_node;
-static GTY(()) tree ms_va_list_type_node;
+tree sysv_va_list_type_node;
+tree ms_va_list_type_node;
/* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
char internal_label_prefix[16];
static bool ext_80387_constants_init;
\f
-static struct machine_function * ix86_init_machine_status (void);
static rtx ix86_function_value (const_tree, const_tree, bool);
static bool ix86_function_value_regno_p (const unsigned int);
static unsigned int ix86_function_arg_boundary (machine_mode,
static rtx ix86_static_chain (const_tree, bool);
static int ix86_function_regparm (const_tree, const_tree);
static void ix86_compute_frame_layout (void);
-static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
- rtx, rtx, int);
-static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
static tree ix86_canonical_va_list_type (tree);
-static void predict_jump (int);
static unsigned int split_stack_prologue_scratch_regno (void);
static bool i386_asm_output_addr_const_extra (FILE *, rtx);
-enum ix86_function_specific_strings
-{
- IX86_FUNCTION_SPECIFIC_ARCH,
- IX86_FUNCTION_SPECIFIC_TUNE,
- IX86_FUNCTION_SPECIFIC_MAX
-};
-
-static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
- const char *, const char *, enum fpmath_unit,
- bool, bool);
-static void ix86_function_specific_save (struct cl_target_option *,
- struct gcc_options *opts);
-static void ix86_function_specific_restore (struct gcc_options *opts,
- struct cl_target_option *);
-static void ix86_function_specific_post_stream_in (struct cl_target_option *);
-static void ix86_function_specific_print (FILE *, int,
- struct cl_target_option *);
-static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
-static bool ix86_valid_target_attribute_inner_p (tree, tree, char *[],
- struct gcc_options *,
- struct gcc_options *,
- struct gcc_options *,
- bool);
static bool ix86_can_inline_p (tree, tree);
-static void ix86_set_current_function (tree);
static unsigned int ix86_minimum_incoming_stack_boundary (bool);
-static enum calling_abi ix86_function_abi (const_tree);
-
\f
-#ifndef SUBTARGET32_DEFAULT_CPU
-#define SUBTARGET32_DEFAULT_CPU "i386"
-#endif
-
/* Whether -mtune= or -march= were specified */
-static int ix86_tune_defaulted;
-static int ix86_arch_specified;
-
-/* Vectorization library interface and handlers. */
-static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
-
-static tree ix86_veclibabi_svml (combined_fn, tree, tree);
-static tree ix86_veclibabi_acml (combined_fn, tree, tree);
-
-/* This table must be in sync with enum processor_type in i386.h. */
-static const struct processor_costs *processor_cost_table[] =
-{
- &generic_cost,
- &i386_cost,
- &i486_cost,
- &pentium_cost,
- &lakemont_cost,
- &pentiumpro_cost,
- &pentium4_cost,
- &nocona_cost,
- &core_cost,
- &core_cost,
- &core_cost,
- &core_cost,
- &atom_cost,
- &slm_cost,
- &slm_cost,
- &slm_cost,
- &slm_cost,
- &slm_cost,
- &slm_cost,
- &skylake_cost,
- &skylake_cost,
- &skylake_cost,
- &skylake_cost,
- &skylake_cost,
- &skylake_cost,
- &intel_cost,
- &geode_cost,
- &k6_cost,
- &athlon_cost,
- &k8_cost,
- &amdfam10_cost,
- &bdver_cost,
- &bdver_cost,
- &bdver_cost,
- &bdver_cost,
- &btver1_cost,
- &btver2_cost,
- &znver1_cost,
- &znver2_cost
-};
-
-/* Guarantee that the array is aligned with enum processor_type. */
-STATIC_ASSERT (ARRAY_SIZE (processor_cost_table) == PROCESSOR_max);
+int ix86_tune_defaulted;
+int ix86_arch_specified;
\f
-static unsigned int
-rest_of_handle_insert_vzeroupper (void)
-{
- int i;
-
- /* vzeroupper instructions are inserted immediately after reload to
- account for possible spills from 256bit or 512bit registers. The pass
- reuses mode switching infrastructure by re-running mode insertion
- pass, so disable entities that have already been processed. */
- for (i = 0; i < MAX_386_ENTITIES; i++)
- ix86_optimize_mode_switching[i] = 0;
+/* Return true if a red-zone is in use. We can't use red-zone when
+ there are local indirect jumps, like "indirect_jump" or "tablejump",
+ which jumps to another place in the function, since "call" in the
+ indirect thunk pushes the return address onto stack, destroying
+ red-zone.
- ix86_optimize_mode_switching[AVX_U128] = 1;
+ TODO: If we can reserve the first 2 WORDs, for PUSH and, another
+ for CALL, in red-zone, we can allow local indirect jumps with
+ indirect thunk. */
- /* Call optimize_mode_switching. */
- g->get_passes ()->execute_pass_mode_switching ();
- return 0;
+bool
+ix86_using_red_zone (void)
+{
+ return (TARGET_RED_ZONE
+ && !TARGET_64BIT_MS_ABI
+ && (!cfun->machine->has_local_indirect_jump
+ || cfun->machine->indirect_branch_type == indirect_branch_keep));
}
-
-/* Return 1 if INSN uses or defines a hard register.
- Hard register uses in a memory address are ignored.
- Clobbers and flags definitions are ignored. */
-
+\f
+/* Return true, if profiling code should be emitted before
+ prologue. Otherwise it returns false.
+ Note: For x86 with "hotfix" it is sorried. */
static bool
-has_non_address_hard_reg (rtx_insn *insn)
+ix86_profile_before_prologue (void)
{
- df_ref ref;
- FOR_EACH_INSN_DEF (ref, insn)
- if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
- && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
- && DF_REF_REGNO (ref) != FLAGS_REG)
- return true;
-
- FOR_EACH_INSN_USE (ref, insn)
- if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
- return true;
-
- return false;
+ return flag_fentry != 0;
}
-/* Check if comparison INSN may be transformed
- into vector comparison. Currently we transform
- zero checks only which look like:
-
- (set (reg:CCZ 17 flags)
- (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
- (subreg:SI (reg:DI x) 0))
- (const_int 0 [0]))) */
+/* Update register usage after having seen the compiler flags. */
-static bool
-convertible_comparison_p (rtx_insn *insn)
+static void
+ix86_conditional_register_usage (void)
{
- if (!TARGET_SSE4_1)
- return false;
-
- rtx def_set = single_set (insn);
-
- gcc_assert (def_set);
+ int i, c_mask;
- rtx src = SET_SRC (def_set);
- rtx dst = SET_DEST (def_set);
+ /* If there are no caller-saved registers, preserve all registers.
+ except fixed_regs and registers used for function return value
+ since aggregate_value_p checks call_used_regs[regno] on return
+ value. */
+ if (cfun && cfun->machine->no_caller_saved_registers)
+ for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+ if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
+ call_used_regs[i] = 0;
- gcc_assert (GET_CODE (src) == COMPARE);
+ /* For 32-bit targets, squash the REX registers. */
+ if (! TARGET_64BIT)
+ {
+ for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
+ fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+ for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
+ fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+ for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
+ fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+ }
- if (GET_CODE (dst) != REG
- || REGNO (dst) != FLAGS_REG
- || GET_MODE (dst) != CCZmode)
- return false;
+ /* See the definition of CALL_USED_REGISTERS in i386.h. */
+ c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
+
+ CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
- rtx op1 = XEXP (src, 0);
- rtx op2 = XEXP (src, 1);
+ for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+ {
+ /* Set/reset conditionally defined registers from
+ CALL_USED_REGISTERS initializer. */
+ if (call_used_regs[i] > 1)
+ call_used_regs[i] = !!(call_used_regs[i] & c_mask);
- if (op2 != CONST0_RTX (GET_MODE (op2)))
- return false;
+ /* Calculate registers of CLOBBERED_REGS register set
+ as call used registers from GENERAL_REGS register set. */
+ if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
+ && call_used_regs[i])
+ SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
+ }
- if (GET_CODE (op1) != IOR)
- return false;
+ /* If MMX is disabled, squash the registers. */
+ if (! TARGET_MMX)
+ for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+ if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
+ fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
- op2 = XEXP (op1, 1);
- op1 = XEXP (op1, 0);
-
- if (!SUBREG_P (op1)
- || !SUBREG_P (op2)
- || GET_MODE (op1) != SImode
- || GET_MODE (op2) != SImode
- || ((SUBREG_BYTE (op1) != 0
- || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
- && (SUBREG_BYTE (op2) != 0
- || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
- return false;
+ /* If SSE is disabled, squash the registers. */
+ if (! TARGET_SSE)
+ for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+ if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
+ fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
- op1 = SUBREG_REG (op1);
- op2 = SUBREG_REG (op2);
+ /* If the FPU is disabled, squash the registers. */
+ if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
+ for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+ if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
+ fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
- if (op1 != op2
- || !REG_P (op1)
- || GET_MODE (op1) != DImode)
- return false;
+ /* If AVX512F is disabled, squash the registers. */
+ if (! TARGET_AVX512F)
+ {
+ for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
+ fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
- return true;
+ for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
+ fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+ }
}
-/* The DImode version of scalar_to_vector_candidate_p. */
+/* Canonicalize a comparison from one we don't have to one we do have. */
-static bool
-dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
+static void
+ix86_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
+ bool op0_preserve_value)
{
- rtx def_set = single_set (insn);
-
- if (!def_set)
- return false;
-
- if (has_non_address_hard_reg (insn))
- return false;
-
- rtx src = SET_SRC (def_set);
- rtx dst = SET_DEST (def_set);
+ /* The order of operands in x87 ficom compare is forced by combine in
+ simplify_comparison () function. Float operator is treated as RTX_OBJ
+ with a precedence over other operators and is always put in the first
+ place. Swap condition and operands to match ficom instruction. */
+ if (!op0_preserve_value
+ && GET_CODE (*op0) == FLOAT && MEM_P (XEXP (*op0, 0)) && REG_P (*op1))
+ {
+ enum rtx_code scode = swap_condition ((enum rtx_code) *code);
- if (GET_CODE (src) == COMPARE)
- return convertible_comparison_p (insn);
+ /* We are called only for compares that are split to SAHF instruction.
+ Ensure that we have setcc/jcc insn for the swapped condition. */
+ if (ix86_fp_compare_code_to_integer (scode) != UNKNOWN)
+ {
+ std::swap (*op0, *op1);
+ *code = (int) scode;
+ }
+ }
+}
+\f
+\f
+/* Hook to determine if one function can safely inline another. */
- /* We are interested in DImode promotion only. */
- if ((GET_MODE (src) != DImode
- && !CONST_INT_P (src))
- || GET_MODE (dst) != DImode)
- return false;
+static bool
+ix86_can_inline_p (tree caller, tree callee)
+{
+ tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
+ tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
- if (!REG_P (dst) && !MEM_P (dst))
- return false;
+ /* Changes of those flags can be tolerated for always inlines. Lets hope
+ user knows what he is doing. */
+ const unsigned HOST_WIDE_INT always_inline_safe_mask
+ = (MASK_USE_8BIT_IDIV | MASK_ACCUMULATE_OUTGOING_ARGS
+ | MASK_NO_ALIGN_STRINGOPS | MASK_AVX256_SPLIT_UNALIGNED_LOAD
+ | MASK_AVX256_SPLIT_UNALIGNED_STORE | MASK_CLD
+ | MASK_NO_FANCY_MATH_387 | MASK_IEEE_FP | MASK_INLINE_ALL_STRINGOPS
+ | MASK_INLINE_STRINGOPS_DYNAMICALLY | MASK_RECIP | MASK_STACK_PROBE
+ | MASK_STV | MASK_TLS_DIRECT_SEG_REFS | MASK_VZEROUPPER
+ | MASK_NO_PUSH_ARGS | MASK_OMIT_LEAF_FRAME_POINTER);
- switch (GET_CODE (src))
- {
- case ASHIFTRT:
- if (!TARGET_AVX512VL)
- return false;
- /* FALLTHRU */
- case ASHIFT:
- case LSHIFTRT:
- if (!CONST_INT_P (XEXP (src, 1))
- || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63))
- return false;
- break;
+ if (!callee_tree)
+ callee_tree = target_option_default_node;
+ if (!caller_tree)
+ caller_tree = target_option_default_node;
+ if (callee_tree == caller_tree)
+ return true;
- case PLUS:
- case MINUS:
- case IOR:
- case XOR:
- case AND:
- if (!REG_P (XEXP (src, 1))
- && !MEM_P (XEXP (src, 1))
- && !CONST_INT_P (XEXP (src, 1)))
- return false;
+ struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
+ struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
+ bool ret = false;
+ bool always_inline
+ = (DECL_DISREGARD_INLINE_LIMITS (callee)
+ && lookup_attribute ("always_inline",
+ DECL_ATTRIBUTES (callee)));
- if (GET_MODE (XEXP (src, 1)) != DImode
- && !CONST_INT_P (XEXP (src, 1)))
- return false;
- break;
+ cgraph_node *callee_node = cgraph_node::get (callee);
+ /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
+ function can inline a SSE2 function but a SSE2 function can't inline
+ a SSE4 function. */
+ if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
+ != callee_opts->x_ix86_isa_flags)
+ || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
+ != callee_opts->x_ix86_isa_flags2))
+ ret = false;
- case NEG:
- case NOT:
- break;
+ /* See if we have the same non-isa options. */
+ else if ((!always_inline
+ && caller_opts->x_target_flags != callee_opts->x_target_flags)
+ || (caller_opts->x_target_flags & ~always_inline_safe_mask)
+ != (callee_opts->x_target_flags & ~always_inline_safe_mask))
+ ret = false;
- case REG:
- return true;
+ /* See if arch, tune, etc. are the same. */
+ else if (caller_opts->arch != callee_opts->arch)
+ ret = false;
- case MEM:
- case CONST_INT:
- return REG_P (dst);
+ else if (!always_inline && caller_opts->tune != callee_opts->tune)
+ ret = false;
- default:
- return false;
- }
+ else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
+ /* If the calle doesn't use FP expressions differences in
+ ix86_fpmath can be ignored. We are called from FEs
+ for multi-versioning call optimization, so beware of
+ ipa_fn_summaries not available. */
+ && (! ipa_fn_summaries
+ || ipa_fn_summaries->get (callee_node) == NULL
+ || ipa_fn_summaries->get (callee_node)->fp_expressions))
+ ret = false;
- if (!REG_P (XEXP (src, 0))
- && !MEM_P (XEXP (src, 0))
- && !CONST_INT_P (XEXP (src, 0))
- /* Check for andnot case. */
- && (GET_CODE (src) != AND
- || GET_CODE (XEXP (src, 0)) != NOT
- || !REG_P (XEXP (XEXP (src, 0), 0))))
- return false;
+ else if (!always_inline
+ && caller_opts->branch_cost != callee_opts->branch_cost)
+ ret = false;
- if (GET_MODE (XEXP (src, 0)) != DImode
- && !CONST_INT_P (XEXP (src, 0)))
- return false;
+ else
+ ret = true;
- return true;
+ return ret;
}
-
-/* The TImode version of scalar_to_vector_candidate_p. */
+\f
+/* Return true if this goes in large data/bss. */
static bool
-timode_scalar_to_vector_candidate_p (rtx_insn *insn)
+ix86_in_large_data_p (tree exp)
{
- rtx def_set = single_set (insn);
-
- if (!def_set)
+ if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
return false;
- if (has_non_address_hard_reg (insn))
+ if (exp == NULL_TREE)
return false;
- rtx src = SET_SRC (def_set);
- rtx dst = SET_DEST (def_set);
-
- /* Only TImode load and store are allowed. */
- if (GET_MODE (dst) != TImode)
+ /* Functions are never large data. */
+ if (TREE_CODE (exp) == FUNCTION_DECL)
return false;
- if (MEM_P (dst))
- {
- /* Check for store. Memory must be aligned or unaligned store
- is optimal. Only support store from register, standard SSE
- constant or CONST_WIDE_INT generated from piecewise store.
-
- ??? Verify performance impact before enabling CONST_INT for
- __int128 store. */
- if (misaligned_operand (dst, TImode)
- && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
- return false;
-
- switch (GET_CODE (src))
- {
- default:
- return false;
-
- case REG:
- case CONST_WIDE_INT:
- return true;
+ /* Automatic variables are never large data. */
+ if (VAR_P (exp) && !is_global_var (exp))
+ return false;
- case CONST_INT:
- return standard_sse_constant_p (src, TImode);
- }
- }
- else if (MEM_P (src))
+ if (VAR_P (exp) && DECL_SECTION_NAME (exp))
{
- /* Check for load. Memory must be aligned or unaligned load is
- optimal. */
- return (REG_P (dst)
- && (!misaligned_operand (src, TImode)
- || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
+ const char *section = DECL_SECTION_NAME (exp);
+ if (strcmp (section, ".ldata") == 0
+ || strcmp (section, ".lbss") == 0)
+ return true;
+ return false;
}
-
- return false;
-}
-
-/* Return 1 if INSN may be converted into vector
- instruction. */
-
-static bool
-scalar_to_vector_candidate_p (rtx_insn *insn)
-{
- if (TARGET_64BIT)
- return timode_scalar_to_vector_candidate_p (insn);
else
- return dimode_scalar_to_vector_candidate_p (insn);
-}
-
-/* The DImode version of remove_non_convertible_regs. */
-
-static void
-dimode_remove_non_convertible_regs (bitmap candidates)
-{
- bitmap_iterator bi;
- unsigned id;
- bitmap regs = BITMAP_ALLOC (NULL);
-
- EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
- {
- rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
- rtx reg = SET_DEST (def_set);
-
- if (!REG_P (reg)
- || bitmap_bit_p (regs, REGNO (reg))
- || HARD_REGISTER_P (reg))
- continue;
-
- for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
- def;
- def = DF_REF_NEXT_REG (def))
- {
- if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
- {
- if (dump_file)
- fprintf (dump_file,
- "r%d has non convertible definition in insn %d\n",
- REGNO (reg), DF_REF_INSN_UID (def));
-
- bitmap_set_bit (regs, REGNO (reg));
- break;
- }
- }
- }
-
- EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
{
- for (df_ref def = DF_REG_DEF_CHAIN (id);
- def;
- def = DF_REF_NEXT_REG (def))
- if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
- {
- if (dump_file)
- fprintf (dump_file, "Removing insn %d from candidates list\n",
- DF_REF_INSN_UID (def));
+ HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
- bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
- }
+ /* If this is an incomplete type with size 0, then we can't put it
+ in data because it might be too big when completed. Also,
+ int_size_in_bytes returns -1 if size can vary or is larger than
+ an integer in which case also it is safer to assume that it goes in
+ large data. */
+ if (size <= 0 || size > ix86_section_threshold)
+ return true;
}
- BITMAP_FREE (regs);
+ return false;
}
-/* For a register REGNO, scan instructions for its defs and uses.
- Put REGNO in REGS if a def or use isn't in CANDIDATES. */
+/* i386-specific section flag to mark large sections. */
+#define SECTION_LARGE SECTION_MACH_DEP
-static void
-timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
- unsigned int regno)
+/* Switch to the appropriate section for output of DECL.
+ DECL is either a `VAR_DECL' node or a constant of some sort.
+ RELOC indicates whether forming the initial value of DECL requires
+ link-time relocations. */
+
+ATTRIBUTE_UNUSED static section *
+x86_64_elf_select_section (tree decl, int reloc,
+ unsigned HOST_WIDE_INT align)
{
- for (df_ref def = DF_REG_DEF_CHAIN (regno);
- def;
- def = DF_REF_NEXT_REG (def))
+ if (ix86_in_large_data_p (decl))
{
- if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
+ const char *sname = NULL;
+ unsigned int flags = SECTION_WRITE | SECTION_LARGE;
+ switch (categorize_decl_for_section (decl, reloc))
{
- if (dump_file)
- fprintf (dump_file,
- "r%d has non convertible def in insn %d\n",
- regno, DF_REF_INSN_UID (def));
-
- bitmap_set_bit (regs, regno);
+ case SECCAT_DATA:
+ sname = ".ldata";
+ break;
+ case SECCAT_DATA_REL:
+ sname = ".ldata.rel";
+ break;
+ case SECCAT_DATA_REL_LOCAL:
+ sname = ".ldata.rel.local";
+ break;
+ case SECCAT_DATA_REL_RO:
+ sname = ".ldata.rel.ro";
+ break;
+ case SECCAT_DATA_REL_RO_LOCAL:
+ sname = ".ldata.rel.ro.local";
+ break;
+ case SECCAT_BSS:
+ sname = ".lbss";
+ flags |= SECTION_BSS;
+ break;
+ case SECCAT_RODATA:
+ case SECCAT_RODATA_MERGE_STR:
+ case SECCAT_RODATA_MERGE_STR_INIT:
+ case SECCAT_RODATA_MERGE_CONST:
+ sname = ".lrodata";
+ flags &= ~SECTION_WRITE;
+ break;
+ case SECCAT_SRODATA:
+ case SECCAT_SDATA:
+ case SECCAT_SBSS:
+ gcc_unreachable ();
+ case SECCAT_TEXT:
+ case SECCAT_TDATA:
+ case SECCAT_TBSS:
+ /* We don't split these for medium model. Place them into
+ default sections and hope for best. */
break;
}
- }
-
- for (df_ref ref = DF_REG_USE_CHAIN (regno);
- ref;
- ref = DF_REF_NEXT_REG (ref))
- {
- /* Debug instructions are skipped. */
- if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
- && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
+ if (sname)
{
- if (dump_file)
- fprintf (dump_file,
- "r%d has non convertible use in insn %d\n",
- regno, DF_REF_INSN_UID (ref));
-
- bitmap_set_bit (regs, regno);
- break;
+ /* We might get called with string constants, but get_named_section
+ doesn't like them as they are not DECLs. Also, we need to set
+ flags in that case. */
+ if (!DECL_P (decl))
+ return get_section (sname, flags, NULL);
+ return get_named_section (decl, sname, reloc);
}
}
+ return default_elf_select_section (decl, reloc, align);
}
-/* The TImode version of remove_non_convertible_regs. */
+/* Select a set of attributes for section NAME based on the properties
+ of DECL and whether or not RELOC indicates that DECL's initializer
+ might contain runtime relocations. */
-static void
-timode_remove_non_convertible_regs (bitmap candidates)
+static unsigned int ATTRIBUTE_UNUSED
+x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
{
- bitmap_iterator bi;
- unsigned id;
- bitmap regs = BITMAP_ALLOC (NULL);
-
- EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
- {
- rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
- rtx dest = SET_DEST (def_set);
- rtx src = SET_SRC (def_set);
-
- if ((!REG_P (dest)
- || bitmap_bit_p (regs, REGNO (dest))
- || HARD_REGISTER_P (dest))
- && (!REG_P (src)
- || bitmap_bit_p (regs, REGNO (src))
- || HARD_REGISTER_P (src)))
- continue;
-
- if (REG_P (dest))
- timode_check_non_convertible_regs (candidates, regs,
- REGNO (dest));
-
- if (REG_P (src))
- timode_check_non_convertible_regs (candidates, regs,
- REGNO (src));
- }
-
- EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
- {
- for (df_ref def = DF_REG_DEF_CHAIN (id);
- def;
- def = DF_REF_NEXT_REG (def))
- if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
- {
- if (dump_file)
- fprintf (dump_file, "Removing insn %d from candidates list\n",
- DF_REF_INSN_UID (def));
+ unsigned int flags = default_section_type_flags (decl, name, reloc);
- bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
- }
+ if (ix86_in_large_data_p (decl))
+ flags |= SECTION_LARGE;
- for (df_ref ref = DF_REG_USE_CHAIN (id);
- ref;
- ref = DF_REF_NEXT_REG (ref))
- if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
- {
- if (dump_file)
- fprintf (dump_file, "Removing insn %d from candidates list\n",
- DF_REF_INSN_UID (ref));
+ if (decl == NULL_TREE
+ && (strcmp (name, ".ldata.rel.ro") == 0
+ || strcmp (name, ".ldata.rel.ro.local") == 0))
+ flags |= SECTION_RELRO;
- bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
- }
- }
+ if (strcmp (name, ".lbss") == 0
+ || strncmp (name, ".lbss.", 5) == 0
+ || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
+ flags |= SECTION_BSS;
- BITMAP_FREE (regs);
+ return flags;
}
-/* For a given bitmap of insn UIDs scans all instruction and
- remove insn from CANDIDATES in case it has both convertible
- and not convertible definitions.
+/* Build up a unique section name, expressed as a
+ STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
+ RELOC indicates whether the initial value of EXP requires
+ link-time relocations. */
- All insns in a bitmap are conversion candidates according to
- scalar_to_vector_candidate_p. Currently it implies all insns
- are single_set. */
-
-static void
-remove_non_convertible_regs (bitmap candidates)
-{
- if (TARGET_64BIT)
- timode_remove_non_convertible_regs (candidates);
- else
- dimode_remove_non_convertible_regs (candidates);
-}
-
-class scalar_chain
-{
- public:
- scalar_chain ();
- virtual ~scalar_chain ();
-
- static unsigned max_id;
-
- /* ID of a chain. */
- unsigned int chain_id;
- /* A queue of instructions to be included into a chain. */
- bitmap queue;
- /* Instructions included into a chain. */
- bitmap insns;
- /* All registers defined by a chain. */
- bitmap defs;
- /* Registers used in both vector and sclar modes. */
- bitmap defs_conv;
-
- void build (bitmap candidates, unsigned insn_uid);
- virtual int compute_convert_gain () = 0;
- int convert ();
-
- protected:
- void add_to_queue (unsigned insn_uid);
- void emit_conversion_insns (rtx insns, rtx_insn *pos);
-
- private:
- void add_insn (bitmap candidates, unsigned insn_uid);
- void analyze_register_chain (bitmap candidates, df_ref ref);
- virtual void mark_dual_mode_def (df_ref def) = 0;
- virtual void convert_insn (rtx_insn *insn) = 0;
- virtual void convert_registers () = 0;
-};
-
-class dimode_scalar_chain : public scalar_chain
-{
- public:
- int compute_convert_gain ();
- private:
- void mark_dual_mode_def (df_ref def);
- rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
- void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
- void convert_insn (rtx_insn *insn);
- void convert_op (rtx *op, rtx_insn *insn);
- void convert_reg (unsigned regno);
- void make_vector_copies (unsigned regno);
- void convert_registers ();
- int vector_const_cost (rtx exp);
-};
-
-class timode_scalar_chain : public scalar_chain
+static void ATTRIBUTE_UNUSED
+x86_64_elf_unique_section (tree decl, int reloc)
{
- public:
- /* Convert from TImode to V1TImode is always faster. */
- int compute_convert_gain () { return 1; }
-
- private:
- void mark_dual_mode_def (df_ref def);
- void fix_debug_reg_uses (rtx reg);
- void convert_insn (rtx_insn *insn);
- /* We don't convert registers to difference size. */
- void convert_registers () {}
-};
-
-unsigned scalar_chain::max_id = 0;
-
-/* Initialize new chain. */
+ if (ix86_in_large_data_p (decl))
+ {
+ const char *prefix = NULL;
+ /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
+ bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
-scalar_chain::scalar_chain ()
-{
- chain_id = ++max_id;
+ switch (categorize_decl_for_section (decl, reloc))
+ {
+ case SECCAT_DATA:
+ case SECCAT_DATA_REL:
+ case SECCAT_DATA_REL_LOCAL:
+ case SECCAT_DATA_REL_RO:
+ case SECCAT_DATA_REL_RO_LOCAL:
+ prefix = one_only ? ".ld" : ".ldata";
+ break;
+ case SECCAT_BSS:
+ prefix = one_only ? ".lb" : ".lbss";
+ break;
+ case SECCAT_RODATA:
+ case SECCAT_RODATA_MERGE_STR:
+ case SECCAT_RODATA_MERGE_STR_INIT:
+ case SECCAT_RODATA_MERGE_CONST:
+ prefix = one_only ? ".lr" : ".lrodata";
+ break;
+ case SECCAT_SRODATA:
+ case SECCAT_SDATA:
+ case SECCAT_SBSS:
+ gcc_unreachable ();
+ case SECCAT_TEXT:
+ case SECCAT_TDATA:
+ case SECCAT_TBSS:
+ /* We don't split these for medium model. Place them into
+ default sections and hope for best. */
+ break;
+ }
+ if (prefix)
+ {
+ const char *name, *linkonce;
+ char *string;
- if (dump_file)
- fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
+ name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
+ name = targetm.strip_name_encoding (name);
- bitmap_obstack_initialize (NULL);
- insns = BITMAP_ALLOC (NULL);
- defs = BITMAP_ALLOC (NULL);
- defs_conv = BITMAP_ALLOC (NULL);
- queue = NULL;
-}
+ /* If we're using one_only, then there needs to be a .gnu.linkonce
+ prefix to the section name. */
+ linkonce = one_only ? ".gnu.linkonce" : "";
-/* Free chain's data. */
+ string = ACONCAT ((linkonce, prefix, ".", name, NULL));
-scalar_chain::~scalar_chain ()
-{
- BITMAP_FREE (insns);
- BITMAP_FREE (defs);
- BITMAP_FREE (defs_conv);
- bitmap_obstack_release (NULL);
+ set_decl_section_name (decl, string);
+ return;
+ }
+ }
+ default_unique_section (decl, reloc);
}
-/* Add instruction into chains' queue. */
-
-void
-scalar_chain::add_to_queue (unsigned insn_uid)
-{
- if (bitmap_bit_p (insns, insn_uid)
- || bitmap_bit_p (queue, insn_uid))
- return;
+#ifdef COMMON_ASM_OP
- if (dump_file)
- fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
- insn_uid, chain_id);
- bitmap_set_bit (queue, insn_uid);
-}
+#ifndef LARGECOMM_SECTION_ASM_OP
+#define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
+#endif
-/* For DImode conversion, mark register defined by DEF as requiring
- conversion. */
+/* This says how to output assembler code to declare an
+ uninitialized external linkage data object.
+ For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
+ large objects. */
void
-dimode_scalar_chain::mark_dual_mode_def (df_ref def)
+x86_elf_aligned_decl_common (FILE *file, tree decl,
+ const char *name, unsigned HOST_WIDE_INT size,
+ int align)
{
- gcc_assert (DF_REF_REG_DEF_P (def));
-
- if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
- return;
-
- if (dump_file)
- fprintf (dump_file,
- " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
- DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
-
- bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
+ if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
+ && size > (unsigned int)ix86_section_threshold)
+ {
+ switch_to_section (get_named_section (decl, ".lbss", 0));
+ fputs (LARGECOMM_SECTION_ASM_OP, file);
+ }
+ else
+ fputs (COMMON_ASM_OP, file);
+ assemble_name (file, name);
+ fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
+ size, align / BITS_PER_UNIT);
}
+#endif
-/* For TImode conversion, it is unused. */
+/* Utility function for targets to use in implementing
+ ASM_OUTPUT_ALIGNED_BSS. */
void
-timode_scalar_chain::mark_dual_mode_def (df_ref)
+x86_output_aligned_bss (FILE *file, tree decl, const char *name,
+ unsigned HOST_WIDE_INT size, int align)
{
- gcc_unreachable ();
+ if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
+ && size > (unsigned int)ix86_section_threshold)
+ switch_to_section (get_named_section (decl, ".lbss", 0));
+ else
+ switch_to_section (bss_section);
+ ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
+#ifdef ASM_DECLARE_OBJECT_NAME
+ last_assemble_variable_decl = decl;
+ ASM_DECLARE_OBJECT_NAME (file, name, decl);
+#else
+ /* Standard thing is just output label for the object. */
+ ASM_OUTPUT_LABEL (file, name);
+#endif /* ASM_DECLARE_OBJECT_NAME */
+ ASM_OUTPUT_SKIP (file, size ? size : 1);
}
+\f
+/* Decide whether we must probe the stack before any space allocation
+ on this target. It's essentially TARGET_STACK_PROBE except when
+ -fstack-check causes the stack to be already probed differently. */
-/* Check REF's chain to add new insns into a queue
- and find registers requiring conversion. */
-
-void
-scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
+bool
+ix86_target_stack_probe (void)
{
- df_link *chain;
-
- gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
- || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
- add_to_queue (DF_REF_INSN_UID (ref));
-
- for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
- {
- unsigned uid = DF_REF_INSN_UID (chain->ref);
-
- if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
- continue;
-
- if (!DF_REF_REG_MEM_P (chain->ref))
- {
- if (bitmap_bit_p (insns, uid))
- continue;
-
- if (bitmap_bit_p (candidates, uid))
- {
- add_to_queue (uid);
- continue;
- }
- }
+ /* Do not probe the stack twice if static stack checking is enabled. */
+ if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
+ return false;
- if (DF_REF_REG_DEF_P (chain->ref))
- {
- if (dump_file)
- fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
- DF_REF_REGNO (chain->ref), uid);
- mark_dual_mode_def (chain->ref);
- }
- else
- {
- if (dump_file)
- fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
- DF_REF_REGNO (chain->ref), uid);
- mark_dual_mode_def (ref);
- }
- }
+ return TARGET_STACK_PROBE;
}
+\f
+/* Decide whether we can make a sibling call to a function. DECL is the
+ declaration of the function being targeted by the call and EXP is the
+ CALL_EXPR representing the call. */
-/* Add instruction into a chain. */
-
-void
-scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
+static bool
+ix86_function_ok_for_sibcall (tree decl, tree exp)
{
- if (bitmap_bit_p (insns, insn_uid))
- return;
-
- if (dump_file)
- fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
-
- bitmap_set_bit (insns, insn_uid);
+ tree type, decl_or_type;
+ rtx a, b;
+ bool bind_global = decl && !targetm.binds_local_p (decl);
- rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
- rtx def_set = single_set (insn);
- if (def_set && REG_P (SET_DEST (def_set))
- && !HARD_REGISTER_P (SET_DEST (def_set)))
- bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
+ if (ix86_function_naked (current_function_decl))
+ return false;
- df_ref ref;
- df_ref def;
- for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
- if (!HARD_REGISTER_P (DF_REF_REG (ref)))
- for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
- def;
- def = DF_REF_NEXT_REG (def))
- analyze_register_chain (candidates, def);
- for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
- if (!DF_REF_REG_MEM_P (ref))
- analyze_register_chain (candidates, ref);
-}
+ /* Sibling call isn't OK if there are no caller-saved registers
+ since all registers must be preserved before return. */
+ if (cfun->machine->no_caller_saved_registers)
+ return false;
-/* Build new chain starting from insn INSN_UID recursively
- adding all dependent uses and definitions. */
+ /* If we are generating position-independent code, we cannot sibcall
+ optimize direct calls to global functions, as the PLT requires
+ %ebx be live. (Darwin does not have a PLT.) */
+ if (!TARGET_MACHO
+ && !TARGET_64BIT
+ && flag_pic
+ && flag_plt
+ && bind_global)
+ return false;
-void
-scalar_chain::build (bitmap candidates, unsigned insn_uid)
-{
- queue = BITMAP_ALLOC (NULL);
- bitmap_set_bit (queue, insn_uid);
+ /* If we need to align the outgoing stack, then sibcalling would
+ unalign the stack, which may break the called function. */
+ if (ix86_minimum_incoming_stack_boundary (true)
+ < PREFERRED_STACK_BOUNDARY)
+ return false;
- if (dump_file)
- fprintf (dump_file, "Building chain #%d...\n", chain_id);
+ if (decl)
+ {
+ decl_or_type = decl;
+ type = TREE_TYPE (decl);
+ }
+ else
+ {
+ /* We're looking at the CALL_EXPR, we need the type of the function. */
+ type = CALL_EXPR_FN (exp); /* pointer expression */
+ type = TREE_TYPE (type); /* pointer type */
+ type = TREE_TYPE (type); /* function type */
+ decl_or_type = type;
+ }
- while (!bitmap_empty_p (queue))
+ /* Check that the return value locations are the same. Like
+ if we are returning floats on the 80387 register stack, we cannot
+ make a sibcall from a function that doesn't return a float to a
+ function that does or, conversely, from a function that does return
+ a float to a function that doesn't; the necessary stack adjustment
+ would not be executed. This is also the place we notice
+ differences in the return value ABI. Note that it is ok for one
+ of the functions to have void return type as long as the return
+ value of the other is passed in a register. */
+ a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
+ b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
+ cfun->decl, false);
+ if (STACK_REG_P (a) || STACK_REG_P (b))
{
- insn_uid = bitmap_first_set_bit (queue);
- bitmap_clear_bit (queue, insn_uid);
- bitmap_clear_bit (candidates, insn_uid);
- add_insn (candidates, insn_uid);
+ if (!rtx_equal_p (a, b))
+ return false;
}
+ else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
+ ;
+ else if (!rtx_equal_p (a, b))
+ return false;
- if (dump_file)
+ if (TARGET_64BIT)
+ {
+ /* The SYSV ABI has more call-clobbered registers;
+ disallow sibcalls from MS to SYSV. */
+ if (cfun->machine->call_abi == MS_ABI
+ && ix86_function_type_abi (type) == SYSV_ABI)
+ return false;
+ }
+ else
{
- fprintf (dump_file, "Collected chain #%d...\n", chain_id);
- fprintf (dump_file, " insns: ");
- dump_bitmap (dump_file, insns);
- if (!bitmap_empty_p (defs_conv))
+ /* If this call is indirect, we'll need to be able to use a
+ call-clobbered register for the address of the target function.
+ Make sure that all such registers are not used for passing
+ parameters. Note that DLLIMPORT functions and call to global
+ function via GOT slot are indirect. */
+ if (!decl
+ || (bind_global && flag_pic && !flag_plt)
+ || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl))
+ || flag_force_indirect_call)
{
- bitmap_iterator bi;
- unsigned id;
- const char *comma = "";
- fprintf (dump_file, " defs to convert: ");
- EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
- {
- fprintf (dump_file, "%sr%d", comma, id);
- comma = ", ";
- }
- fprintf (dump_file, "\n");
+ /* Check if regparm >= 3 since arg_reg_available is set to
+ false if regparm == 0. If regparm is 1 or 2, there is
+ always a call-clobbered register available.
+
+ ??? The symbol indirect call doesn't need a call-clobbered
+ register. But we don't know if this is a symbol indirect
+ call or not here. */
+ if (ix86_function_regparm (type, decl) >= 3
+ && !cfun->machine->arg_reg_available)
+ return false;
}
}
- BITMAP_FREE (queue);
+ /* Otherwise okay. That also includes certain types of indirect calls. */
+ return true;
}
-/* Return a cost of building a vector costant
- instead of using a scalar one. */
+/* This function determines from TYPE the calling-convention. */
-int
-dimode_scalar_chain::vector_const_cost (rtx exp)
+unsigned int
+ix86_get_callcvt (const_tree type)
{
- gcc_assert (CONST_INT_P (exp));
+ unsigned int ret = 0;
+ bool is_stdarg;
+ tree attrs;
- if (standard_sse_constant_p (exp, V2DImode))
- return COSTS_N_INSNS (1);
- return ix86_cost->sse_load[1];
-}
+ if (TARGET_64BIT)
+ return IX86_CALLCVT_CDECL;
-/* Compute a gain for chain conversion. */
+ attrs = TYPE_ATTRIBUTES (type);
+ if (attrs != NULL_TREE)
+ {
+ if (lookup_attribute ("cdecl", attrs))
+ ret |= IX86_CALLCVT_CDECL;
+ else if (lookup_attribute ("stdcall", attrs))
+ ret |= IX86_CALLCVT_STDCALL;
+ else if (lookup_attribute ("fastcall", attrs))
+ ret |= IX86_CALLCVT_FASTCALL;
+ else if (lookup_attribute ("thiscall", attrs))
+ ret |= IX86_CALLCVT_THISCALL;
-int
-dimode_scalar_chain::compute_convert_gain ()
-{
- bitmap_iterator bi;
- unsigned insn_uid;
- int gain = 0;
- int cost = 0;
-
- if (dump_file)
- fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
-
- EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
- {
- rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
- rtx def_set = single_set (insn);
- rtx src = SET_SRC (def_set);
- rtx dst = SET_DEST (def_set);
-
- if (REG_P (src) && REG_P (dst))
- gain += COSTS_N_INSNS (2) - ix86_cost->xmm_move;
- else if (REG_P (src) && MEM_P (dst))
- gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
- else if (MEM_P (src) && REG_P (dst))
- gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
- else if (GET_CODE (src) == ASHIFT
- || GET_CODE (src) == ASHIFTRT
- || GET_CODE (src) == LSHIFTRT)
- {
- if (CONST_INT_P (XEXP (src, 0)))
- gain -= vector_const_cost (XEXP (src, 0));
-
- gain += ix86_cost->shift_const;
- if (INTVAL (XEXP (src, 1)) >= 32)
- gain -= COSTS_N_INSNS (1);
- }
- else if (GET_CODE (src) == PLUS
- || GET_CODE (src) == MINUS
- || GET_CODE (src) == IOR
- || GET_CODE (src) == XOR
- || GET_CODE (src) == AND)
- {
- gain += ix86_cost->add;
- /* Additional gain for andnot for targets without BMI. */
- if (GET_CODE (XEXP (src, 0)) == NOT
- && !TARGET_BMI)
- gain += 2 * ix86_cost->add;
-
- if (CONST_INT_P (XEXP (src, 0)))
- gain -= vector_const_cost (XEXP (src, 0));
- if (CONST_INT_P (XEXP (src, 1)))
- gain -= vector_const_cost (XEXP (src, 1));
- }
- else if (GET_CODE (src) == NEG
- || GET_CODE (src) == NOT)
- gain += ix86_cost->add - COSTS_N_INSNS (1);
- else if (GET_CODE (src) == COMPARE)
- {
- /* Assume comparison cost is the same. */
- }
- else if (CONST_INT_P (src))
- {
- if (REG_P (dst))
- gain += COSTS_N_INSNS (2);
- else if (MEM_P (dst))
- gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
- gain -= vector_const_cost (src);
+ /* Regparam isn't allowed for thiscall and fastcall. */
+ if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
+ {
+ if (lookup_attribute ("regparm", attrs))
+ ret |= IX86_CALLCVT_REGPARM;
+ if (lookup_attribute ("sseregparm", attrs))
+ ret |= IX86_CALLCVT_SSEREGPARM;
}
- else
- gcc_unreachable ();
- }
- if (dump_file)
- fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
-
- EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
- cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
-
- if (dump_file)
- fprintf (dump_file, " Registers conversion cost: %d\n", cost);
+ if (IX86_BASE_CALLCVT(ret) != 0)
+ return ret;
+ }
- gain -= cost;
+ is_stdarg = stdarg_p (type);
+ if (TARGET_RTD && !is_stdarg)
+ return IX86_CALLCVT_STDCALL | ret;
- if (dump_file)
- fprintf (dump_file, " Total gain: %d\n", gain);
+ if (ret != 0
+ || is_stdarg
+ || TREE_CODE (type) != METHOD_TYPE
+ || ix86_function_type_abi (type) != MS_ABI)
+ return IX86_CALLCVT_CDECL | ret;
- return gain;
+ return IX86_CALLCVT_THISCALL;
}
-/* Replace REG in X with a V2DI subreg of NEW_REG. */
+/* Return 0 if the attributes for two types are incompatible, 1 if they
+ are compatible, and 2 if they are nearly compatible (which causes a
+ warning to be generated). */
-rtx
-dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
+static int
+ix86_comp_type_attributes (const_tree type1, const_tree type2)
{
- if (x == reg)
- return gen_rtx_SUBREG (V2DImode, new_reg, 0);
+ unsigned int ccvt1, ccvt2;
- const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
- int i, j;
- for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
- {
- if (fmt[i] == 'e')
- XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
- else if (fmt[i] == 'E')
- for (j = XVECLEN (x, i) - 1; j >= 0; j--)
- XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
- reg, new_reg);
- }
+ if (TREE_CODE (type1) != FUNCTION_TYPE
+ && TREE_CODE (type1) != METHOD_TYPE)
+ return 1;
- return x;
-}
+ ccvt1 = ix86_get_callcvt (type1);
+ ccvt2 = ix86_get_callcvt (type2);
+ if (ccvt1 != ccvt2)
+ return 0;
+ if (ix86_function_regparm (type1, NULL)
+ != ix86_function_regparm (type2, NULL))
+ return 0;
-/* Replace REG in INSN with a V2DI subreg of NEW_REG. */
+ return 1;
+}
+\f
+/* Return the regparm value for a function with the indicated TYPE and DECL.
+ DECL may be NULL when calling function indirectly
+ or considering a libcall. */
-void
-dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
- rtx reg, rtx new_reg)
+static int
+ix86_function_regparm (const_tree type, const_tree decl)
{
- replace_with_subreg (single_set (insn), reg, new_reg);
-}
+ tree attr;
+ int regparm;
+ unsigned int ccvt;
-/* Insert generated conversion instruction sequence INSNS
- after instruction AFTER. New BB may be required in case
- instruction has EH region attached. */
+ if (TARGET_64BIT)
+ return (ix86_function_type_abi (type) == SYSV_ABI
+ ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
+ ccvt = ix86_get_callcvt (type);
+ regparm = ix86_regparm;
-void
-scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
-{
- if (!control_flow_insn_p (after))
+ if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
{
- emit_insn_after (insns, after);
- return;
+ attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
+ if (attr)
+ {
+ regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
+ return regparm;
+ }
}
+ else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
+ return 2;
+ else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
+ return 1;
- basic_block bb = BLOCK_FOR_INSN (after);
- edge e = find_fallthru_edge (bb->succs);
- gcc_assert (e);
+ /* Use register calling convention for local functions when possible. */
+ if (decl
+ && TREE_CODE (decl) == FUNCTION_DECL)
+ {
+ cgraph_node *target = cgraph_node::get (decl);
+ if (target)
+ target = target->function_symbol ();
- basic_block new_bb = split_edge (e);
- emit_insn_after (insns, BB_HEAD (new_bb));
-}
+ /* Caller and callee must agree on the calling convention, so
+ checking here just optimize means that with
+ __attribute__((optimize (...))) caller could use regparm convention
+ and callee not, or vice versa. Instead look at whether the callee
+ is optimized or not. */
+ if (target && opt_for_fn (target->decl, optimize)
+ && !(profile_flag && !flag_fentry))
+ {
+ cgraph_local_info *i = &target->local;
+ if (i && i->local && i->can_change_signature)
+ {
+ int local_regparm, globals = 0, regno;
-/* Make vector copies for all register REGNO definitions
- and replace its uses in a chain. */
+ /* Make sure no regparm register is taken by a
+ fixed register variable. */
+ for (local_regparm = 0; local_regparm < REGPARM_MAX;
+ local_regparm++)
+ if (fixed_regs[local_regparm])
+ break;
-void
-dimode_scalar_chain::make_vector_copies (unsigned regno)
-{
- rtx reg = regno_reg_rtx[regno];
- rtx vreg = gen_reg_rtx (DImode);
- df_ref ref;
+ /* We don't want to use regparm(3) for nested functions as
+ these use a static chain pointer in the third argument. */
+ if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
+ local_regparm = 2;
- for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
- if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
- {
- start_sequence ();
+ /* Save a register for the split stack. */
+ if (flag_split_stack)
+ {
+ if (local_regparm == 3)
+ local_regparm = 2;
+ else if (local_regparm == 2
+ && DECL_STATIC_CHAIN (target->decl))
+ local_regparm = 1;
+ }
- if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
- {
- rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
- emit_move_insn (adjust_address (tmp, SImode, 0),
- gen_rtx_SUBREG (SImode, reg, 0));
- emit_move_insn (adjust_address (tmp, SImode, 4),
- gen_rtx_SUBREG (SImode, reg, 4));
- emit_move_insn (vreg, tmp);
- }
- else if (TARGET_SSE4_1)
- {
- emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
- CONST0_RTX (V4SImode),
- gen_rtx_SUBREG (SImode, reg, 0)));
- emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
- gen_rtx_SUBREG (V4SImode, vreg, 0),
- gen_rtx_SUBREG (SImode, reg, 4),
- GEN_INT (2)));
- }
- else
- {
- rtx tmp = gen_reg_rtx (DImode);
- emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
- CONST0_RTX (V4SImode),
- gen_rtx_SUBREG (SImode, reg, 0)));
- emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
- CONST0_RTX (V4SImode),
- gen_rtx_SUBREG (SImode, reg, 4)));
- emit_insn (gen_vec_interleave_lowv4si
- (gen_rtx_SUBREG (V4SImode, vreg, 0),
- gen_rtx_SUBREG (V4SImode, vreg, 0),
- gen_rtx_SUBREG (V4SImode, tmp, 0)));
- }
- rtx_insn *seq = get_insns ();
- end_sequence ();
- rtx_insn *insn = DF_REF_INSN (ref);
- emit_conversion_insns (seq, insn);
-
- if (dump_file)
- fprintf (dump_file,
- " Copied r%d to a vector register r%d for insn %d\n",
- regno, REGNO (vreg), INSN_UID (insn));
- }
+ /* Each fixed register usage increases register pressure,
+ so less registers should be used for argument passing.
+ This functionality can be overriden by an explicit
+ regparm value. */
+ for (regno = AX_REG; regno <= DI_REG; regno++)
+ if (fixed_regs[regno])
+ globals++;
- for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
- if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
- {
- rtx_insn *insn = DF_REF_INSN (ref);
+ local_regparm
+ = globals < local_regparm ? local_regparm - globals : 0;
- replace_with_subreg_in_insn (insn, reg, vreg);
+ if (local_regparm > regparm)
+ regparm = local_regparm;
+ }
+ }
+ }
- if (dump_file)
- fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
- regno, REGNO (vreg), INSN_UID (insn));
- }
+ return regparm;
}
-/* Convert all definitions of register REGNO
- and fix its uses. Scalar copies may be created
- in case register is used in not convertible insn. */
+/* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
+ DFmode (2) arguments in SSE registers for a function with the
+ indicated TYPE and DECL. DECL may be NULL when calling function
+ indirectly or considering a libcall. Return -1 if any FP parameter
+ should be rejected by error. This is used in siutation we imply SSE
+ calling convetion but the function is called from another function with
+ SSE disabled. Otherwise return 0. */
-void
-dimode_scalar_chain::convert_reg (unsigned regno)
+static int
+ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
{
- bool scalar_copy = bitmap_bit_p (defs_conv, regno);
- rtx reg = regno_reg_rtx[regno];
- rtx scopy = NULL_RTX;
- df_ref ref;
- bitmap conv;
-
- conv = BITMAP_ALLOC (NULL);
- bitmap_copy (conv, insns);
-
- if (scalar_copy)
- scopy = gen_reg_rtx (DImode);
+ gcc_assert (!TARGET_64BIT);
- for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+ /* Use SSE registers to pass SFmode and DFmode arguments if requested
+ by the sseregparm attribute. */
+ if (TARGET_SSEREGPARM
+ || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
{
- rtx_insn *insn = DF_REF_INSN (ref);
- rtx def_set = single_set (insn);
- rtx src = SET_SRC (def_set);
- rtx reg = DF_REF_REG (ref);
-
- if (!MEM_P (src))
- {
- replace_with_subreg_in_insn (insn, reg, reg);
- bitmap_clear_bit (conv, INSN_UID (insn));
- }
-
- if (scalar_copy)
+ if (!TARGET_SSE)
{
- start_sequence ();
- if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
- {
- rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
- emit_move_insn (tmp, reg);
- emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
- adjust_address (tmp, SImode, 0));
- emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
- adjust_address (tmp, SImode, 4));
- }
- else if (TARGET_SSE4_1)
- {
- rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
- emit_insn
- (gen_rtx_SET
- (gen_rtx_SUBREG (SImode, scopy, 0),
- gen_rtx_VEC_SELECT (SImode,
- gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
-
- tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
- emit_insn
- (gen_rtx_SET
- (gen_rtx_SUBREG (SImode, scopy, 4),
- gen_rtx_VEC_SELECT (SImode,
- gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
- }
- else
+ if (warn)
{
- rtx vcopy = gen_reg_rtx (V2DImode);
- emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
- emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
- gen_rtx_SUBREG (SImode, vcopy, 0));
- emit_move_insn (vcopy,
- gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
- emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
- gen_rtx_SUBREG (SImode, vcopy, 0));
+ if (decl)
+ error ("calling %qD with attribute sseregparm without "
+ "SSE/SSE2 enabled", decl);
+ else
+ error ("calling %qT with attribute sseregparm without "
+ "SSE/SSE2 enabled", type);
}
- rtx_insn *seq = get_insns ();
- end_sequence ();
- emit_conversion_insns (seq, insn);
-
- if (dump_file)
- fprintf (dump_file,
- " Copied r%d to a scalar register r%d for insn %d\n",
- regno, REGNO (scopy), INSN_UID (insn));
+ return 0;
}
- }
-
- for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
- if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
- {
- if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
- {
- rtx_insn *insn = DF_REF_INSN (ref);
- rtx def_set = single_set (insn);
- gcc_assert (def_set);
+ return 2;
+ }
- rtx src = SET_SRC (def_set);
- rtx dst = SET_DEST (def_set);
+ if (!decl)
+ return 0;
- if (!MEM_P (dst) || !REG_P (src))
- replace_with_subreg_in_insn (insn, reg, reg);
+ cgraph_node *target = cgraph_node::get (decl);
+ if (target)
+ target = target->function_symbol ();
- bitmap_clear_bit (conv, INSN_UID (insn));
- }
- }
- /* Skip debug insns and uninitialized uses. */
- else if (DF_REF_CHAIN (ref)
- && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
- {
- gcc_assert (scopy);
- replace_rtx (DF_REF_INSN (ref), reg, scopy);
- df_insn_rescan (DF_REF_INSN (ref));
- }
+ /* For local functions, pass up to SSE_REGPARM_MAX SFmode
+ (and DFmode for SSE2) arguments in SSE registers. */
+ if (target
+ /* TARGET_SSE_MATH */
+ && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
+ && opt_for_fn (target->decl, optimize)
+ && !(profile_flag && !flag_fentry))
+ {
+ cgraph_local_info *i = &target->local;
+ if (i && i->local && i->can_change_signature)
+ {
+ /* Refuse to produce wrong code when local function with SSE enabled
+ is called from SSE disabled function.
+ FIXME: We need a way to detect these cases cross-ltrans partition
+ and avoid using SSE calling conventions on local functions called
+ from function with SSE disabled. For now at least delay the
+ warning until we know we are going to produce wrong code.
+ See PR66047 */
+ if (!TARGET_SSE && warn)
+ return -1;
+ return TARGET_SSE2_P (target_opts_for_fn (target->decl)
+ ->x_ix86_isa_flags) ? 2 : 1;
+ }
+ }
- BITMAP_FREE (conv);
+ return 0;
}
-/* Convert operand OP in INSN. We should handle
- memory operands and uninitialized registers.
- All other register uses are converted during
- registers conversion. */
+/* Return true if EAX is live at the start of the function. Used by
+ ix86_expand_prologue to determine if we need special help before
+ calling allocate_stack_worker. */
-void
-dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
+static bool
+ix86_eax_live_at_start_p (void)
{
- *op = copy_rtx_if_shared (*op);
+ /* Cheat. Don't bother working forward from ix86_function_regparm
+ to the function type to whether an actual argument is located in
+ eax. Instead just look at cfg info, which is still close enough
+ to correct at this point. This gives false positives for broken
+ functions that might use uninitialized data that happens to be
+ allocated in eax, but who cares? */
+ return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
+}
- if (GET_CODE (*op) == NOT)
- {
- convert_op (&XEXP (*op, 0), insn);
- PUT_MODE (*op, V2DImode);
- }
- else if (MEM_P (*op))
- {
- rtx tmp = gen_reg_rtx (DImode);
-
- emit_insn_before (gen_move_insn (tmp, *op), insn);
- *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
+static bool
+ix86_keep_aggregate_return_pointer (tree fntype)
+{
+ tree attr;
- if (dump_file)
- fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
- INSN_UID (insn), REGNO (tmp));
- }
- else if (REG_P (*op))
- {
- /* We may have not converted register usage in case
- this register has no definition. Otherwise it
- should be converted in convert_reg. */
- df_ref ref;
- FOR_EACH_INSN_USE (ref, insn)
- if (DF_REF_REGNO (ref) == REGNO (*op))
- {
- gcc_assert (!DF_REF_CHAIN (ref));
- break;
- }
- *op = gen_rtx_SUBREG (V2DImode, *op, 0);
- }
- else if (CONST_INT_P (*op))
+ if (!TARGET_64BIT)
{
- rtx vec_cst;
- rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
-
- /* Prefer all ones vector in case of -1. */
- if (constm1_operand (*op, GET_MODE (*op)))
- vec_cst = CONSTM1_RTX (V2DImode);
- else
- vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
- gen_rtvec (2, *op, const0_rtx));
-
- if (!standard_sse_constant_p (vec_cst, V2DImode))
- {
- start_sequence ();
- vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
- rtx_insn *seq = get_insns ();
- end_sequence ();
- emit_insn_before (seq, insn);
- }
+ attr = lookup_attribute ("callee_pop_aggregate_return",
+ TYPE_ATTRIBUTES (fntype));
+ if (attr)
+ return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
- emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
- *op = tmp;
- }
- else
- {
- gcc_assert (SUBREG_P (*op));
- gcc_assert (GET_MODE (*op) == V2DImode);
+ /* For 32-bit MS-ABI the default is to keep aggregate
+ return pointer. */
+ if (ix86_function_type_abi (fntype) == MS_ABI)
+ return true;
}
+ return KEEP_AGGREGATE_RETURN_POINTER != 0;
}
-/* Convert INSN to vector mode. */
-
-void
-dimode_scalar_chain::convert_insn (rtx_insn *insn)
-{
- rtx def_set = single_set (insn);
- rtx src = SET_SRC (def_set);
- rtx dst = SET_DEST (def_set);
- rtx subreg;
+/* Value is the number of bytes of arguments automatically
+ popped when returning from a subroutine call.
+ FUNDECL is the declaration node of the function (as a tree),
+ FUNTYPE is the data type of the function (as a tree),
+ or for a library call it is an identifier node for the subroutine name.
+ SIZE is the number of bytes of arguments passed on the stack.
- if (MEM_P (dst) && !REG_P (src))
- {
- /* There are no scalar integer instructions and therefore
- temporary register usage is required. */
- rtx tmp = gen_reg_rtx (DImode);
- emit_conversion_insns (gen_move_insn (dst, tmp), insn);
- dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
- }
+ On the 80386, the RTD insn may be used to pop them if the number
+ of args is fixed, but if the number is variable then the caller
+ must pop them all. RTD can't be used for library calls now
+ because the library is compiled with the Unix compiler.
+ Use of RTD is a selectable option, since it is incompatible with
+ standard Unix calling sequences. If the option is not selected,
+ the caller must always pop the args.
- switch (GET_CODE (src))
- {
- case ASHIFT:
- case ASHIFTRT:
- case LSHIFTRT:
- convert_op (&XEXP (src, 0), insn);
- PUT_MODE (src, V2DImode);
- break;
+ The attribute stdcall is equivalent to RTD on a per module basis. */
- case PLUS:
- case MINUS:
- case IOR:
- case XOR:
- case AND:
- convert_op (&XEXP (src, 0), insn);
- convert_op (&XEXP (src, 1), insn);
- PUT_MODE (src, V2DImode);
- break;
+static poly_int64
+ix86_return_pops_args (tree fundecl, tree funtype, poly_int64 size)
+{
+ unsigned int ccvt;
- case NEG:
- src = XEXP (src, 0);
- convert_op (&src, insn);
- subreg = gen_reg_rtx (V2DImode);
- emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
- src = gen_rtx_MINUS (V2DImode, subreg, src);
- break;
+ /* None of the 64-bit ABIs pop arguments. */
+ if (TARGET_64BIT)
+ return 0;
- case NOT:
- src = XEXP (src, 0);
- convert_op (&src, insn);
- subreg = gen_reg_rtx (V2DImode);
- emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
- src = gen_rtx_XOR (V2DImode, src, subreg);
- break;
+ ccvt = ix86_get_callcvt (funtype);
- case MEM:
- if (!REG_P (dst))
- convert_op (&src, insn);
- break;
+ if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
+ | IX86_CALLCVT_THISCALL)) != 0
+ && ! stdarg_p (funtype))
+ return size;
- case REG:
- if (!MEM_P (dst))
- convert_op (&src, insn);
- break;
+ /* Lose any fake structure return argument if it is passed on the stack. */
+ if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
+ && !ix86_keep_aggregate_return_pointer (funtype))
+ {
+ int nregs = ix86_function_regparm (funtype, fundecl);
+ if (nregs == 0)
+ return GET_MODE_SIZE (Pmode);
+ }
- case SUBREG:
- gcc_assert (GET_MODE (src) == V2DImode);
- break;
+ return 0;
+}
- case COMPARE:
- src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
+/* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
- gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
- || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
+static bool
+ix86_legitimate_combined_insn (rtx_insn *insn)
+{
+ int i;
- if (REG_P (src))
- subreg = gen_rtx_SUBREG (V2DImode, src, 0);
- else
- subreg = copy_rtx_if_shared (src);
- emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
- copy_rtx_if_shared (subreg),
- copy_rtx_if_shared (subreg)),
- insn);
- dst = gen_rtx_REG (CCmode, FLAGS_REG);
- src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
- copy_rtx_if_shared (src)),
- UNSPEC_PTEST);
- break;
+ /* Check operand constraints in case hard registers were propagated
+ into insn pattern. This check prevents combine pass from
+ generating insn patterns with invalid hard register operands.
+ These invalid insns can eventually confuse reload to error out
+ with a spill failure. See also PRs 46829 and 46843. */
- case CONST_INT:
- convert_op (&src, insn);
- break;
+ gcc_assert (INSN_CODE (insn) >= 0);
- default:
- gcc_unreachable ();
- }
+ extract_insn (insn);
+ preprocess_constraints (insn);
- SET_SRC (def_set) = src;
- SET_DEST (def_set) = dst;
+ int n_operands = recog_data.n_operands;
+ int n_alternatives = recog_data.n_alternatives;
+ for (i = 0; i < n_operands; i++)
+ {
+ rtx op = recog_data.operand[i];
+ machine_mode mode = GET_MODE (op);
+ const operand_alternative *op_alt;
+ int offset = 0;
+ bool win;
+ int j;
- /* Drop possible dead definitions. */
- PATTERN (insn) = def_set;
+ /* A unary operator may be accepted by the predicate, but it
+ is irrelevant for matching constraints. */
+ if (UNARY_P (op))
+ op = XEXP (op, 0);
- INSN_CODE (insn) = -1;
- recog_memoized (insn);
- df_insn_rescan (insn);
-}
+ if (SUBREG_P (op))
+ {
+ if (REG_P (SUBREG_REG (op))
+ && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
+ offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
+ GET_MODE (SUBREG_REG (op)),
+ SUBREG_BYTE (op),
+ GET_MODE (op));
+ op = SUBREG_REG (op);
+ }
-/* Fix uses of converted REG in debug insns. */
+ if (!(REG_P (op) && HARD_REGISTER_P (op)))
+ continue;
-void
-timode_scalar_chain::fix_debug_reg_uses (rtx reg)
-{
- if (!flag_var_tracking)
- return;
+ op_alt = recog_op_alt;
- df_ref ref, next;
- for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
- {
- rtx_insn *insn = DF_REF_INSN (ref);
- /* Make sure the next ref is for a different instruction,
- so that we're not affected by the rescan. */
- next = DF_REF_NEXT_REG (ref);
- while (next && DF_REF_INSN (next) == insn)
- next = DF_REF_NEXT_REG (next);
+ /* Operand has no constraints, anything is OK. */
+ win = !n_alternatives;
- if (DEBUG_INSN_P (insn))
+ alternative_mask preferred = get_preferred_alternatives (insn);
+ for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
{
- /* It may be a debug insn with a TImode variable in
- register. */
- bool changed = false;
- for (; ref != next; ref = DF_REF_NEXT_REG (ref))
+ if (!TEST_BIT (preferred, j))
+ continue;
+ if (op_alt[i].anything_ok
+ || (op_alt[i].matches != -1
+ && operands_match_p
+ (recog_data.operand[i],
+ recog_data.operand[op_alt[i].matches]))
+ || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
{
- rtx *loc = DF_REF_LOC (ref);
- if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
- {
- *loc = gen_rtx_SUBREG (TImode, *loc, 0);
- changed = true;
- }
+ win = true;
+ break;
}
- if (changed)
- df_insn_rescan (insn);
}
+
+ if (!win)
+ return false;
}
+
+ return true;
}
+\f
+/* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
-/* Convert INSN from TImode to V1T1mode. */
+static unsigned HOST_WIDE_INT
+ix86_asan_shadow_offset (void)
+{
+ return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
+ : HOST_WIDE_INT_C (0x7fff8000))
+ : (HOST_WIDE_INT_1 << 29);
+}
+\f
+/* Argument support functions. */
-void
-timode_scalar_chain::convert_insn (rtx_insn *insn)
+/* Return true when register may be used to pass function parameters. */
+bool
+ix86_function_arg_regno_p (int regno)
{
- rtx def_set = single_set (insn);
- rtx src = SET_SRC (def_set);
- rtx dst = SET_DEST (def_set);
+ int i;
+ enum calling_abi call_abi;
+ const int *parm_regs;
- switch (GET_CODE (dst))
+ if (!TARGET_64BIT)
{
- case REG:
- {
- rtx tmp = find_reg_equal_equiv_note (insn);
- if (tmp)
- PUT_MODE (XEXP (tmp, 0), V1TImode);
- PUT_MODE (dst, V1TImode);
- fix_debug_reg_uses (dst);
- }
- break;
- case MEM:
- PUT_MODE (dst, V1TImode);
- break;
-
- default:
- gcc_unreachable ();
+ if (TARGET_MACHO)
+ return (regno < REGPARM_MAX
+ || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
+ else
+ return (regno < REGPARM_MAX
+ || (TARGET_MMX && MMX_REGNO_P (regno)
+ && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
+ || (TARGET_SSE && SSE_REGNO_P (regno)
+ && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
}
- switch (GET_CODE (src))
- {
- case REG:
- PUT_MODE (src, V1TImode);
- /* Call fix_debug_reg_uses only if SRC is never defined. */
- if (!DF_REG_DEF_CHAIN (REGNO (src)))
- fix_debug_reg_uses (src);
- break;
-
- case MEM:
- PUT_MODE (src, V1TImode);
- break;
-
- case CONST_WIDE_INT:
- if (NONDEBUG_INSN_P (insn))
- {
- /* Since there are no instructions to store 128-bit constant,
- temporary register usage is required. */
- rtx tmp = gen_reg_rtx (V1TImode);
- start_sequence ();
- src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
- src = validize_mem (force_const_mem (V1TImode, src));
- rtx_insn *seq = get_insns ();
- end_sequence ();
- if (seq)
- emit_insn_before (seq, insn);
- emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
- dst = tmp;
- }
- break;
-
- case CONST_INT:
- switch (standard_sse_constant_p (src, TImode))
- {
- case 1:
- src = CONST0_RTX (GET_MODE (dst));
- break;
- case 2:
- src = CONSTM1_RTX (GET_MODE (dst));
- break;
- default:
- gcc_unreachable ();
- }
- if (NONDEBUG_INSN_P (insn))
- {
- rtx tmp = gen_reg_rtx (V1TImode);
- /* Since there are no instructions to store standard SSE
- constant, temporary register usage is required. */
- emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
- dst = tmp;
- }
- break;
+ if (TARGET_SSE && SSE_REGNO_P (regno)
+ && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
+ return true;
- default:
- gcc_unreachable ();
- }
+ /* TODO: The function should depend on current function ABI but
+ builtins.c would need updating then. Therefore we use the
+ default ABI. */
+ call_abi = ix86_cfun_abi ();
- SET_SRC (def_set) = src;
- SET_DEST (def_set) = dst;
+ /* RAX is used as hidden argument to va_arg functions. */
+ if (call_abi == SYSV_ABI && regno == AX_REG)
+ return true;
- /* Drop possible dead definitions. */
- PATTERN (insn) = def_set;
+ if (call_abi == MS_ABI)
+ parm_regs = x86_64_ms_abi_int_parameter_registers;
+ else
+ parm_regs = x86_64_int_parameter_registers;
- INSN_CODE (insn) = -1;
- recog_memoized (insn);
- df_insn_rescan (insn);
+ for (i = 0; i < (call_abi == MS_ABI
+ ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
+ if (regno == parm_regs[i])
+ return true;
+ return false;
}
-void
-dimode_scalar_chain::convert_registers ()
+/* Return if we do not know how to pass TYPE solely in registers. */
+
+static bool
+ix86_must_pass_in_stack (machine_mode mode, const_tree type)
{
- bitmap_iterator bi;
- unsigned id;
+ if (must_pass_in_stack_var_size_or_pad (mode, type))
+ return true;
- EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
- convert_reg (id);
-
- EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
- make_vector_copies (id);
+ /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
+ The layout_type routine is crafty and tries to trick us into passing
+ currently unsupported vector types on the stack by using TImode. */
+ return (!TARGET_64BIT && mode == TImode
+ && type && TREE_CODE (type) != VECTOR_TYPE);
}
-/* Convert whole chain creating required register
- conversions and copies. */
-
+/* It returns the size, in bytes, of the area reserved for arguments passed
+ in registers for the function represented by fndecl dependent to the used
+ abi format. */
int
-scalar_chain::convert ()
+ix86_reg_parm_stack_space (const_tree fndecl)
{
- bitmap_iterator bi;
- unsigned id;
- int converted_insns = 0;
+ enum calling_abi call_abi = SYSV_ABI;
+ if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
+ call_abi = ix86_function_abi (fndecl);
+ else
+ call_abi = ix86_function_type_abi (fndecl);
+ if (TARGET_64BIT && call_abi == MS_ABI)
+ return 32;
+ return 0;
+}
- if (!dbg_cnt (stv_conversion))
- return 0;
+/* We add this as a workaround in order to use libc_has_function
+ hook in i386.md. */
+bool
+ix86_libc_has_function (enum function_class fn_class)
+{
+ return targetm.libc_has_function (fn_class);
+}
- if (dump_file)
- fprintf (dump_file, "Converting chain #%d...\n", chain_id);
+/* Returns value SYSV_ABI, MS_ABI dependent on fntype,
+ specifying the call abi used. */
+enum calling_abi
+ix86_function_type_abi (const_tree fntype)
+{
+ enum calling_abi abi = ix86_abi;
- convert_registers ();
+ if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
+ return abi;
- EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
+ if (abi == SYSV_ABI
+ && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
{
- convert_insn (DF_INSN_UID_GET (id)->insn);
- converted_insns++;
+ static int warned;
+ if (TARGET_X32 && !warned)
+ {
+ error ("X32 does not support ms_abi attribute");
+ warned = 1;
+ }
+
+ abi = MS_ABI;
}
+ else if (abi == MS_ABI
+ && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
+ abi = SYSV_ABI;
- return converted_insns;
+ return abi;
}
-/* Main STV pass function. Find and convert scalar
- instructions into vector mode when profitable. */
-
-static unsigned int
-convert_scalars_to_vector ()
+enum calling_abi
+ix86_function_abi (const_tree fndecl)
{
- basic_block bb;
- bitmap candidates;
- int converted_insns = 0;
-
- bitmap_obstack_initialize (NULL);
- candidates = BITMAP_ALLOC (NULL);
-
- calculate_dominance_info (CDI_DOMINATORS);
- df_set_flags (DF_DEFER_INSN_RESCAN);
- df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
- df_md_add_problem ();
- df_analyze ();
+ return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
+}
- /* Find all instructions we want to convert into vector mode. */
- if (dump_file)
- fprintf (dump_file, "Searching for mode conversion candidates...\n");
+/* Returns value SYSV_ABI, MS_ABI dependent on cfun,
+ specifying the call abi used. */
+enum calling_abi
+ix86_cfun_abi (void)
+{
+ return cfun ? cfun->machine->call_abi : ix86_abi;
+}
- FOR_EACH_BB_FN (bb, cfun)
+bool
+ix86_function_ms_hook_prologue (const_tree fn)
+{
+ if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
{
- rtx_insn *insn;
- FOR_BB_INSNS (bb, insn)
- if (scalar_to_vector_candidate_p (insn))
- {
- if (dump_file)
- fprintf (dump_file, " insn %d is marked as a candidate\n",
- INSN_UID (insn));
-
- bitmap_set_bit (candidates, INSN_UID (insn));
- }
+ if (decl_function_context (fn) != NULL_TREE)
+ error_at (DECL_SOURCE_LOCATION (fn),
+ "ms_hook_prologue is not compatible with nested function");
+ else
+ return true;
}
+ return false;
+}
- remove_non_convertible_regs (candidates);
-
- if (bitmap_empty_p (candidates))
- if (dump_file)
- fprintf (dump_file, "There are no candidates for optimization.\n");
+bool
+ix86_function_naked (const_tree fn)
+{
+ if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
+ return true;
- while (!bitmap_empty_p (candidates))
- {
- unsigned uid = bitmap_first_set_bit (candidates);
- scalar_chain *chain;
+ return false;
+}
- if (TARGET_64BIT)
- chain = new timode_scalar_chain;
- else
- chain = new dimode_scalar_chain;
+/* Write the extra assembler code needed to declare a function properly. */
- /* Find instructions chain we want to convert to vector mode.
- Check all uses and definitions to estimate all required
- conversions. */
- chain->build (candidates, uid);
+void
+ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
+ tree decl)
+{
+ bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
- if (chain->compute_convert_gain () > 0)
- converted_insns += chain->convert ();
- else
- if (dump_file)
- fprintf (dump_file, "Chain #%d conversion is not profitable\n",
- chain->chain_id);
+ if (is_ms_hook)
+ {
+ int i, filler_count = (TARGET_64BIT ? 32 : 16);
+ unsigned int filler_cc = 0xcccccccc;
- delete chain;
+ for (i = 0; i < filler_count; i += 4)
+ fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
}
- if (dump_file)
- fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
+#ifdef SUBTARGET_ASM_UNWIND_INIT
+ SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
+#endif
- BITMAP_FREE (candidates);
- bitmap_obstack_release (NULL);
- df_process_deferred_rescans ();
+ ASM_OUTPUT_LABEL (asm_out_file, fname);
- /* Conversion means we may have 128bit register spills/fills
- which require aligned stack. */
- if (converted_insns)
+ /* Output magic byte marker, if hot-patch attribute is set. */
+ if (is_ms_hook)
{
- if (crtl->stack_alignment_needed < 128)
- crtl->stack_alignment_needed = 128;
- if (crtl->stack_alignment_estimated < 128)
- crtl->stack_alignment_estimated = 128;
- /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
if (TARGET_64BIT)
- for (tree parm = DECL_ARGUMENTS (current_function_decl);
- parm; parm = DECL_CHAIN (parm))
- {
- if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
- continue;
- if (DECL_RTL_SET_P (parm)
- && GET_MODE (DECL_RTL (parm)) == V1TImode)
- {
- rtx r = DECL_RTL (parm);
- if (REG_P (r))
- SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
- }
- if (DECL_INCOMING_RTL (parm)
- && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
- {
- rtx r = DECL_INCOMING_RTL (parm);
- if (REG_P (r))
- DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
- }
- }
+ {
+ /* leaq [%rsp + 0], %rsp */
+ fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
+ asm_out_file);
+ }
+ else
+ {
+ /* movl.s %edi, %edi
+ push %ebp
+ movl.s %esp, %ebp */
+ fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
+ }
}
-
- return 0;
}
-namespace {
-
-const pass_data pass_data_insert_vzeroupper =
+/* Implementation of call abi switching target hook. Specific to FNDECL
+ the specific call register sets are set. See also
+ ix86_conditional_register_usage for more details. */
+void
+ix86_call_abi_override (const_tree fndecl)
{
- RTL_PASS, /* type */
- "vzeroupper", /* name */
- OPTGROUP_NONE, /* optinfo_flags */
- TV_MACH_DEP, /* tv_id */
- 0, /* properties_required */
- 0, /* properties_provided */
- 0, /* properties_destroyed */
- 0, /* todo_flags_start */
- TODO_df_finish, /* todo_flags_finish */
-};
+ cfun->machine->call_abi = ix86_function_abi (fndecl);
+}
-class pass_insert_vzeroupper : public rtl_opt_pass
+/* Return 1 if pseudo register should be created and used to hold
+ GOT address for PIC code. */
+bool
+ix86_use_pseudo_pic_reg (void)
{
-public:
- pass_insert_vzeroupper(gcc::context *ctxt)
- : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
- {}
-
- /* opt_pass methods: */
- virtual bool gate (function *)
- {
- return TARGET_AVX
- && TARGET_VZEROUPPER && flag_expensive_optimizations
- && !optimize_size;
- }
-
- virtual unsigned int execute (function *)
- {
- return rest_of_handle_insert_vzeroupper ();
- }
+ if ((TARGET_64BIT
+ && (ix86_cmodel == CM_SMALL_PIC
+ || TARGET_PECOFF))
+ || !flag_pic)
+ return false;
+ return true;
+}
-}; // class pass_insert_vzeroupper
+/* Initialize large model PIC register. */
-const pass_data pass_data_stv =
+static void
+ix86_init_large_pic_reg (unsigned int tmp_regno)
{
- RTL_PASS, /* type */
- "stv", /* name */
- OPTGROUP_NONE, /* optinfo_flags */
- TV_MACH_DEP, /* tv_id */
- 0, /* properties_required */
- 0, /* properties_provided */
- 0, /* properties_destroyed */
- 0, /* todo_flags_start */
- TODO_df_finish, /* todo_flags_finish */
-};
+ rtx_code_label *label;
+ rtx tmp_reg;
+
+ gcc_assert (Pmode == DImode);
+ label = gen_label_rtx ();
+ emit_label (label);
+ LABEL_PRESERVE_P (label) = 1;
+ tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
+ gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
+ emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
+ label));
+ emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
+ emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
+ pic_offset_table_rtx, tmp_reg));
+ const char *name = LABEL_NAME (label);
+ PUT_CODE (label, NOTE);
+ NOTE_KIND (label) = NOTE_INSN_DELETED_LABEL;
+ NOTE_DELETED_LABEL_NAME (label) = name;
+}
-class pass_stv : public rtl_opt_pass
+/* Create and initialize PIC register if required. */
+static void
+ix86_init_pic_reg (void)
{
-public:
- pass_stv (gcc::context *ctxt)
- : rtl_opt_pass (pass_data_stv, ctxt),
- timode_p (false)
- {}
+ edge entry_edge;
+ rtx_insn *seq;
- /* opt_pass methods: */
- virtual bool gate (function *)
- {
- return (timode_p == !!TARGET_64BIT
- && TARGET_STV && TARGET_SSE2 && optimize > 1);
- }
+ if (!ix86_use_pseudo_pic_reg ())
+ return;
- virtual unsigned int execute (function *)
- {
- return convert_scalars_to_vector ();
- }
+ start_sequence ();
- opt_pass *clone ()
+ if (TARGET_64BIT)
{
- return new pass_stv (m_ctxt);
+ if (ix86_cmodel == CM_LARGE_PIC)
+ ix86_init_large_pic_reg (R11_REG);
+ else
+ emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
}
-
- void set_pass_param (unsigned int n, bool param)
+ else
{
- gcc_assert (n == 0);
- timode_p = param;
+ /* If there is future mcount call in the function it is more profitable
+ to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
+ rtx reg = crtl->profile
+ ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
+ : pic_offset_table_rtx;
+ rtx_insn *insn = emit_insn (gen_set_got (reg));
+ RTX_FRAME_RELATED_P (insn) = 1;
+ if (crtl->profile)
+ emit_move_insn (pic_offset_table_rtx, reg);
+ add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
}
-private:
- bool timode_p;
-}; // class pass_stv
-
-} // anon namespace
-
-rtl_opt_pass *
-make_pass_insert_vzeroupper (gcc::context *ctxt)
-{
- return new pass_insert_vzeroupper (ctxt);
-}
+ seq = get_insns ();
+ end_sequence ();
-rtl_opt_pass *
-make_pass_stv (gcc::context *ctxt)
-{
- return new pass_stv (ctxt);
+ entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
+ insert_insn_on_edge (seq, entry_edge);
+ commit_one_edge_insertion (entry_edge);
}
-/* Inserting ENDBRANCH instructions. */
+/* Initialize a variable CUM of type CUMULATIVE_ARGS
+ for a call to a function whose data type is FNTYPE.
+ For a library call, FNTYPE is 0. */
-static unsigned int
-rest_of_insert_endbranch (void)
+void
+init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
+ tree fntype, /* tree ptr for function decl */
+ rtx libname, /* SYMBOL_REF of library name or 0 */
+ tree fndecl,
+ int caller)
{
- timevar_push (TV_MACH_DEP);
+ struct cgraph_local_info *i = NULL;
+ struct cgraph_node *target = NULL;
- rtx cet_eb;
- rtx_insn *insn;
- basic_block bb;
+ memset (cum, 0, sizeof (*cum));
- /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is
- absent among function attributes. Later an optimization will be
- introduced to make analysis if an address of a static function is
- taken. A static function whose address is not taken will get a
- nocf_check attribute. This will allow to reduce the number of EB. */
-
- if (!lookup_attribute ("nocf_check",
- TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
- && (!flag_manual_endbr
- || lookup_attribute ("cf_check",
- DECL_ATTRIBUTES (cfun->decl)))
- && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
- {
- /* Queue ENDBR insertion to x86_function_profiler. */
- if (crtl->profile && flag_fentry)
- cfun->machine->endbr_queued_at_entrance = true;
- else
+ if (fndecl)
+ {
+ target = cgraph_node::get (fndecl);
+ if (target)
{
- cet_eb = gen_nop_endbr ();
-
- bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
- insn = BB_HEAD (bb);
- emit_insn_before (cet_eb, insn);
+ target = target->function_symbol ();
+ i = cgraph_node::local_info (target->decl);
+ cum->call_abi = ix86_function_abi (target->decl);
}
+ else
+ cum->call_abi = ix86_function_abi (fndecl);
}
+ else
+ cum->call_abi = ix86_function_type_abi (fntype);
- bb = 0;
- FOR_EACH_BB_FN (bb, cfun)
+ cum->caller = caller;
+
+ /* Set up the number of registers to use for passing arguments. */
+ cum->nregs = ix86_regparm;
+ if (TARGET_64BIT)
{
- for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
- insn = NEXT_INSN (insn))
- {
- if (CALL_P (insn))
- {
- bool need_endbr;
- need_endbr = find_reg_note (insn, REG_SETJMP, NULL) != NULL;
- if (!need_endbr && !SIBLING_CALL_P (insn))
- {
- rtx call = get_call_rtx_from (insn);
- rtx fnaddr = XEXP (call, 0);
- tree fndecl = NULL_TREE;
-
- /* Also generate ENDBRANCH for non-tail call which
- may return via indirect branch. */
- if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
- fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
- if (fndecl == NULL_TREE)
- fndecl = MEM_EXPR (fnaddr);
- if (fndecl
- && TREE_CODE (TREE_TYPE (fndecl)) != FUNCTION_TYPE
- && TREE_CODE (TREE_TYPE (fndecl)) != METHOD_TYPE)
- fndecl = NULL_TREE;
- if (fndecl && TYPE_ARG_TYPES (TREE_TYPE (fndecl)))
- {
- tree fntype = TREE_TYPE (fndecl);
- if (lookup_attribute ("indirect_return",
- TYPE_ATTRIBUTES (fntype)))
- need_endbr = true;
- }
- }
- if (!need_endbr)
- continue;
- /* Generate ENDBRANCH after CALL, which can return more than
- twice, setjmp-like functions. */
-
- cet_eb = gen_nop_endbr ();
- emit_insn_after_setloc (cet_eb, insn, INSN_LOCATION (insn));
- continue;
- }
-
- if (JUMP_P (insn) && flag_cet_switch)
- {
- rtx target = JUMP_LABEL (insn);
- if (target == NULL_RTX || ANY_RETURN_P (target))
- continue;
-
- /* Check the jump is a switch table. */
- rtx_insn *label = as_a<rtx_insn *> (target);
- rtx_insn *table = next_insn (label);
- if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
- continue;
-
- /* For the indirect jump find out all places it jumps and insert
- ENDBRANCH there. It should be done under a special flag to
- control ENDBRANCH generation for switch stmts. */
- edge_iterator ei;
- edge e;
- basic_block dest_blk;
-
- FOR_EACH_EDGE (e, ei, bb->succs)
- {
- rtx_insn *insn;
-
- dest_blk = e->dest;
- insn = BB_HEAD (dest_blk);
- gcc_assert (LABEL_P (insn));
- cet_eb = gen_nop_endbr ();
- emit_insn_after (cet_eb, insn);
- }
- continue;
- }
-
- if ((LABEL_P (insn) && LABEL_PRESERVE_P (insn))
- || (NOTE_P (insn)
- && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
- /* TODO. Check /s bit also. */
- {
- cet_eb = gen_nop_endbr ();
- emit_insn_after (cet_eb, insn);
- continue;
- }
- }
- }
-
- timevar_pop (TV_MACH_DEP);
- return 0;
-}
-
-namespace {
-
-const pass_data pass_data_insert_endbranch =
-{
- RTL_PASS, /* type. */
- "cet", /* name. */
- OPTGROUP_NONE, /* optinfo_flags. */
- TV_MACH_DEP, /* tv_id. */
- 0, /* properties_required. */
- 0, /* properties_provided. */
- 0, /* properties_destroyed. */
- 0, /* todo_flags_start. */
- 0, /* todo_flags_finish. */
-};
-
-class pass_insert_endbranch : public rtl_opt_pass
-{
-public:
- pass_insert_endbranch (gcc::context *ctxt)
- : rtl_opt_pass (pass_data_insert_endbranch, ctxt)
- {}
-
- /* opt_pass methods: */
- virtual bool gate (function *)
- {
- return ((flag_cf_protection & CF_BRANCH));
+ cum->nregs = (cum->call_abi == SYSV_ABI
+ ? X86_64_REGPARM_MAX
+ : X86_64_MS_REGPARM_MAX);
}
-
- virtual unsigned int execute (function *)
+ if (TARGET_SSE)
{
- return rest_of_insert_endbranch ();
+ cum->sse_nregs = SSE_REGPARM_MAX;
+ if (TARGET_64BIT)
+ {
+ cum->sse_nregs = (cum->call_abi == SYSV_ABI
+ ? X86_64_SSE_REGPARM_MAX
+ : X86_64_MS_SSE_REGPARM_MAX);
+ }
}
+ if (TARGET_MMX)
+ cum->mmx_nregs = MMX_REGPARM_MAX;
+ cum->warn_avx512f = true;
+ cum->warn_avx = true;
+ cum->warn_sse = true;
+ cum->warn_mmx = true;
-}; // class pass_insert_endbranch
-
-} // anon namespace
-
-rtl_opt_pass *
-make_pass_insert_endbranch (gcc::context *ctxt)
-{
- return new pass_insert_endbranch (ctxt);
-}
-
-/* At entry of the nearest common dominator for basic blocks with
- conversions, generate a single
- vxorps %xmmN, %xmmN, %xmmN
- for all
- vcvtss2sd op, %xmmN, %xmmX
- vcvtsd2ss op, %xmmN, %xmmX
- vcvtsi2ss op, %xmmN, %xmmX
- vcvtsi2sd op, %xmmN, %xmmX
-
- NB: We want to generate only a single vxorps to cover the whole
- function. The LCM algorithm isn't appropriate here since it may
- place a vxorps inside the loop. */
-
-static unsigned int
-remove_partial_avx_dependency (void)
-{
- timevar_push (TV_MACH_DEP);
-
- bitmap_obstack_initialize (NULL);
- bitmap convert_bbs = BITMAP_ALLOC (NULL);
-
- basic_block bb;
- rtx_insn *insn, *set_insn;
- rtx set;
- rtx v4sf_const0 = NULL_RTX;
+ /* Because type might mismatch in between caller and callee, we need to
+ use actual type of function for local calls.
+ FIXME: cgraph_analyze can be told to actually record if function uses
+ va_start so for local functions maybe_vaarg can be made aggressive
+ helping K&R code.
+ FIXME: once typesytem is fixed, we won't need this code anymore. */
+ if (i && i->local && i->can_change_signature)
+ fntype = TREE_TYPE (target->decl);
+ cum->stdarg = stdarg_p (fntype);
+ cum->maybe_vaarg = (fntype
+ ? (!prototype_p (fntype) || stdarg_p (fntype))
+ : !libname);
- auto_vec<rtx_insn *> control_flow_insns;
+ cum->decl = fndecl;
- FOR_EACH_BB_FN (bb, cfun)
+ cum->warn_empty = !warn_abi || cum->stdarg;
+ if (!cum->warn_empty && fntype)
{
- FOR_BB_INSNS (bb, insn)
+ function_args_iterator iter;
+ tree argtype;
+ bool seen_empty_type = false;
+ FOREACH_FUNCTION_ARGS (fntype, argtype, iter)
{
- if (!NONDEBUG_INSN_P (insn))
- continue;
-
- set = single_set (insn);
- if (!set)
- continue;
-
- if (get_attr_avx_partial_xmm_update (insn)
- != AVX_PARTIAL_XMM_UPDATE_TRUE)
- continue;
-
- if (!v4sf_const0)
- {
- calculate_dominance_info (CDI_DOMINATORS);
- df_set_flags (DF_DEFER_INSN_RESCAN);
- df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
- df_md_add_problem ();
- df_analyze ();
- v4sf_const0 = gen_reg_rtx (V4SFmode);
- }
-
- /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF,
- SI -> SF, SI -> DF, DI -> SF, DI -> DF, to vec_dup and
- vec_merge with subreg. */
- rtx src = SET_SRC (set);
- rtx dest = SET_DEST (set);
- machine_mode dest_mode = GET_MODE (dest);
-
- rtx zero;
- machine_mode dest_vecmode;
- if (dest_mode == E_SFmode)
- {
- dest_vecmode = V4SFmode;
- zero = v4sf_const0;
- }
- else
- {
- dest_vecmode = V2DFmode;
- zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
- }
-
- /* Change source to vector mode. */
- src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src);
- src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero,
- GEN_INT (HOST_WIDE_INT_1U));
- /* Change destination to vector mode. */
- rtx vec = gen_reg_rtx (dest_vecmode);
- /* Generate an XMM vector SET. */
- set = gen_rtx_SET (vec, src);
- set_insn = emit_insn_before (set, insn);
- df_insn_rescan (set_insn);
-
- if (cfun->can_throw_non_call_exceptions)
+ if (argtype == error_mark_node || VOID_TYPE_P (argtype))
+ break;
+ if (TYPE_EMPTY_P (argtype))
+ seen_empty_type = true;
+ else if (seen_empty_type)
{
- /* Handle REG_EH_REGION note. */
- rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
- if (note)
- {
- control_flow_insns.safe_push (set_insn);
- add_reg_note (set_insn, REG_EH_REGION, XEXP (note, 0));
- }
+ cum->warn_empty = true;
+ break;
}
-
- src = gen_rtx_SUBREG (dest_mode, vec, 0);
- set = gen_rtx_SET (dest, src);
-
- /* Drop possible dead definitions. */
- PATTERN (insn) = set;
-
- INSN_CODE (insn) = -1;
- recog_memoized (insn);
- df_insn_rescan (insn);
- bitmap_set_bit (convert_bbs, bb->index);
}
}
- if (v4sf_const0)
+ if (!TARGET_64BIT)
{
- /* (Re-)discover loops so that bb->loop_father can be used in the
- analysis below. */
- loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
-
- /* Generate a vxorps at entry of the nearest dominator for basic
- blocks with conversions, which is in the the fake loop that
- contains the whole function, so that there is only a single
- vxorps in the whole function. */
- bb = nearest_common_dominator_for_set (CDI_DOMINATORS,
- convert_bbs);
- while (bb->loop_father->latch
- != EXIT_BLOCK_PTR_FOR_FN (cfun))
- bb = get_immediate_dominator (CDI_DOMINATORS,
- bb->loop_father->header);
-
- set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode));
+ /* If there are variable arguments, then we won't pass anything
+ in registers in 32-bit mode. */
+ if (stdarg_p (fntype))
+ {
+ cum->nregs = 0;
+ /* Since in 32-bit, variable arguments are always passed on
+ stack, there is scratch register available for indirect
+ sibcall. */
+ cfun->machine->arg_reg_available = true;
+ cum->sse_nregs = 0;
+ cum->mmx_nregs = 0;
+ cum->warn_avx512f = false;
+ cum->warn_avx = false;
+ cum->warn_sse = false;
+ cum->warn_mmx = false;
+ return;
+ }
- insn = BB_HEAD (bb);
- while (insn && !NONDEBUG_INSN_P (insn))
+ /* Use ecx and edx registers if function has fastcall attribute,
+ else look for regparm information. */
+ if (fntype)
{
- if (insn == BB_END (bb))
+ unsigned int ccvt = ix86_get_callcvt (fntype);
+ if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
{
- insn = NULL;
- break;
+ cum->nregs = 1;
+ cum->fastcall = 1; /* Same first register as in fastcall. */
+ }
+ else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
+ {
+ cum->nregs = 2;
+ cum->fastcall = 1;
}
- insn = NEXT_INSN (insn);
+ else
+ cum->nregs = ix86_function_regparm (fntype, fndecl);
}
- if (insn == BB_HEAD (bb))
- set_insn = emit_insn_before (set, insn);
- else
- set_insn = emit_insn_after (set,
- insn ? PREV_INSN (insn) : BB_END (bb));
- df_insn_rescan (set_insn);
- df_process_deferred_rescans ();
- loop_optimizer_finalize ();
- if (!control_flow_insns.is_empty ())
- {
- free_dominance_info (CDI_DOMINATORS);
-
- unsigned int i;
- FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
- if (control_flow_insn_p (insn))
- {
- /* Split the block after insn. There will be a fallthru
- edge, which is OK so we keep it. We have to create
- the exception edges ourselves. */
- bb = BLOCK_FOR_INSN (insn);
- split_block (bb, insn);
- rtl_make_eh_edge (NULL, bb, BB_END (bb));
- }
- }
+ /* Set up the number of SSE registers used for passing SFmode
+ and DFmode arguments. Warn for mismatching ABI. */
+ cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
}
- bitmap_obstack_release (NULL);
- BITMAP_FREE (convert_bbs);
-
- timevar_pop (TV_MACH_DEP);
- return 0;
+ cfun->machine->arg_reg_available = (cum->nregs > 0);
}
-namespace {
-
-const pass_data pass_data_remove_partial_avx_dependency =
-{
- RTL_PASS, /* type */
- "rpad", /* name */
- OPTGROUP_NONE, /* optinfo_flags */
- TV_MACH_DEP, /* tv_id */
- 0, /* properties_required */
- 0, /* properties_provided */
- 0, /* properties_destroyed */
- 0, /* todo_flags_start */
- TODO_df_finish, /* todo_flags_finish */
-};
-
-class pass_remove_partial_avx_dependency : public rtl_opt_pass
-{
-public:
- pass_remove_partial_avx_dependency (gcc::context *ctxt)
- : rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt)
- {}
-
- /* opt_pass methods: */
- virtual bool gate (function *)
- {
- return (TARGET_AVX
- && TARGET_SSE_PARTIAL_REG_DEPENDENCY
- && TARGET_SSE_MATH
- && optimize
- && optimize_function_for_speed_p (cfun));
- }
-
- virtual unsigned int execute (function *)
- {
- return remove_partial_avx_dependency ();
- }
-}; // class pass_rpad
-
-} // anon namespace
-
-rtl_opt_pass *
-make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
-{
- return new pass_remove_partial_avx_dependency (ctxt);
-}
+/* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
+ But in the case of vector types, it is some vector mode.
-/* Return true if a red-zone is in use. We can't use red-zone when
- there are local indirect jumps, like "indirect_jump" or "tablejump",
- which jumps to another place in the function, since "call" in the
- indirect thunk pushes the return address onto stack, destroying
- red-zone.
+ When we have only some of our vector isa extensions enabled, then there
+ are some modes for which vector_mode_supported_p is false. For these
+ modes, the generic vector support in gcc will choose some non-vector mode
+ in order to implement the type. By computing the natural mode, we'll
+ select the proper ABI location for the operand and not depend on whatever
+ the middle-end decides to do with these vector types.
- TODO: If we can reserve the first 2 WORDs, for PUSH and, another
- for CALL, in red-zone, we can allow local indirect jumps with
- indirect thunk. */
+ The midde-end can't deal with the vector types > 16 bytes. In this
+ case, we return the original mode and warn ABI change if CUM isn't
+ NULL.
-bool
-ix86_using_red_zone (void)
-{
- return (TARGET_RED_ZONE
- && !TARGET_64BIT_MS_ABI
- && (!cfun->machine->has_local_indirect_jump
- || cfun->machine->indirect_branch_type == indirect_branch_keep));
-}
-\f
-/* Return a string that documents the current -m options. The caller is
- responsible for freeing the string. */
+ If INT_RETURN is true, warn ABI change if the vector mode isn't
+ available for function return value. */
-static char *
-ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
- int flags, int flags2,
- const char *arch, const char *tune,
- enum fpmath_unit fpmath, bool add_nl_p, bool add_abi_p)
+static machine_mode
+type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
+ bool in_return)
{
- struct ix86_target_opts
- {
- const char *option; /* option string */
- HOST_WIDE_INT mask; /* isa mask options */
- };
-
- /* This table is ordered so that options like -msse4.2 that imply other
- ISAs come first. Target string will be displayed in the same order. */
- static struct ix86_target_opts isa2_opts[] =
- {
- { "-mcx16", OPTION_MASK_ISA_CX16 },
- { "-mvaes", OPTION_MASK_ISA_VAES },
- { "-mrdpid", OPTION_MASK_ISA_RDPID },
- { "-mpconfig", OPTION_MASK_ISA_PCONFIG },
- { "-mwbnoinvd", OPTION_MASK_ISA_WBNOINVD },
- { "-msgx", OPTION_MASK_ISA_SGX },
- { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
- { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
- { "-mhle", OPTION_MASK_ISA_HLE },
- { "-mmovbe", OPTION_MASK_ISA_MOVBE },
- { "-mclzero", OPTION_MASK_ISA_CLZERO },
- { "-mmwaitx", OPTION_MASK_ISA_MWAITX },
- { "-mmovdir64b", OPTION_MASK_ISA_MOVDIR64B },
- { "-mwaitpkg", OPTION_MASK_ISA_WAITPKG },
- { "-mcldemote", OPTION_MASK_ISA_CLDEMOTE },
- { "-mptwrite", OPTION_MASK_ISA_PTWRITE }
- };
- static struct ix86_target_opts isa_opts[] =
- {
- { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ },
- { "-mavx512bitalg", OPTION_MASK_ISA_AVX512BITALG },
- { "-mvpclmulqdq", OPTION_MASK_ISA_VPCLMULQDQ },
- { "-mgfni", OPTION_MASK_ISA_GFNI },
- { "-mavx512vnni", OPTION_MASK_ISA_AVX512VNNI },
- { "-mavx512vbmi2", OPTION_MASK_ISA_AVX512VBMI2 },
- { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
- { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
- { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
- { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
- { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
- { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
- { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
- { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
- { "-mavx512f", OPTION_MASK_ISA_AVX512F },
- { "-mavx2", OPTION_MASK_ISA_AVX2 },
- { "-mfma", OPTION_MASK_ISA_FMA },
- { "-mxop", OPTION_MASK_ISA_XOP },
- { "-mfma4", OPTION_MASK_ISA_FMA4 },
- { "-mf16c", OPTION_MASK_ISA_F16C },
- { "-mavx", OPTION_MASK_ISA_AVX },
-/* { "-msse4" OPTION_MASK_ISA_SSE4 }, */
- { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
- { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
- { "-msse4a", OPTION_MASK_ISA_SSE4A },
- { "-mssse3", OPTION_MASK_ISA_SSSE3 },
- { "-msse3", OPTION_MASK_ISA_SSE3 },
- { "-maes", OPTION_MASK_ISA_AES },
- { "-msha", OPTION_MASK_ISA_SHA },
- { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
- { "-msse2", OPTION_MASK_ISA_SSE2 },
- { "-msse", OPTION_MASK_ISA_SSE },
- { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
- { "-m3dnow", OPTION_MASK_ISA_3DNOW },
- { "-mmmx", OPTION_MASK_ISA_MMX },
- { "-mrtm", OPTION_MASK_ISA_RTM },
- { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
- { "-mrdseed", OPTION_MASK_ISA_RDSEED },
- { "-madx", OPTION_MASK_ISA_ADX },
- { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
- { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
- { "-mxsaves", OPTION_MASK_ISA_XSAVES },
- { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
- { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
- { "-mxsave", OPTION_MASK_ISA_XSAVE },
- { "-mabm", OPTION_MASK_ISA_ABM },
- { "-mbmi", OPTION_MASK_ISA_BMI },
- { "-mbmi2", OPTION_MASK_ISA_BMI2 },
- { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
- { "-mtbm", OPTION_MASK_ISA_TBM },
- { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
- { "-msahf", OPTION_MASK_ISA_SAHF },
- { "-mcrc32", OPTION_MASK_ISA_CRC32 },
- { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
- { "-mrdrnd", OPTION_MASK_ISA_RDRND },
- { "-mpku", OPTION_MASK_ISA_PKU },
- { "-mlwp", OPTION_MASK_ISA_LWP },
- { "-mfxsr", OPTION_MASK_ISA_FXSR },
- { "-mclwb", OPTION_MASK_ISA_CLWB },
- { "-mshstk", OPTION_MASK_ISA_SHSTK },
- { "-mmovdiri", OPTION_MASK_ISA_MOVDIRI }
- };
-
- /* Flag options. */
- static struct ix86_target_opts flag_opts[] =
- {
- { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
- { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
- { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
- { "-m80387", MASK_80387 },
- { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
- { "-malign-double", MASK_ALIGN_DOUBLE },
- { "-mcld", MASK_CLD },
- { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
- { "-mieee-fp", MASK_IEEE_FP },
- { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
- { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
- { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
- { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
- { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
- { "-mno-push-args", MASK_NO_PUSH_ARGS },
- { "-mno-red-zone", MASK_NO_RED_ZONE },
- { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
- { "-mrecip", MASK_RECIP },
- { "-mrtd", MASK_RTD },
- { "-msseregparm", MASK_SSEREGPARM },
- { "-mstack-arg-probe", MASK_STACK_PROBE },
- { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
- { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
- { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
- { "-mvzeroupper", MASK_VZEROUPPER },
- { "-mstv", MASK_STV },
- { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
- { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE },
- { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES }
- };
-
- /* Additional flag options. */
- static struct ix86_target_opts flag2_opts[] =
- {
- { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY }
- };
-
- const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
- + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
-
- char isa_other[40];
- char isa2_other[40];
- char flags_other[40];
- char flags2_other[40];
- unsigned num = 0;
- unsigned i, j;
- char *ret;
- char *ptr;
- size_t len;
- size_t line_len;
- size_t sep_len;
- const char *abi;
-
- memset (opts, '\0', sizeof (opts));
+ machine_mode mode = TYPE_MODE (type);
- /* Add -march= option. */
- if (arch)
+ if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
{
- opts[num][0] = "-march=";
- opts[num++][1] = arch;
- }
+ HOST_WIDE_INT size = int_size_in_bytes (type);
+ if ((size == 8 || size == 16 || size == 32 || size == 64)
+ /* ??? Generic code allows us to create width 1 vectors. Ignore. */
+ && TYPE_VECTOR_SUBPARTS (type) > 1)
+ {
+ machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
- /* Add -mtune= option. */
- if (tune)
- {
- opts[num][0] = "-mtune=";
- opts[num++][1] = tune;
- }
+ /* There are no XFmode vector modes. */
+ if (innermode == XFmode)
+ return mode;
- /* Add -m32/-m64/-mx32. */
- if (add_abi_p)
- {
- if ((isa & OPTION_MASK_ISA_64BIT) != 0)
- {
- if ((isa & OPTION_MASK_ABI_64) != 0)
- abi = "-m64";
+ if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
+ mode = MIN_MODE_VECTOR_FLOAT;
else
- abi = "-mx32";
- }
- else
- abi = "-m32";
- opts[num++][0] = abi;
- }
- isa &= ~(OPTION_MASK_ISA_64BIT | OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
+ mode = MIN_MODE_VECTOR_INT;
- /* Pick out the options in isa2 options. */
- for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
- {
- if ((isa2 & isa2_opts[i].mask) != 0)
- {
- opts[num++][0] = isa2_opts[i].option;
- isa2 &= ~ isa2_opts[i].mask;
- }
- }
-
- if (isa2 && add_nl_p)
- {
- opts[num++][0] = isa2_other;
- sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
- }
-
- /* Pick out the options in isa options. */
- for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
- {
- if ((isa & isa_opts[i].mask) != 0)
- {
- opts[num++][0] = isa_opts[i].option;
- isa &= ~ isa_opts[i].mask;
- }
- }
-
- if (isa && add_nl_p)
- {
- opts[num++][0] = isa_other;
- sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
- }
-
- /* Add flag options. */
- for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
- {
- if ((flags & flag_opts[i].mask) != 0)
- {
- opts[num++][0] = flag_opts[i].option;
- flags &= ~ flag_opts[i].mask;
- }
- }
+ /* Get the mode which has this inner mode and number of units. */
+ FOR_EACH_MODE_FROM (mode, mode)
+ if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
+ && GET_MODE_INNER (mode) == innermode)
+ {
+ if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
+ {
+ static bool warnedavx512f;
+ static bool warnedavx512f_ret;
- if (flags && add_nl_p)
- {
- opts[num++][0] = flags_other;
- sprintf (flags_other, "(other flags: %#x)", flags);
- }
+ if (cum && cum->warn_avx512f && !warnedavx512f)
+ {
+ if (warning (OPT_Wpsabi, "AVX512F vector argument "
+ "without AVX512F enabled changes the ABI"))
+ warnedavx512f = true;
+ }
+ else if (in_return && !warnedavx512f_ret)
+ {
+ if (warning (OPT_Wpsabi, "AVX512F vector return "
+ "without AVX512F enabled changes the ABI"))
+ warnedavx512f_ret = true;
+ }
- /* Add additional flag options. */
- for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
- {
- if ((flags2 & flag2_opts[i].mask) != 0)
- {
- opts[num++][0] = flag2_opts[i].option;
- flags2 &= ~ flag2_opts[i].mask;
- }
- }
+ return TYPE_MODE (type);
+ }
+ else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
+ {
+ static bool warnedavx;
+ static bool warnedavx_ret;
- if (flags2 && add_nl_p)
- {
- opts[num++][0] = flags2_other;
- sprintf (flags2_other, "(other flags2: %#x)", flags2);
- }
+ if (cum && cum->warn_avx && !warnedavx)
+ {
+ if (warning (OPT_Wpsabi, "AVX vector argument "
+ "without AVX enabled changes the ABI"))
+ warnedavx = true;
+ }
+ else if (in_return && !warnedavx_ret)
+ {
+ if (warning (OPT_Wpsabi, "AVX vector return "
+ "without AVX enabled changes the ABI"))
+ warnedavx_ret = true;
+ }
- /* Add -fpmath= option. */
- if (fpmath)
- {
- opts[num][0] = "-mfpmath=";
- switch ((int) fpmath)
- {
- case FPMATH_387:
- opts[num++][1] = "387";
- break;
+ return TYPE_MODE (type);
+ }
+ else if (((size == 8 && TARGET_64BIT) || size == 16)
+ && !TARGET_SSE
+ && !TARGET_IAMCU)
+ {
+ static bool warnedsse;
+ static bool warnedsse_ret;
- case FPMATH_SSE:
- opts[num++][1] = "sse";
- break;
+ if (cum && cum->warn_sse && !warnedsse)
+ {
+ if (warning (OPT_Wpsabi, "SSE vector argument "
+ "without SSE enabled changes the ABI"))
+ warnedsse = true;
+ }
+ else if (!TARGET_64BIT && in_return && !warnedsse_ret)
+ {
+ if (warning (OPT_Wpsabi, "SSE vector return "
+ "without SSE enabled changes the ABI"))
+ warnedsse_ret = true;
+ }
+ }
+ else if ((size == 8 && !TARGET_64BIT)
+ && (!cfun
+ || cfun->machine->func_type == TYPE_NORMAL)
+ && !TARGET_MMX
+ && !TARGET_IAMCU)
+ {
+ static bool warnedmmx;
+ static bool warnedmmx_ret;
- case FPMATH_387 | FPMATH_SSE:
- opts[num++][1] = "sse+387";
- break;
+ if (cum && cum->warn_mmx && !warnedmmx)
+ {
+ if (warning (OPT_Wpsabi, "MMX vector argument "
+ "without MMX enabled changes the ABI"))
+ warnedmmx = true;
+ }
+ else if (in_return && !warnedmmx_ret)
+ {
+ if (warning (OPT_Wpsabi, "MMX vector return "
+ "without MMX enabled changes the ABI"))
+ warnedmmx_ret = true;
+ }
+ }
+ return mode;
+ }
- default:
gcc_unreachable ();
}
}
- /* Any options? */
- if (num == 0)
- return NULL;
-
- gcc_assert (num < ARRAY_SIZE (opts));
-
- /* Size the string. */
- len = 0;
- sep_len = (add_nl_p) ? 3 : 1;
- for (i = 0; i < num; i++)
- {
- len += sep_len;
- for (j = 0; j < 2; j++)
- if (opts[i][j])
- len += strlen (opts[i][j]);
- }
-
- /* Build the string. */
- ret = ptr = (char *) xmalloc (len);
- line_len = 0;
-
- for (i = 0; i < num; i++)
- {
- size_t len2[2];
-
- for (j = 0; j < 2; j++)
- len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
-
- if (i != 0)
- {
- *ptr++ = ' ';
- line_len++;
-
- if (add_nl_p && line_len + len2[0] + len2[1] > 70)
- {
- *ptr++ = '\\';
- *ptr++ = '\n';
- line_len = 0;
- }
- }
-
- for (j = 0; j < 2; j++)
- if (opts[i][j])
- {
- memcpy (ptr, opts[i][j], len2[j]);
- ptr += len2[j];
- line_len += len2[j];
- }
- }
-
- *ptr = '\0';
- gcc_assert (ret + len >= ptr);
-
- return ret;
+ return mode;
}
-/* Return true, if profiling code should be emitted before
- prologue. Otherwise it returns false.
- Note: For x86 with "hotfix" it is sorried. */
-static bool
-ix86_profile_before_prologue (void)
-{
- return flag_fentry != 0;
-}
+/* We want to pass a value in REGNO whose "natural" mode is MODE. However,
+ this may not agree with the mode that the type system has chosen for the
+ register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
+ go ahead and use it. Otherwise we have to build a PARALLEL instead. */
-/* Function that is callable from the debugger to print the current
- options. */
-void ATTRIBUTE_UNUSED
-ix86_debug_options (void)
+static rtx
+gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
+ unsigned int regno)
{
- char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
- target_flags, ix86_target_flags,
- ix86_arch_string,ix86_tune_string,
- ix86_fpmath, true, true);
+ rtx tmp;
- if (opts)
+ if (orig_mode != BLKmode)
+ tmp = gen_rtx_REG (orig_mode, regno);
+ else
{
- fprintf (stderr, "%s\n\n", opts);
- free (opts);
+ tmp = gen_rtx_REG (mode, regno);
+ tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
+ tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
}
- else
- fputs ("<no options>\n\n", stderr);
- return;
+ return tmp;
}
-static const char *stringop_alg_names[] = {
-#define DEF_ENUM
-#define DEF_ALG(alg, name) #name,
-#include "stringop.def"
-#undef DEF_ENUM
-#undef DEF_ALG
-};
+/* x86-64 register passing implementation. See x86-64 ABI for details. Goal
+ of this code is to classify each 8bytes of incoming argument by the register
+ class and assign registers accordingly. */
-/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
- The string is of the following form (or comma separated list of it):
+/* Return the union class of CLASS1 and CLASS2.
+ See the x86-64 PS ABI for details. */
- strategy_alg:max_size:[align|noalign]
+static enum x86_64_reg_class
+merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
+{
+ /* Rule #1: If both classes are equal, this is the resulting class. */
+ if (class1 == class2)
+ return class1;
- where the full size range for the strategy is either [0, max_size] or
- [min_size, max_size], in which min_size is the max_size + 1 of the
- preceding range. The last size range must have max_size == -1.
+ /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
+ the other class. */
+ if (class1 == X86_64_NO_CLASS)
+ return class2;
+ if (class2 == X86_64_NO_CLASS)
+ return class1;
- Examples:
+ /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
+ if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
+ return X86_64_MEMORY_CLASS;
- 1.
- -mmemcpy-strategy=libcall:-1:noalign
+ /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
+ if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
+ || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
+ return X86_64_INTEGERSI_CLASS;
+ if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
+ || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
+ return X86_64_INTEGER_CLASS;
- this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
+ /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
+ MEMORY is used. */
+ if (class1 == X86_64_X87_CLASS
+ || class1 == X86_64_X87UP_CLASS
+ || class1 == X86_64_COMPLEX_X87_CLASS
+ || class2 == X86_64_X87_CLASS
+ || class2 == X86_64_X87UP_CLASS
+ || class2 == X86_64_COMPLEX_X87_CLASS)
+ return X86_64_MEMORY_CLASS;
+ /* Rule #6: Otherwise class SSE is used. */
+ return X86_64_SSE_CLASS;
+}
- 2.
- -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
+/* Classify the argument of type TYPE and mode MODE.
+ CLASSES will be filled by the register class used to pass each word
+ of the operand. The number of words is returned. In case the parameter
+ should be passed in memory, 0 is returned. As a special case for zero
+ sized containers, classes[0] will be NO_CLASS and 1 is returned.
- This is to tell the compiler to use the following strategy for memset
- 1) when the expected size is between [1, 16], use rep_8byte strategy;
- 2) when the size is between [17, 2048], use vector_loop;
- 3) when the size is > 2048, use libcall. */
+ BIT_OFFSET is used internally for handling records and specifies offset
+ of the offset in bits modulo 512 to avoid overflow cases.
-struct stringop_size_range
-{
- int max;
- stringop_alg alg;
- bool noalign;
-};
+ See the x86-64 PS ABI for details.
+*/
-static void
-ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
+static int
+classify_argument (machine_mode mode, const_tree type,
+ enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
{
- const struct stringop_algs *default_algs;
- stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
- char *curr_range_str, *next_range_str;
- const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
- int i = 0, n = 0;
+ HOST_WIDE_INT bytes
+ = mode == BLKmode ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
+ int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
- if (is_memset)
- default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
- else
- default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
+ /* Variable sized entities are always passed/returned in memory. */
+ if (bytes < 0)
+ return 0;
- curr_range_str = strategy_str;
+ if (mode != VOIDmode
+ && targetm.calls.must_pass_in_stack (mode, type))
+ return 0;
- do
+ if (type && AGGREGATE_TYPE_P (type))
{
- int maxs;
- char alg_name[128];
- char align[16];
- next_range_str = strchr (curr_range_str, ',');
- if (next_range_str)
- *next_range_str++ = '\0';
-
- if (sscanf (curr_range_str, "%20[^:]:%d:%10s", alg_name, &maxs,
- align) != 3)
- {
- error ("wrong argument %qs to option %qs", curr_range_str, opt);
- return;
- }
-
- if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
- {
- error ("size ranges of option %qs should be increasing", opt);
- return;
- }
+ int i;
+ tree field;
+ enum x86_64_reg_class subclasses[MAX_CLASSES];
- for (i = 0; i < last_alg; i++)
- if (!strcmp (alg_name, stringop_alg_names[i]))
- break;
+ /* On x86-64 we pass structures larger than 64 bytes on the stack. */
+ if (bytes > 64)
+ return 0;
- if (i == last_alg)
- {
- error ("wrong strategy name %qs specified for option %qs",
- alg_name, opt);
-
- auto_vec <const char *> candidates;
- for (i = 0; i < last_alg; i++)
- if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
- candidates.safe_push (stringop_alg_names[i]);
-
- char *s;
- const char *hint
- = candidates_list_and_hint (alg_name, s, candidates);
- if (hint)
- inform (input_location,
- "valid arguments to %qs are: %s; did you mean %qs?",
- opt, s, hint);
- else
- inform (input_location, "valid arguments to %qs are: %s",
- opt, s);
- XDELETEVEC (s);
- return;
- }
+ for (i = 0; i < words; i++)
+ classes[i] = X86_64_NO_CLASS;
- if ((stringop_alg) i == rep_prefix_8_byte
- && !TARGET_64BIT)
+ /* Zero sized arrays or structures are NO_CLASS. We return 0 to
+ signalize memory class, so handle it as special case. */
+ if (!words)
{
- /* rep; movq isn't available in 32-bit code. */
- error ("strategy name %qs specified for option %qs "
- "not supported for 32-bit code", alg_name, opt);
- return;
+ classes[0] = X86_64_NO_CLASS;
+ return 1;
}
- input_ranges[n].max = maxs;
- input_ranges[n].alg = (stringop_alg) i;
- if (!strcmp (align, "align"))
- input_ranges[n].noalign = false;
- else if (!strcmp (align, "noalign"))
- input_ranges[n].noalign = true;
- else
- {
- error ("unknown alignment %qs specified for option %qs", align, opt);
- return;
- }
- n++;
- curr_range_str = next_range_str;
- }
- while (curr_range_str);
-
- if (input_ranges[n - 1].max != -1)
- {
- error ("the max value for the last size range should be -1"
- " for option %qs", opt);
- return;
- }
-
- if (n > MAX_STRINGOP_ALGS)
- {
- error ("too many size ranges specified in option %qs", opt);
- return;
- }
-
- /* Now override the default algs array. */
- for (i = 0; i < n; i++)
- {
- *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
- *const_cast<stringop_alg *>(&default_algs->size[i].alg)
- = input_ranges[i].alg;
- *const_cast<int *>(&default_algs->size[i].noalign)
- = input_ranges[i].noalign;
- }
-}
-
-\f
-/* parse -mtune-ctrl= option. When DUMP is true,
- print the features that are explicitly set. */
+ /* Classify each field of record and merge classes. */
+ switch (TREE_CODE (type))
+ {
+ case RECORD_TYPE:
+ /* And now merge the fields of structure. */
+ for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
+ {
+ if (TREE_CODE (field) == FIELD_DECL)
+ {
+ int num;
-static void
-parse_mtune_ctrl_str (bool dump)
-{
- if (!ix86_tune_ctrl_string)
- return;
+ if (TREE_TYPE (field) == error_mark_node)
+ continue;
- char *next_feature_string = NULL;
- char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
- char *orig = curr_feature_string;
- int i;
- do
- {
- bool clear = false;
+ /* Bitfields are always classified as integer. Handle them
+ early, since later code would consider them to be
+ misaligned integers. */
+ if (DECL_BIT_FIELD (field))
+ {
+ for (i = (int_bit_position (field)
+ + (bit_offset % 64)) / 8 / 8;
+ i < ((int_bit_position (field) + (bit_offset % 64))
+ + tree_to_shwi (DECL_SIZE (field))
+ + 63) / 8 / 8; i++)
+ classes[i]
+ = merge_classes (X86_64_INTEGER_CLASS, classes[i]);
+ }
+ else
+ {
+ int pos;
- next_feature_string = strchr (curr_feature_string, ',');
- if (next_feature_string)
- *next_feature_string++ = '\0';
- if (*curr_feature_string == '^')
- {
- curr_feature_string++;
- clear = true;
- }
- for (i = 0; i < X86_TUNE_LAST; i++)
- {
- if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
- {
- ix86_tune_features[i] = !clear;
- if (dump)
- fprintf (stderr, "Explicitly %s feature %s\n",
- clear ? "clear" : "set", ix86_tune_feature_names[i]);
- break;
- }
- }
- if (i == X86_TUNE_LAST)
- error ("unknown parameter to option %<-mtune-ctrl%>: %s",
- clear ? curr_feature_string - 1 : curr_feature_string);
- curr_feature_string = next_feature_string;
- }
- while (curr_feature_string);
- free (orig);
-}
+ type = TREE_TYPE (field);
-/* Helper function to set ix86_tune_features. IX86_TUNE is the
- processor type. */
+ /* Flexible array member is ignored. */
+ if (TYPE_MODE (type) == BLKmode
+ && TREE_CODE (type) == ARRAY_TYPE
+ && TYPE_SIZE (type) == NULL_TREE
+ && TYPE_DOMAIN (type) != NULL_TREE
+ && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
+ == NULL_TREE))
+ {
+ static bool warned;
-static void
-set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
-{
- unsigned HOST_WIDE_INT ix86_tune_mask = HOST_WIDE_INT_1U << ix86_tune;
- int i;
+ if (!warned && warn_psabi)
+ {
+ warned = true;
+ inform (input_location,
+ "the ABI of passing struct with"
+ " a flexible array member has"
+ " changed in GCC 4.4");
+ }
+ continue;
+ }
+ num = classify_argument (TYPE_MODE (type), type,
+ subclasses,
+ (int_bit_position (field)
+ + bit_offset) % 512);
+ if (!num)
+ return 0;
+ pos = (int_bit_position (field)
+ + (bit_offset % 64)) / 8 / 8;
+ for (i = 0; i < num && (i + pos) < words; i++)
+ classes[i + pos]
+ = merge_classes (subclasses[i], classes[i + pos]);
+ }
+ }
+ }
+ break;
- for (i = 0; i < X86_TUNE_LAST; ++i)
- {
- if (ix86_tune_no_default)
- ix86_tune_features[i] = 0;
- else
- ix86_tune_features[i]
- = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
- }
+ case ARRAY_TYPE:
+ /* Arrays are handled as small records. */
+ {
+ int num;
+ num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
+ TREE_TYPE (type), subclasses, bit_offset);
+ if (!num)
+ return 0;
- if (dump)
- {
- fprintf (stderr, "List of x86 specific tuning parameter names:\n");
- for (i = 0; i < X86_TUNE_LAST; i++)
- fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
- ix86_tune_features[i] ? "on" : "off");
- }
+ /* The partial classes are now full classes. */
+ if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
+ subclasses[0] = X86_64_SSE_CLASS;
+ if (subclasses[0] == X86_64_INTEGERSI_CLASS
+ && !((bit_offset % 64) == 0 && bytes == 4))
+ subclasses[0] = X86_64_INTEGER_CLASS;
- parse_mtune_ctrl_str (dump);
-}
+ for (i = 0; i < words; i++)
+ classes[i] = subclasses[i % num];
+ break;
+ }
+ case UNION_TYPE:
+ case QUAL_UNION_TYPE:
+ /* Unions are similar to RECORD_TYPE but offset is always 0.
+ */
+ for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
+ {
+ if (TREE_CODE (field) == FIELD_DECL)
+ {
+ int num;
-/* Default align_* from the processor table. */
+ if (TREE_TYPE (field) == error_mark_node)
+ continue;
-static void
-ix86_default_align (struct gcc_options *opts)
-{
- /* -falign-foo without argument: supply one. */
- if (opts->x_flag_align_loops && !opts->x_str_align_loops)
- opts->x_str_align_loops = processor_cost_table[ix86_tune]->align_loop;
- if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
- opts->x_str_align_jumps = processor_cost_table[ix86_tune]->align_jump;
- if (opts->x_flag_align_labels && !opts->x_str_align_labels)
- opts->x_str_align_labels = processor_cost_table[ix86_tune]->align_label;
- if (opts->x_flag_align_functions && !opts->x_str_align_functions)
- opts->x_str_align_functions = processor_cost_table[ix86_tune]->align_func;
-}
+ num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
+ TREE_TYPE (field), subclasses,
+ bit_offset);
+ if (!num)
+ return 0;
+ for (i = 0; i < num && i < words; i++)
+ classes[i] = merge_classes (subclasses[i], classes[i]);
+ }
+ }
+ break;
-/* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
+ default:
+ gcc_unreachable ();
+ }
-static void
-ix86_override_options_after_change (void)
-{
- ix86_default_align (&global_options);
-}
+ if (words > 2)
+ {
+ /* When size > 16 bytes, if the first one isn't
+ X86_64_SSE_CLASS or any other ones aren't
+ X86_64_SSEUP_CLASS, everything should be passed in
+ memory. */
+ if (classes[0] != X86_64_SSE_CLASS)
+ return 0;
+ for (i = 1; i < words; i++)
+ if (classes[i] != X86_64_SSEUP_CLASS)
+ return 0;
+ }
+ /* Final merger cleanup. */
+ for (i = 0; i < words; i++)
+ {
+ /* If one class is MEMORY, everything should be passed in
+ memory. */
+ if (classes[i] == X86_64_MEMORY_CLASS)
+ return 0;
-/* Override various settings based on options. If MAIN_ARGS_P, the
- options are from the command line, otherwise they are from
- attributes. Return true if there's an error related to march
- option. */
+ /* The X86_64_SSEUP_CLASS should be always preceded by
+ X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
+ if (classes[i] == X86_64_SSEUP_CLASS
+ && classes[i - 1] != X86_64_SSE_CLASS
+ && classes[i - 1] != X86_64_SSEUP_CLASS)
+ {
+ /* The first one should never be X86_64_SSEUP_CLASS. */
+ gcc_assert (i != 0);
+ classes[i] = X86_64_SSE_CLASS;
+ }
-static bool
-ix86_option_override_internal (bool main_args_p,
- struct gcc_options *opts,
- struct gcc_options *opts_set)
-{
- int i;
- unsigned HOST_WIDE_INT ix86_arch_mask;
- const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
+ /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
+ everything should be passed in memory. */
+ if (classes[i] == X86_64_X87UP_CLASS
+ && (classes[i - 1] != X86_64_X87_CLASS))
+ {
+ static bool warned;
- /* -mrecip options. */
- static struct
- {
- const char *string; /* option name */
- unsigned int mask; /* mask bits to set */
+ /* The first one should never be X86_64_X87UP_CLASS. */
+ gcc_assert (i != 0);
+ if (!warned && warn_psabi)
+ {
+ warned = true;
+ inform (input_location,
+ "the ABI of passing union with long double"
+ " has changed in GCC 4.4");
+ }
+ return 0;
+ }
+ }
+ return words;
}
- const recip_options[] =
- {
- { "all", RECIP_MASK_ALL },
- { "none", RECIP_MASK_NONE },
- { "div", RECIP_MASK_DIV },
- { "sqrt", RECIP_MASK_SQRT },
- { "vec-div", RECIP_MASK_VEC_DIV },
- { "vec-sqrt", RECIP_MASK_VEC_SQRT },
- };
-
- /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
- TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
- if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
- opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
-#ifdef TARGET_BI_ARCH
- else
+ /* Compute alignment needed. We align all types to natural boundaries with
+ exception of XFmode that is aligned to 64bits. */
+ if (mode != VOIDmode && mode != BLKmode)
{
-#if TARGET_BI_ARCH == 1
- /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
- is on and OPTION_MASK_ABI_X32 is off. We turn off
- OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
- -mx32. */
- if (TARGET_X32_P (opts->x_ix86_isa_flags))
- opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
-#else
- /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
- on and OPTION_MASK_ABI_64 is off. We turn off
- OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
- -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
- if (TARGET_LP64_P (opts->x_ix86_isa_flags)
- || TARGET_16BIT_P (opts->x_ix86_isa_flags))
- opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
-#endif
- if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
- && TARGET_IAMCU_P (opts->x_target_flags))
- sorry ("Intel MCU psABI isn%'t supported in %s mode",
- TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
- }
-#endif
+ int mode_alignment = GET_MODE_BITSIZE (mode);
- if (TARGET_X32_P (opts->x_ix86_isa_flags))
- {
- /* Always turn on OPTION_MASK_ISA_64BIT and turn off
- OPTION_MASK_ABI_64 for TARGET_X32. */
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
- opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
- }
- else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
- opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
- | OPTION_MASK_ABI_X32
- | OPTION_MASK_ABI_64);
- else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
- {
- /* Always turn on OPTION_MASK_ISA_64BIT and turn off
- OPTION_MASK_ABI_X32 for TARGET_LP64. */
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
- opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
+ if (mode == XFmode)
+ mode_alignment = 128;
+ else if (mode == XCmode)
+ mode_alignment = 256;
+ if (COMPLEX_MODE_P (mode))
+ mode_alignment /= 2;
+ /* Misaligned fields are always returned in memory. */
+ if (bit_offset % mode_alignment)
+ return 0;
}
-#ifdef SUBTARGET_OVERRIDE_OPTIONS
- SUBTARGET_OVERRIDE_OPTIONS;
-#endif
-
-#ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
- SUBSUBTARGET_OVERRIDE_OPTIONS;
-#endif
-
- /* -fPIC is the default for x86_64. */
- if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
- opts->x_flag_pic = 2;
+ /* for V1xx modes, just use the base mode */
+ if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
+ && GET_MODE_UNIT_SIZE (mode) == bytes)
+ mode = GET_MODE_INNER (mode);
- /* Need to check -mtune=generic first. */
- if (opts->x_ix86_tune_string)
- {
- /* As special support for cross compilers we read -mtune=native
- as -mtune=generic. With native compilers we won't see the
- -mtune=native, as it was changed by the driver. */
- if (!strcmp (opts->x_ix86_tune_string, "native"))
- {
- opts->x_ix86_tune_string = "generic";
- }
- else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
- warning (OPT_Wdeprecated,
- main_args_p
- ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
- "or %<-mtune=generic%> instead as appropriate")
- : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
- "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
- " instead as appropriate"));
- }
- else
+ /* Classification of atomic types. */
+ switch (mode)
{
- if (opts->x_ix86_arch_string)
- opts->x_ix86_tune_string = opts->x_ix86_arch_string;
- if (!opts->x_ix86_tune_string)
- {
- opts->x_ix86_tune_string = processor_names[TARGET_CPU_DEFAULT];
- ix86_tune_defaulted = 1;
- }
+ case E_SDmode:
+ case E_DDmode:
+ classes[0] = X86_64_SSE_CLASS;
+ return 1;
+ case E_TDmode:
+ classes[0] = X86_64_SSE_CLASS;
+ classes[1] = X86_64_SSEUP_CLASS;
+ return 2;
+ case E_DImode:
+ case E_SImode:
+ case E_HImode:
+ case E_QImode:
+ case E_CSImode:
+ case E_CHImode:
+ case E_CQImode:
+ {
+ int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
- /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
- or defaulted. We need to use a sensible tune option. */
- if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
- {
- opts->x_ix86_tune_string = "generic";
- }
- }
-
- if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
- && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
- {
- /* rep; movq isn't available in 32-bit code. */
- error ("%<-mstringop-strategy=rep_8byte%> not supported for 32-bit code");
- opts->x_ix86_stringop_alg = no_stringop;
- }
-
- if (!opts->x_ix86_arch_string)
- opts->x_ix86_arch_string
- = TARGET_64BIT_P (opts->x_ix86_isa_flags)
- ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
- else
- ix86_arch_specified = 1;
-
- if (opts_set->x_ix86_pmode)
- {
- if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
- && opts->x_ix86_pmode == PMODE_SI)
- || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
- && opts->x_ix86_pmode == PMODE_DI))
- error ("address mode %qs not supported in the %s bit mode",
- TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
- TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
- }
- else
- opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
- ? PMODE_DI : PMODE_SI;
-
- if (!opts_set->x_ix86_abi)
- opts->x_ix86_abi = DEFAULT_ABI;
-
- if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags))
- error ("%<-mabi=ms%> not supported with X32 ABI");
- gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI);
-
- if ((opts->x_flag_sanitize & SANITIZE_USER_ADDRESS) && opts->x_ix86_abi == MS_ABI)
- error ("%<-mabi=ms%> not supported with %<-fsanitize=address%>");
- if ((opts->x_flag_sanitize & SANITIZE_KERNEL_ADDRESS) && opts->x_ix86_abi == MS_ABI)
- error ("%<-mabi=ms%> not supported with %<-fsanitize=kernel-address%>");
- if ((opts->x_flag_sanitize & SANITIZE_THREAD) && opts->x_ix86_abi == MS_ABI)
- error ("%<-mabi=ms%> not supported with %<-fsanitize=thread%>");
-
- /* For targets using ms ABI enable ms-extensions, if not
- explicit turned off. For non-ms ABI we turn off this
- option. */
- if (!opts_set->x_flag_ms_extensions)
- opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
-
- if (opts_set->x_ix86_cmodel)
- {
- switch (opts->x_ix86_cmodel)
- {
- case CM_SMALL:
- case CM_SMALL_PIC:
- if (opts->x_flag_pic)
- opts->x_ix86_cmodel = CM_SMALL_PIC;
- if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
- error ("code model %qs not supported in the %s bit mode",
- "small", "32");
- break;
-
- case CM_MEDIUM:
- case CM_MEDIUM_PIC:
- if (opts->x_flag_pic)
- opts->x_ix86_cmodel = CM_MEDIUM_PIC;
- if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
- error ("code model %qs not supported in the %s bit mode",
- "medium", "32");
- else if (TARGET_X32_P (opts->x_ix86_isa_flags))
- error ("code model %qs not supported in x32 mode",
- "medium");
- break;
-
- case CM_LARGE:
- case CM_LARGE_PIC:
- if (opts->x_flag_pic)
- opts->x_ix86_cmodel = CM_LARGE_PIC;
- if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
- error ("code model %qs not supported in the %s bit mode",
- "large", "32");
- else if (TARGET_X32_P (opts->x_ix86_isa_flags))
- error ("code model %qs not supported in x32 mode",
- "large");
- break;
-
- case CM_32:
- if (opts->x_flag_pic)
- error ("code model %s does not support PIC mode", "32");
- if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
- error ("code model %qs not supported in the %s bit mode",
- "32", "64");
- break;
-
- case CM_KERNEL:
- if (opts->x_flag_pic)
- {
- error ("code model %s does not support PIC mode", "kernel");
- opts->x_ix86_cmodel = CM_32;
- }
- if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
- error ("code model %qs not supported in the %s bit mode",
- "kernel", "32");
- break;
-
- default:
- gcc_unreachable ();
- }
- }
- else
- {
- /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
- use of rip-relative addressing. This eliminates fixups that
- would otherwise be needed if this object is to be placed in a
- DLL, and is essentially just as efficient as direct addressing. */
- if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
- && (TARGET_RDOS || TARGET_PECOFF))
- opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
- else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
- opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
- else
- opts->x_ix86_cmodel = CM_32;
- }
- if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
- {
- error ("%<-masm=intel%> not supported in this configuration");
- opts->x_ix86_asm_dialect = ASM_ATT;
- }
- if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
- != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
- sorry ("%i-bit mode not compiled in",
- (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
+ /* Analyze last 128 bits only. */
+ size = (size - 1) & 0x7f;
- for (i = 0; i < pta_size; i++)
- if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
- {
- if (!strcmp (opts->x_ix86_arch_string, "generic"))
+ if (size < 32)
{
- error (main_args_p
- ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
- "switch")
- : G_("%<generic%> CPU can be used only for "
- "%<target(\"tune=\")%> attribute"));
- return false;
+ classes[0] = X86_64_INTEGERSI_CLASS;
+ return 1;
}
- else if (!strcmp (opts->x_ix86_arch_string, "intel"))
+ else if (size < 64)
{
- error (main_args_p
- ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
- "switch")
- : G_("%<intel%> CPU can be used only for "
- "%<target(\"tune=\")%> attribute"));
- return false;
+ classes[0] = X86_64_INTEGER_CLASS;
+ return 1;
}
-
- if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
- && !((processor_alias_table[i].flags & PTA_64BIT) != 0))
+ else if (size < 64+32)
{
- error ("CPU you selected does not support x86-64 "
- "instruction set");
- return false;
+ classes[0] = X86_64_INTEGER_CLASS;
+ classes[1] = X86_64_INTEGERSI_CLASS;
+ return 2;
}
-
- ix86_schedule = processor_alias_table[i].schedule;
- ix86_arch = processor_alias_table[i].processor;
- /* Default cpu tuning to the architecture. */
- ix86_tune = ix86_arch;
-
- if (((processor_alias_table[i].flags & PTA_MMX) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
- if (((processor_alias_table[i].flags & PTA_3DNOW) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
- if (((processor_alias_table[i].flags & PTA_3DNOW_A) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
- if (((processor_alias_table[i].flags & PTA_SSE) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
- if (((processor_alias_table[i].flags & PTA_SSE2) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
- if (((processor_alias_table[i].flags & PTA_SSE3) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
- if (((processor_alias_table[i].flags & PTA_SSSE3) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
- if (((processor_alias_table[i].flags & PTA_SSE4_1) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
- if (((processor_alias_table[i].flags & PTA_SSE4_2) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
- if (((processor_alias_table[i].flags & PTA_AVX) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
- if (((processor_alias_table[i].flags & PTA_AVX2) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
- if (((processor_alias_table[i].flags & PTA_FMA) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
- if (((processor_alias_table[i].flags & PTA_SSE4A) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
- if (((processor_alias_table[i].flags & PTA_FMA4) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
- if (((processor_alias_table[i].flags & PTA_XOP) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
- if (((processor_alias_table[i].flags & PTA_LWP) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
- if (((processor_alias_table[i].flags & PTA_ABM) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
- if (((processor_alias_table[i].flags & PTA_BMI) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
- if (((processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
- if (((processor_alias_table[i].flags & PTA_TBM) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
- if (((processor_alias_table[i].flags & PTA_BMI2) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
- if (((processor_alias_table[i].flags & PTA_CX16) != 0)
- && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CX16))
- opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CX16;
- if (((processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
- if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
- && ((processor_alias_table[i].flags & PTA_NO_SAHF) != 0))
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
- if (((processor_alias_table[i].flags & PTA_MOVBE) != 0)
- && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MOVBE))
- opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MOVBE;
- if (((processor_alias_table[i].flags & PTA_AES) != 0)
- && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
- ix86_isa_flags |= OPTION_MASK_ISA_AES;
- if (((processor_alias_table[i].flags & PTA_SHA) != 0)
- && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
- ix86_isa_flags |= OPTION_MASK_ISA_SHA;
- if (((processor_alias_table[i].flags & PTA_PCLMUL) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
- if (((processor_alias_table[i].flags & PTA_FSGSBASE) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
- if (((processor_alias_table[i].flags & PTA_RDRND) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
- if (((processor_alias_table[i].flags & PTA_F16C) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
- if (((processor_alias_table[i].flags & PTA_RTM) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
- if (((processor_alias_table[i].flags & PTA_HLE) != 0)
- && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_HLE))
- opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_HLE;
- if (((processor_alias_table[i].flags & PTA_PRFCHW) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
- if (((processor_alias_table[i].flags & PTA_RDSEED) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
- if (((processor_alias_table[i].flags & PTA_ADX) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
- if (((processor_alias_table[i].flags & PTA_FXSR) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
- if (((processor_alias_table[i].flags & PTA_XSAVE) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
- if (((processor_alias_table[i].flags & PTA_XSAVEOPT) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
- if (((processor_alias_table[i].flags & PTA_AVX512F) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
- if (((processor_alias_table[i].flags & PTA_AVX512ER) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
- if (((processor_alias_table[i].flags & PTA_AVX512PF) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
- if (((processor_alias_table[i].flags & PTA_AVX512CD) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
- if (((processor_alias_table[i].flags & PTA_PREFETCHWT1) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
- if (((processor_alias_table[i].flags & PTA_CLWB) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
- if (((processor_alias_table[i].flags & PTA_CLFLUSHOPT) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
- if (((processor_alias_table[i].flags & PTA_CLZERO) != 0)
- && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CLZERO))
- opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CLZERO;
- if (((processor_alias_table[i].flags & PTA_XSAVEC) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
- if (((processor_alias_table[i].flags & PTA_XSAVES) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
- if (((processor_alias_table[i].flags & PTA_AVX512DQ) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
- if (((processor_alias_table[i].flags & PTA_AVX512BW) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
- if (((processor_alias_table[i].flags & PTA_AVX512VL) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
- if (((processor_alias_table[i].flags & PTA_AVX512VBMI) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
- if (((processor_alias_table[i].flags & PTA_AVX512IFMA) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
- if (((processor_alias_table[i].flags & PTA_AVX512VNNI) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VNNI))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VNNI;
- if (((processor_alias_table[i].flags & PTA_GFNI) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_GFNI))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_GFNI;
- if (((processor_alias_table[i].flags & PTA_AVX512VBMI2) != 0)
- && !(opts->x_ix86_isa_flags_explicit
- & OPTION_MASK_ISA_AVX512VBMI2))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI2;
- if (((processor_alias_table[i].flags & PTA_VPCLMULQDQ) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_VPCLMULQDQ))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_VPCLMULQDQ;
- if (((processor_alias_table[i].flags & PTA_AVX512BITALG) != 0)
- && !(opts->x_ix86_isa_flags_explicit
- & OPTION_MASK_ISA_AVX512BITALG))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BITALG;
-
- if (((processor_alias_table[i].flags & PTA_AVX5124VNNIW) != 0)
- && !(opts->x_ix86_isa_flags2_explicit
- & OPTION_MASK_ISA_AVX5124VNNIW))
- opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
- if (((processor_alias_table[i].flags & PTA_AVX5124FMAPS) != 0)
- && !(opts->x_ix86_isa_flags2_explicit
- & OPTION_MASK_ISA_AVX5124FMAPS))
- opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
- if (((processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ) != 0)
- && !(opts->x_ix86_isa_flags_explicit
- & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
- if (((processor_alias_table[i].flags & PTA_SGX) != 0)
- && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
- opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
- if (((processor_alias_table[i].flags & PTA_VAES) != 0)
- && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_VAES))
- opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_VAES;
- if (((processor_alias_table[i].flags & PTA_RDPID) != 0)
- && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_RDPID))
- opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_RDPID;
- if (((processor_alias_table[i].flags & PTA_PCONFIG) != 0)
- && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_PCONFIG))
- opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_PCONFIG;
- if (((processor_alias_table[i].flags & PTA_WBNOINVD) != 0)
- && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_WBNOINVD))
- opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_WBNOINVD;
- if (((processor_alias_table[i].flags & PTA_PTWRITE) != 0)
- && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_PTWRITE))
- opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_PTWRITE;
-
- if ((processor_alias_table[i].flags
- & (PTA_PREFETCH_SSE | PTA_SSE)) != 0)
- x86_prefetch_sse = true;
- if (((processor_alias_table[i].flags & PTA_MWAITX) != 0)
- && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MWAITX))
- opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MWAITX;
- if (((processor_alias_table[i].flags & PTA_PKU) != 0)
- && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
- opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
-
- /* Don't enable x87 instructions if only
- general registers are allowed. */
- if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
- && !(opts_set->x_target_flags & MASK_80387))
+ else if (size < 64+64)
{
- if (((processor_alias_table[i].flags & PTA_NO_80387) != 0))
- opts->x_target_flags &= ~MASK_80387;
- else
- opts->x_target_flags |= MASK_80387;
+ classes[0] = classes[1] = X86_64_INTEGER_CLASS;
+ return 2;
}
- break;
+ else
+ gcc_unreachable ();
}
+ case E_CDImode:
+ case E_TImode:
+ classes[0] = classes[1] = X86_64_INTEGER_CLASS;
+ return 2;
+ case E_COImode:
+ case E_OImode:
+ /* OImode shouldn't be used directly. */
+ gcc_unreachable ();
+ case E_CTImode:
+ return 0;
+ case E_SFmode:
+ if (!(bit_offset % 64))
+ classes[0] = X86_64_SSESF_CLASS;
+ else
+ classes[0] = X86_64_SSE_CLASS;
+ return 1;
+ case E_DFmode:
+ classes[0] = X86_64_SSEDF_CLASS;
+ return 1;
+ case E_XFmode:
+ classes[0] = X86_64_X87_CLASS;
+ classes[1] = X86_64_X87UP_CLASS;
+ return 2;
+ case E_TFmode:
+ classes[0] = X86_64_SSE_CLASS;
+ classes[1] = X86_64_SSEUP_CLASS;
+ return 2;
+ case E_SCmode:
+ classes[0] = X86_64_SSE_CLASS;
+ if (!(bit_offset % 64))
+ return 1;
+ else
+ {
+ static bool warned;
- if (i == pta_size)
- {
- error (main_args_p
- ? G_("bad value (%qs) for %<-march=%> switch")
- : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
- opts->x_ix86_arch_string);
+ if (!warned && warn_psabi)
+ {
+ warned = true;
+ inform (input_location,
+ "the ABI of passing structure with complex float"
+ " member has changed in GCC 4.4");
+ }
+ classes[1] = X86_64_SSESF_CLASS;
+ return 2;
+ }
+ case E_DCmode:
+ classes[0] = X86_64_SSEDF_CLASS;
+ classes[1] = X86_64_SSEDF_CLASS;
+ return 2;
+ case E_XCmode:
+ classes[0] = X86_64_COMPLEX_X87_CLASS;
+ return 1;
+ case E_TCmode:
+ /* This modes is larger than 16 bytes. */
+ return 0;
+ case E_V8SFmode:
+ case E_V8SImode:
+ case E_V32QImode:
+ case E_V16HImode:
+ case E_V4DFmode:
+ case E_V4DImode:
+ classes[0] = X86_64_SSE_CLASS;
+ classes[1] = X86_64_SSEUP_CLASS;
+ classes[2] = X86_64_SSEUP_CLASS;
+ classes[3] = X86_64_SSEUP_CLASS;
+ return 4;
+ case E_V8DFmode:
+ case E_V16SFmode:
+ case E_V8DImode:
+ case E_V16SImode:
+ case E_V32HImode:
+ case E_V64QImode:
+ classes[0] = X86_64_SSE_CLASS;
+ classes[1] = X86_64_SSEUP_CLASS;
+ classes[2] = X86_64_SSEUP_CLASS;
+ classes[3] = X86_64_SSEUP_CLASS;
+ classes[4] = X86_64_SSEUP_CLASS;
+ classes[5] = X86_64_SSEUP_CLASS;
+ classes[6] = X86_64_SSEUP_CLASS;
+ classes[7] = X86_64_SSEUP_CLASS;
+ return 8;
+ case E_V4SFmode:
+ case E_V4SImode:
+ case E_V16QImode:
+ case E_V8HImode:
+ case E_V2DFmode:
+ case E_V2DImode:
+ classes[0] = X86_64_SSE_CLASS;
+ classes[1] = X86_64_SSEUP_CLASS;
+ return 2;
+ case E_V1TImode:
+ case E_V1DImode:
+ case E_V2SFmode:
+ case E_V2SImode:
+ case E_V4HImode:
+ case E_V8QImode:
+ classes[0] = X86_64_SSE_CLASS;
+ return 1;
+ case E_BLKmode:
+ case E_VOIDmode:
+ return 0;
+ default:
+ gcc_assert (VECTOR_MODE_P (mode));
- auto_vec <const char *> candidates;
- for (i = 0; i < pta_size; i++)
- if (strcmp (processor_alias_table[i].name, "generic")
- && strcmp (processor_alias_table[i].name, "intel")
- && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
- || ((processor_alias_table[i].flags & PTA_64BIT) != 0)))
- candidates.safe_push (processor_alias_table[i].name);
+ if (bytes > 16)
+ return 0;
-#ifdef HAVE_LOCAL_CPU_DETECT
- /* Add also "native" as possible value. */
- candidates.safe_push ("native");
-#endif
+ gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
- char *s;
- const char *hint
- = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
- if (hint)
- inform (input_location,
- main_args_p
- ? G_("valid arguments to %<-march=%> switch are: "
- "%s; did you mean %qs?")
- : G_("valid arguments to %<target(\"arch=\")%> attribute are: "
- "%s; did you mean %qs?"), s, hint);
+ if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
+ classes[0] = X86_64_INTEGERSI_CLASS;
else
- inform (input_location,
- main_args_p
- ? G_("valid arguments to %<-march=%> switch are: %s")
- : G_("valid arguments to %<target(\"arch=\")%> attribute "
- "are: %s"), s);
- XDELETEVEC (s);
+ classes[0] = X86_64_INTEGER_CLASS;
+ classes[1] = X86_64_INTEGER_CLASS;
+ return 1 + (bytes > 8);
}
+}
+
+/* Examine the argument and return set number of register required in each
+ class. Return true iff parameter should be passed in memory. */
+
+static bool
+examine_argument (machine_mode mode, const_tree type, int in_return,
+ int *int_nregs, int *sse_nregs)
+{
+ enum x86_64_reg_class regclass[MAX_CLASSES];
+ int n = classify_argument (mode, type, regclass, 0);
- ix86_arch_mask = HOST_WIDE_INT_1U << ix86_arch;
- for (i = 0; i < X86_ARCH_LAST; ++i)
- ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
+ *int_nregs = 0;
+ *sse_nregs = 0;
- for (i = 0; i < pta_size; i++)
- if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
+ if (!n)
+ return true;
+ for (n--; n >= 0; n--)
+ switch (regclass[n])
{
- ix86_schedule = processor_alias_table[i].schedule;
- ix86_tune = processor_alias_table[i].processor;
- if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
- {
- if (!((processor_alias_table[i].flags & PTA_64BIT) != 0))
- {
- if (ix86_tune_defaulted)
- {
- opts->x_ix86_tune_string = "x86-64";
- for (i = 0; i < pta_size; i++)
- if (! strcmp (opts->x_ix86_tune_string,
- processor_alias_table[i].name))
- break;
- ix86_schedule = processor_alias_table[i].schedule;
- ix86_tune = processor_alias_table[i].processor;
- }
- else
- error ("CPU you selected does not support x86-64 "
- "instruction set");
- }
- }
- /* Intel CPUs have always interpreted SSE prefetch instructions as
- NOPs; so, we can enable SSE prefetch instructions even when
- -mtune (rather than -march) points us to a processor that has them.
- However, the VIA C3 gives a SIGILL, so we only do that for i686 and
- higher processors. */
- if (TARGET_CMOV
- && ((processor_alias_table[i].flags
- & (PTA_PREFETCH_SSE | PTA_SSE)) != 0))
- x86_prefetch_sse = true;
+ case X86_64_INTEGER_CLASS:
+ case X86_64_INTEGERSI_CLASS:
+ (*int_nregs)++;
+ break;
+ case X86_64_SSE_CLASS:
+ case X86_64_SSESF_CLASS:
+ case X86_64_SSEDF_CLASS:
+ (*sse_nregs)++;
+ break;
+ case X86_64_NO_CLASS:
+ case X86_64_SSEUP_CLASS:
+ break;
+ case X86_64_X87_CLASS:
+ case X86_64_X87UP_CLASS:
+ case X86_64_COMPLEX_X87_CLASS:
+ if (!in_return)
+ return true;
break;
+ case X86_64_MEMORY_CLASS:
+ gcc_unreachable ();
}
- if (ix86_tune_specified && i == pta_size)
- {
- error (main_args_p
- ? G_("bad value (%qs) for %<-mtune=%> switch")
- : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
- opts->x_ix86_tune_string);
-
- auto_vec <const char *> candidates;
- for (i = 0; i < pta_size; i++)
- if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
- || ((processor_alias_table[i].flags & PTA_64BIT) != 0))
- candidates.safe_push (processor_alias_table[i].name);
-
-#ifdef HAVE_LOCAL_CPU_DETECT
- /* Add also "native" as possible value. */
- candidates.safe_push ("native");
-#endif
+ return false;
+}
- char *s;
- const char *hint
- = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
- if (hint)
- inform (input_location,
- main_args_p
- ? G_("valid arguments to %<-mtune=%> switch are: "
- "%s; did you mean %qs?")
- : G_("valid arguments to %<target(\"tune=\")%> attribute are: "
- "%s; did you mean %qs?"), s, hint);
- else
- inform (input_location,
- main_args_p
- ? G_("valid arguments to %<-mtune=%> switch are: %s")
- : G_("valid arguments to %<target(\"tune=\")%> attribute "
- "are: %s"), s);
- XDELETEVEC (s);
- }
+/* Construct container for the argument used by GCC interface. See
+ FUNCTION_ARG for the detailed description. */
- set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
+static rtx
+construct_container (machine_mode mode, machine_mode orig_mode,
+ const_tree type, int in_return, int nintregs, int nsseregs,
+ const int *intreg, int sse_regno)
+{
+ /* The following variables hold the static issued_error state. */
+ static bool issued_sse_arg_error;
+ static bool issued_sse_ret_error;
+ static bool issued_x87_ret_error;
-#ifndef USE_IX86_FRAME_POINTER
-#define USE_IX86_FRAME_POINTER 0
-#endif
+ machine_mode tmpmode;
+ int bytes
+ = mode == BLKmode ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
+ enum x86_64_reg_class regclass[MAX_CLASSES];
+ int n;
+ int i;
+ int nexps = 0;
+ int needed_sseregs, needed_intregs;
+ rtx exp[MAX_CLASSES];
+ rtx ret;
-#ifndef USE_X86_64_FRAME_POINTER
-#define USE_X86_64_FRAME_POINTER 0
-#endif
+ n = classify_argument (mode, type, regclass, 0);
+ if (!n)
+ return NULL;
+ if (examine_argument (mode, type, in_return, &needed_intregs,
+ &needed_sseregs))
+ return NULL;
+ if (needed_intregs > nintregs || needed_sseregs > nsseregs)
+ return NULL;
- /* Set the default values for switches whose default depends on TARGET_64BIT
- in case they weren't overwritten by command line options. */
- if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
- {
- if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
- opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
- if (opts->x_flag_asynchronous_unwind_tables
- && !opts_set->x_flag_unwind_tables
- && TARGET_64BIT_MS_ABI)
- opts->x_flag_unwind_tables = 1;
- if (opts->x_flag_asynchronous_unwind_tables == 2)
- opts->x_flag_unwind_tables
- = opts->x_flag_asynchronous_unwind_tables = 1;
- if (opts->x_flag_pcc_struct_return == 2)
- opts->x_flag_pcc_struct_return = 0;
- }
- else
- {
- if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
- opts->x_flag_omit_frame_pointer
- = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
- if (opts->x_flag_asynchronous_unwind_tables == 2)
- opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
- if (opts->x_flag_pcc_struct_return == 2)
- {
- /* Intel MCU psABI specifies that -freg-struct-return should
- be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
- we check -miamcu so that -freg-struct-return is always
- turned on if -miamcu is used. */
- if (TARGET_IAMCU_P (opts->x_target_flags))
- opts->x_flag_pcc_struct_return = 0;
- else
- opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
- }
- }
-
- ix86_tune_cost = processor_cost_table[ix86_tune];
- /* TODO: ix86_cost should be chosen at instruction or function granuality
- so for cold code we use size_cost even in !optimize_size compilation. */
- if (opts->x_optimize_size)
- ix86_cost = &ix86_size_cost;
- else
- ix86_cost = ix86_tune_cost;
-
- /* Arrange to set up i386_stack_locals for all functions. */
- init_machine_status = ix86_init_machine_status;
-
- /* Validate -mregparm= value. */
- if (opts_set->x_ix86_regparm)
+ /* We allowed the user to turn off SSE for kernel mode. Don't crash if
+ some less clueful developer tries to use floating-point anyway. */
+ if (needed_sseregs && !TARGET_SSE)
{
- if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
- warning (0, "%<-mregparm%> is ignored in 64-bit mode");
- else if (TARGET_IAMCU_P (opts->x_target_flags))
- warning (0, "%<-mregparm%> is ignored for Intel MCU psABI");
- if (opts->x_ix86_regparm > REGPARM_MAX)
+ if (in_return)
{
- error ("%<-mregparm=%d%> is not between 0 and %d",
- opts->x_ix86_regparm, REGPARM_MAX);
- opts->x_ix86_regparm = 0;
- }
- }
- if (TARGET_IAMCU_P (opts->x_target_flags)
- || TARGET_64BIT_P (opts->x_ix86_isa_flags))
- opts->x_ix86_regparm = REGPARM_MAX;
-
- /* Default align_* from the processor table. */
- ix86_default_align (opts);
-
- /* Provide default for -mbranch-cost= value. */
- if (!opts_set->x_ix86_branch_cost)
- opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
-
- if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
- {
- opts->x_target_flags
- |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
-
- if (!ix86_arch_specified)
- opts->x_ix86_isa_flags
- |= TARGET_SUBTARGET64_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
-
- if (TARGET_RTD_P (opts->x_target_flags))
- warning (0,
- main_args_p
- ? G_("%<-mrtd%> is ignored in 64bit mode")
- : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
- }
- else
- {
- opts->x_target_flags
- |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
-
- if (!ix86_arch_specified)
- opts->x_ix86_isa_flags
- |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
-
- /* i386 ABI does not specify red zone. It still makes sense to use it
- when programmer takes care to stack from being destroyed. */
- if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
- opts->x_target_flags |= MASK_NO_RED_ZONE;
- }
-
- /* Keep nonleaf frame pointers. */
- if (opts->x_flag_omit_frame_pointer)
- opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
- else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
- opts->x_flag_omit_frame_pointer = 1;
-
- /* If we're doing fast math, we don't care about comparison order
- wrt NaNs. This lets us use a shorter comparison sequence. */
- if (opts->x_flag_finite_math_only)
- opts->x_target_flags &= ~MASK_IEEE_FP;
-
- /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
- since the insns won't need emulation. */
- if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
- opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
-
- /* Likewise, if the target doesn't have a 387, or we've specified
- software floating point, don't use 387 inline intrinsics. */
- if (!TARGET_80387_P (opts->x_target_flags))
- opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
-
- /* Turn on MMX builtins for -msse. */
- if (TARGET_SSE_P (opts->x_ix86_isa_flags))
- opts->x_ix86_isa_flags
- |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
-
- /* Enable SSE prefetch. */
- if (TARGET_SSE_P (opts->x_ix86_isa_flags)
- || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
- && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
- || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
- x86_prefetch_sse = true;
-
- /* Enable popcnt instruction for -msse4.2 or -mabm. */
- if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
- || TARGET_ABM_P (opts->x_ix86_isa_flags))
- opts->x_ix86_isa_flags
- |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
-
- /* Enable lzcnt instruction for -mabm. */
- if (TARGET_ABM_P(opts->x_ix86_isa_flags))
- opts->x_ix86_isa_flags
- |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
-
- /* Disable BMI, BMI2 and TBM instructions for -m16. */
- if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
- opts->x_ix86_isa_flags
- &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
- & ~opts->x_ix86_isa_flags_explicit);
-
- /* Validate -mpreferred-stack-boundary= value or default it to
- PREFERRED_STACK_BOUNDARY_DEFAULT. */
- ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
- if (opts_set->x_ix86_preferred_stack_boundary_arg)
- {
- int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
- int max = TARGET_SEH ? 4 : 12;
-
- if (opts->x_ix86_preferred_stack_boundary_arg < min
- || opts->x_ix86_preferred_stack_boundary_arg > max)
- {
- if (min == max)
- error ("%<-mpreferred-stack-boundary%> is not supported "
- "for this target");
- else
- error ("%<-mpreferred-stack-boundary=%d%> is not between %d and %d",
- opts->x_ix86_preferred_stack_boundary_arg, min, max);
+ if (!issued_sse_ret_error)
+ {
+ error ("SSE register return with SSE disabled");
+ issued_sse_ret_error = true;
+ }
}
- else
- ix86_preferred_stack_boundary
- = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
- }
-
- /* Set the default value for -mstackrealign. */
- if (!opts_set->x_ix86_force_align_arg_pointer)
- opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
-
- ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
-
- /* Validate -mincoming-stack-boundary= value or default it to
- MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
- ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
- if (opts_set->x_ix86_incoming_stack_boundary_arg)
- {
- int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
-
- if (opts->x_ix86_incoming_stack_boundary_arg < min
- || opts->x_ix86_incoming_stack_boundary_arg > 12)
- error ("%<-mincoming-stack-boundary=%d%> is not between %d and 12",
- opts->x_ix86_incoming_stack_boundary_arg, min);
- else
+ else if (!issued_sse_arg_error)
{
- ix86_user_incoming_stack_boundary
- = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
- ix86_incoming_stack_boundary
- = ix86_user_incoming_stack_boundary;
+ error ("SSE register argument with SSE disabled");
+ issued_sse_arg_error = true;
}
+ return NULL;
}
-#ifndef NO_PROFILE_COUNTERS
- if (flag_nop_mcount)
- error ("%<-mnop-mcount%> is not compatible with this target");
-#endif
- if (flag_nop_mcount && flag_pic)
- error ("%<-mnop-mcount%> is not implemented for %<-fPIC%>");
-
- /* Accept -msseregparm only if at least SSE support is enabled. */
- if (TARGET_SSEREGPARM_P (opts->x_target_flags)
- && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
- error (main_args_p
- ? G_("%<-msseregparm%> used without SSE enabled")
- : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
-
- if (opts_set->x_ix86_fpmath)
- {
- if (opts->x_ix86_fpmath & FPMATH_SSE)
+ /* Likewise, error if the ABI requires us to return values in the
+ x87 registers and the user specified -mno-80387. */
+ if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
+ for (i = 0; i < n; i++)
+ if (regclass[i] == X86_64_X87_CLASS
+ || regclass[i] == X86_64_X87UP_CLASS
+ || regclass[i] == X86_64_COMPLEX_X87_CLASS)
{
- if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
- {
- if (TARGET_80387_P (opts->x_target_flags))
- {
- warning (0, "SSE instruction set disabled, using 387 arithmetics");
- opts->x_ix86_fpmath = FPMATH_387;
- }
- }
- else if ((opts->x_ix86_fpmath & FPMATH_387)
- && !TARGET_80387_P (opts->x_target_flags))
+ if (!issued_x87_ret_error)
{
- warning (0, "387 instruction set disabled, using SSE arithmetics");
- opts->x_ix86_fpmath = FPMATH_SSE;
+ error ("x87 register return with x87 disabled");
+ issued_x87_ret_error = true;
}
+ return NULL;
}
- }
- /* For all chips supporting SSE2, -mfpmath=sse performs better than
- fpmath=387. The second is however default at many targets since the
- extra 80bit precision of temporaries is considered to be part of ABI.
- Overwrite the default at least for -ffast-math.
- TODO: -mfpmath=both seems to produce same performing code with bit
- smaller binaries. It is however not clear if register allocation is
- ready for this setting.
- Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
- codegen. We may switch to 387 with -ffast-math for size optimized
- functions. */
- else if (fast_math_flags_set_p (&global_options)
- && TARGET_SSE2_P (opts->x_ix86_isa_flags))
- opts->x_ix86_fpmath = FPMATH_SSE;
- else
- opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
- /* Use external vectorized library in vectorizing intrinsics. */
- if (opts_set->x_ix86_veclibabi_type)
- switch (opts->x_ix86_veclibabi_type)
+ /* First construct simple cases. Avoid SCmode, since we want to use
+ single register to pass this type. */
+ if (n == 1 && mode != SCmode)
+ switch (regclass[0])
{
- case ix86_veclibabi_type_svml:
- ix86_veclib_handler = ix86_veclibabi_svml;
- break;
-
- case ix86_veclibabi_type_acml:
- ix86_veclib_handler = ix86_veclibabi_acml;
+ case X86_64_INTEGER_CLASS:
+ case X86_64_INTEGERSI_CLASS:
+ return gen_rtx_REG (mode, intreg[0]);
+ case X86_64_SSE_CLASS:
+ case X86_64_SSESF_CLASS:
+ case X86_64_SSEDF_CLASS:
+ if (mode != BLKmode)
+ return gen_reg_or_parallel (mode, orig_mode,
+ GET_SSE_REGNO (sse_regno));
break;
-
+ case X86_64_X87_CLASS:
+ case X86_64_COMPLEX_X87_CLASS:
+ return gen_rtx_REG (mode, FIRST_STACK_REG);
+ case X86_64_NO_CLASS:
+ /* Zero sized array, struct or class. */
+ return NULL;
default:
gcc_unreachable ();
}
+ if (n == 2
+ && regclass[0] == X86_64_SSE_CLASS
+ && regclass[1] == X86_64_SSEUP_CLASS
+ && mode != BLKmode)
+ return gen_reg_or_parallel (mode, orig_mode,
+ GET_SSE_REGNO (sse_regno));
+ if (n == 4
+ && regclass[0] == X86_64_SSE_CLASS
+ && regclass[1] == X86_64_SSEUP_CLASS
+ && regclass[2] == X86_64_SSEUP_CLASS
+ && regclass[3] == X86_64_SSEUP_CLASS
+ && mode != BLKmode)
+ return gen_reg_or_parallel (mode, orig_mode,
+ GET_SSE_REGNO (sse_regno));
+ if (n == 8
+ && regclass[0] == X86_64_SSE_CLASS
+ && regclass[1] == X86_64_SSEUP_CLASS
+ && regclass[2] == X86_64_SSEUP_CLASS
+ && regclass[3] == X86_64_SSEUP_CLASS
+ && regclass[4] == X86_64_SSEUP_CLASS
+ && regclass[5] == X86_64_SSEUP_CLASS
+ && regclass[6] == X86_64_SSEUP_CLASS
+ && regclass[7] == X86_64_SSEUP_CLASS
+ && mode != BLKmode)
+ return gen_reg_or_parallel (mode, orig_mode,
+ GET_SSE_REGNO (sse_regno));
+ if (n == 2
+ && regclass[0] == X86_64_X87_CLASS
+ && regclass[1] == X86_64_X87UP_CLASS)
+ return gen_rtx_REG (XFmode, FIRST_STACK_REG);
- if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
- && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
- opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
-
- /* If stack probes are required, the space used for large function
- arguments on the stack must also be probed, so enable
- -maccumulate-outgoing-args so this happens in the prologue. */
- if (TARGET_STACK_PROBE_P (opts->x_target_flags)
- && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
- {
- if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
- warning (0,
- main_args_p
- ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
- "for correctness")
- : G_("stack probing requires "
- "%<target(\"accumulate-outgoing-args\")%> for "
- "correctness"));
- opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
- }
-
- /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
- so enable -maccumulate-outgoing-args when %ebp is fixed. */
- if (fixed_regs[BP_REG]
- && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
- {
- if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
- warning (0,
- main_args_p
- ? G_("fixed ebp register requires "
- "%<-maccumulate-outgoing-args%>")
- : G_("fixed ebp register requires "
- "%<target(\"accumulate-outgoing-args\")%>"));
- opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
- }
-
- /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
- {
- char *p;
- ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
- p = strchr (internal_label_prefix, 'X');
- internal_label_prefix_len = p - internal_label_prefix;
- *p = '\0';
- }
-
- /* When scheduling description is not available, disable scheduler pass
- so it won't slow down the compilation and make x87 code slower. */
- if (!TARGET_SCHEDULE)
- opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
-
- maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
- ix86_tune_cost->simultaneous_prefetches,
- opts->x_param_values,
- opts_set->x_param_values);
- maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
- ix86_tune_cost->prefetch_block,
- opts->x_param_values,
- opts_set->x_param_values);
- maybe_set_param_value (PARAM_L1_CACHE_SIZE,
- ix86_tune_cost->l1_cache_size,
- opts->x_param_values,
- opts_set->x_param_values);
- maybe_set_param_value (PARAM_L2_CACHE_SIZE,
- ix86_tune_cost->l2_cache_size,
- opts->x_param_values,
- opts_set->x_param_values);
-
- /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
- if (opts->x_flag_prefetch_loop_arrays < 0
- && HAVE_prefetch
- && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
- && !opts->x_optimize_size
- && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
- opts->x_flag_prefetch_loop_arrays = 1;
-
- /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
- can be opts->x_optimized to ap = __builtin_next_arg (0). */
- if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
- targetm.expand_builtin_va_start = NULL;
-
- if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
- {
- ix86_gen_leave = gen_leave_rex64;
- if (Pmode == DImode)
- {
- ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
- ix86_gen_tls_local_dynamic_base_64
- = gen_tls_local_dynamic_base_64_di;
- }
- else
+ if (n == 2
+ && regclass[0] == X86_64_INTEGER_CLASS
+ && regclass[1] == X86_64_INTEGER_CLASS
+ && (mode == CDImode || mode == TImode || mode == BLKmode)
+ && intreg[0] + 1 == intreg[1])
+ {
+ if (mode == BLKmode)
{
- ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
- ix86_gen_tls_local_dynamic_base_64
- = gen_tls_local_dynamic_base_64_si;
+ /* Use TImode for BLKmode values in 2 integer registers. */
+ exp[0] = gen_rtx_EXPR_LIST (VOIDmode,
+ gen_rtx_REG (TImode, intreg[0]),
+ GEN_INT (0));
+ ret = gen_rtx_PARALLEL (mode, rtvec_alloc (1));
+ XVECEXP (ret, 0, 0) = exp[0];
+ return ret;
}
+ else
+ return gen_rtx_REG (mode, intreg[0]);
}
- else
- ix86_gen_leave = gen_leave;
-
- if (Pmode == DImode)
- {
- ix86_gen_add3 = gen_adddi3;
- ix86_gen_sub3 = gen_subdi3;
- ix86_gen_sub3_carry = gen_subdi3_carry;
- ix86_gen_one_cmpl2 = gen_one_cmpldi2;
- ix86_gen_andsp = gen_anddi3;
- ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
- ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
- ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
- ix86_gen_monitor = gen_sse3_monitor_di;
- ix86_gen_monitorx = gen_monitorx_di;
- ix86_gen_clzero = gen_clzero_di;
- }
- else
- {
- ix86_gen_add3 = gen_addsi3;
- ix86_gen_sub3 = gen_subsi3;
- ix86_gen_sub3_carry = gen_subsi3_carry;
- ix86_gen_one_cmpl2 = gen_one_cmplsi2;
- ix86_gen_andsp = gen_andsi3;
- ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
- ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
- ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
- ix86_gen_monitor = gen_sse3_monitor_si;
- ix86_gen_monitorx = gen_monitorx_si;
- ix86_gen_clzero = gen_clzero_si;
- }
-
-#ifdef USE_IX86_CLD
- /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
- if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
- opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
-#endif
- /* Set the default value for -mfentry. */
- if (!opts_set->x_flag_fentry)
- opts->x_flag_fentry = TARGET_SEH;
- else
+ /* Otherwise figure out the entries of the PARALLEL. */
+ for (i = 0; i < n; i++)
{
- if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic
- && opts->x_flag_fentry)
- sorry ("%<-mfentry%> isn%'t supported for 32-bit in combination "
- "with %<-fpic%>");
- else if (TARGET_SEH && !opts->x_flag_fentry)
- sorry ("%<-mno-fentry%> isn%'t compatible with SEH");
- }
-
- if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
- sorry ("%<-mcall-ms2sysv-xlogues%> isn%'t currently supported with SEH");
-
- if (!(opts_set->x_target_flags & MASK_VZEROUPPER)
- && TARGET_EMIT_VZEROUPPER)
- opts->x_target_flags |= MASK_VZEROUPPER;
- if (!(opts_set->x_target_flags & MASK_STV))
- opts->x_target_flags |= MASK_STV;
- /* Disable STV if -mpreferred-stack-boundary={2,3} or
- -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
- stack realignment will be extra cost the pass doesn't take into
- account and the pass can't realign the stack. */
- if (ix86_preferred_stack_boundary < 128
- || ix86_incoming_stack_boundary < 128
- || opts->x_ix86_force_align_arg_pointer)
- opts->x_target_flags &= ~MASK_STV;
- if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
- && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
- opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
- if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
- && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
- opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
-
- /* Enable 128-bit AVX instruction generation
- for the auto-vectorizer. */
- if (TARGET_AVX128_OPTIMAL
- && (opts_set->x_prefer_vector_width_type == PVW_NONE))
- opts->x_prefer_vector_width_type = PVW_AVX128;
-
- /* Use 256-bit AVX instruction generation
- in the auto-vectorizer. */
- if (ix86_tune_features[X86_TUNE_AVX256_OPTIMAL]
- && (opts_set->x_prefer_vector_width_type == PVW_NONE))
- opts->x_prefer_vector_width_type = PVW_AVX256;
-
- if (opts->x_ix86_recip_name)
- {
- char *p = ASTRDUP (opts->x_ix86_recip_name);
- char *q;
- unsigned int mask, i;
- bool invert;
-
- while ((q = strtok (p, ",")) != NULL)
- {
- p = NULL;
- if (*q == '!')
- {
- invert = true;
- q++;
- }
- else
- invert = false;
+ int pos;
- if (!strcmp (q, "default"))
- mask = RECIP_MASK_ALL;
- else
- {
- for (i = 0; i < ARRAY_SIZE (recip_options); i++)
- if (!strcmp (q, recip_options[i].string))
+ switch (regclass[i])
+ {
+ case X86_64_NO_CLASS:
+ break;
+ case X86_64_INTEGER_CLASS:
+ case X86_64_INTEGERSI_CLASS:
+ /* Merge TImodes on aligned occasions here too. */
+ if (i * 8 + 8 > bytes)
+ {
+ unsigned int tmpbits = (bytes - i * 8) * BITS_PER_UNIT;
+ if (!int_mode_for_size (tmpbits, 0).exists (&tmpmode))
+ /* We've requested 24 bytes we
+ don't have mode for. Use DImode. */
+ tmpmode = DImode;
+ }
+ else if (regclass[i] == X86_64_INTEGERSI_CLASS)
+ tmpmode = SImode;
+ else
+ tmpmode = DImode;
+ exp [nexps++]
+ = gen_rtx_EXPR_LIST (VOIDmode,
+ gen_rtx_REG (tmpmode, *intreg),
+ GEN_INT (i*8));
+ intreg++;
+ break;
+ case X86_64_SSESF_CLASS:
+ exp [nexps++]
+ = gen_rtx_EXPR_LIST (VOIDmode,
+ gen_rtx_REG (SFmode,
+ GET_SSE_REGNO (sse_regno)),
+ GEN_INT (i*8));
+ sse_regno++;
+ break;
+ case X86_64_SSEDF_CLASS:
+ exp [nexps++]
+ = gen_rtx_EXPR_LIST (VOIDmode,
+ gen_rtx_REG (DFmode,
+ GET_SSE_REGNO (sse_regno)),
+ GEN_INT (i*8));
+ sse_regno++;
+ break;
+ case X86_64_SSE_CLASS:
+ pos = i;
+ switch (n)
+ {
+ case 1:
+ tmpmode = DImode;
+ break;
+ case 2:
+ if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
{
- mask = recip_options[i].mask;
- break;
+ tmpmode = TImode;
+ i++;
}
-
- if (i == ARRAY_SIZE (recip_options))
- {
- error ("unknown option for %<-mrecip=%s%>", q);
- invert = false;
- mask = RECIP_MASK_NONE;
- }
- }
-
- opts->x_recip_mask_explicit |= mask;
- if (invert)
- opts->x_recip_mask &= ~mask;
- else
- opts->x_recip_mask |= mask;
+ else
+ tmpmode = DImode;
+ break;
+ case 4:
+ gcc_assert (i == 0
+ && regclass[1] == X86_64_SSEUP_CLASS
+ && regclass[2] == X86_64_SSEUP_CLASS
+ && regclass[3] == X86_64_SSEUP_CLASS);
+ tmpmode = OImode;
+ i += 3;
+ break;
+ case 8:
+ gcc_assert (i == 0
+ && regclass[1] == X86_64_SSEUP_CLASS
+ && regclass[2] == X86_64_SSEUP_CLASS
+ && regclass[3] == X86_64_SSEUP_CLASS
+ && regclass[4] == X86_64_SSEUP_CLASS
+ && regclass[5] == X86_64_SSEUP_CLASS
+ && regclass[6] == X86_64_SSEUP_CLASS
+ && regclass[7] == X86_64_SSEUP_CLASS);
+ tmpmode = XImode;
+ i += 7;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ exp [nexps++]
+ = gen_rtx_EXPR_LIST (VOIDmode,
+ gen_rtx_REG (tmpmode,
+ GET_SSE_REGNO (sse_regno)),
+ GEN_INT (pos*8));
+ sse_regno++;
+ break;
+ default:
+ gcc_unreachable ();
}
}
- if (TARGET_RECIP_P (opts->x_target_flags))
- opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
- else if (opts_set->x_target_flags & MASK_RECIP)
- opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
+ /* Empty aligned struct, union or class. */
+ if (nexps == 0)
+ return NULL;
- /* Default long double to 64-bit for 32-bit Bionic and to __float128
- for 64-bit Bionic. Also default long double to 64-bit for Intel
- MCU psABI. */
- if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
- && !(opts_set->x_target_flags
- & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
- opts->x_target_flags |= (TARGET_64BIT
- ? MASK_LONG_DOUBLE_128
- : MASK_LONG_DOUBLE_64);
+ ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
+ for (i = 0; i < nexps; i++)
+ XVECEXP (ret, 0, i) = exp [i];
+ return ret;
+}
- /* Only one of them can be active. */
- gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
- || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
+/* Update the data in CUM to advance over an argument of mode MODE
+ and data type TYPE. (TYPE is null for libcalls where that information
+ may not be available.)
- /* Handle stack protector */
- if (!opts_set->x_ix86_stack_protector_guard)
+ Return a number of integer regsiters advanced over. */
+
+static int
+function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
+ const_tree type, HOST_WIDE_INT bytes,
+ HOST_WIDE_INT words)
+{
+ int res = 0;
+ bool error_p = false;
+
+ if (TARGET_IAMCU)
{
-#ifdef TARGET_THREAD_SSP_OFFSET
- if (!TARGET_HAS_BIONIC)
- opts->x_ix86_stack_protector_guard = SSP_TLS;
- else
-#endif
- opts->x_ix86_stack_protector_guard = SSP_GLOBAL;
+ /* Intel MCU psABI passes scalars and aggregates no larger than 8
+ bytes in registers. */
+ if (!VECTOR_MODE_P (mode) && bytes <= 8)
+ goto pass_in_reg;
+ return res;
}
- if (opts_set->x_ix86_stack_protector_guard_offset_str)
+ switch (mode)
{
- char *endp;
- const char *str = opts->x_ix86_stack_protector_guard_offset_str;
+ default:
+ break;
- errno = 0;
- int64_t offset;
+ case E_BLKmode:
+ if (bytes < 0)
+ break;
+ /* FALLTHRU */
-#if defined(INT64_T_IS_LONG)
- offset = strtol (str, &endp, 0);
-#else
- offset = strtoll (str, &endp, 0);
-#endif
+ case E_DImode:
+ case E_SImode:
+ case E_HImode:
+ case E_QImode:
+pass_in_reg:
+ cum->words += words;
+ cum->nregs -= words;
+ cum->regno += words;
+ if (cum->nregs >= 0)
+ res = words;
+ if (cum->nregs <= 0)
+ {
+ cum->nregs = 0;
+ cfun->machine->arg_reg_available = false;
+ cum->regno = 0;
+ }
+ break;
- if (!*str || *endp || errno)
- error ("%qs is not a valid number "
- "in %<-mstack-protector-guard-offset=%>", str);
+ case E_OImode:
+ /* OImode shouldn't be used directly. */
+ gcc_unreachable ();
- if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000),
- HOST_WIDE_INT_C (0x7fffffff)))
- error ("%qs is not a valid offset "
- "in %<-mstack-protector-guard-offset=%>", str);
+ case E_DFmode:
+ if (cum->float_in_sse == -1)
+ error_p = true;
+ if (cum->float_in_sse < 2)
+ break;
+ /* FALLTHRU */
+ case E_SFmode:
+ if (cum->float_in_sse == -1)
+ error_p = true;
+ if (cum->float_in_sse < 1)
+ break;
+ /* FALLTHRU */
- opts->x_ix86_stack_protector_guard_offset = offset;
- }
-#ifdef TARGET_THREAD_SSP_OFFSET
- else
- opts->x_ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET;
-#endif
+ case E_V8SFmode:
+ case E_V8SImode:
+ case E_V64QImode:
+ case E_V32HImode:
+ case E_V16SImode:
+ case E_V8DImode:
+ case E_V16SFmode:
+ case E_V8DFmode:
+ case E_V32QImode:
+ case E_V16HImode:
+ case E_V4DFmode:
+ case E_V4DImode:
+ case E_TImode:
+ case E_V16QImode:
+ case E_V8HImode:
+ case E_V4SImode:
+ case E_V2DImode:
+ case E_V4SFmode:
+ case E_V2DFmode:
+ if (!type || !AGGREGATE_TYPE_P (type))
+ {
+ cum->sse_words += words;
+ cum->sse_nregs -= 1;
+ cum->sse_regno += 1;
+ if (cum->sse_nregs <= 0)
+ {
+ cum->sse_nregs = 0;
+ cum->sse_regno = 0;
+ }
+ }
+ break;
- if (opts_set->x_ix86_stack_protector_guard_reg_str)
+ case E_V8QImode:
+ case E_V4HImode:
+ case E_V2SImode:
+ case E_V2SFmode:
+ case E_V1TImode:
+ case E_V1DImode:
+ if (!type || !AGGREGATE_TYPE_P (type))
+ {
+ cum->mmx_words += words;
+ cum->mmx_nregs -= 1;
+ cum->mmx_regno += 1;
+ if (cum->mmx_nregs <= 0)
+ {
+ cum->mmx_nregs = 0;
+ cum->mmx_regno = 0;
+ }
+ }
+ break;
+ }
+ if (error_p)
{
- const char *str = opts->x_ix86_stack_protector_guard_reg_str;
- addr_space_t seg = ADDR_SPACE_GENERIC;
+ cum->float_in_sse = 0;
+ error ("calling %qD with SSE calling convention without "
+ "SSE/SSE2 enabled", cum->decl);
+ sorry ("this is a GCC bug that can be worked around by adding "
+ "attribute used to function called");
+ }
- /* Discard optional register prefix. */
- if (str[0] == '%')
- str++;
+ return res;
+}
- if (strlen (str) == 2 && str[1] == 's')
- {
- if (str[0] == 'f')
- seg = ADDR_SPACE_SEG_FS;
- else if (str[0] == 'g')
- seg = ADDR_SPACE_SEG_GS;
- }
+static int
+function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
+ const_tree type, HOST_WIDE_INT words, bool named)
+{
+ int int_nregs, sse_nregs;
- if (seg == ADDR_SPACE_GENERIC)
- error ("%qs is not a valid base register "
- "in %<-mstack-protector-guard-reg=%>",
- opts->x_ix86_stack_protector_guard_reg_str);
+ /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
+ if (!named && (VALID_AVX512F_REG_MODE (mode)
+ || VALID_AVX256_REG_MODE (mode)))
+ return 0;
- opts->x_ix86_stack_protector_guard_reg = seg;
+ if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
+ && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
+ {
+ cum->nregs -= int_nregs;
+ cum->sse_nregs -= sse_nregs;
+ cum->regno += int_nregs;
+ cum->sse_regno += sse_nregs;
+ return int_nregs;
}
else
{
- opts->x_ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG;
-
- /* The kernel uses a different segment register for performance
- reasons; a system call would not have to trash the userspace
- segment register, which would be expensive. */
- if (opts->x_ix86_cmodel == CM_KERNEL)
- opts->x_ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS;
+ int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
+ cum->words = ROUND_UP (cum->words, align);
+ cum->words += words;
+ return 0;
}
+}
- /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
- if (opts->x_ix86_tune_memcpy_strategy)
- {
- char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
- ix86_parse_stringop_strategy_string (str, false);
- free (str);
- }
+static int
+function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
+ HOST_WIDE_INT words)
+{
+ /* Otherwise, this should be passed indirect. */
+ gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
- if (opts->x_ix86_tune_memset_strategy)
+ cum->words += words;
+ if (cum->nregs > 0)
{
- char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
- ix86_parse_stringop_strategy_string (str, true);
- free (str);
+ cum->nregs -= 1;
+ cum->regno += 1;
+ return 1;
}
+ return 0;
+}
- /* Save the initial options in case the user does function specific
- options. */
- if (main_args_p)
- target_option_default_node = target_option_current_node
- = build_target_option_node (opts);
+/* Update the data in CUM to advance over an argument of mode MODE and
+ data type TYPE. (TYPE is null for libcalls where that information
+ may not be available.) */
- if (opts->x_flag_cf_protection != CF_NONE)
- opts->x_flag_cf_protection
- = (cf_protection_level) (opts->x_flag_cf_protection | CF_SET);
+static void
+ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
+ const_tree type, bool named)
+{
+ CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+ HOST_WIDE_INT bytes, words;
+ int nregs;
- if (ix86_tune_features [X86_TUNE_AVOID_128FMA_CHAINS])
- maybe_set_param_value (PARAM_AVOID_FMA_MAX_BITS, 128,
- opts->x_param_values,
- opts_set->x_param_values);
+ /* The argument of interrupt handler is a special case and is
+ handled in ix86_function_arg. */
+ if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
+ return;
- /* PR86952: jump table usage with retpolines is slow.
- The PR provides some numbers about the slowness. */
- if (ix86_indirect_branch != indirect_branch_keep
- && !opts_set->x_flag_jump_tables)
- opts->x_flag_jump_tables = 0;
+ if (mode == BLKmode)
+ bytes = int_size_in_bytes (type);
+ else
+ bytes = GET_MODE_SIZE (mode);
+ words = CEIL (bytes, UNITS_PER_WORD);
- return true;
-}
+ if (type)
+ mode = type_natural_mode (type, NULL, false);
-/* Implement the TARGET_OPTION_OVERRIDE hook. */
+ if (TARGET_64BIT)
+ {
+ enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
-static void
-ix86_option_override (void)
-{
- ix86_option_override_internal (true, &global_options, &global_options_set);
-}
+ if (call_abi == MS_ABI)
+ nregs = function_arg_advance_ms_64 (cum, bytes, words);
+ else
+ nregs = function_arg_advance_64 (cum, mode, type, words, named);
+ }
+ else
+ nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
-/* Implement the TARGET_OFFLOAD_OPTIONS hook. */
-static char *
-ix86_offload_options (void)
-{
- if (TARGET_LP64)
- return xstrdup ("-foffload-abi=lp64");
- return xstrdup ("-foffload-abi=ilp32");
+ if (!nregs)
+ {
+ /* Track if there are outgoing arguments on stack. */
+ if (cum->caller)
+ cfun->machine->outgoing_args_on_stack = true;
+ }
}
-/* Update register usage after having seen the compiler flags. */
+/* Define where to put the arguments to a function.
+ Value is zero to push the argument on the stack,
+ or a hard register in which to store the argument.
-static void
-ix86_conditional_register_usage (void)
+ MODE is the argument's machine mode.
+ TYPE is the data type of the argument (as a tree).
+ This is null for libcalls where that information may
+ not be available.
+ CUM is a variable of type CUMULATIVE_ARGS which gives info about
+ the preceding args and about the function being called.
+ NAMED is nonzero if this argument is a named parameter
+ (otherwise it is an extra parameter matching an ellipsis). */
+
+static rtx
+function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
+ machine_mode orig_mode, const_tree type,
+ HOST_WIDE_INT bytes, HOST_WIDE_INT words)
{
- int i, c_mask;
+ bool error_p = false;
- /* If there are no caller-saved registers, preserve all registers.
- except fixed_regs and registers used for function return value
- since aggregate_value_p checks call_used_regs[regno] on return
- value. */
- if (cfun && cfun->machine->no_caller_saved_registers)
- for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
- if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
- call_used_regs[i] = 0;
+ /* Avoid the AL settings for the Unix64 ABI. */
+ if (mode == VOIDmode)
+ return constm1_rtx;
- /* For 32-bit targets, squash the REX registers. */
- if (! TARGET_64BIT)
+ if (TARGET_IAMCU)
{
- for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
- fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
- for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
- fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
- for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
- fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+ /* Intel MCU psABI passes scalars and aggregates no larger than 8
+ bytes in registers. */
+ if (!VECTOR_MODE_P (mode) && bytes <= 8)
+ goto pass_in_reg;
+ return NULL_RTX;
}
- /* See the definition of CALL_USED_REGISTERS in i386.h. */
- c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
-
- CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
-
- for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+ switch (mode)
{
- /* Set/reset conditionally defined registers from
- CALL_USED_REGISTERS initializer. */
- if (call_used_regs[i] > 1)
- call_used_regs[i] = !!(call_used_regs[i] & c_mask);
-
- /* Calculate registers of CLOBBERED_REGS register set
- as call used registers from GENERAL_REGS register set. */
- if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
- && call_used_regs[i])
- SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
- }
-
- /* If MMX is disabled, squash the registers. */
- if (! TARGET_MMX)
- for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
- if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
- fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+ default:
+ break;
- /* If SSE is disabled, squash the registers. */
- if (! TARGET_SSE)
- for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
- if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
- fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+ case E_BLKmode:
+ if (bytes < 0)
+ break;
+ /* FALLTHRU */
+ case E_DImode:
+ case E_SImode:
+ case E_HImode:
+ case E_QImode:
+pass_in_reg:
+ if (words <= cum->nregs)
+ {
+ int regno = cum->regno;
- /* If the FPU is disabled, squash the registers. */
- if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
- for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
- if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
- fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+ /* Fastcall allocates the first two DWORD (SImode) or
+ smaller arguments to ECX and EDX if it isn't an
+ aggregate type . */
+ if (cum->fastcall)
+ {
+ if (mode == BLKmode
+ || mode == DImode
+ || (type && AGGREGATE_TYPE_P (type)))
+ break;
- /* If AVX512F is disabled, squash the registers. */
- if (! TARGET_AVX512F)
- {
- for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
- fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
+ /* ECX not EAX is the first allocated register. */
+ if (regno == AX_REG)
+ regno = CX_REG;
+ }
+ return gen_rtx_REG (mode, regno);
+ }
+ break;
- for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
- fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
- }
-}
+ case E_DFmode:
+ if (cum->float_in_sse == -1)
+ error_p = true;
+ if (cum->float_in_sse < 2)
+ break;
+ /* FALLTHRU */
+ case E_SFmode:
+ if (cum->float_in_sse == -1)
+ error_p = true;
+ if (cum->float_in_sse < 1)
+ break;
+ /* FALLTHRU */
+ case E_TImode:
+ /* In 32bit, we pass TImode in xmm registers. */
+ case E_V16QImode:
+ case E_V8HImode:
+ case E_V4SImode:
+ case E_V2DImode:
+ case E_V4SFmode:
+ case E_V2DFmode:
+ if (!type || !AGGREGATE_TYPE_P (type))
+ {
+ if (cum->sse_nregs)
+ return gen_reg_or_parallel (mode, orig_mode,
+ cum->sse_regno + FIRST_SSE_REG);
+ }
+ break;
-/* Canonicalize a comparison from one we don't have to one we do have. */
+ case E_OImode:
+ case E_XImode:
+ /* OImode and XImode shouldn't be used directly. */
+ gcc_unreachable ();
-static void
-ix86_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
- bool op0_preserve_value)
-{
- /* The order of operands in x87 ficom compare is forced by combine in
- simplify_comparison () function. Float operator is treated as RTX_OBJ
- with a precedence over other operators and is always put in the first
- place. Swap condition and operands to match ficom instruction. */
- if (!op0_preserve_value
- && GET_CODE (*op0) == FLOAT && MEM_P (XEXP (*op0, 0)) && REG_P (*op1))
- {
- enum rtx_code scode = swap_condition ((enum rtx_code) *code);
+ case E_V64QImode:
+ case E_V32HImode:
+ case E_V16SImode:
+ case E_V8DImode:
+ case E_V16SFmode:
+ case E_V8DFmode:
+ case E_V8SFmode:
+ case E_V8SImode:
+ case E_V32QImode:
+ case E_V16HImode:
+ case E_V4DFmode:
+ case E_V4DImode:
+ if (!type || !AGGREGATE_TYPE_P (type))
+ {
+ if (cum->sse_nregs)
+ return gen_reg_or_parallel (mode, orig_mode,
+ cum->sse_regno + FIRST_SSE_REG);
+ }
+ break;
- /* We are called only for compares that are split to SAHF instruction.
- Ensure that we have setcc/jcc insn for the swapped condition. */
- if (ix86_fp_compare_code_to_integer (scode) != UNKNOWN)
+ case E_V8QImode:
+ case E_V4HImode:
+ case E_V2SImode:
+ case E_V2SFmode:
+ case E_V1TImode:
+ case E_V1DImode:
+ if (!type || !AGGREGATE_TYPE_P (type))
{
- std::swap (*op0, *op1);
- *code = (int) scode;
+ if (cum->mmx_nregs)
+ return gen_reg_or_parallel (mode, orig_mode,
+ cum->mmx_regno + FIRST_MMX_REG);
}
+ break;
}
-}
-\f
-/* Save the current options */
-
-static void
-ix86_function_specific_save (struct cl_target_option *ptr,
- struct gcc_options *opts)
-{
- ptr->arch = ix86_arch;
- ptr->schedule = ix86_schedule;
- ptr->prefetch_sse = x86_prefetch_sse;
- ptr->tune = ix86_tune;
- ptr->branch_cost = ix86_branch_cost;
- ptr->tune_defaulted = ix86_tune_defaulted;
- ptr->arch_specified = ix86_arch_specified;
- ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
- ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
- ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
- ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
- ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
- ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
- ptr->x_ix86_abi = opts->x_ix86_abi;
- ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
- ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
- ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
- ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
- ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
- ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
- ptr->x_ix86_pmode = opts->x_ix86_pmode;
- ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
- ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
- ptr->x_ix86_regparm = opts->x_ix86_regparm;
- ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
- ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
- ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
- ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
- ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
- ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
- ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
- ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
- ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
- ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
-
- /* The fields are char but the variables are not; make sure the
- values fit in the fields. */
- gcc_assert (ptr->arch == ix86_arch);
- gcc_assert (ptr->schedule == ix86_schedule);
- gcc_assert (ptr->tune == ix86_tune);
- gcc_assert (ptr->branch_cost == ix86_branch_cost);
-}
-
-/* Restore the current options */
-
-static void
-ix86_function_specific_restore (struct gcc_options *opts,
- struct cl_target_option *ptr)
-{
- enum processor_type old_tune = ix86_tune;
- enum processor_type old_arch = ix86_arch;
- unsigned HOST_WIDE_INT ix86_arch_mask;
- int i;
-
- /* We don't change -fPIC. */
- opts->x_flag_pic = flag_pic;
-
- ix86_arch = (enum processor_type) ptr->arch;
- ix86_schedule = (enum attr_cpu) ptr->schedule;
- ix86_tune = (enum processor_type) ptr->tune;
- x86_prefetch_sse = ptr->prefetch_sse;
- opts->x_ix86_branch_cost = ptr->branch_cost;
- ix86_tune_defaulted = ptr->tune_defaulted;
- ix86_arch_specified = ptr->arch_specified;
- opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
- opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
- opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
- opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
- opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
- opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
- opts->x_ix86_abi = ptr->x_ix86_abi;
- opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
- opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
- opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
- opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
- opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
- opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
- opts->x_ix86_pmode = ptr->x_ix86_pmode;
- opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
- opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
- opts->x_ix86_regparm = ptr->x_ix86_regparm;
- opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
- opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
- opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
- opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
- opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
- opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
- opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
- opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
- opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
- opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
- ix86_tune_cost = processor_cost_table[ix86_tune];
- /* TODO: ix86_cost should be chosen at instruction or function granuality
- so for cold code we use size_cost even in !optimize_size compilation. */
- if (opts->x_optimize_size)
- ix86_cost = &ix86_size_cost;
- else
- ix86_cost = ix86_tune_cost;
-
- /* Recreate the arch feature tests if the arch changed */
- if (old_arch != ix86_arch)
+ if (error_p)
{
- ix86_arch_mask = HOST_WIDE_INT_1U << ix86_arch;
- for (i = 0; i < X86_ARCH_LAST; ++i)
- ix86_arch_features[i]
- = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
+ cum->float_in_sse = 0;
+ error ("calling %qD with SSE calling convention without "
+ "SSE/SSE2 enabled", cum->decl);
+ sorry ("this is a GCC bug that can be worked around by adding "
+ "attribute used to function called");
}
- /* Recreate the tune optimization tests */
- if (old_tune != ix86_tune)
- set_ix86_tune_features (ix86_tune, false);
+ return NULL_RTX;
}
-/* Adjust target options after streaming them in. This is mainly about
- reconciling them with global options. */
-
-static void
-ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
+static rtx
+function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
+ machine_mode orig_mode, const_tree type, bool named)
{
- /* flag_pic is a global option, but ix86_cmodel is target saved option
- partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
- for PIC, or error out. */
- if (flag_pic)
- switch (ptr->x_ix86_cmodel)
- {
- case CM_SMALL:
- ptr->x_ix86_cmodel = CM_SMALL_PIC;
- break;
-
- case CM_MEDIUM:
- ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
- break;
-
- case CM_LARGE:
- ptr->x_ix86_cmodel = CM_LARGE_PIC;
- break;
-
- case CM_KERNEL:
- error ("code model %s does not support PIC mode", "kernel");
- break;
-
- default:
- break;
- }
- else
- switch (ptr->x_ix86_cmodel)
- {
- case CM_SMALL_PIC:
- ptr->x_ix86_cmodel = CM_SMALL;
- break;
+ /* Handle a hidden AL argument containing number of registers
+ for varargs x86-64 functions. */
+ if (mode == VOIDmode)
+ return GEN_INT (cum->maybe_vaarg
+ ? (cum->sse_nregs < 0
+ ? X86_64_SSE_REGPARM_MAX
+ : cum->sse_regno)
+ : -1);
- case CM_MEDIUM_PIC:
- ptr->x_ix86_cmodel = CM_MEDIUM;
- break;
+ switch (mode)
+ {
+ default:
+ break;
- case CM_LARGE_PIC:
- ptr->x_ix86_cmodel = CM_LARGE;
- break;
+ case E_V8SFmode:
+ case E_V8SImode:
+ case E_V32QImode:
+ case E_V16HImode:
+ case E_V4DFmode:
+ case E_V4DImode:
+ case E_V16SFmode:
+ case E_V16SImode:
+ case E_V64QImode:
+ case E_V32HImode:
+ case E_V8DFmode:
+ case E_V8DImode:
+ /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
+ if (!named)
+ return NULL;
+ break;
+ }
- default:
- break;
- }
+ return construct_container (mode, orig_mode, type, 0, cum->nregs,
+ cum->sse_nregs,
+ &x86_64_int_parameter_registers [cum->regno],
+ cum->sse_regno);
}
-/* Print the current options */
-
-static void
-ix86_function_specific_print (FILE *file, int indent,
- struct cl_target_option *ptr)
+static rtx
+function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
+ machine_mode orig_mode, bool named,
+ HOST_WIDE_INT bytes)
{
- char *target_string
- = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
- ptr->x_target_flags, ptr->x_ix86_target_flags,
- NULL, NULL, ptr->x_ix86_fpmath, false, true);
+ unsigned int regno;
- gcc_assert (ptr->arch < PROCESSOR_max);
- fprintf (file, "%*sarch = %d (%s)\n",
- indent, "",
- ptr->arch, processor_names[ptr->arch]);
+ /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
+ We use value of -2 to specify that current function call is MSABI. */
+ if (mode == VOIDmode)
+ return GEN_INT (-2);
- gcc_assert (ptr->tune < PROCESSOR_max);
- fprintf (file, "%*stune = %d (%s)\n",
- indent, "",
- ptr->tune, processor_names[ptr->tune]);
+ /* If we've run out of registers, it goes on the stack. */
+ if (cum->nregs == 0)
+ return NULL_RTX;
- fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
+ regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
- if (target_string)
+ /* Only floating point modes are passed in anything but integer regs. */
+ if (TARGET_SSE && (mode == SFmode || mode == DFmode))
{
- fprintf (file, "%*s%s\n", indent, "", target_string);
- free (target_string);
- }
-}
-
-\f
-/* Inner function to process the attribute((target(...))), take an argument and
- set the current options from the argument. If we have a list, recursively go
- over the list. */
-
-static bool
-ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[],
- struct gcc_options *opts,
- struct gcc_options *opts_set,
- struct gcc_options *enum_opts_set,
- bool target_clone_attr)
-{
- char *next_optstr;
- bool ret = true;
-
-#define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
-#define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
-#define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
-#define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
-#define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
-
- enum ix86_opt_type
- {
- ix86_opt_unknown,
- ix86_opt_yes,
- ix86_opt_no,
- ix86_opt_str,
- ix86_opt_enum,
- ix86_opt_isa
- };
+ if (named)
+ regno = cum->regno + FIRST_SSE_REG;
+ else
+ {
+ rtx t1, t2;
- static const struct
- {
- const char *string;
- size_t len;
- enum ix86_opt_type type;
- int opt;
- int mask;
- } attrs[] = {
- /* isa options */
- IX86_ATTR_ISA ("pconfig", OPT_mpconfig),
- IX86_ATTR_ISA ("wbnoinvd", OPT_mwbnoinvd),
- IX86_ATTR_ISA ("sgx", OPT_msgx),
- IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
- IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
- IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
- IX86_ATTR_ISA ("avx512vbmi2", OPT_mavx512vbmi2),
- IX86_ATTR_ISA ("avx512vnni", OPT_mavx512vnni),
- IX86_ATTR_ISA ("avx512bitalg", OPT_mavx512bitalg),
-
- IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
- IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
- IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
- IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
- IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
- IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
- IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
- IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
- IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
- IX86_ATTR_ISA ("avx2", OPT_mavx2),
- IX86_ATTR_ISA ("fma", OPT_mfma),
- IX86_ATTR_ISA ("xop", OPT_mxop),
- IX86_ATTR_ISA ("fma4", OPT_mfma4),
- IX86_ATTR_ISA ("f16c", OPT_mf16c),
- IX86_ATTR_ISA ("avx", OPT_mavx),
- IX86_ATTR_ISA ("sse4", OPT_msse4),
- IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
- IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
- IX86_ATTR_ISA ("sse4a", OPT_msse4a),
- IX86_ATTR_ISA ("ssse3", OPT_mssse3),
- IX86_ATTR_ISA ("sse3", OPT_msse3),
- IX86_ATTR_ISA ("aes", OPT_maes),
- IX86_ATTR_ISA ("sha", OPT_msha),
- IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
- IX86_ATTR_ISA ("sse2", OPT_msse2),
- IX86_ATTR_ISA ("sse", OPT_msse),
- IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa),
- IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
- IX86_ATTR_ISA ("mmx", OPT_mmmx),
- IX86_ATTR_ISA ("rtm", OPT_mrtm),
- IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
- IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
- IX86_ATTR_ISA ("adx", OPT_madx),
- IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
- IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
- IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
- IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
- IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
- IX86_ATTR_ISA ("xsave", OPT_mxsave),
- IX86_ATTR_ISA ("abm", OPT_mabm),
- IX86_ATTR_ISA ("bmi", OPT_mbmi),
- IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
- IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
- IX86_ATTR_ISA ("tbm", OPT_mtbm),
- IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
- IX86_ATTR_ISA ("cx16", OPT_mcx16),
- IX86_ATTR_ISA ("sahf", OPT_msahf),
- IX86_ATTR_ISA ("movbe", OPT_mmovbe),
- IX86_ATTR_ISA ("crc32", OPT_mcrc32),
- IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
- IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
- IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
- IX86_ATTR_ISA ("clzero", OPT_mclzero),
- IX86_ATTR_ISA ("pku", OPT_mpku),
- IX86_ATTR_ISA ("lwp", OPT_mlwp),
- IX86_ATTR_ISA ("hle", OPT_mhle),
- IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
- IX86_ATTR_ISA ("clwb", OPT_mclwb),
- IX86_ATTR_ISA ("rdpid", OPT_mrdpid),
- IX86_ATTR_ISA ("gfni", OPT_mgfni),
- IX86_ATTR_ISA ("shstk", OPT_mshstk),
- IX86_ATTR_ISA ("vaes", OPT_mvaes),
- IX86_ATTR_ISA ("vpclmulqdq", OPT_mvpclmulqdq),
- IX86_ATTR_ISA ("movdiri", OPT_mmovdiri),
- IX86_ATTR_ISA ("movdir64b", OPT_mmovdir64b),
- IX86_ATTR_ISA ("waitpkg", OPT_mwaitpkg),
- IX86_ATTR_ISA ("cldemote", OPT_mcldemote),
- IX86_ATTR_ISA ("ptwrite", OPT_mptwrite),
-
- /* enum options */
- IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
-
- /* string options */
- IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
- IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
-
- /* flag options */
- IX86_ATTR_YES ("cld",
- OPT_mcld,
- MASK_CLD),
-
- IX86_ATTR_NO ("fancy-math-387",
- OPT_mfancy_math_387,
- MASK_NO_FANCY_MATH_387),
-
- IX86_ATTR_YES ("ieee-fp",
- OPT_mieee_fp,
- MASK_IEEE_FP),
-
- IX86_ATTR_YES ("inline-all-stringops",
- OPT_minline_all_stringops,
- MASK_INLINE_ALL_STRINGOPS),
-
- IX86_ATTR_YES ("inline-stringops-dynamically",
- OPT_minline_stringops_dynamically,
- MASK_INLINE_STRINGOPS_DYNAMICALLY),
-
- IX86_ATTR_NO ("align-stringops",
- OPT_mno_align_stringops,
- MASK_NO_ALIGN_STRINGOPS),
-
- IX86_ATTR_YES ("recip",
- OPT_mrecip,
- MASK_RECIP),
- };
-
- location_t loc
- = fndecl == NULL ? UNKNOWN_LOCATION : DECL_SOURCE_LOCATION (fndecl);
- const char *attr_name = target_clone_attr ? "target_clone" : "target";
-
- /* If this is a list, recurse to get the options. */
- if (TREE_CODE (args) == TREE_LIST)
- {
- bool ret = true;
-
- for (; args; args = TREE_CHAIN (args))
- if (TREE_VALUE (args)
- && !ix86_valid_target_attribute_inner_p (fndecl, TREE_VALUE (args),
- p_strings, opts, opts_set,
- enum_opts_set,
- target_clone_attr))
- ret = false;
-
- return ret;
+ /* Unnamed floating parameters are passed in both the
+ SSE and integer registers. */
+ t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
+ t2 = gen_rtx_REG (mode, regno);
+ t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
+ t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
+ return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
+ }
}
-
- else if (TREE_CODE (args) != STRING_CST)
+ /* Handle aggregated types passed in register. */
+ if (orig_mode == BLKmode)
{
- error_at (loc, "attribute %qs argument is not a string", attr_name);
- return false;
+ if (bytes > 0 && bytes <= 8)
+ mode = (bytes > 4 ? DImode : SImode);
+ if (mode == BLKmode)
+ mode = DImode;
}
- /* Handle multiple arguments separated by commas. */
- next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
-
- while (next_optstr && *next_optstr != '\0')
- {
- char *p = next_optstr;
- char *orig_p = p;
- char *comma = strchr (next_optstr, ',');
- size_t len, opt_len;
- int opt;
- bool opt_set_p;
- char ch;
- unsigned i;
- enum ix86_opt_type type = ix86_opt_unknown;
- int mask = 0;
+ return gen_reg_or_parallel (mode, orig_mode, regno);
+}
- if (comma)
- {
- *comma = '\0';
- len = comma - next_optstr;
- next_optstr = comma + 1;
- }
- else
- {
- len = strlen (p);
- next_optstr = NULL;
- }
+/* Return where to put the arguments to a function.
+ Return zero to push the argument on the stack, or a hard register in which to store the argument.
- /* Recognize no-xxx. */
- if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
- {
- opt_set_p = false;
- p += 3;
- len -= 3;
- }
- else
- opt_set_p = true;
-
- /* Find the option. */
- ch = *p;
- opt = N_OPTS;
- for (i = 0; i < ARRAY_SIZE (attrs); i++)
- {
- type = attrs[i].type;
- opt_len = attrs[i].len;
- if (ch == attrs[i].string[0]
- && ((type != ix86_opt_str && type != ix86_opt_enum)
- ? len == opt_len
- : len > opt_len)
- && memcmp (p, attrs[i].string, opt_len) == 0)
- {
- opt = attrs[i].opt;
- mask = attrs[i].mask;
- break;
- }
- }
+ MODE is the argument's machine mode. TYPE is the data type of the
+ argument. It is null for libcalls where that information may not be
+ available. CUM gives information about the preceding args and about
+ the function being called. NAMED is nonzero if this argument is a
+ named parameter (otherwise it is an extra parameter matching an
+ ellipsis). */
- /* Process the option. */
- if (opt == N_OPTS)
- {
- error_at (loc, "attribute %qs argument %qs is unknown",
- orig_p, attr_name);
- ret = false;
- }
+static rtx
+ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
+ const_tree type, bool named)
+{
+ CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+ machine_mode mode = omode;
+ HOST_WIDE_INT bytes, words;
+ rtx arg;
- else if (type == ix86_opt_isa)
+ if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
+ {
+ gcc_assert (type != NULL_TREE);
+ if (POINTER_TYPE_P (type))
{
- struct cl_decoded_option decoded;
-
- generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
- ix86_handle_option (opts, opts_set,
- &decoded, input_location);
+ /* This is the pointer argument. */
+ gcc_assert (TYPE_MODE (type) == Pmode);
+ /* It is at -WORD(AP) in the current frame in interrupt and
+ exception handlers. */
+ arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
}
-
- else if (type == ix86_opt_yes || type == ix86_opt_no)
+ else
{
- if (type == ix86_opt_no)
- opt_set_p = !opt_set_p;
-
- if (opt_set_p)
- opts->x_target_flags |= mask;
- else
- opts->x_target_flags &= ~mask;
+ gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
+ && TREE_CODE (type) == INTEGER_TYPE
+ && TYPE_MODE (type) == word_mode);
+ /* The error code is the word-mode integer argument at
+ -2 * WORD(AP) in the current frame of the exception
+ handler. */
+ arg = gen_rtx_MEM (word_mode,
+ plus_constant (Pmode,
+ arg_pointer_rtx,
+ -2 * UNITS_PER_WORD));
}
+ return arg;
+ }
- else if (type == ix86_opt_str)
- {
- if (p_strings[opt])
- {
- error_at (loc, "attribute value %qs was already specified "
- "in %qs attribute", orig_p, attr_name);
- ret = false;
- }
- else
- p_strings[opt] = xstrdup (p + opt_len);
- }
+ if (mode == BLKmode)
+ bytes = int_size_in_bytes (type);
+ else
+ bytes = GET_MODE_SIZE (mode);
+ words = CEIL (bytes, UNITS_PER_WORD);
- else if (type == ix86_opt_enum)
- {
- bool arg_ok;
- int value;
+ /* To simplify the code below, represent vector types with a vector mode
+ even if MMX/SSE are not active. */
+ if (type && TREE_CODE (type) == VECTOR_TYPE)
+ mode = type_natural_mode (type, cum, false);
- arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
- if (arg_ok)
- set_option (opts, enum_opts_set, opt, value,
- p + opt_len, DK_UNSPECIFIED, input_location,
- global_dc);
- else
- {
- error_at (loc, "attribute value %qs is unknown in %qs attribute",
- orig_p, attr_name);
- ret = false;
- }
- }
+ if (TARGET_64BIT)
+ {
+ enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
+ if (call_abi == MS_ABI)
+ arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
else
- gcc_unreachable ();
+ arg = function_arg_64 (cum, mode, omode, type, named);
}
+ else
+ arg = function_arg_32 (cum, mode, omode, type, bytes, words);
- return ret;
-}
+ /* Track if there are outgoing arguments on stack. */
+ if (arg == NULL_RTX && cum->caller)
+ cfun->machine->outgoing_args_on_stack = true;
-/* Release allocated strings. */
-static void
-release_options_strings (char **option_strings)
-{
- /* Free up memory allocated to hold the strings */
- for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
- free (option_strings[i]);
+ return arg;
}
-/* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
+/* A C expression that indicates when an argument must be passed by
+ reference. If nonzero for an argument, a copy of that argument is
+ made in memory and a pointer to the argument is passed instead of
+ the argument itself. The pointer is passed in whatever way is
+ appropriate for passing a pointer to that type. */
-tree
-ix86_valid_target_attribute_tree (tree fndecl, tree args,
- struct gcc_options *opts,
- struct gcc_options *opts_set,
- bool target_clone_attr)
+static bool
+ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
+ const_tree type, bool)
{
- const char *orig_arch_string = opts->x_ix86_arch_string;
- const char *orig_tune_string = opts->x_ix86_tune_string;
- enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
- int orig_tune_defaulted = ix86_tune_defaulted;
- int orig_arch_specified = ix86_arch_specified;
- char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
- tree t = NULL_TREE;
- struct cl_target_option *def
- = TREE_TARGET_OPTION (target_option_default_node);
- struct gcc_options enum_opts_set;
-
- memset (&enum_opts_set, 0, sizeof (enum_opts_set));
-
- /* Process each of the options on the chain. */
- if (!ix86_valid_target_attribute_inner_p (fndecl, args, option_strings, opts,
- opts_set, &enum_opts_set,
- target_clone_attr))
- return error_mark_node;
+ CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
- /* If the changed options are different from the default, rerun
- ix86_option_override_internal, and then save the options away.
- The string options are attribute options, and will be undone
- when we copy the save structure. */
- if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
- || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
- || opts->x_target_flags != def->x_target_flags
- || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
- || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
- || enum_opts_set.x_ix86_fpmath)
+ if (TARGET_64BIT)
{
- /* If we are using the default tune= or arch=, undo the string assigned,
- and use the default. */
- if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
- {
- opts->x_ix86_arch_string
- = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
-
- /* If arch= is set, clear all bits in x_ix86_isa_flags,
- except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
- opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
- | OPTION_MASK_ABI_64
- | OPTION_MASK_ABI_X32
- | OPTION_MASK_CODE16);
- opts->x_ix86_isa_flags2 = 0;
- }
- else if (!orig_arch_specified)
- opts->x_ix86_arch_string = NULL;
-
- if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
- opts->x_ix86_tune_string
- = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
- else if (orig_tune_defaulted)
- opts->x_ix86_tune_string = NULL;
-
- /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
- if (enum_opts_set.x_ix86_fpmath)
- opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
+ enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
- /* Do any overrides, such as arch=xxx, or tune=xxx support. */
- bool r = ix86_option_override_internal (false, opts, opts_set);
- if (!r)
+ /* See Windows x64 Software Convention. */
+ if (call_abi == MS_ABI)
{
- release_options_strings (option_strings);
- return error_mark_node;
- }
-
- /* Add any builtin functions with the new isa if any. */
- ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
+ HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
- /* Save the current options unless we are validating options for
- #pragma. */
- t = build_target_option_node (opts);
+ if (type)
+ {
+ /* Arrays are passed by reference. */
+ if (TREE_CODE (type) == ARRAY_TYPE)
+ return true;
- opts->x_ix86_arch_string = orig_arch_string;
- opts->x_ix86_tune_string = orig_tune_string;
- opts_set->x_ix86_fpmath = orig_fpmath_set;
+ if (RECORD_OR_UNION_TYPE_P (type))
+ {
+ /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
+ are passed by reference. */
+ msize = int_size_in_bytes (type);
+ }
+ }
- release_options_strings (option_strings);
+ /* __m128 is passed by reference. */
+ return msize != 1 && msize != 2 && msize != 4 && msize != 8;
+ }
+ else if (type && int_size_in_bytes (type) == -1)
+ return true;
}
- return t;
+ return false;
}
-/* Hook to validate attribute((target("string"))). */
+/* Return true when TYPE should be 128bit aligned for 32bit argument
+ passing ABI. XXX: This function is obsolete and is only used for
+ checking psABI compatibility with previous versions of GCC. */
static bool
-ix86_valid_target_attribute_p (tree fndecl,
- tree ARG_UNUSED (name),
- tree args,
- int flags)
-{
- struct gcc_options func_options;
- tree new_target, new_optimize;
- bool ret = true;
-
- /* attribute((target("default"))) does nothing, beyond
- affecting multi-versioning. */
- if (TREE_VALUE (args)
- && TREE_CODE (TREE_VALUE (args)) == STRING_CST
- && TREE_CHAIN (args) == NULL_TREE
- && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
+ix86_compat_aligned_value_p (const_tree type)
+{
+ machine_mode mode = TYPE_MODE (type);
+ if (((TARGET_SSE && SSE_REG_MODE_P (mode))
+ || mode == TDmode
+ || mode == TFmode
+ || mode == TCmode)
+ && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
return true;
+ if (TYPE_ALIGN (type) < 128)
+ return false;
- tree old_optimize = build_optimization_node (&global_options);
-
- /* Get the optimization options of the current function. */
- tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
-
- if (!func_optimize)
- func_optimize = old_optimize;
-
- /* Init func_options. */
- memset (&func_options, 0, sizeof (func_options));
- init_options_struct (&func_options, NULL);
- lang_hooks.init_options_struct (&func_options);
-
- cl_optimization_restore (&func_options,
- TREE_OPTIMIZATION (func_optimize));
+ if (AGGREGATE_TYPE_P (type))
+ {
+ /* Walk the aggregates recursively. */
+ switch (TREE_CODE (type))
+ {
+ case RECORD_TYPE:
+ case UNION_TYPE:
+ case QUAL_UNION_TYPE:
+ {
+ tree field;
- /* Initialize func_options to the default before its target options can
- be set. */
- cl_target_option_restore (&func_options,
- TREE_TARGET_OPTION (target_option_default_node));
+ /* Walk all the structure fields. */
+ for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
+ {
+ if (TREE_CODE (field) == FIELD_DECL
+ && ix86_compat_aligned_value_p (TREE_TYPE (field)))
+ return true;
+ }
+ break;
+ }
- /* FLAGS == 1 is used for target_clones attribute. */
- new_target
- = ix86_valid_target_attribute_tree (fndecl, args, &func_options,
- &global_options_set, flags == 1);
+ case ARRAY_TYPE:
+ /* Just for use if some languages passes arrays by value. */
+ if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
+ return true;
+ break;
- new_optimize = build_optimization_node (&func_options);
+ default:
+ gcc_unreachable ();
+ }
+ }
+ return false;
+}
- if (new_target == error_mark_node)
- ret = false;
+/* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
+ XXX: This function is obsolete and is only used for checking psABI
+ compatibility with previous versions of GCC. */
- else if (fndecl && new_target)
+static unsigned int
+ix86_compat_function_arg_boundary (machine_mode mode,
+ const_tree type, unsigned int align)
+{
+ /* In 32bit, only _Decimal128 and __float128 are aligned to their
+ natural boundaries. */
+ if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
{
- DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
+ /* i386 ABI defines all arguments to be 4 byte aligned. We have to
+ make an exception for SSE modes since these require 128bit
+ alignment.
- if (old_optimize != new_optimize)
- DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
+ The handling here differs from field_alignment. ICC aligns MMX
+ arguments to 4 byte boundaries, while structure fields are aligned
+ to 8 byte boundaries. */
+ if (!type)
+ {
+ if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
+ align = PARM_BOUNDARY;
+ }
+ else
+ {
+ if (!ix86_compat_aligned_value_p (type))
+ align = PARM_BOUNDARY;
+ }
}
-
- finalize_options_struct (&func_options);
-
- return ret;
+ if (align > BIGGEST_ALIGNMENT)
+ align = BIGGEST_ALIGNMENT;
+ return align;
}
-\f
-/* Hook to determine if one function can safely inline another. */
+/* Return true when TYPE should be 128bit aligned for 32bit argument
+ passing ABI. */
static bool
-ix86_can_inline_p (tree caller, tree callee)
+ix86_contains_aligned_value_p (const_tree type)
{
- tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
- tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
+ machine_mode mode = TYPE_MODE (type);
- /* Changes of those flags can be tolerated for always inlines. Lets hope
- user knows what he is doing. */
- const unsigned HOST_WIDE_INT always_inline_safe_mask
- = (MASK_USE_8BIT_IDIV | MASK_ACCUMULATE_OUTGOING_ARGS
- | MASK_NO_ALIGN_STRINGOPS | MASK_AVX256_SPLIT_UNALIGNED_LOAD
- | MASK_AVX256_SPLIT_UNALIGNED_STORE | MASK_CLD
- | MASK_NO_FANCY_MATH_387 | MASK_IEEE_FP | MASK_INLINE_ALL_STRINGOPS
- | MASK_INLINE_STRINGOPS_DYNAMICALLY | MASK_RECIP | MASK_STACK_PROBE
- | MASK_STV | MASK_TLS_DIRECT_SEG_REFS | MASK_VZEROUPPER
- | MASK_NO_PUSH_ARGS | MASK_OMIT_LEAF_FRAME_POINTER);
-
-
- if (!callee_tree)
- callee_tree = target_option_default_node;
- if (!caller_tree)
- caller_tree = target_option_default_node;
- if (callee_tree == caller_tree)
- return true;
-
- struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
- struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
- bool ret = false;
- bool always_inline
- = (DECL_DISREGARD_INLINE_LIMITS (callee)
- && lookup_attribute ("always_inline",
- DECL_ATTRIBUTES (callee)));
-
- cgraph_node *callee_node = cgraph_node::get (callee);
- /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
- function can inline a SSE2 function but a SSE2 function can't inline
- a SSE4 function. */
- if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
- != callee_opts->x_ix86_isa_flags)
- || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
- != callee_opts->x_ix86_isa_flags2))
- ret = false;
-
- /* See if we have the same non-isa options. */
- else if ((!always_inline
- && caller_opts->x_target_flags != callee_opts->x_target_flags)
- || (caller_opts->x_target_flags & ~always_inline_safe_mask)
- != (callee_opts->x_target_flags & ~always_inline_safe_mask))
- ret = false;
-
- /* See if arch, tune, etc. are the same. */
- else if (caller_opts->arch != callee_opts->arch)
- ret = false;
-
- else if (!always_inline && caller_opts->tune != callee_opts->tune)
- ret = false;
-
- else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
- /* If the calle doesn't use FP expressions differences in
- ix86_fpmath can be ignored. We are called from FEs
- for multi-versioning call optimization, so beware of
- ipa_fn_summaries not available. */
- && (! ipa_fn_summaries
- || ipa_fn_summaries->get (callee_node) == NULL
- || ipa_fn_summaries->get (callee_node)->fp_expressions))
- ret = false;
-
- else if (!always_inline
- && caller_opts->branch_cost != callee_opts->branch_cost)
- ret = false;
-
- else
- ret = true;
-
- return ret;
-}
-
-\f
-/* Remember the last target of ix86_set_current_function. */
-static GTY(()) tree ix86_previous_fndecl;
-
-/* Set targets globals to the default (or current #pragma GCC target
- if active). Invalidate ix86_previous_fndecl cache. */
-
-void
-ix86_reset_previous_fndecl (void)
-{
- tree new_tree = target_option_current_node;
- cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
- if (TREE_TARGET_GLOBALS (new_tree))
- restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
- else if (new_tree == target_option_default_node)
- restore_target_globals (&default_target_globals);
- else
- TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
- ix86_previous_fndecl = NULL_TREE;
-}
+ if (mode == XFmode || mode == XCmode)
+ return false;
-/* Set the func_type field from the function FNDECL. */
+ if (TYPE_ALIGN (type) < 128)
+ return false;
-static void
-ix86_set_func_type (tree fndecl)
-{
- if (cfun->machine->func_type == TYPE_UNKNOWN)
+ if (AGGREGATE_TYPE_P (type))
{
- if (lookup_attribute ("interrupt",
- TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
+ /* Walk the aggregates recursively. */
+ switch (TREE_CODE (type))
{
- if (ix86_function_naked (fndecl))
- error_at (DECL_SOURCE_LOCATION (fndecl),
- "interrupt and naked attributes are not compatible");
+ case RECORD_TYPE:
+ case UNION_TYPE:
+ case QUAL_UNION_TYPE:
+ {
+ tree field;
- int nargs = 0;
- for (tree arg = DECL_ARGUMENTS (fndecl);
- arg;
- arg = TREE_CHAIN (arg))
- nargs++;
- cfun->machine->no_caller_saved_registers = true;
- cfun->machine->func_type
- = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
+ /* Walk all the structure fields. */
+ for (field = TYPE_FIELDS (type);
+ field;
+ field = DECL_CHAIN (field))
+ {
+ if (TREE_CODE (field) == FIELD_DECL
+ && ix86_contains_aligned_value_p (TREE_TYPE (field)))
+ return true;
+ }
+ break;
+ }
- ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
+ case ARRAY_TYPE:
+ /* Just for use if some languages passes arrays by value. */
+ if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
+ return true;
+ break;
- /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
- if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
- sorry ("only DWARF debug format is supported for interrupt "
- "service routine");
- }
- else
- {
- cfun->machine->func_type = TYPE_NORMAL;
- if (lookup_attribute ("no_caller_saved_registers",
- TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
- cfun->machine->no_caller_saved_registers = true;
+ default:
+ gcc_unreachable ();
}
}
-}
-
-/* Set the indirect_branch_type field from the function FNDECL. */
+ else
+ return TYPE_ALIGN (type) >= 128;
-static void
-ix86_set_indirect_branch_type (tree fndecl)
-{
- if (cfun->machine->indirect_branch_type == indirect_branch_unset)
- {
- tree attr = lookup_attribute ("indirect_branch",
- DECL_ATTRIBUTES (fndecl));
- if (attr != NULL)
- {
- tree args = TREE_VALUE (attr);
- if (args == NULL)
- gcc_unreachable ();
- tree cst = TREE_VALUE (args);
- if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0)
- cfun->machine->indirect_branch_type = indirect_branch_keep;
- else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0)
- cfun->machine->indirect_branch_type = indirect_branch_thunk;
- else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0)
- cfun->machine->indirect_branch_type = indirect_branch_thunk_inline;
- else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0)
- cfun->machine->indirect_branch_type = indirect_branch_thunk_extern;
- else
- gcc_unreachable ();
- }
- else
- cfun->machine->indirect_branch_type = ix86_indirect_branch;
+ return false;
+}
- /* -mcmodel=large is not compatible with -mindirect-branch=thunk
- nor -mindirect-branch=thunk-extern. */
- if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
- && ((cfun->machine->indirect_branch_type
- == indirect_branch_thunk_extern)
- || (cfun->machine->indirect_branch_type
- == indirect_branch_thunk)))
- error ("%<-mindirect-branch=%s%> and %<-mcmodel=large%> are not "
- "compatible",
- ((cfun->machine->indirect_branch_type
- == indirect_branch_thunk_extern)
- ? "thunk-extern" : "thunk"));
-
- if (cfun->machine->indirect_branch_type != indirect_branch_keep
- && (flag_cf_protection & CF_RETURN))
- error ("%<-mindirect-branch%> and %<-fcf-protection%> are not "
- "compatible");
- }
-
- if (cfun->machine->function_return_type == indirect_branch_unset)
- {
- tree attr = lookup_attribute ("function_return",
- DECL_ATTRIBUTES (fndecl));
- if (attr != NULL)
- {
- tree args = TREE_VALUE (attr);
- if (args == NULL)
- gcc_unreachable ();
- tree cst = TREE_VALUE (args);
- if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0)
- cfun->machine->function_return_type = indirect_branch_keep;
- else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0)
- cfun->machine->function_return_type = indirect_branch_thunk;
- else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0)
- cfun->machine->function_return_type = indirect_branch_thunk_inline;
- else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0)
- cfun->machine->function_return_type = indirect_branch_thunk_extern;
- else
- gcc_unreachable ();
- }
- else
- cfun->machine->function_return_type = ix86_function_return;
+/* Gives the alignment boundary, in bits, of an argument with the
+ specified mode and type. */
- /* -mcmodel=large is not compatible with -mfunction-return=thunk
- nor -mfunction-return=thunk-extern. */
- if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
- && ((cfun->machine->function_return_type
- == indirect_branch_thunk_extern)
- || (cfun->machine->function_return_type
- == indirect_branch_thunk)))
- error ("%<-mfunction-return=%s%> and %<-mcmodel=large%> are not "
- "compatible",
- ((cfun->machine->function_return_type
- == indirect_branch_thunk_extern)
- ? "thunk-extern" : "thunk"));
-
- if (cfun->machine->function_return_type != indirect_branch_keep
- && (flag_cf_protection & CF_RETURN))
- error ("%<-mfunction-return%> and %<-fcf-protection%> are not "
- "compatible");
- }
-}
-
-/* Establish appropriate back-end context for processing the function
- FNDECL. The argument might be NULL to indicate processing at top
- level, outside of any function scope. */
-static void
-ix86_set_current_function (tree fndecl)
+static unsigned int
+ix86_function_arg_boundary (machine_mode mode, const_tree type)
{
- /* Only change the context if the function changes. This hook is called
- several times in the course of compiling a function, and we don't want to
- slow things down too much or call target_reinit when it isn't safe. */
- if (fndecl == ix86_previous_fndecl)
+ unsigned int align;
+ if (type)
{
- /* There may be 2 function bodies for the same function FNDECL,
- one is extern inline and one isn't. Call ix86_set_func_type
- to set the func_type field. */
- if (fndecl != NULL_TREE)
- {
- ix86_set_func_type (fndecl);
- ix86_set_indirect_branch_type (fndecl);
- }
- return;
+ /* Since the main variant type is used for call, we convert it to
+ the main variant type. */
+ type = TYPE_MAIN_VARIANT (type);
+ align = TYPE_ALIGN (type);
+ if (TYPE_EMPTY_P (type))
+ return PARM_BOUNDARY;
}
-
- tree old_tree;
- if (ix86_previous_fndecl == NULL_TREE)
- old_tree = target_option_current_node;
- else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
- old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
else
- old_tree = target_option_default_node;
-
- if (fndecl == NULL_TREE)
+ align = GET_MODE_ALIGNMENT (mode);
+ if (align < PARM_BOUNDARY)
+ align = PARM_BOUNDARY;
+ else
{
- if (old_tree != target_option_current_node)
- ix86_reset_previous_fndecl ();
- return;
- }
-
- ix86_set_func_type (fndecl);
- ix86_set_indirect_branch_type (fndecl);
-
- tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
- if (new_tree == NULL_TREE)
- new_tree = target_option_default_node;
+ static bool warned;
+ unsigned int saved_align = align;
- if (old_tree != new_tree)
- {
- cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
- if (TREE_TARGET_GLOBALS (new_tree))
- restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
- else if (new_tree == target_option_default_node)
- restore_target_globals (&default_target_globals);
- else
- TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
- }
- ix86_previous_fndecl = fndecl;
+ if (!TARGET_64BIT)
+ {
+ /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
+ if (!type)
+ {
+ if (mode == XFmode || mode == XCmode)
+ align = PARM_BOUNDARY;
+ }
+ else if (!ix86_contains_aligned_value_p (type))
+ align = PARM_BOUNDARY;
- static bool prev_no_caller_saved_registers;
+ if (align < 128)
+ align = PARM_BOUNDARY;
+ }
- /* 64-bit MS and SYSV ABI have different set of call used registers.
- Avoid expensive re-initialization of init_regs each time we switch
- function context. */
- if (TARGET_64BIT
- && (call_used_regs[SI_REG]
- == (cfun->machine->call_abi == MS_ABI)))
- reinit_regs ();
- /* Need to re-initialize init_regs if caller-saved registers are
- changed. */
- else if (prev_no_caller_saved_registers
- != cfun->machine->no_caller_saved_registers)
- reinit_regs ();
-
- if (cfun->machine->func_type != TYPE_NORMAL
- || cfun->machine->no_caller_saved_registers)
- {
- /* Don't allow SSE, MMX nor x87 instructions since they
- may change processor state. */
- const char *isa;
- if (TARGET_SSE)
- isa = "SSE";
- else if (TARGET_MMX)
- isa = "MMX/3Dnow";
- else if (TARGET_80387)
- isa = "80387";
- else
- isa = NULL;
- if (isa != NULL)
- {
- if (cfun->machine->func_type != TYPE_NORMAL)
- sorry (cfun->machine->func_type == TYPE_EXCEPTION
- ? G_("%s instructions aren%'t allowed in an"
- " exception service routine")
- : G_("%s instructions aren%'t allowed in an"
- " interrupt service routine"),
- isa);
- else
- sorry ("%s instructions aren%'t allowed in a function with "
- "the %<no_caller_saved_registers%> attribute", isa);
- /* Don't issue the same error twice. */
- cfun->machine->func_type = TYPE_NORMAL;
- cfun->machine->no_caller_saved_registers = false;
+ if (warn_psabi
+ && !warned
+ && align != ix86_compat_function_arg_boundary (mode, type,
+ saved_align))
+ {
+ warned = true;
+ inform (input_location,
+ "the ABI for passing parameters with %d-byte"
+ " alignment has changed in GCC 4.6",
+ align / BITS_PER_UNIT);
}
}
- prev_no_caller_saved_registers
- = cfun->machine->no_caller_saved_registers;
+ return align;
}
-\f
-/* Return true if this goes in large data/bss. */
+/* Return true if N is a possible register number of function value. */
static bool
-ix86_in_large_data_p (tree exp)
+ix86_function_value_regno_p (const unsigned int regno)
{
- if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
- return false;
-
- if (exp == NULL_TREE)
- return false;
-
- /* Functions are never large data. */
- if (TREE_CODE (exp) == FUNCTION_DECL)
- return false;
+ switch (regno)
+ {
+ case AX_REG:
+ return true;
+ case DX_REG:
+ return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
+ case DI_REG:
+ case SI_REG:
+ return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
- /* Automatic variables are never large data. */
- if (VAR_P (exp) && !is_global_var (exp))
- return false;
+ /* Complex values are returned in %st(0)/%st(1) pair. */
+ case ST0_REG:
+ case ST1_REG:
+ /* TODO: The function should depend on current function ABI but
+ builtins.c would need updating then. Therefore we use the
+ default ABI. */
+ if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
+ return false;
+ return TARGET_FLOAT_RETURNS_IN_80387;
- if (VAR_P (exp) && DECL_SECTION_NAME (exp))
- {
- const char *section = DECL_SECTION_NAME (exp);
- if (strcmp (section, ".ldata") == 0
- || strcmp (section, ".lbss") == 0)
- return true;
- return false;
- }
- else
- {
- HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
+ /* Complex values are returned in %xmm0/%xmm1 pair. */
+ case XMM0_REG:
+ case XMM1_REG:
+ return TARGET_SSE;
- /* If this is an incomplete type with size 0, then we can't put it
- in data because it might be too big when completed. Also,
- int_size_in_bytes returns -1 if size can vary or is larger than
- an integer in which case also it is safer to assume that it goes in
- large data. */
- if (size <= 0 || size > ix86_section_threshold)
- return true;
+ case MM0_REG:
+ if (TARGET_MACHO || TARGET_64BIT)
+ return false;
+ return TARGET_MMX;
}
return false;
}
-/* i386-specific section flag to mark large sections. */
-#define SECTION_LARGE SECTION_MACH_DEP
+/* Define how to find the value returned by a function.
+ VALTYPE is the data type of the value (as a tree).
+ If the precise function being called is known, FUNC is its FUNCTION_DECL;
+ otherwise, FUNC is 0. */
-/* Switch to the appropriate section for output of DECL.
- DECL is either a `VAR_DECL' node or a constant of some sort.
- RELOC indicates whether forming the initial value of DECL requires
- link-time relocations. */
-
-ATTRIBUTE_UNUSED static section *
-x86_64_elf_select_section (tree decl, int reloc,
- unsigned HOST_WIDE_INT align)
+static rtx
+function_value_32 (machine_mode orig_mode, machine_mode mode,
+ const_tree fntype, const_tree fn)
{
- if (ix86_in_large_data_p (decl))
- {
- const char *sname = NULL;
- unsigned int flags = SECTION_WRITE | SECTION_LARGE;
- switch (categorize_decl_for_section (decl, reloc))
- {
- case SECCAT_DATA:
- sname = ".ldata";
- break;
- case SECCAT_DATA_REL:
- sname = ".ldata.rel";
- break;
- case SECCAT_DATA_REL_LOCAL:
- sname = ".ldata.rel.local";
- break;
- case SECCAT_DATA_REL_RO:
- sname = ".ldata.rel.ro";
- break;
- case SECCAT_DATA_REL_RO_LOCAL:
- sname = ".ldata.rel.ro.local";
- break;
- case SECCAT_BSS:
- sname = ".lbss";
- flags |= SECTION_BSS;
- break;
- case SECCAT_RODATA:
- case SECCAT_RODATA_MERGE_STR:
- case SECCAT_RODATA_MERGE_STR_INIT:
- case SECCAT_RODATA_MERGE_CONST:
- sname = ".lrodata";
- flags &= ~SECTION_WRITE;
- break;
- case SECCAT_SRODATA:
- case SECCAT_SDATA:
- case SECCAT_SBSS:
- gcc_unreachable ();
- case SECCAT_TEXT:
- case SECCAT_TDATA:
- case SECCAT_TBSS:
- /* We don't split these for medium model. Place them into
- default sections and hope for best. */
- break;
- }
- if (sname)
- {
- /* We might get called with string constants, but get_named_section
- doesn't like them as they are not DECLs. Also, we need to set
- flags in that case. */
- if (!DECL_P (decl))
- return get_section (sname, flags, NULL);
- return get_named_section (decl, sname, reloc);
- }
- }
- return default_elf_select_section (decl, reloc, align);
-}
+ unsigned int regno;
-/* Select a set of attributes for section NAME based on the properties
- of DECL and whether or not RELOC indicates that DECL's initializer
- might contain runtime relocations. */
+ /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
+ we normally prevent this case when mmx is not available. However
+ some ABIs may require the result to be returned like DImode. */
+ if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
+ regno = FIRST_MMX_REG;
-static unsigned int ATTRIBUTE_UNUSED
-x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
-{
- unsigned int flags = default_section_type_flags (decl, name, reloc);
+ /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
+ we prevent this case when sse is not available. However some ABIs
+ may require the result to be returned like integer TImode. */
+ else if (mode == TImode
+ || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
+ regno = FIRST_SSE_REG;
- if (ix86_in_large_data_p (decl))
- flags |= SECTION_LARGE;
+ /* 32-byte vector modes in %ymm0. */
+ else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
+ regno = FIRST_SSE_REG;
- if (decl == NULL_TREE
- && (strcmp (name, ".ldata.rel.ro") == 0
- || strcmp (name, ".ldata.rel.ro.local") == 0))
- flags |= SECTION_RELRO;
+ /* 64-byte vector modes in %zmm0. */
+ else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
+ regno = FIRST_SSE_REG;
- if (strcmp (name, ".lbss") == 0
- || strncmp (name, ".lbss.", 5) == 0
- || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
- flags |= SECTION_BSS;
+ /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
+ else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
+ regno = FIRST_FLOAT_REG;
+ else
+ /* Most things go in %eax. */
+ regno = AX_REG;
- return flags;
-}
+ /* Override FP return register with %xmm0 for local functions when
+ SSE math is enabled or for functions with sseregparm attribute. */
+ if ((fn || fntype) && (mode == SFmode || mode == DFmode))
+ {
+ int sse_level = ix86_function_sseregparm (fntype, fn, false);
+ if (sse_level == -1)
+ {
+ error ("calling %qD with SSE calling convention without "
+ "SSE/SSE2 enabled", fn);
+ sorry ("this is a GCC bug that can be worked around by adding "
+ "attribute used to function called");
+ }
+ else if ((sse_level >= 1 && mode == SFmode)
+ || (sse_level == 2 && mode == DFmode))
+ regno = FIRST_SSE_REG;
+ }
-/* Build up a unique section name, expressed as a
- STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
- RELOC indicates whether the initial value of EXP requires
- link-time relocations. */
+ /* OImode shouldn't be used directly. */
+ gcc_assert (mode != OImode);
-static void ATTRIBUTE_UNUSED
-x86_64_elf_unique_section (tree decl, int reloc)
+ return gen_rtx_REG (orig_mode, regno);
+}
+
+static rtx
+function_value_64 (machine_mode orig_mode, machine_mode mode,
+ const_tree valtype)
{
- if (ix86_in_large_data_p (decl))
+ rtx ret;
+
+ /* Handle libcalls, which don't provide a type node. */
+ if (valtype == NULL)
{
- const char *prefix = NULL;
- /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
- bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
+ unsigned int regno;
- switch (categorize_decl_for_section (decl, reloc))
+ switch (mode)
{
- case SECCAT_DATA:
- case SECCAT_DATA_REL:
- case SECCAT_DATA_REL_LOCAL:
- case SECCAT_DATA_REL_RO:
- case SECCAT_DATA_REL_RO_LOCAL:
- prefix = one_only ? ".ld" : ".ldata";
- break;
- case SECCAT_BSS:
- prefix = one_only ? ".lb" : ".lbss";
- break;
- case SECCAT_RODATA:
- case SECCAT_RODATA_MERGE_STR:
- case SECCAT_RODATA_MERGE_STR_INIT:
- case SECCAT_RODATA_MERGE_CONST:
- prefix = one_only ? ".lr" : ".lrodata";
+ case E_SFmode:
+ case E_SCmode:
+ case E_DFmode:
+ case E_DCmode:
+ case E_TFmode:
+ case E_SDmode:
+ case E_DDmode:
+ case E_TDmode:
+ regno = FIRST_SSE_REG;
break;
- case SECCAT_SRODATA:
- case SECCAT_SDATA:
- case SECCAT_SBSS:
- gcc_unreachable ();
- case SECCAT_TEXT:
- case SECCAT_TDATA:
- case SECCAT_TBSS:
- /* We don't split these for medium model. Place them into
- default sections and hope for best. */
+ case E_XFmode:
+ case E_XCmode:
+ regno = FIRST_FLOAT_REG;
break;
+ case E_TCmode:
+ return NULL;
+ default:
+ regno = AX_REG;
}
- if (prefix)
- {
- const char *name, *linkonce;
- char *string;
- name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
- name = targetm.strip_name_encoding (name);
+ return gen_rtx_REG (mode, regno);
+ }
+ else if (POINTER_TYPE_P (valtype))
+ {
+ /* Pointers are always returned in word_mode. */
+ mode = word_mode;
+ }
- /* If we're using one_only, then there needs to be a .gnu.linkonce
- prefix to the section name. */
- linkonce = one_only ? ".gnu.linkonce" : "";
+ ret = construct_container (mode, orig_mode, valtype, 1,
+ X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
+ x86_64_int_return_registers, 0);
- string = ACONCAT ((linkonce, prefix, ".", name, NULL));
+ /* For zero sized structures, construct_container returns NULL, but we
+ need to keep rest of compiler happy by returning meaningful value. */
+ if (!ret)
+ ret = gen_rtx_REG (orig_mode, AX_REG);
- set_decl_section_name (decl, string);
- return;
- }
- }
- default_unique_section (decl, reloc);
+ return ret;
}
-#ifdef COMMON_ASM_OP
-
-#ifndef LARGECOMM_SECTION_ASM_OP
-#define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
-#endif
+static rtx
+function_value_ms_32 (machine_mode orig_mode, machine_mode mode,
+ const_tree fntype, const_tree fn, const_tree valtype)
+{
+ unsigned int regno;
-/* This says how to output assembler code to declare an
- uninitialized external linkage data object.
+ /* Floating point return values in %st(0)
+ (unless -mno-fp-ret-in-387 or aggregate type of up to 8 bytes). */
+ if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387
+ && (GET_MODE_SIZE (mode) > 8
+ || valtype == NULL_TREE || !AGGREGATE_TYPE_P (valtype)))
+ {
+ regno = FIRST_FLOAT_REG;
+ return gen_rtx_REG (orig_mode, regno);
+ }
+ else
+ return function_value_32(orig_mode, mode, fntype,fn);
+}
- For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
- large objects. */
-void
-x86_elf_aligned_decl_common (FILE *file, tree decl,
- const char *name, unsigned HOST_WIDE_INT size,
- int align)
+static rtx
+function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
+ const_tree valtype)
{
- if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
- && size > (unsigned int)ix86_section_threshold)
+ unsigned int regno = AX_REG;
+
+ if (TARGET_SSE)
{
- switch_to_section (get_named_section (decl, ".lbss", 0));
- fputs (LARGECOMM_SECTION_ASM_OP, file);
+ switch (GET_MODE_SIZE (mode))
+ {
+ case 16:
+ if (valtype != NULL_TREE
+ && !VECTOR_INTEGER_TYPE_P (valtype)
+ && !VECTOR_INTEGER_TYPE_P (valtype)
+ && !INTEGRAL_TYPE_P (valtype)
+ && !VECTOR_FLOAT_TYPE_P (valtype))
+ break;
+ if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
+ && !COMPLEX_MODE_P (mode))
+ regno = FIRST_SSE_REG;
+ break;
+ case 8:
+ case 4:
+ if (valtype != NULL_TREE && AGGREGATE_TYPE_P (valtype))
+ break;
+ if (mode == SFmode || mode == DFmode)
+ regno = FIRST_SSE_REG;
+ break;
+ default:
+ break;
+ }
}
- else
- fputs (COMMON_ASM_OP, file);
- assemble_name (file, name);
- fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
- size, align / BITS_PER_UNIT);
+ return gen_rtx_REG (orig_mode, regno);
}
-#endif
-
-/* Utility function for targets to use in implementing
- ASM_OUTPUT_ALIGNED_BSS. */
-void
-x86_output_aligned_bss (FILE *file, tree decl, const char *name,
- unsigned HOST_WIDE_INT size, int align)
+static rtx
+ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
+ machine_mode orig_mode, machine_mode mode)
{
- if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
- && size > (unsigned int)ix86_section_threshold)
- switch_to_section (get_named_section (decl, ".lbss", 0));
+ const_tree fn, fntype;
+
+ fn = NULL_TREE;
+ if (fntype_or_decl && DECL_P (fntype_or_decl))
+ fn = fntype_or_decl;
+ fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
+
+ if (ix86_function_type_abi (fntype) == MS_ABI)
+ {
+ if (TARGET_64BIT)
+ return function_value_ms_64 (orig_mode, mode, valtype);
+ else
+ return function_value_ms_32 (orig_mode, mode, fntype, fn, valtype);
+ }
+ else if (TARGET_64BIT)
+ return function_value_64 (orig_mode, mode, valtype);
else
- switch_to_section (bss_section);
- ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
-#ifdef ASM_DECLARE_OBJECT_NAME
- last_assemble_variable_decl = decl;
- ASM_DECLARE_OBJECT_NAME (file, name, decl);
-#else
- /* Standard thing is just output label for the object. */
- ASM_OUTPUT_LABEL (file, name);
-#endif /* ASM_DECLARE_OBJECT_NAME */
- ASM_OUTPUT_SKIP (file, size ? size : 1);
+ return function_value_32 (orig_mode, mode, fntype, fn);
}
-\f
-/* Decide whether we must probe the stack before any space allocation
- on this target. It's essentially TARGET_STACK_PROBE except when
- -fstack-check causes the stack to be already probed differently. */
-bool
-ix86_target_stack_probe (void)
+static rtx
+ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
{
- /* Do not probe the stack twice if static stack checking is enabled. */
- if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
- return false;
+ machine_mode mode, orig_mode;
- return TARGET_STACK_PROBE;
+ orig_mode = TYPE_MODE (valtype);
+ mode = type_natural_mode (valtype, NULL, true);
+ return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
}
-\f
-/* Decide whether we can make a sibling call to a function. DECL is the
- declaration of the function being targeted by the call and EXP is the
- CALL_EXPR representing the call. */
-
-static bool
-ix86_function_ok_for_sibcall (tree decl, tree exp)
-{
- tree type, decl_or_type;
- rtx a, b;
- bool bind_global = decl && !targetm.binds_local_p (decl);
- if (ix86_function_naked (current_function_decl))
- return false;
-
- /* Sibling call isn't OK if there are no caller-saved registers
- since all registers must be preserved before return. */
- if (cfun->machine->no_caller_saved_registers)
- return false;
-
- /* If we are generating position-independent code, we cannot sibcall
- optimize direct calls to global functions, as the PLT requires
- %ebx be live. (Darwin does not have a PLT.) */
- if (!TARGET_MACHO
- && !TARGET_64BIT
- && flag_pic
- && flag_plt
- && bind_global)
- return false;
-
- /* If we need to align the outgoing stack, then sibcalling would
- unalign the stack, which may break the called function. */
- if (ix86_minimum_incoming_stack_boundary (true)
- < PREFERRED_STACK_BOUNDARY)
- return false;
-
- if (decl)
- {
- decl_or_type = decl;
- type = TREE_TYPE (decl);
- }
- else
- {
- /* We're looking at the CALL_EXPR, we need the type of the function. */
- type = CALL_EXPR_FN (exp); /* pointer expression */
- type = TREE_TYPE (type); /* pointer type */
- type = TREE_TYPE (type); /* function type */
- decl_or_type = type;
- }
+/* Pointer function arguments and return values are promoted to
+ word_mode for normal functions. */
- /* Check that the return value locations are the same. Like
- if we are returning floats on the 80387 register stack, we cannot
- make a sibcall from a function that doesn't return a float to a
- function that does or, conversely, from a function that does return
- a float to a function that doesn't; the necessary stack adjustment
- would not be executed. This is also the place we notice
- differences in the return value ABI. Note that it is ok for one
- of the functions to have void return type as long as the return
- value of the other is passed in a register. */
- a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
- b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
- cfun->decl, false);
- if (STACK_REG_P (a) || STACK_REG_P (b))
+static machine_mode
+ix86_promote_function_mode (const_tree type, machine_mode mode,
+ int *punsignedp, const_tree fntype,
+ int for_return)
+{
+ if (cfun->machine->func_type == TYPE_NORMAL
+ && type != NULL_TREE
+ && POINTER_TYPE_P (type))
{
- if (!rtx_equal_p (a, b))
- return false;
+ *punsignedp = POINTERS_EXTEND_UNSIGNED;
+ return word_mode;
}
- else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
- ;
- else if (!rtx_equal_p (a, b))
- return false;
+ return default_promote_function_mode (type, mode, punsignedp, fntype,
+ for_return);
+}
- if (TARGET_64BIT)
- {
- /* The SYSV ABI has more call-clobbered registers;
- disallow sibcalls from MS to SYSV. */
- if (cfun->machine->call_abi == MS_ABI
- && ix86_function_type_abi (type) == SYSV_ABI)
- return false;
- }
- else
- {
- /* If this call is indirect, we'll need to be able to use a
- call-clobbered register for the address of the target function.
- Make sure that all such registers are not used for passing
- parameters. Note that DLLIMPORT functions and call to global
- function via GOT slot are indirect. */
- if (!decl
- || (bind_global && flag_pic && !flag_plt)
- || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl))
- || flag_force_indirect_call)
- {
- /* Check if regparm >= 3 since arg_reg_available is set to
- false if regparm == 0. If regparm is 1 or 2, there is
- always a call-clobbered register available.
+/* Return true if a structure, union or array with MODE containing FIELD
+ should be accessed using BLKmode. */
- ??? The symbol indirect call doesn't need a call-clobbered
- register. But we don't know if this is a symbol indirect
- call or not here. */
- if (ix86_function_regparm (type, decl) >= 3
- && !cfun->machine->arg_reg_available)
- return false;
- }
- }
+static bool
+ix86_member_type_forces_blk (const_tree field, machine_mode mode)
+{
+ /* Union with XFmode must be in BLKmode. */
+ return (mode == XFmode
+ && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
+ || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
+}
- /* Otherwise okay. That also includes certain types of indirect calls. */
- return true;
+rtx
+ix86_libcall_value (machine_mode mode)
+{
+ return ix86_function_value_1 (NULL, NULL, mode, mode);
}
-/* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
- and "sseregparm" calling convention attributes;
- arguments as in struct attribute_spec.handler. */
+/* Return true iff type is returned in memory. */
-static tree
-ix86_handle_cconv_attribute (tree *node, tree name, tree args, int,
- bool *no_add_attrs)
+static bool
+ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
{
- if (TREE_CODE (*node) != FUNCTION_TYPE
- && TREE_CODE (*node) != METHOD_TYPE
- && TREE_CODE (*node) != FIELD_DECL
- && TREE_CODE (*node) != TYPE_DECL)
- {
- warning (OPT_Wattributes, "%qE attribute only applies to functions",
- name);
- *no_add_attrs = true;
- return NULL_TREE;
- }
+#ifdef SUBTARGET_RETURN_IN_MEMORY
+ return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
+#else
+ const machine_mode mode = type_natural_mode (type, NULL, true);
+ HOST_WIDE_INT size;
- /* Can combine regparm with all attributes but fastcall, and thiscall. */
- if (is_attribute_p ("regparm", name))
+ if (TARGET_64BIT)
{
- tree cst;
-
- if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
- {
- error ("fastcall and regparm attributes are not compatible");
- }
-
- if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
+ if (ix86_function_type_abi (fntype) == MS_ABI)
{
- error ("regparam and thiscall attributes are not compatible");
- }
+ size = int_size_in_bytes (type);
- cst = TREE_VALUE (args);
- if (TREE_CODE (cst) != INTEGER_CST)
- {
- warning (OPT_Wattributes,
- "%qE attribute requires an integer constant argument",
- name);
- *no_add_attrs = true;
+ /* __m128 is returned in xmm0. */
+ if ((!type || VECTOR_INTEGER_TYPE_P (type)
+ || INTEGRAL_TYPE_P (type)
+ || VECTOR_FLOAT_TYPE_P (type))
+ && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
+ && !COMPLEX_MODE_P (mode)
+ && (GET_MODE_SIZE (mode) == 16 || size == 16))
+ return false;
+
+ /* Otherwise, the size must be exactly in [1248]. */
+ return size != 1 && size != 2 && size != 4 && size != 8;
}
- else if (compare_tree_int (cst, REGPARM_MAX) > 0)
+ else
{
- warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
- name, REGPARM_MAX);
- *no_add_attrs = true;
- }
+ int needed_intregs, needed_sseregs;
- return NULL_TREE;
+ return examine_argument (mode, type, 1,
+ &needed_intregs, &needed_sseregs);
+ }
}
-
- if (TARGET_64BIT)
+ else
{
- /* Do not warn when emulating the MS ABI. */
- if ((TREE_CODE (*node) != FUNCTION_TYPE
- && TREE_CODE (*node) != METHOD_TYPE)
- || ix86_function_type_abi (*node) != MS_ABI)
- warning (OPT_Wattributes, "%qE attribute ignored",
- name);
- *no_add_attrs = true;
- return NULL_TREE;
- }
+ size = int_size_in_bytes (type);
- /* Can combine fastcall with stdcall (redundant) and sseregparm. */
- if (is_attribute_p ("fastcall", name))
- {
- if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
- {
- error ("fastcall and cdecl attributes are not compatible");
- }
- if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
- {
- error ("fastcall and stdcall attributes are not compatible");
- }
- if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
- {
- error ("fastcall and regparm attributes are not compatible");
- }
- if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
- {
- error ("fastcall and thiscall attributes are not compatible");
- }
- }
+ /* Intel MCU psABI returns scalars and aggregates no larger than 8
+ bytes in registers. */
+ if (TARGET_IAMCU)
+ return VECTOR_MODE_P (mode) || size < 0 || size > 8;
- /* Can combine stdcall with fastcall (redundant), regparm and
- sseregparm. */
- else if (is_attribute_p ("stdcall", name))
- {
- if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
- {
- error ("stdcall and cdecl attributes are not compatible");
- }
- if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
- {
- error ("stdcall and fastcall attributes are not compatible");
- }
- if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
- {
- error ("stdcall and thiscall attributes are not compatible");
- }
- }
+ if (mode == BLKmode)
+ return true;
- /* Can combine cdecl with regparm and sseregparm. */
- else if (is_attribute_p ("cdecl", name))
- {
- if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
- {
- error ("stdcall and cdecl attributes are not compatible");
- }
- if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
- {
- error ("fastcall and cdecl attributes are not compatible");
- }
- if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
- {
- error ("cdecl and thiscall attributes are not compatible");
- }
- }
- else if (is_attribute_p ("thiscall", name))
- {
- if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
- warning (OPT_Wattributes, "%qE attribute is used for non-class method",
- name);
- if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
- {
- error ("stdcall and thiscall attributes are not compatible");
- }
- if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
- {
- error ("fastcall and thiscall attributes are not compatible");
- }
- if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
+ if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
+ return false;
+
+ if (VECTOR_MODE_P (mode) || mode == TImode)
{
- error ("cdecl and thiscall attributes are not compatible");
- }
- }
+ /* User-created vectors small enough to fit in EAX. */
+ if (size < 8)
+ return false;
- /* Can combine sseregparm with all attributes. */
+ /* Unless ABI prescibes otherwise,
+ MMX/3dNow values are returned in MM0 if available. */
+
+ if (size == 8)
+ return TARGET_VECT8_RETURNS || !TARGET_MMX;
- return NULL_TREE;
-}
+ /* SSE values are returned in XMM0 if available. */
+ if (size == 16)
+ return !TARGET_SSE;
-/* The transactional memory builtins are implicitly regparm or fastcall
- depending on the ABI. Override the generic do-nothing attribute that
- these builtins were declared with, and replace it with one of the two
- attributes that we expect elsewhere. */
+ /* AVX values are returned in YMM0 if available. */
+ if (size == 32)
+ return !TARGET_AVX;
-static tree
-ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
- int flags, bool *no_add_attrs)
-{
- tree alt;
+ /* AVX512F values are returned in ZMM0 if available. */
+ if (size == 64)
+ return !TARGET_AVX512F;
+ }
- /* In no case do we want to add the placeholder attribute. */
- *no_add_attrs = true;
+ if (mode == XFmode)
+ return false;
- /* The 64-bit ABI is unchanged for transactional memory. */
- if (TARGET_64BIT)
- return NULL_TREE;
+ if (size > 12)
+ return true;
- /* ??? Is there a better way to validate 32-bit windows? We have
- cfun->machine->call_abi, but that seems to be set only for 64-bit. */
- if (CHECK_STACK_LIMIT > 0)
- alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
- else
- {
- alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
- alt = tree_cons (get_identifier ("regparm"), alt, NULL);
- }
- decl_attributes (node, alt, flags);
+ /* OImode shouldn't be used directly. */
+ gcc_assert (mode != OImode);
- return NULL_TREE;
+ return false;
+ }
+#endif
}
-/* This function determines from TYPE the calling-convention. */
+\f
+/* Create the va_list data type. */
-unsigned int
-ix86_get_callcvt (const_tree type)
+static tree
+ix86_build_builtin_va_list_64 (void)
{
- unsigned int ret = 0;
- bool is_stdarg;
- tree attrs;
+ tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
- if (TARGET_64BIT)
- return IX86_CALLCVT_CDECL;
+ record = lang_hooks.types.make_type (RECORD_TYPE);
+ type_decl = build_decl (BUILTINS_LOCATION,
+ TYPE_DECL, get_identifier ("__va_list_tag"), record);
- attrs = TYPE_ATTRIBUTES (type);
- if (attrs != NULL_TREE)
- {
- if (lookup_attribute ("cdecl", attrs))
- ret |= IX86_CALLCVT_CDECL;
- else if (lookup_attribute ("stdcall", attrs))
- ret |= IX86_CALLCVT_STDCALL;
- else if (lookup_attribute ("fastcall", attrs))
- ret |= IX86_CALLCVT_FASTCALL;
- else if (lookup_attribute ("thiscall", attrs))
- ret |= IX86_CALLCVT_THISCALL;
+ f_gpr = build_decl (BUILTINS_LOCATION,
+ FIELD_DECL, get_identifier ("gp_offset"),
+ unsigned_type_node);
+ f_fpr = build_decl (BUILTINS_LOCATION,
+ FIELD_DECL, get_identifier ("fp_offset"),
+ unsigned_type_node);
+ f_ovf = build_decl (BUILTINS_LOCATION,
+ FIELD_DECL, get_identifier ("overflow_arg_area"),
+ ptr_type_node);
+ f_sav = build_decl (BUILTINS_LOCATION,
+ FIELD_DECL, get_identifier ("reg_save_area"),
+ ptr_type_node);
- /* Regparam isn't allowed for thiscall and fastcall. */
- if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
- {
- if (lookup_attribute ("regparm", attrs))
- ret |= IX86_CALLCVT_REGPARM;
- if (lookup_attribute ("sseregparm", attrs))
- ret |= IX86_CALLCVT_SSEREGPARM;
- }
+ va_list_gpr_counter_field = f_gpr;
+ va_list_fpr_counter_field = f_fpr;
- if (IX86_BASE_CALLCVT(ret) != 0)
- return ret;
- }
+ DECL_FIELD_CONTEXT (f_gpr) = record;
+ DECL_FIELD_CONTEXT (f_fpr) = record;
+ DECL_FIELD_CONTEXT (f_ovf) = record;
+ DECL_FIELD_CONTEXT (f_sav) = record;
- is_stdarg = stdarg_p (type);
- if (TARGET_RTD && !is_stdarg)
- return IX86_CALLCVT_STDCALL | ret;
+ TYPE_STUB_DECL (record) = type_decl;
+ TYPE_NAME (record) = type_decl;
+ TYPE_FIELDS (record) = f_gpr;
+ DECL_CHAIN (f_gpr) = f_fpr;
+ DECL_CHAIN (f_fpr) = f_ovf;
+ DECL_CHAIN (f_ovf) = f_sav;
- if (ret != 0
- || is_stdarg
- || TREE_CODE (type) != METHOD_TYPE
- || ix86_function_type_abi (type) != MS_ABI)
- return IX86_CALLCVT_CDECL | ret;
+ layout_type (record);
- return IX86_CALLCVT_THISCALL;
+ TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
+ NULL_TREE, TYPE_ATTRIBUTES (record));
+
+ /* The correct type is an array type of one element. */
+ return build_array_type (record, build_index_type (size_zero_node));
}
-/* Return 0 if the attributes for two types are incompatible, 1 if they
- are compatible, and 2 if they are nearly compatible (which causes a
- warning to be generated). */
+/* Setup the builtin va_list data type and for 64-bit the additional
+ calling convention specific va_list data types. */
-static int
-ix86_comp_type_attributes (const_tree type1, const_tree type2)
+static tree
+ix86_build_builtin_va_list (void)
{
- unsigned int ccvt1, ccvt2;
-
- if (TREE_CODE (type1) != FUNCTION_TYPE
- && TREE_CODE (type1) != METHOD_TYPE)
- return 1;
-
- ccvt1 = ix86_get_callcvt (type1);
- ccvt2 = ix86_get_callcvt (type2);
- if (ccvt1 != ccvt2)
- return 0;
- if (ix86_function_regparm (type1, NULL)
- != ix86_function_regparm (type2, NULL))
- return 0;
+ if (TARGET_64BIT)
+ {
+ /* Initialize ABI specific va_list builtin types.
- return 1;
-}
-\f
-/* Return the regparm value for a function with the indicated TYPE and DECL.
- DECL may be NULL when calling function indirectly
- or considering a libcall. */
+ In lto1, we can encounter two va_list types:
+ - one as a result of the type-merge across TUs, and
+ - the one constructed here.
+ These two types will not have the same TYPE_MAIN_VARIANT, and therefore
+ a type identity check in canonical_va_list_type based on
+ TYPE_MAIN_VARIANT (which we used to have) will not work.
+ Instead, we tag each va_list_type_node with its unique attribute, and
+ look for the attribute in the type identity check in
+ canonical_va_list_type.
-static int
-ix86_function_regparm (const_tree type, const_tree decl)
-{
- tree attr;
- int regparm;
- unsigned int ccvt;
+ Tagging sysv_va_list_type_node directly with the attribute is
+ problematic since it's a array of one record, which will degrade into a
+ pointer to record when used as parameter (see build_va_arg comments for
+ an example), dropping the attribute in the process. So we tag the
+ record instead. */
- if (TARGET_64BIT)
- return (ix86_function_type_abi (type) == SYSV_ABI
- ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
- ccvt = ix86_get_callcvt (type);
- regparm = ix86_regparm;
+ /* For SYSV_ABI we use an array of one record. */
+ sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
+
+ /* For MS_ABI we use plain pointer to argument area. */
+ tree char_ptr_type = build_pointer_type (char_type_node);
+ tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
+ TYPE_ATTRIBUTES (char_ptr_type));
+ ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
- if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
+ return ((ix86_abi == MS_ABI)
+ ? ms_va_list_type_node
+ : sysv_va_list_type_node);
+ }
+ else
{
- attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
- if (attr)
- {
- regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
- return regparm;
- }
+ /* For i386 we use plain pointer to argument area. */
+ return build_pointer_type (char_type_node);
}
- else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
- return 2;
- else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
- return 1;
+}
- /* Use register calling convention for local functions when possible. */
- if (decl
- && TREE_CODE (decl) == FUNCTION_DECL)
- {
- cgraph_node *target = cgraph_node::get (decl);
- if (target)
- target = target->function_symbol ();
+/* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
- /* Caller and callee must agree on the calling convention, so
- checking here just optimize means that with
- __attribute__((optimize (...))) caller could use regparm convention
- and callee not, or vice versa. Instead look at whether the callee
- is optimized or not. */
- if (target && opt_for_fn (target->decl, optimize)
- && !(profile_flag && !flag_fentry))
- {
- cgraph_local_info *i = &target->local;
- if (i && i->local && i->can_change_signature)
- {
- int local_regparm, globals = 0, regno;
+static void
+setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
+{
+ rtx save_area, mem;
+ alias_set_type set;
+ int i, max;
- /* Make sure no regparm register is taken by a
- fixed register variable. */
- for (local_regparm = 0; local_regparm < REGPARM_MAX;
- local_regparm++)
- if (fixed_regs[local_regparm])
- break;
+ /* GPR size of varargs save area. */
+ if (cfun->va_list_gpr_size)
+ ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
+ else
+ ix86_varargs_gpr_size = 0;
- /* We don't want to use regparm(3) for nested functions as
- these use a static chain pointer in the third argument. */
- if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
- local_regparm = 2;
+ /* FPR size of varargs save area. We don't need it if we don't pass
+ anything in SSE registers. */
+ if (TARGET_SSE && cfun->va_list_fpr_size)
+ ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
+ else
+ ix86_varargs_fpr_size = 0;
- /* Save a register for the split stack. */
- if (flag_split_stack)
- {
- if (local_regparm == 3)
- local_regparm = 2;
- else if (local_regparm == 2
- && DECL_STATIC_CHAIN (target->decl))
- local_regparm = 1;
- }
+ if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
+ return;
- /* Each fixed register usage increases register pressure,
- so less registers should be used for argument passing.
- This functionality can be overriden by an explicit
- regparm value. */
- for (regno = AX_REG; regno <= DI_REG; regno++)
- if (fixed_regs[regno])
- globals++;
+ save_area = frame_pointer_rtx;
+ set = get_varargs_alias_set ();
- local_regparm
- = globals < local_regparm ? local_regparm - globals : 0;
+ max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
+ if (max > X86_64_REGPARM_MAX)
+ max = X86_64_REGPARM_MAX;
- if (local_regparm > regparm)
- regparm = local_regparm;
- }
- }
+ for (i = cum->regno; i < max; i++)
+ {
+ mem = gen_rtx_MEM (word_mode,
+ plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
+ MEM_NOTRAP_P (mem) = 1;
+ set_mem_alias_set (mem, set);
+ emit_move_insn (mem,
+ gen_rtx_REG (word_mode,
+ x86_64_int_parameter_registers[i]));
}
- return regparm;
-}
-
-/* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
- DFmode (2) arguments in SSE registers for a function with the
- indicated TYPE and DECL. DECL may be NULL when calling function
- indirectly or considering a libcall. Return -1 if any FP parameter
- should be rejected by error. This is used in siutation we imply SSE
- calling convetion but the function is called from another function with
- SSE disabled. Otherwise return 0. */
-
-static int
-ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
-{
- gcc_assert (!TARGET_64BIT);
-
- /* Use SSE registers to pass SFmode and DFmode arguments if requested
- by the sseregparm attribute. */
- if (TARGET_SSEREGPARM
- || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
+ if (ix86_varargs_fpr_size)
{
- if (!TARGET_SSE)
- {
- if (warn)
- {
- if (decl)
- error ("calling %qD with attribute sseregparm without "
- "SSE/SSE2 enabled", decl);
- else
- error ("calling %qT with attribute sseregparm without "
- "SSE/SSE2 enabled", type);
- }
- return 0;
- }
+ machine_mode smode;
+ rtx_code_label *label;
+ rtx test;
- return 2;
- }
+ /* Now emit code to save SSE registers. The AX parameter contains number
+ of SSE parameter registers used to call this function, though all we
+ actually check here is the zero/non-zero status. */
- if (!decl)
- return 0;
+ label = gen_label_rtx ();
+ test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
+ emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
+ label));
- cgraph_node *target = cgraph_node::get (decl);
- if (target)
- target = target->function_symbol ();
+ /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
+ we used movdqa (i.e. TImode) instead? Perhaps even better would
+ be if we could determine the real mode of the data, via a hook
+ into pass_stdarg. Ignore all that for now. */
+ smode = V4SFmode;
+ if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
+ crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
- /* For local functions, pass up to SSE_REGPARM_MAX SFmode
- (and DFmode for SSE2) arguments in SSE registers. */
- if (target
- /* TARGET_SSE_MATH */
- && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
- && opt_for_fn (target->decl, optimize)
- && !(profile_flag && !flag_fentry))
- {
- cgraph_local_info *i = &target->local;
- if (i && i->local && i->can_change_signature)
+ max = cum->sse_regno + cfun->va_list_fpr_size / 16;
+ if (max > X86_64_SSE_REGPARM_MAX)
+ max = X86_64_SSE_REGPARM_MAX;
+
+ for (i = cum->sse_regno; i < max; ++i)
{
- /* Refuse to produce wrong code when local function with SSE enabled
- is called from SSE disabled function.
- FIXME: We need a way to detect these cases cross-ltrans partition
- and avoid using SSE calling conventions on local functions called
- from function with SSE disabled. For now at least delay the
- warning until we know we are going to produce wrong code.
- See PR66047 */
- if (!TARGET_SSE && warn)
- return -1;
- return TARGET_SSE2_P (target_opts_for_fn (target->decl)
- ->x_ix86_isa_flags) ? 2 : 1;
+ mem = plus_constant (Pmode, save_area,
+ i * 16 + ix86_varargs_gpr_size);
+ mem = gen_rtx_MEM (smode, mem);
+ MEM_NOTRAP_P (mem) = 1;
+ set_mem_alias_set (mem, set);
+ set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
+
+ emit_move_insn (mem, gen_rtx_REG (smode, GET_SSE_REGNO (i)));
}
- }
- return 0;
+ emit_label (label);
+ }
}
-/* Return true if EAX is live at the start of the function. Used by
- ix86_expand_prologue to determine if we need special help before
- calling allocate_stack_worker. */
-
-static bool
-ix86_eax_live_at_start_p (void)
+static void
+setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
{
- /* Cheat. Don't bother working forward from ix86_function_regparm
- to the function type to whether an actual argument is located in
- eax. Instead just look at cfg info, which is still close enough
- to correct at this point. This gives false positives for broken
- functions that might use uninitialized data that happens to be
- allocated in eax, but who cares? */
- return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
-}
+ alias_set_type set = get_varargs_alias_set ();
+ int i;
-static bool
-ix86_keep_aggregate_return_pointer (tree fntype)
-{
- tree attr;
+ /* Reset to zero, as there might be a sysv vaarg used
+ before. */
+ ix86_varargs_gpr_size = 0;
+ ix86_varargs_fpr_size = 0;
- if (!TARGET_64BIT)
+ for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
{
- attr = lookup_attribute ("callee_pop_aggregate_return",
- TYPE_ATTRIBUTES (fntype));
- if (attr)
- return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
+ rtx reg, mem;
- /* For 32-bit MS-ABI the default is to keep aggregate
- return pointer. */
- if (ix86_function_type_abi (fntype) == MS_ABI)
- return true;
+ mem = gen_rtx_MEM (Pmode,
+ plus_constant (Pmode, virtual_incoming_args_rtx,
+ i * UNITS_PER_WORD));
+ MEM_NOTRAP_P (mem) = 1;
+ set_mem_alias_set (mem, set);
+
+ reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
+ emit_move_insn (mem, reg);
}
- return KEEP_AGGREGATE_RETURN_POINTER != 0;
}
-/* Value is the number of bytes of arguments automatically
- popped when returning from a subroutine call.
- FUNDECL is the declaration node of the function (as a tree),
- FUNTYPE is the data type of the function (as a tree),
- or for a library call it is an identifier node for the subroutine name.
- SIZE is the number of bytes of arguments passed on the stack.
-
- On the 80386, the RTD insn may be used to pop them if the number
- of args is fixed, but if the number is variable then the caller
- must pop them all. RTD can't be used for library calls now
- because the library is compiled with the Unix compiler.
- Use of RTD is a selectable option, since it is incompatible with
- standard Unix calling sequences. If the option is not selected,
- the caller must always pop the args.
-
- The attribute stdcall is equivalent to RTD on a per module basis. */
-
-static poly_int64
-ix86_return_pops_args (tree fundecl, tree funtype, poly_int64 size)
+static void
+ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
+ tree type, int *, int no_rtl)
{
- unsigned int ccvt;
+ CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+ CUMULATIVE_ARGS next_cum;
+ tree fntype;
- /* None of the 64-bit ABIs pop arguments. */
- if (TARGET_64BIT)
- return 0;
+ /* This argument doesn't appear to be used anymore. Which is good,
+ because the old code here didn't suppress rtl generation. */
+ gcc_assert (!no_rtl);
- ccvt = ix86_get_callcvt (funtype);
+ if (!TARGET_64BIT)
+ return;
- if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
- | IX86_CALLCVT_THISCALL)) != 0
- && ! stdarg_p (funtype))
- return size;
+ fntype = TREE_TYPE (current_function_decl);
- /* Lose any fake structure return argument if it is passed on the stack. */
- if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
- && !ix86_keep_aggregate_return_pointer (funtype))
- {
- int nregs = ix86_function_regparm (funtype, fundecl);
- if (nregs == 0)
- return GET_MODE_SIZE (Pmode);
- }
+ /* For varargs, we do not want to skip the dummy va_dcl argument.
+ For stdargs, we do want to skip the last named argument. */
+ next_cum = *cum;
+ if (stdarg_p (fntype))
+ ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
+ true);
- return 0;
+ if (cum->call_abi == MS_ABI)
+ setup_incoming_varargs_ms_64 (&next_cum);
+ else
+ setup_incoming_varargs_64 (&next_cum);
}
-/* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
+static void
+ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
+ machine_mode mode,
+ tree type,
+ int *pretend_size ATTRIBUTE_UNUSED,
+ int no_rtl)
+{
+ CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+ CUMULATIVE_ARGS next_cum;
+ tree fntype;
+ int max;
+
+ gcc_assert (!no_rtl);
+
+ /* Do nothing if we use plain pointer to argument area. */
+ if (!TARGET_64BIT || cum->call_abi == MS_ABI)
+ return;
+
+ fntype = TREE_TYPE (current_function_decl);
+
+ /* For varargs, we do not want to skip the dummy va_dcl argument.
+ For stdargs, we do want to skip the last named argument. */
+ next_cum = *cum;
+ if (stdarg_p (fntype))
+ ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
+ true);
+
+ max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
+ if (max > X86_64_REGPARM_MAX)
+ max = X86_64_REGPARM_MAX;
+}
+
+
+/* Checks if TYPE is of kind va_list char *. */
static bool
-ix86_legitimate_combined_insn (rtx_insn *insn)
+is_va_list_char_pointer (tree type)
{
- int i;
+ tree canonic;
- /* Check operand constraints in case hard registers were propagated
- into insn pattern. This check prevents combine pass from
- generating insn patterns with invalid hard register operands.
- These invalid insns can eventually confuse reload to error out
- with a spill failure. See also PRs 46829 and 46843. */
+ /* For 32-bit it is always true. */
+ if (!TARGET_64BIT)
+ return true;
+ canonic = ix86_canonical_va_list_type (type);
+ return (canonic == ms_va_list_type_node
+ || (ix86_abi == MS_ABI && canonic == va_list_type_node));
+}
- gcc_assert (INSN_CODE (insn) >= 0);
+/* Implement va_start. */
- extract_insn (insn);
- preprocess_constraints (insn);
+static void
+ix86_va_start (tree valist, rtx nextarg)
+{
+ HOST_WIDE_INT words, n_gpr, n_fpr;
+ tree f_gpr, f_fpr, f_ovf, f_sav;
+ tree gpr, fpr, ovf, sav, t;
+ tree type;
+ rtx ovf_rtx;
- int n_operands = recog_data.n_operands;
- int n_alternatives = recog_data.n_alternatives;
- for (i = 0; i < n_operands; i++)
+ if (flag_split_stack
+ && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
{
- rtx op = recog_data.operand[i];
- machine_mode mode = GET_MODE (op);
- const operand_alternative *op_alt;
- int offset = 0;
- bool win;
- int j;
+ unsigned int scratch_regno;
- /* A unary operator may be accepted by the predicate, but it
- is irrelevant for matching constraints. */
- if (UNARY_P (op))
- op = XEXP (op, 0);
+ /* When we are splitting the stack, we can't refer to the stack
+ arguments using internal_arg_pointer, because they may be on
+ the old stack. The split stack prologue will arrange to
+ leave a pointer to the old stack arguments in a scratch
+ register, which we here copy to a pseudo-register. The split
+ stack prologue can't set the pseudo-register directly because
+ it (the prologue) runs before any registers have been saved. */
- if (SUBREG_P (op))
+ scratch_regno = split_stack_prologue_scratch_regno ();
+ if (scratch_regno != INVALID_REGNUM)
{
- if (REG_P (SUBREG_REG (op))
- && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
- offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
- GET_MODE (SUBREG_REG (op)),
- SUBREG_BYTE (op),
- GET_MODE (op));
- op = SUBREG_REG (op);
- }
+ rtx reg;
+ rtx_insn *seq;
- if (!(REG_P (op) && HARD_REGISTER_P (op)))
- continue;
+ reg = gen_reg_rtx (Pmode);
+ cfun->machine->split_stack_varargs_pointer = reg;
- op_alt = recog_op_alt;
+ start_sequence ();
+ emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
+ seq = get_insns ();
+ end_sequence ();
- /* Operand has no constraints, anything is OK. */
- win = !n_alternatives;
+ push_topmost_sequence ();
+ emit_insn_after (seq, entry_of_function ());
+ pop_topmost_sequence ();
+ }
+ }
- alternative_mask preferred = get_preferred_alternatives (insn);
- for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
+ /* Only 64bit target needs something special. */
+ if (is_va_list_char_pointer (TREE_TYPE (valist)))
+ {
+ if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
+ std_expand_builtin_va_start (valist, nextarg);
+ else
{
- if (!TEST_BIT (preferred, j))
- continue;
- if (op_alt[i].anything_ok
- || (op_alt[i].matches != -1
- && operands_match_p
- (recog_data.operand[i],
- recog_data.operand[op_alt[i].matches]))
- || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
- {
- win = true;
- break;
- }
- }
+ rtx va_r, next;
- if (!win)
- return false;
+ va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
+ next = expand_binop (ptr_mode, add_optab,
+ cfun->machine->split_stack_varargs_pointer,
+ crtl->args.arg_offset_rtx,
+ NULL_RTX, 0, OPTAB_LIB_WIDEN);
+ convert_move (va_r, next, 0);
+ }
+ return;
}
- return true;
-}
-\f
-/* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
+ f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
+ f_fpr = DECL_CHAIN (f_gpr);
+ f_ovf = DECL_CHAIN (f_fpr);
+ f_sav = DECL_CHAIN (f_ovf);
-static unsigned HOST_WIDE_INT
-ix86_asan_shadow_offset (void)
-{
- return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
- : HOST_WIDE_INT_C (0x7fff8000))
- : (HOST_WIDE_INT_1 << 29);
-}
-\f
-/* Argument support functions. */
+ valist = build_simple_mem_ref (valist);
+ TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
+ /* The following should be folded into the MEM_REF offset. */
+ gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
+ f_gpr, NULL_TREE);
+ fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
+ f_fpr, NULL_TREE);
+ ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
+ f_ovf, NULL_TREE);
+ sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
+ f_sav, NULL_TREE);
-/* Return true when register may be used to pass function parameters. */
-bool
-ix86_function_arg_regno_p (int regno)
-{
- int i;
- enum calling_abi call_abi;
- const int *parm_regs;
+ /* Count number of gp and fp argument registers used. */
+ words = crtl->args.info.words;
+ n_gpr = crtl->args.info.regno;
+ n_fpr = crtl->args.info.sse_regno;
- if (!TARGET_64BIT)
+ if (cfun->va_list_gpr_size)
{
- if (TARGET_MACHO)
- return (regno < REGPARM_MAX
- || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
- else
- return (regno < REGPARM_MAX
- || (TARGET_MMX && MMX_REGNO_P (regno)
- && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
- || (TARGET_SSE && SSE_REGNO_P (regno)
- && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
+ type = TREE_TYPE (gpr);
+ t = build2 (MODIFY_EXPR, type,
+ gpr, build_int_cst (type, n_gpr * 8));
+ TREE_SIDE_EFFECTS (t) = 1;
+ expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
}
- if (TARGET_SSE && SSE_REGNO_P (regno)
- && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
- return true;
-
- /* TODO: The function should depend on current function ABI but
- builtins.c would need updating then. Therefore we use the
- default ABI. */
- call_abi = ix86_cfun_abi ();
-
- /* RAX is used as hidden argument to va_arg functions. */
- if (call_abi == SYSV_ABI && regno == AX_REG)
- return true;
+ if (TARGET_SSE && cfun->va_list_fpr_size)
+ {
+ type = TREE_TYPE (fpr);
+ t = build2 (MODIFY_EXPR, type, fpr,
+ build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
+ TREE_SIDE_EFFECTS (t) = 1;
+ expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
+ }
- if (call_abi == MS_ABI)
- parm_regs = x86_64_ms_abi_int_parameter_registers;
+ /* Find the overflow area. */
+ type = TREE_TYPE (ovf);
+ if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
+ ovf_rtx = crtl->args.internal_arg_pointer;
else
- parm_regs = x86_64_int_parameter_registers;
+ ovf_rtx = cfun->machine->split_stack_varargs_pointer;
+ t = make_tree (type, ovf_rtx);
+ if (words != 0)
+ t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
- for (i = 0; i < (call_abi == MS_ABI
- ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
- if (regno == parm_regs[i])
- return true;
- return false;
-}
-
-/* Return if we do not know how to pass TYPE solely in registers. */
+ t = build2 (MODIFY_EXPR, type, ovf, t);
+ TREE_SIDE_EFFECTS (t) = 1;
+ expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
-static bool
-ix86_must_pass_in_stack (machine_mode mode, const_tree type)
-{
- if (must_pass_in_stack_var_size_or_pad (mode, type))
- return true;
+ if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
+ {
+ /* Find the register save area.
+ Prologue of the function save it right above stack frame. */
+ type = TREE_TYPE (sav);
+ t = make_tree (type, frame_pointer_rtx);
+ if (!ix86_varargs_gpr_size)
+ t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
- /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
- The layout_type routine is crafty and tries to trick us into passing
- currently unsupported vector types on the stack by using TImode. */
- return (!TARGET_64BIT && mode == TImode
- && type && TREE_CODE (type) != VECTOR_TYPE);
+ t = build2 (MODIFY_EXPR, type, sav, t);
+ TREE_SIDE_EFFECTS (t) = 1;
+ expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
+ }
}
-/* It returns the size, in bytes, of the area reserved for arguments passed
- in registers for the function represented by fndecl dependent to the used
- abi format. */
-int
-ix86_reg_parm_stack_space (const_tree fndecl)
-{
- enum calling_abi call_abi = SYSV_ABI;
- if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
- call_abi = ix86_function_abi (fndecl);
- else
- call_abi = ix86_function_type_abi (fndecl);
- if (TARGET_64BIT && call_abi == MS_ABI)
- return 32;
- return 0;
-}
+/* Implement va_arg. */
-/* We add this as a workaround in order to use libc_has_function
- hook in i386.md. */
-bool
-ix86_libc_has_function (enum function_class fn_class)
+static tree
+ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
+ gimple_seq *post_p)
{
- return targetm.libc_has_function (fn_class);
-}
+ static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
+ tree f_gpr, f_fpr, f_ovf, f_sav;
+ tree gpr, fpr, ovf, sav, t;
+ int size, rsize;
+ tree lab_false, lab_over = NULL_TREE;
+ tree addr, t2;
+ rtx container;
+ int indirect_p = 0;
+ tree ptrtype;
+ machine_mode nat_mode;
+ unsigned int arg_boundary;
-/* Returns value SYSV_ABI, MS_ABI dependent on fntype,
- specifying the call abi used. */
-enum calling_abi
-ix86_function_type_abi (const_tree fntype)
-{
- enum calling_abi abi = ix86_abi;
+ /* Only 64bit target needs something special. */
+ if (is_va_list_char_pointer (TREE_TYPE (valist)))
+ return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
- if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
- return abi;
+ f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
+ f_fpr = DECL_CHAIN (f_gpr);
+ f_ovf = DECL_CHAIN (f_fpr);
+ f_sav = DECL_CHAIN (f_ovf);
- if (abi == SYSV_ABI
- && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
+ gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
+ valist, f_gpr, NULL_TREE);
+
+ fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
+ ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
+ sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
+
+ indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
+ if (indirect_p)
+ type = build_pointer_type (type);
+ size = arg_int_size_in_bytes (type);
+ rsize = CEIL (size, UNITS_PER_WORD);
+
+ nat_mode = type_natural_mode (type, NULL, false);
+ switch (nat_mode)
{
- static int warned;
- if (TARGET_X32 && !warned)
+ case E_V8SFmode:
+ case E_V8SImode:
+ case E_V32QImode:
+ case E_V16HImode:
+ case E_V4DFmode:
+ case E_V4DImode:
+ case E_V16SFmode:
+ case E_V16SImode:
+ case E_V64QImode:
+ case E_V32HImode:
+ case E_V8DFmode:
+ case E_V8DImode:
+ /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
+ if (!TARGET_64BIT_MS_ABI)
{
- error ("X32 does not support ms_abi attribute");
- warned = 1;
+ container = NULL;
+ break;
}
+ /* FALLTHRU */
- abi = MS_ABI;
+ default:
+ container = construct_container (nat_mode, TYPE_MODE (type),
+ type, 0, X86_64_REGPARM_MAX,
+ X86_64_SSE_REGPARM_MAX, intreg,
+ 0);
+ break;
}
- else if (abi == MS_ABI
- && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
- abi = SYSV_ABI;
-
- return abi;
-}
-static enum calling_abi
-ix86_function_abi (const_tree fndecl)
-{
- return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
-}
+ /* Pull the value out of the saved registers. */
-/* Returns value SYSV_ABI, MS_ABI dependent on cfun,
- specifying the call abi used. */
-enum calling_abi
-ix86_cfun_abi (void)
-{
- return cfun ? cfun->machine->call_abi : ix86_abi;
-}
+ addr = create_tmp_var (ptr_type_node, "addr");
-static bool
-ix86_function_ms_hook_prologue (const_tree fn)
-{
- if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
+ if (container)
{
- if (decl_function_context (fn) != NULL_TREE)
- error_at (DECL_SOURCE_LOCATION (fn),
- "ms_hook_prologue is not compatible with nested function");
- else
- return true;
- }
- return false;
-}
-
-static bool
-ix86_function_naked (const_tree fn)
-{
- if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
- return true;
-
- return false;
-}
-
-/* Write the extra assembler code needed to declare a function properly. */
+ int needed_intregs, needed_sseregs;
+ bool need_temp;
+ tree int_addr, sse_addr;
-void
-ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
- tree decl)
-{
- bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
+ lab_false = create_artificial_label (UNKNOWN_LOCATION);
+ lab_over = create_artificial_label (UNKNOWN_LOCATION);
- if (is_ms_hook)
- {
- int i, filler_count = (TARGET_64BIT ? 32 : 16);
- unsigned int filler_cc = 0xcccccccc;
+ examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
- for (i = 0; i < filler_count; i += 4)
- fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
- }
+ need_temp = (!REG_P (container)
+ && ((needed_intregs && TYPE_ALIGN (type) > 64)
+ || TYPE_ALIGN (type) > 128));
-#ifdef SUBTARGET_ASM_UNWIND_INIT
- SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
-#endif
+ /* In case we are passing structure, verify that it is consecutive block
+ on the register save area. If not we need to do moves. */
+ if (!need_temp && !REG_P (container))
+ {
+ /* Verify that all registers are strictly consecutive */
+ if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
+ {
+ int i;
- ASM_OUTPUT_LABEL (asm_out_file, fname);
+ for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
+ {
+ rtx slot = XVECEXP (container, 0, i);
+ if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
+ || INTVAL (XEXP (slot, 1)) != i * 16)
+ need_temp = true;
+ }
+ }
+ else
+ {
+ int i;
- /* Output magic byte marker, if hot-patch attribute is set. */
- if (is_ms_hook)
- {
- if (TARGET_64BIT)
+ for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
+ {
+ rtx slot = XVECEXP (container, 0, i);
+ if (REGNO (XEXP (slot, 0)) != (unsigned int) i
+ || INTVAL (XEXP (slot, 1)) != i * 8)
+ need_temp = true;
+ }
+ }
+ }
+ if (!need_temp)
{
- /* leaq [%rsp + 0], %rsp */
- fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
- asm_out_file);
+ int_addr = addr;
+ sse_addr = addr;
}
else
{
- /* movl.s %edi, %edi
- push %ebp
- movl.s %esp, %ebp */
- fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
+ int_addr = create_tmp_var (ptr_type_node, "int_addr");
+ sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
}
- }
-}
-
-/* Implementation of call abi switching target hook. Specific to FNDECL
- the specific call register sets are set. See also
- ix86_conditional_register_usage for more details. */
-void
-ix86_call_abi_override (const_tree fndecl)
-{
- cfun->machine->call_abi = ix86_function_abi (fndecl);
-}
-/* Return 1 if pseudo register should be created and used to hold
- GOT address for PIC code. */
-bool
-ix86_use_pseudo_pic_reg (void)
-{
- if ((TARGET_64BIT
- && (ix86_cmodel == CM_SMALL_PIC
- || TARGET_PECOFF))
- || !flag_pic)
- return false;
- return true;
-}
+ /* First ensure that we fit completely in registers. */
+ if (needed_intregs)
+ {
+ t = build_int_cst (TREE_TYPE (gpr),
+ (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
+ t = build2 (GE_EXPR, boolean_type_node, gpr, t);
+ t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
+ t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
+ gimplify_and_add (t, pre_p);
+ }
+ if (needed_sseregs)
+ {
+ t = build_int_cst (TREE_TYPE (fpr),
+ (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
+ + X86_64_REGPARM_MAX * 8);
+ t = build2 (GE_EXPR, boolean_type_node, fpr, t);
+ t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
+ t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
+ gimplify_and_add (t, pre_p);
+ }
-/* Initialize large model PIC register. */
+ /* Compute index to start of area used for integer regs. */
+ if (needed_intregs)
+ {
+ /* int_addr = gpr + sav; */
+ t = fold_build_pointer_plus (sav, gpr);
+ gimplify_assign (int_addr, t, pre_p);
+ }
+ if (needed_sseregs)
+ {
+ /* sse_addr = fpr + sav; */
+ t = fold_build_pointer_plus (sav, fpr);
+ gimplify_assign (sse_addr, t, pre_p);
+ }
+ if (need_temp)
+ {
+ int i, prev_size = 0;
+ tree temp = create_tmp_var (type, "va_arg_tmp");
-static void
-ix86_init_large_pic_reg (unsigned int tmp_regno)
-{
- rtx_code_label *label;
- rtx tmp_reg;
+ /* addr = &temp; */
+ t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
+ gimplify_assign (addr, t, pre_p);
- gcc_assert (Pmode == DImode);
- label = gen_label_rtx ();
- emit_label (label);
- LABEL_PRESERVE_P (label) = 1;
- tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
- gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
- emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
- label));
- emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
- emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
- pic_offset_table_rtx, tmp_reg));
- const char *name = LABEL_NAME (label);
- PUT_CODE (label, NOTE);
- NOTE_KIND (label) = NOTE_INSN_DELETED_LABEL;
- NOTE_DELETED_LABEL_NAME (label) = name;
-}
+ for (i = 0; i < XVECLEN (container, 0); i++)
+ {
+ rtx slot = XVECEXP (container, 0, i);
+ rtx reg = XEXP (slot, 0);
+ machine_mode mode = GET_MODE (reg);
+ tree piece_type;
+ tree addr_type;
+ tree daddr_type;
+ tree src_addr, src;
+ int src_offset;
+ tree dest_addr, dest;
+ int cur_size = GET_MODE_SIZE (mode);
-/* Create and initialize PIC register if required. */
-static void
-ix86_init_pic_reg (void)
-{
- edge entry_edge;
- rtx_insn *seq;
+ gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
+ prev_size = INTVAL (XEXP (slot, 1));
+ if (prev_size + cur_size > size)
+ {
+ cur_size = size - prev_size;
+ unsigned int nbits = cur_size * BITS_PER_UNIT;
+ if (!int_mode_for_size (nbits, 1).exists (&mode))
+ mode = QImode;
+ }
+ piece_type = lang_hooks.types.type_for_mode (mode, 1);
+ if (mode == GET_MODE (reg))
+ addr_type = build_pointer_type (piece_type);
+ else
+ addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
+ true);
+ daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
+ true);
- if (!ix86_use_pseudo_pic_reg ())
- return;
+ if (SSE_REGNO_P (REGNO (reg)))
+ {
+ src_addr = sse_addr;
+ src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
+ }
+ else
+ {
+ src_addr = int_addr;
+ src_offset = REGNO (reg) * 8;
+ }
+ src_addr = fold_convert (addr_type, src_addr);
+ src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
- start_sequence ();
+ dest_addr = fold_convert (daddr_type, addr);
+ dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
+ if (cur_size == GET_MODE_SIZE (mode))
+ {
+ src = build_va_arg_indirect_ref (src_addr);
+ dest = build_va_arg_indirect_ref (dest_addr);
- if (TARGET_64BIT)
- {
- if (ix86_cmodel == CM_LARGE_PIC)
- ix86_init_large_pic_reg (R11_REG);
- else
- emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
+ gimplify_assign (dest, src, pre_p);
+ }
+ else
+ {
+ tree copy
+ = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
+ 3, dest_addr, src_addr,
+ size_int (cur_size));
+ gimplify_and_add (copy, pre_p);
+ }
+ prev_size += cur_size;
+ }
+ }
+
+ if (needed_intregs)
+ {
+ t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
+ build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
+ gimplify_assign (gpr, t, pre_p);
+ }
+
+ if (needed_sseregs)
+ {
+ t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
+ build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
+ gimplify_assign (unshare_expr (fpr), t, pre_p);
+ }
+
+ gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
+
+ gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
}
- else
+
+ /* ... otherwise out of the overflow area. */
+
+ /* When we align parameter on stack for caller, if the parameter
+ alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
+ aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
+ here with caller. */
+ arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
+ if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
+ arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
+
+ /* Care for on-stack alignment if needed. */
+ if (arg_boundary <= 64 || size == 0)
+ t = ovf;
+ else
{
- /* If there is future mcount call in the function it is more profitable
- to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
- rtx reg = crtl->profile
- ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
- : pic_offset_table_rtx;
- rtx_insn *insn = emit_insn (gen_set_got (reg));
- RTX_FRAME_RELATED_P (insn) = 1;
- if (crtl->profile)
- emit_move_insn (pic_offset_table_rtx, reg);
- add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
+ HOST_WIDE_INT align = arg_boundary / 8;
+ t = fold_build_pointer_plus_hwi (ovf, align - 1);
+ t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
+ build_int_cst (TREE_TYPE (t), -align));
}
- seq = get_insns ();
- end_sequence ();
+ gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
+ gimplify_assign (addr, t, pre_p);
- entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
- insert_insn_on_edge (seq, entry_edge);
- commit_one_edge_insertion (entry_edge);
-}
+ t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
+ gimplify_assign (unshare_expr (ovf), t, pre_p);
-/* Initialize a variable CUM of type CUMULATIVE_ARGS
- for a call to a function whose data type is FNTYPE.
- For a library call, FNTYPE is 0. */
+ if (container)
+ gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
-void
-init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
- tree fntype, /* tree ptr for function decl */
- rtx libname, /* SYMBOL_REF of library name or 0 */
- tree fndecl,
- int caller)
+ ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
+ addr = fold_convert (ptrtype, addr);
+
+ if (indirect_p)
+ addr = build_va_arg_indirect_ref (addr);
+ return build_va_arg_indirect_ref (addr);
+}
+\f
+/* Return true if OPNUM's MEM should be matched
+ in movabs* patterns. */
+
+bool
+ix86_check_movabs (rtx insn, int opnum)
{
- struct cgraph_local_info *i = NULL;
- struct cgraph_node *target = NULL;
+ rtx set, mem;
- memset (cum, 0, sizeof (*cum));
+ set = PATTERN (insn);
+ if (GET_CODE (set) == PARALLEL)
+ set = XVECEXP (set, 0, 0);
+ gcc_assert (GET_CODE (set) == SET);
+ mem = XEXP (set, opnum);
+ while (SUBREG_P (mem))
+ mem = SUBREG_REG (mem);
+ gcc_assert (MEM_P (mem));
+ return volatile_ok || !MEM_VOLATILE_P (mem);
+}
- if (fndecl)
+/* Return false if INSN contains a MEM with a non-default address space. */
+bool
+ix86_check_no_addr_space (rtx insn)
+{
+ subrtx_var_iterator::array_type array;
+ FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
{
- target = cgraph_node::get (fndecl);
- if (target)
- {
- target = target->function_symbol ();
- i = cgraph_node::local_info (target->decl);
- cum->call_abi = ix86_function_abi (target->decl);
- }
- else
- cum->call_abi = ix86_function_abi (fndecl);
+ rtx x = *iter;
+ if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
+ return false;
}
- else
- cum->call_abi = ix86_function_type_abi (fntype);
+ return true;
+}
+\f
+/* Initialize the table of extra 80387 mathematical constants. */
- cum->caller = caller;
+static void
+init_ext_80387_constants (void)
+{
+ static const char * cst[5] =
+ {
+ "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
+ "0.6931471805599453094286904741849753009", /* 1: fldln2 */
+ "1.4426950408889634073876517827983434472", /* 2: fldl2e */
+ "3.3219280948873623478083405569094566090", /* 3: fldl2t */
+ "3.1415926535897932385128089594061862044", /* 4: fldpi */
+ };
+ int i;
- /* Set up the number of registers to use for passing arguments. */
- cum->nregs = ix86_regparm;
- if (TARGET_64BIT)
- {
- cum->nregs = (cum->call_abi == SYSV_ABI
- ? X86_64_REGPARM_MAX
- : X86_64_MS_REGPARM_MAX);
- }
- if (TARGET_SSE)
+ for (i = 0; i < 5; i++)
{
- cum->sse_nregs = SSE_REGPARM_MAX;
- if (TARGET_64BIT)
- {
- cum->sse_nregs = (cum->call_abi == SYSV_ABI
- ? X86_64_SSE_REGPARM_MAX
- : X86_64_MS_SSE_REGPARM_MAX);
- }
+ real_from_string (&ext_80387_constants_table[i], cst[i]);
+ /* Ensure each constant is rounded to XFmode precision. */
+ real_convert (&ext_80387_constants_table[i],
+ XFmode, &ext_80387_constants_table[i]);
}
- if (TARGET_MMX)
- cum->mmx_nregs = MMX_REGPARM_MAX;
- cum->warn_avx512f = true;
- cum->warn_avx = true;
- cum->warn_sse = true;
- cum->warn_mmx = true;
- /* Because type might mismatch in between caller and callee, we need to
- use actual type of function for local calls.
- FIXME: cgraph_analyze can be told to actually record if function uses
- va_start so for local functions maybe_vaarg can be made aggressive
- helping K&R code.
- FIXME: once typesytem is fixed, we won't need this code anymore. */
- if (i && i->local && i->can_change_signature)
- fntype = TREE_TYPE (target->decl);
- cum->stdarg = stdarg_p (fntype);
- cum->maybe_vaarg = (fntype
- ? (!prototype_p (fntype) || stdarg_p (fntype))
- : !libname);
+ ext_80387_constants_init = 1;
+}
- cum->decl = fndecl;
+/* Return non-zero if the constant is something that
+ can be loaded with a special instruction. */
- cum->warn_empty = !warn_abi || cum->stdarg;
- if (!cum->warn_empty && fntype)
- {
- function_args_iterator iter;
- tree argtype;
- bool seen_empty_type = false;
- FOREACH_FUNCTION_ARGS (fntype, argtype, iter)
- {
- if (argtype == error_mark_node || VOID_TYPE_P (argtype))
- break;
- if (TYPE_EMPTY_P (argtype))
- seen_empty_type = true;
- else if (seen_empty_type)
- {
- cum->warn_empty = true;
- break;
- }
- }
- }
+int
+standard_80387_constant_p (rtx x)
+{
+ machine_mode mode = GET_MODE (x);
- if (!TARGET_64BIT)
+ const REAL_VALUE_TYPE *r;
+
+ if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
+ return -1;
+
+ if (x == CONST0_RTX (mode))
+ return 1;
+ if (x == CONST1_RTX (mode))
+ return 2;
+
+ r = CONST_DOUBLE_REAL_VALUE (x);
+
+ /* For XFmode constants, try to find a special 80387 instruction when
+ optimizing for size or on those CPUs that benefit from them. */
+ if (mode == XFmode
+ && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
{
- /* If there are variable arguments, then we won't pass anything
- in registers in 32-bit mode. */
- if (stdarg_p (fntype))
- {
- cum->nregs = 0;
- /* Since in 32-bit, variable arguments are always passed on
- stack, there is scratch register available for indirect
- sibcall. */
- cfun->machine->arg_reg_available = true;
- cum->sse_nregs = 0;
- cum->mmx_nregs = 0;
- cum->warn_avx512f = false;
- cum->warn_avx = false;
- cum->warn_sse = false;
- cum->warn_mmx = false;
- return;
- }
+ int i;
- /* Use ecx and edx registers if function has fastcall attribute,
- else look for regparm information. */
- if (fntype)
- {
- unsigned int ccvt = ix86_get_callcvt (fntype);
- if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
- {
- cum->nregs = 1;
- cum->fastcall = 1; /* Same first register as in fastcall. */
- }
- else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
- {
- cum->nregs = 2;
- cum->fastcall = 1;
- }
- else
- cum->nregs = ix86_function_regparm (fntype, fndecl);
- }
+ if (! ext_80387_constants_init)
+ init_ext_80387_constants ();
- /* Set up the number of SSE registers used for passing SFmode
- and DFmode arguments. Warn for mismatching ABI. */
- cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
+ for (i = 0; i < 5; i++)
+ if (real_identical (r, &ext_80387_constants_table[i]))
+ return i + 3;
}
- cfun->machine->arg_reg_available = (cum->nregs > 0);
-}
+ /* Load of the constant -0.0 or -1.0 will be split as
+ fldz;fchs or fld1;fchs sequence. */
+ if (real_isnegzero (r))
+ return 8;
+ if (real_identical (r, &dconstm1))
+ return 9;
-/* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
- But in the case of vector types, it is some vector mode.
+ return 0;
+}
- When we have only some of our vector isa extensions enabled, then there
- are some modes for which vector_mode_supported_p is false. For these
- modes, the generic vector support in gcc will choose some non-vector mode
- in order to implement the type. By computing the natural mode, we'll
- select the proper ABI location for the operand and not depend on whatever
- the middle-end decides to do with these vector types.
+/* Return the opcode of the special instruction to be used to load
+ the constant X. */
- The midde-end can't deal with the vector types > 16 bytes. In this
- case, we return the original mode and warn ABI change if CUM isn't
- NULL.
+const char *
+standard_80387_constant_opcode (rtx x)
+{
+ switch (standard_80387_constant_p (x))
+ {
+ case 1:
+ return "fldz";
+ case 2:
+ return "fld1";
+ case 3:
+ return "fldlg2";
+ case 4:
+ return "fldln2";
+ case 5:
+ return "fldl2e";
+ case 6:
+ return "fldl2t";
+ case 7:
+ return "fldpi";
+ case 8:
+ case 9:
+ return "#";
+ default:
+ gcc_unreachable ();
+ }
+}
- If INT_RETURN is true, warn ABI change if the vector mode isn't
- available for function return value. */
+/* Return the CONST_DOUBLE representing the 80387 constant that is
+ loaded by the specified special instruction. The argument IDX
+ matches the return value from standard_80387_constant_p. */
-static machine_mode
-type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
- bool in_return)
+rtx
+standard_80387_constant_rtx (int idx)
{
- machine_mode mode = TYPE_MODE (type);
+ int i;
- if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
+ if (! ext_80387_constants_init)
+ init_ext_80387_constants ();
+
+ switch (idx)
{
- HOST_WIDE_INT size = int_size_in_bytes (type);
- if ((size == 8 || size == 16 || size == 32 || size == 64)
- /* ??? Generic code allows us to create width 1 vectors. Ignore. */
- && TYPE_VECTOR_SUBPARTS (type) > 1)
- {
- machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
+ case 3:
+ case 4:
+ case 5:
+ case 6:
+ case 7:
+ i = idx - 3;
+ break;
- /* There are no XFmode vector modes. */
- if (innermode == XFmode)
- return mode;
+ default:
+ gcc_unreachable ();
+ }
- if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
- mode = MIN_MODE_VECTOR_FLOAT;
- else
- mode = MIN_MODE_VECTOR_INT;
+ return const_double_from_real_value (ext_80387_constants_table[i],
+ XFmode);
+}
- /* Get the mode which has this inner mode and number of units. */
- FOR_EACH_MODE_FROM (mode, mode)
- if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
- && GET_MODE_INNER (mode) == innermode)
- {
- if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
- {
- static bool warnedavx512f;
- static bool warnedavx512f_ret;
+/* Return 1 if X is all bits 0 and 2 if X is all bits 1
+ in supported SSE/AVX vector mode. */
- if (cum && cum->warn_avx512f && !warnedavx512f)
- {
- if (warning (OPT_Wpsabi, "AVX512F vector argument "
- "without AVX512F enabled changes the ABI"))
- warnedavx512f = true;
- }
- else if (in_return && !warnedavx512f_ret)
- {
- if (warning (OPT_Wpsabi, "AVX512F vector return "
- "without AVX512F enabled changes the ABI"))
- warnedavx512f_ret = true;
- }
-
- return TYPE_MODE (type);
- }
- else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
- {
- static bool warnedavx;
- static bool warnedavx_ret;
+int
+standard_sse_constant_p (rtx x, machine_mode pred_mode)
+{
+ machine_mode mode;
- if (cum && cum->warn_avx && !warnedavx)
- {
- if (warning (OPT_Wpsabi, "AVX vector argument "
- "without AVX enabled changes the ABI"))
- warnedavx = true;
- }
- else if (in_return && !warnedavx_ret)
- {
- if (warning (OPT_Wpsabi, "AVX vector return "
- "without AVX enabled changes the ABI"))
- warnedavx_ret = true;
- }
+ if (!TARGET_SSE)
+ return 0;
- return TYPE_MODE (type);
- }
- else if (((size == 8 && TARGET_64BIT) || size == 16)
- && !TARGET_SSE
- && !TARGET_IAMCU)
- {
- static bool warnedsse;
- static bool warnedsse_ret;
+ mode = GET_MODE (x);
- if (cum && cum->warn_sse && !warnedsse)
- {
- if (warning (OPT_Wpsabi, "SSE vector argument "
- "without SSE enabled changes the ABI"))
- warnedsse = true;
- }
- else if (!TARGET_64BIT && in_return && !warnedsse_ret)
- {
- if (warning (OPT_Wpsabi, "SSE vector return "
- "without SSE enabled changes the ABI"))
- warnedsse_ret = true;
- }
- }
- else if ((size == 8 && !TARGET_64BIT)
- && (!cfun
- || cfun->machine->func_type == TYPE_NORMAL)
- && !TARGET_MMX
- && !TARGET_IAMCU)
- {
- static bool warnedmmx;
- static bool warnedmmx_ret;
+ if (x == const0_rtx || const0_operand (x, mode))
+ return 1;
- if (cum && cum->warn_mmx && !warnedmmx)
- {
- if (warning (OPT_Wpsabi, "MMX vector argument "
- "without MMX enabled changes the ABI"))
- warnedmmx = true;
- }
- else if (in_return && !warnedmmx_ret)
- {
- if (warning (OPT_Wpsabi, "MMX vector return "
- "without MMX enabled changes the ABI"))
- warnedmmx_ret = true;
- }
- }
- return mode;
- }
+ if (x == constm1_rtx || vector_all_ones_operand (x, mode))
+ {
+ /* VOIDmode integer constant, get mode from the predicate. */
+ if (mode == VOIDmode)
+ mode = pred_mode;
+ switch (GET_MODE_SIZE (mode))
+ {
+ case 64:
+ if (TARGET_AVX512F)
+ return 2;
+ break;
+ case 32:
+ if (TARGET_AVX2)
+ return 2;
+ break;
+ case 16:
+ if (TARGET_SSE2)
+ return 2;
+ break;
+ case 0:
+ /* VOIDmode */
gcc_unreachable ();
+ default:
+ break;
}
}
- return mode;
+ return 0;
}
-/* We want to pass a value in REGNO whose "natural" mode is MODE. However,
- this may not agree with the mode that the type system has chosen for the
- register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
- go ahead and use it. Otherwise we have to build a PARALLEL instead. */
+/* Return the opcode of the special instruction to be used to load
+ the constant operands[1] into operands[0]. */
-static rtx
-gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
- unsigned int regno)
+const char *
+standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
{
- rtx tmp;
+ machine_mode mode;
+ rtx x = operands[1];
- if (orig_mode != BLKmode)
- tmp = gen_rtx_REG (orig_mode, regno);
- else
+ gcc_assert (TARGET_SSE);
+
+ mode = GET_MODE (x);
+
+ if (x == const0_rtx || const0_operand (x, mode))
{
- tmp = gen_rtx_REG (mode, regno);
- tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
- tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
+ switch (get_attr_mode (insn))
+ {
+ case MODE_TI:
+ if (!EXT_REX_SSE_REG_P (operands[0]))
+ return "%vpxor\t%0, %d0";
+ /* FALLTHRU */
+ case MODE_XI:
+ case MODE_OI:
+ if (EXT_REX_SSE_REG_P (operands[0]))
+ return (TARGET_AVX512VL
+ ? "vpxord\t%x0, %x0, %x0"
+ : "vpxord\t%g0, %g0, %g0");
+ return "vpxor\t%x0, %x0, %x0";
+
+ case MODE_V2DF:
+ if (!EXT_REX_SSE_REG_P (operands[0]))
+ return "%vxorpd\t%0, %d0";
+ /* FALLTHRU */
+ case MODE_V8DF:
+ case MODE_V4DF:
+ if (!EXT_REX_SSE_REG_P (operands[0]))
+ return "vxorpd\t%x0, %x0, %x0";
+ else if (TARGET_AVX512DQ)
+ return (TARGET_AVX512VL
+ ? "vxorpd\t%x0, %x0, %x0"
+ : "vxorpd\t%g0, %g0, %g0");
+ else
+ return (TARGET_AVX512VL
+ ? "vpxorq\t%x0, %x0, %x0"
+ : "vpxorq\t%g0, %g0, %g0");
+
+ case MODE_V4SF:
+ if (!EXT_REX_SSE_REG_P (operands[0]))
+ return "%vxorps\t%0, %d0";
+ /* FALLTHRU */
+ case MODE_V16SF:
+ case MODE_V8SF:
+ if (!EXT_REX_SSE_REG_P (operands[0]))
+ return "vxorps\t%x0, %x0, %x0";
+ else if (TARGET_AVX512DQ)
+ return (TARGET_AVX512VL
+ ? "vxorps\t%x0, %x0, %x0"
+ : "vxorps\t%g0, %g0, %g0");
+ else
+ return (TARGET_AVX512VL
+ ? "vpxord\t%x0, %x0, %x0"
+ : "vpxord\t%g0, %g0, %g0");
+
+ default:
+ gcc_unreachable ();
+ }
}
+ else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
+ {
+ enum attr_mode insn_mode = get_attr_mode (insn);
+
+ switch (insn_mode)
+ {
+ case MODE_XI:
+ case MODE_V8DF:
+ case MODE_V16SF:
+ gcc_assert (TARGET_AVX512F);
+ return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
- return tmp;
-}
+ case MODE_OI:
+ case MODE_V4DF:
+ case MODE_V8SF:
+ gcc_assert (TARGET_AVX2);
+ /* FALLTHRU */
+ case MODE_TI:
+ case MODE_V2DF:
+ case MODE_V4SF:
+ gcc_assert (TARGET_SSE2);
+ if (!EXT_REX_SSE_REG_P (operands[0]))
+ return (TARGET_AVX
+ ? "vpcmpeqd\t%0, %0, %0"
+ : "pcmpeqd\t%0, %0");
+ else if (TARGET_AVX512VL)
+ return "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}";
+ else
+ return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
-/* x86-64 register passing implementation. See x86-64 ABI for details. Goal
- of this code is to classify each 8bytes of incoming argument by the register
- class and assign registers accordingly. */
+ default:
+ gcc_unreachable ();
+ }
+ }
-/* Return the union class of CLASS1 and CLASS2.
- See the x86-64 PS ABI for details. */
+ gcc_unreachable ();
+}
-static enum x86_64_reg_class
-merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
+/* Returns true if INSN can be transformed from a memory load
+ to a supported FP constant load. */
+
+bool
+ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
{
- /* Rule #1: If both classes are equal, this is the resulting class. */
- if (class1 == class2)
- return class1;
+ rtx src = find_constant_src (insn);
- /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
- the other class. */
- if (class1 == X86_64_NO_CLASS)
- return class2;
- if (class2 == X86_64_NO_CLASS)
- return class1;
+ gcc_assert (REG_P (dst));
- /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
- if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
- return X86_64_MEMORY_CLASS;
+ if (src == NULL
+ || (SSE_REGNO_P (REGNO (dst))
+ && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
+ || (STACK_REGNO_P (REGNO (dst))
+ && standard_80387_constant_p (src) < 1))
+ return false;
- /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
- if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
- || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
- return X86_64_INTEGERSI_CLASS;
- if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
- || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
- return X86_64_INTEGER_CLASS;
+ return true;
+}
- /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
- MEMORY is used. */
- if (class1 == X86_64_X87_CLASS
- || class1 == X86_64_X87UP_CLASS
- || class1 == X86_64_COMPLEX_X87_CLASS
- || class2 == X86_64_X87_CLASS
- || class2 == X86_64_X87UP_CLASS
- || class2 == X86_64_COMPLEX_X87_CLASS)
- return X86_64_MEMORY_CLASS;
+/* Returns true if OP contains a symbol reference */
- /* Rule #6: Otherwise class SSE is used. */
- return X86_64_SSE_CLASS;
-}
+bool
+symbolic_reference_mentioned_p (rtx op)
+{
+ const char *fmt;
+ int i;
-/* Classify the argument of type TYPE and mode MODE.
- CLASSES will be filled by the register class used to pass each word
- of the operand. The number of words is returned. In case the parameter
- should be passed in memory, 0 is returned. As a special case for zero
- sized containers, classes[0] will be NO_CLASS and 1 is returned.
+ if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
+ return true;
- BIT_OFFSET is used internally for handling records and specifies offset
- of the offset in bits modulo 512 to avoid overflow cases.
+ fmt = GET_RTX_FORMAT (GET_CODE (op));
+ for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
+ {
+ if (fmt[i] == 'E')
+ {
+ int j;
- See the x86-64 PS ABI for details.
-*/
+ for (j = XVECLEN (op, i) - 1; j >= 0; j--)
+ if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
+ return true;
+ }
-static int
-classify_argument (machine_mode mode, const_tree type,
- enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
+ else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
+ return true;
+ }
+
+ return false;
+}
+
+/* Return true if it is appropriate to emit `ret' instructions in the
+ body of a function. Do this only if the epilogue is simple, needing a
+ couple of insns. Prior to reloading, we can't tell how many registers
+ must be saved, so return false then. Return false if there is no frame
+ marker to de-allocate. */
+
+bool
+ix86_can_use_return_insn_p (void)
{
- HOST_WIDE_INT bytes
- = mode == BLKmode ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
- int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
+ if (ix86_function_naked (current_function_decl))
+ return false;
- /* Variable sized entities are always passed/returned in memory. */
- if (bytes < 0)
+ /* Don't use `ret' instruction in interrupt handler. */
+ if (! reload_completed
+ || frame_pointer_needed
+ || cfun->machine->func_type != TYPE_NORMAL)
return 0;
- if (mode != VOIDmode
- && targetm.calls.must_pass_in_stack (mode, type))
+ /* Don't allow more than 32k pop, since that's all we can do
+ with one instruction. */
+ if (crtl->args.pops_args && crtl->args.size >= 32768)
return 0;
- if (type && AGGREGATE_TYPE_P (type))
- {
- int i;
- tree field;
- enum x86_64_reg_class subclasses[MAX_CLASSES];
+ struct ix86_frame &frame = cfun->machine->frame;
+ return (frame.stack_pointer_offset == UNITS_PER_WORD
+ && (frame.nregs + frame.nsseregs) == 0);
+}
+\f
+/* Value should be nonzero if functions must have frame pointers.
+ Zero means the frame pointer need not be set up (and parms may
+ be accessed via the stack pointer) in functions that seem suitable. */
- /* On x86-64 we pass structures larger than 64 bytes on the stack. */
- if (bytes > 64)
- return 0;
+static bool
+ix86_frame_pointer_required (void)
+{
+ /* If we accessed previous frames, then the generated code expects
+ to be able to access the saved ebp value in our frame. */
+ if (cfun->machine->accesses_prev_frame)
+ return true;
- for (i = 0; i < words; i++)
- classes[i] = X86_64_NO_CLASS;
+ /* Several x86 os'es need a frame pointer for other reasons,
+ usually pertaining to setjmp. */
+ if (SUBTARGET_FRAME_POINTER_REQUIRED)
+ return true;
- /* Zero sized arrays or structures are NO_CLASS. We return 0 to
- signalize memory class, so handle it as special case. */
- if (!words)
- {
- classes[0] = X86_64_NO_CLASS;
- return 1;
- }
+ /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
+ if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
+ return true;
- /* Classify each field of record and merge classes. */
- switch (TREE_CODE (type))
- {
- case RECORD_TYPE:
- /* And now merge the fields of structure. */
- for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
- {
- if (TREE_CODE (field) == FIELD_DECL)
- {
- int num;
+ /* Win64 SEH, very large frames need a frame-pointer as maximum stack
+ allocation is 4GB. */
+ if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
+ return true;
- if (TREE_TYPE (field) == error_mark_node)
- continue;
+ /* SSE saves require frame-pointer when stack is misaligned. */
+ if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
+ return true;
+
+ /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
+ turns off the frame pointer by default. Turn it back on now if
+ we've not got a leaf function. */
+ if (TARGET_OMIT_LEAF_FRAME_POINTER
+ && (!crtl->is_leaf
+ || ix86_current_function_calls_tls_descriptor))
+ return true;
- /* Bitfields are always classified as integer. Handle them
- early, since later code would consider them to be
- misaligned integers. */
- if (DECL_BIT_FIELD (field))
- {
- for (i = (int_bit_position (field)
- + (bit_offset % 64)) / 8 / 8;
- i < ((int_bit_position (field) + (bit_offset % 64))
- + tree_to_shwi (DECL_SIZE (field))
- + 63) / 8 / 8; i++)
- classes[i]
- = merge_classes (X86_64_INTEGER_CLASS, classes[i]);
- }
- else
- {
- int pos;
+ if (crtl->profile && !flag_fentry)
+ return true;
- type = TREE_TYPE (field);
+ return false;
+}
- /* Flexible array member is ignored. */
- if (TYPE_MODE (type) == BLKmode
- && TREE_CODE (type) == ARRAY_TYPE
- && TYPE_SIZE (type) == NULL_TREE
- && TYPE_DOMAIN (type) != NULL_TREE
- && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
- == NULL_TREE))
- {
- static bool warned;
+/* Record that the current function accesses previous call frames. */
- if (!warned && warn_psabi)
- {
- warned = true;
- inform (input_location,
- "the ABI of passing struct with"
- " a flexible array member has"
- " changed in GCC 4.4");
- }
- continue;
- }
- num = classify_argument (TYPE_MODE (type), type,
- subclasses,
- (int_bit_position (field)
- + bit_offset) % 512);
- if (!num)
- return 0;
- pos = (int_bit_position (field)
- + (bit_offset % 64)) / 8 / 8;
- for (i = 0; i < num && (i + pos) < words; i++)
- classes[i + pos]
- = merge_classes (subclasses[i], classes[i + pos]);
- }
- }
- }
- break;
+void
+ix86_setup_frame_addresses (void)
+{
+ cfun->machine->accesses_prev_frame = 1;
+}
+\f
+#ifndef USE_HIDDEN_LINKONCE
+# if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
+# define USE_HIDDEN_LINKONCE 1
+# else
+# define USE_HIDDEN_LINKONCE 0
+# endif
+#endif
- case ARRAY_TYPE:
- /* Arrays are handled as small records. */
- {
- int num;
- num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
- TREE_TYPE (type), subclasses, bit_offset);
- if (!num)
- return 0;
+/* Label count for call and return thunks. It is used to make unique
+ labels in call and return thunks. */
+static int indirectlabelno;
- /* The partial classes are now full classes. */
- if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
- subclasses[0] = X86_64_SSE_CLASS;
- if (subclasses[0] == X86_64_INTEGERSI_CLASS
- && !((bit_offset % 64) == 0 && bytes == 4))
- subclasses[0] = X86_64_INTEGER_CLASS;
+/* True if call thunk function is needed. */
+static bool indirect_thunk_needed = false;
- for (i = 0; i < words; i++)
- classes[i] = subclasses[i % num];
+/* Bit masks of integer registers, which contain branch target, used
+ by call thunk functions. */
+static int indirect_thunks_used;
- break;
- }
- case UNION_TYPE:
- case QUAL_UNION_TYPE:
- /* Unions are similar to RECORD_TYPE but offset is always 0.
- */
- for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
- {
- if (TREE_CODE (field) == FIELD_DECL)
- {
- int num;
+/* True if return thunk function is needed. */
+static bool indirect_return_needed = false;
- if (TREE_TYPE (field) == error_mark_node)
- continue;
+/* True if return thunk function via CX is needed. */
+static bool indirect_return_via_cx;
- num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
- TREE_TYPE (field), subclasses,
- bit_offset);
- if (!num)
- return 0;
- for (i = 0; i < num && i < words; i++)
- classes[i] = merge_classes (subclasses[i], classes[i]);
- }
- }
- break;
+#ifndef INDIRECT_LABEL
+# define INDIRECT_LABEL "LIND"
+#endif
- default:
- gcc_unreachable ();
- }
+/* Indicate what prefix is needed for an indirect branch. */
+enum indirect_thunk_prefix
+{
+ indirect_thunk_prefix_none,
+ indirect_thunk_prefix_nt
+};
- if (words > 2)
- {
- /* When size > 16 bytes, if the first one isn't
- X86_64_SSE_CLASS or any other ones aren't
- X86_64_SSEUP_CLASS, everything should be passed in
- memory. */
- if (classes[0] != X86_64_SSE_CLASS)
- return 0;
+/* Return the prefix needed for an indirect branch INSN. */
- for (i = 1; i < words; i++)
- if (classes[i] != X86_64_SSEUP_CLASS)
- return 0;
+enum indirect_thunk_prefix
+indirect_thunk_need_prefix (rtx_insn *insn)
+{
+ enum indirect_thunk_prefix need_prefix;
+ if ((cfun->machine->indirect_branch_type
+ == indirect_branch_thunk_extern)
+ && ix86_notrack_prefixed_insn_p (insn))
+ {
+ /* NOTRACK prefix is only used with external thunk so that it
+ can be properly updated to support CET at run-time. */
+ need_prefix = indirect_thunk_prefix_nt;
+ }
+ else
+ need_prefix = indirect_thunk_prefix_none;
+ return need_prefix;
+}
+
+/* Fills in the label name that should be used for the indirect thunk. */
+
+static void
+indirect_thunk_name (char name[32], unsigned int regno,
+ enum indirect_thunk_prefix need_prefix,
+ bool ret_p)
+{
+ if (regno != INVALID_REGNUM && regno != CX_REG && ret_p)
+ gcc_unreachable ();
+
+ if (USE_HIDDEN_LINKONCE)
+ {
+ const char *prefix;
+
+ if (need_prefix == indirect_thunk_prefix_nt
+ && regno != INVALID_REGNUM)
+ {
+ /* NOTRACK prefix is only used with external thunk via
+ register so that NOTRACK prefix can be added to indirect
+ branch via register to support CET at run-time. */
+ prefix = "_nt";
}
+ else
+ prefix = "";
- /* Final merger cleanup. */
- for (i = 0; i < words; i++)
+ const char *ret = ret_p ? "return" : "indirect";
+
+ if (regno != INVALID_REGNUM)
{
- /* If one class is MEMORY, everything should be passed in
- memory. */
- if (classes[i] == X86_64_MEMORY_CLASS)
- return 0;
+ const char *reg_prefix;
+ if (LEGACY_INT_REGNO_P (regno))
+ reg_prefix = TARGET_64BIT ? "r" : "e";
+ else
+ reg_prefix = "";
+ sprintf (name, "__x86_%s_thunk%s_%s%s",
+ ret, prefix, reg_prefix, reg_names[regno]);
+ }
+ else
+ sprintf (name, "__x86_%s_thunk%s", ret, prefix);
+ }
+ else
+ {
+ if (regno != INVALID_REGNUM)
+ ASM_GENERATE_INTERNAL_LABEL (name, "LITR", regno);
+ else
+ {
+ if (ret_p)
+ ASM_GENERATE_INTERNAL_LABEL (name, "LRT", 0);
+ else
+ ASM_GENERATE_INTERNAL_LABEL (name, "LIT", 0);
+ }
+ }
+}
- /* The X86_64_SSEUP_CLASS should be always preceded by
- X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
- if (classes[i] == X86_64_SSEUP_CLASS
- && classes[i - 1] != X86_64_SSE_CLASS
- && classes[i - 1] != X86_64_SSEUP_CLASS)
- {
- /* The first one should never be X86_64_SSEUP_CLASS. */
- gcc_assert (i != 0);
- classes[i] = X86_64_SSE_CLASS;
- }
+/* Output a call and return thunk for indirect branch. If REGNO != -1,
+ the function address is in REGNO and the call and return thunk looks like:
- /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
- everything should be passed in memory. */
- if (classes[i] == X86_64_X87UP_CLASS
- && (classes[i - 1] != X86_64_X87_CLASS))
- {
- static bool warned;
+ call L2
+ L1:
+ pause
+ lfence
+ jmp L1
+ L2:
+ mov %REG, (%sp)
+ ret
- /* The first one should never be X86_64_X87UP_CLASS. */
- gcc_assert (i != 0);
- if (!warned && warn_psabi)
- {
- warned = true;
- inform (input_location,
- "the ABI of passing union with long double"
- " has changed in GCC 4.4");
- }
- return 0;
- }
+ Otherwise, the function address is on the top of stack and the
+ call and return thunk looks like:
+
+ call L2
+ L1:
+ pause
+ lfence
+ jmp L1
+ L2:
+ lea WORD_SIZE(%sp), %sp
+ ret
+ */
+
+static void
+output_indirect_thunk (unsigned int regno)
+{
+ char indirectlabel1[32];
+ char indirectlabel2[32];
+
+ ASM_GENERATE_INTERNAL_LABEL (indirectlabel1, INDIRECT_LABEL,
+ indirectlabelno++);
+ ASM_GENERATE_INTERNAL_LABEL (indirectlabel2, INDIRECT_LABEL,
+ indirectlabelno++);
+
+ /* Call */
+ fputs ("\tcall\t", asm_out_file);
+ assemble_name_raw (asm_out_file, indirectlabel2);
+ fputc ('\n', asm_out_file);
+
+ ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
+
+ /* AMD and Intel CPUs prefer each a different instruction as loop filler.
+ Usage of both pause + lfence is compromise solution. */
+ fprintf (asm_out_file, "\tpause\n\tlfence\n");
+
+ /* Jump. */
+ fputs ("\tjmp\t", asm_out_file);
+ assemble_name_raw (asm_out_file, indirectlabel1);
+ fputc ('\n', asm_out_file);
+
+ ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
+
+ /* The above call insn pushed a word to stack. Adjust CFI info. */
+ if (flag_asynchronous_unwind_tables && dwarf2out_do_frame ())
+ {
+ if (! dwarf2out_do_cfi_asm ())
+ {
+ dw_cfi_ref xcfi = ggc_cleared_alloc<dw_cfi_node> ();
+ xcfi->dw_cfi_opc = DW_CFA_advance_loc4;
+ xcfi->dw_cfi_oprnd1.dw_cfi_addr = ggc_strdup (indirectlabel2);
+ vec_safe_push (cfun->fde->dw_fde_cfi, xcfi);
}
- return words;
+ dw_cfi_ref xcfi = ggc_cleared_alloc<dw_cfi_node> ();
+ xcfi->dw_cfi_opc = DW_CFA_def_cfa_offset;
+ xcfi->dw_cfi_oprnd1.dw_cfi_offset = 2 * UNITS_PER_WORD;
+ vec_safe_push (cfun->fde->dw_fde_cfi, xcfi);
+ dwarf2out_emit_cfi (xcfi);
}
- /* Compute alignment needed. We align all types to natural boundaries with
- exception of XFmode that is aligned to 64bits. */
- if (mode != VOIDmode && mode != BLKmode)
+ if (regno != INVALID_REGNUM)
{
- int mode_alignment = GET_MODE_BITSIZE (mode);
-
- if (mode == XFmode)
- mode_alignment = 128;
- else if (mode == XCmode)
- mode_alignment = 256;
- if (COMPLEX_MODE_P (mode))
- mode_alignment /= 2;
- /* Misaligned fields are always returned in memory. */
- if (bit_offset % mode_alignment)
- return 0;
+ /* MOV. */
+ rtx xops[2];
+ xops[0] = gen_rtx_MEM (word_mode, stack_pointer_rtx);
+ xops[1] = gen_rtx_REG (word_mode, regno);
+ output_asm_insn ("mov\t{%1, %0|%0, %1}", xops);
}
-
- /* for V1xx modes, just use the base mode */
- if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
- && GET_MODE_UNIT_SIZE (mode) == bytes)
- mode = GET_MODE_INNER (mode);
-
- /* Classification of atomic types. */
- switch (mode)
+ else
{
- case E_SDmode:
- case E_DDmode:
- classes[0] = X86_64_SSE_CLASS;
- return 1;
- case E_TDmode:
- classes[0] = X86_64_SSE_CLASS;
- classes[1] = X86_64_SSEUP_CLASS;
- return 2;
- case E_DImode:
- case E_SImode:
- case E_HImode:
- case E_QImode:
- case E_CSImode:
- case E_CHImode:
- case E_CQImode:
- {
- int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
+ /* LEA. */
+ rtx xops[2];
+ xops[0] = stack_pointer_rtx;
+ xops[1] = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
+ output_asm_insn ("lea\t{%E1, %0|%0, %E1}", xops);
+ }
- /* Analyze last 128 bits only. */
- size = (size - 1) & 0x7f;
+ fputs ("\tret\n", asm_out_file);
+}
- if (size < 32)
- {
- classes[0] = X86_64_INTEGERSI_CLASS;
- return 1;
- }
- else if (size < 64)
- {
- classes[0] = X86_64_INTEGER_CLASS;
- return 1;
- }
- else if (size < 64+32)
- {
- classes[0] = X86_64_INTEGER_CLASS;
- classes[1] = X86_64_INTEGERSI_CLASS;
- return 2;
- }
- else if (size < 64+64)
- {
- classes[0] = classes[1] = X86_64_INTEGER_CLASS;
- return 2;
- }
- else
- gcc_unreachable ();
- }
- case E_CDImode:
- case E_TImode:
- classes[0] = classes[1] = X86_64_INTEGER_CLASS;
- return 2;
- case E_COImode:
- case E_OImode:
- /* OImode shouldn't be used directly. */
- gcc_unreachable ();
- case E_CTImode:
- return 0;
- case E_SFmode:
- if (!(bit_offset % 64))
- classes[0] = X86_64_SSESF_CLASS;
- else
- classes[0] = X86_64_SSE_CLASS;
- return 1;
- case E_DFmode:
- classes[0] = X86_64_SSEDF_CLASS;
- return 1;
- case E_XFmode:
- classes[0] = X86_64_X87_CLASS;
- classes[1] = X86_64_X87UP_CLASS;
- return 2;
- case E_TFmode:
- classes[0] = X86_64_SSE_CLASS;
- classes[1] = X86_64_SSEUP_CLASS;
- return 2;
- case E_SCmode:
- classes[0] = X86_64_SSE_CLASS;
- if (!(bit_offset % 64))
- return 1;
- else
- {
- static bool warned;
-
- if (!warned && warn_psabi)
- {
- warned = true;
- inform (input_location,
- "the ABI of passing structure with complex float"
- " member has changed in GCC 4.4");
- }
- classes[1] = X86_64_SSESF_CLASS;
- return 2;
- }
- case E_DCmode:
- classes[0] = X86_64_SSEDF_CLASS;
- classes[1] = X86_64_SSEDF_CLASS;
- return 2;
- case E_XCmode:
- classes[0] = X86_64_COMPLEX_X87_CLASS;
- return 1;
- case E_TCmode:
- /* This modes is larger than 16 bytes. */
- return 0;
- case E_V8SFmode:
- case E_V8SImode:
- case E_V32QImode:
- case E_V16HImode:
- case E_V4DFmode:
- case E_V4DImode:
- classes[0] = X86_64_SSE_CLASS;
- classes[1] = X86_64_SSEUP_CLASS;
- classes[2] = X86_64_SSEUP_CLASS;
- classes[3] = X86_64_SSEUP_CLASS;
- return 4;
- case E_V8DFmode:
- case E_V16SFmode:
- case E_V8DImode:
- case E_V16SImode:
- case E_V32HImode:
- case E_V64QImode:
- classes[0] = X86_64_SSE_CLASS;
- classes[1] = X86_64_SSEUP_CLASS;
- classes[2] = X86_64_SSEUP_CLASS;
- classes[3] = X86_64_SSEUP_CLASS;
- classes[4] = X86_64_SSEUP_CLASS;
- classes[5] = X86_64_SSEUP_CLASS;
- classes[6] = X86_64_SSEUP_CLASS;
- classes[7] = X86_64_SSEUP_CLASS;
- return 8;
- case E_V4SFmode:
- case E_V4SImode:
- case E_V16QImode:
- case E_V8HImode:
- case E_V2DFmode:
- case E_V2DImode:
- classes[0] = X86_64_SSE_CLASS;
- classes[1] = X86_64_SSEUP_CLASS;
- return 2;
- case E_V1TImode:
- case E_V1DImode:
- case E_V2SFmode:
- case E_V2SImode:
- case E_V4HImode:
- case E_V8QImode:
- classes[0] = X86_64_SSE_CLASS;
- return 1;
- case E_BLKmode:
- case E_VOIDmode:
- return 0;
- default:
- gcc_assert (VECTOR_MODE_P (mode));
+/* Output a funtion with a call and return thunk for indirect branch.
+ If REGNO != INVALID_REGNUM, the function address is in REGNO.
+ Otherwise, the function address is on the top of stack. Thunk is
+ used for function return if RET_P is true. */
- if (bytes > 16)
- return 0;
+static void
+output_indirect_thunk_function (enum indirect_thunk_prefix need_prefix,
+ unsigned int regno, bool ret_p)
+{
+ char name[32];
+ tree decl;
- gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
+ /* Create __x86_indirect_thunk. */
+ indirect_thunk_name (name, regno, need_prefix, ret_p);
+ decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
+ get_identifier (name),
+ build_function_type_list (void_type_node, NULL_TREE));
+ DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
+ NULL_TREE, void_type_node);
+ TREE_PUBLIC (decl) = 1;
+ TREE_STATIC (decl) = 1;
+ DECL_IGNORED_P (decl) = 1;
- if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
- classes[0] = X86_64_INTEGERSI_CLASS;
- else
- classes[0] = X86_64_INTEGER_CLASS;
- classes[1] = X86_64_INTEGER_CLASS;
- return 1 + (bytes > 8);
+#if TARGET_MACHO
+ if (TARGET_MACHO)
+ {
+ switch_to_section (darwin_sections[picbase_thunk_section]);
+ fputs ("\t.weak_definition\t", asm_out_file);
+ assemble_name (asm_out_file, name);
+ fputs ("\n\t.private_extern\t", asm_out_file);
+ assemble_name (asm_out_file, name);
+ putc ('\n', asm_out_file);
+ ASM_OUTPUT_LABEL (asm_out_file, name);
+ DECL_WEAK (decl) = 1;
}
-}
-
-/* Examine the argument and return set number of register required in each
- class. Return true iff parameter should be passed in memory. */
-
-static bool
-examine_argument (machine_mode mode, const_tree type, int in_return,
- int *int_nregs, int *sse_nregs)
-{
- enum x86_64_reg_class regclass[MAX_CLASSES];
- int n = classify_argument (mode, type, regclass, 0);
+ else
+#endif
+ if (USE_HIDDEN_LINKONCE)
+ {
+ cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
- *int_nregs = 0;
- *sse_nregs = 0;
+ targetm.asm_out.unique_section (decl, 0);
+ switch_to_section (get_named_section (decl, NULL, 0));
- if (!n)
- return true;
- for (n--; n >= 0; n--)
- switch (regclass[n])
+ targetm.asm_out.globalize_label (asm_out_file, name);
+ fputs ("\t.hidden\t", asm_out_file);
+ assemble_name (asm_out_file, name);
+ putc ('\n', asm_out_file);
+ ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
+ }
+ else
{
- case X86_64_INTEGER_CLASS:
- case X86_64_INTEGERSI_CLASS:
- (*int_nregs)++;
- break;
- case X86_64_SSE_CLASS:
- case X86_64_SSESF_CLASS:
- case X86_64_SSEDF_CLASS:
- (*sse_nregs)++;
- break;
- case X86_64_NO_CLASS:
- case X86_64_SSEUP_CLASS:
- break;
- case X86_64_X87_CLASS:
- case X86_64_X87UP_CLASS:
- case X86_64_COMPLEX_X87_CLASS:
- if (!in_return)
- return true;
- break;
- case X86_64_MEMORY_CLASS:
- gcc_unreachable ();
+ switch_to_section (text_section);
+ ASM_OUTPUT_LABEL (asm_out_file, name);
}
- return false;
+ DECL_INITIAL (decl) = make_node (BLOCK);
+ current_function_decl = decl;
+ allocate_struct_function (decl, false);
+ init_function_start (decl);
+ /* We're about to hide the function body from callees of final_* by
+ emitting it directly; tell them we're a thunk, if they care. */
+ cfun->is_thunk = true;
+ first_function_block_is_cold = false;
+ /* Make sure unwind info is emitted for the thunk if needed. */
+ final_start_function (emit_barrier (), asm_out_file, 1);
+
+ output_indirect_thunk (regno);
+
+ final_end_function ();
+ init_insn_lengths ();
+ free_after_compilation (cfun);
+ set_cfun (NULL);
+ current_function_decl = NULL;
}
-/* Construct container for the argument used by GCC interface. See
- FUNCTION_ARG for the detailed description. */
+static int pic_labels_used;
-static rtx
-construct_container (machine_mode mode, machine_mode orig_mode,
- const_tree type, int in_return, int nintregs, int nsseregs,
- const int *intreg, int sse_regno)
+/* Fills in the label name that should be used for a pc thunk for
+ the given register. */
+
+static void
+get_pc_thunk_name (char name[32], unsigned int regno)
{
- /* The following variables hold the static issued_error state. */
- static bool issued_sse_arg_error;
- static bool issued_sse_ret_error;
- static bool issued_x87_ret_error;
+ gcc_assert (!TARGET_64BIT);
- machine_mode tmpmode;
- int bytes
- = mode == BLKmode ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
- enum x86_64_reg_class regclass[MAX_CLASSES];
- int n;
- int i;
- int nexps = 0;
- int needed_sseregs, needed_intregs;
- rtx exp[MAX_CLASSES];
- rtx ret;
+ if (USE_HIDDEN_LINKONCE)
+ sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
+ else
+ ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
+}
- n = classify_argument (mode, type, regclass, 0);
- if (!n)
- return NULL;
- if (examine_argument (mode, type, in_return, &needed_intregs,
- &needed_sseregs))
- return NULL;
- if (needed_intregs > nintregs || needed_sseregs > nsseregs)
- return NULL;
- /* We allowed the user to turn off SSE for kernel mode. Don't crash if
- some less clueful developer tries to use floating-point anyway. */
- if (needed_sseregs && !TARGET_SSE)
+/* This function generates code for -fpic that loads %ebx with
+ the return address of the caller and then returns. */
+
+static void
+ix86_code_end (void)
+{
+ rtx xops[2];
+ unsigned int regno;
+
+ if (indirect_return_needed)
+ output_indirect_thunk_function (indirect_thunk_prefix_none,
+ INVALID_REGNUM, true);
+ if (indirect_return_via_cx)
+ output_indirect_thunk_function (indirect_thunk_prefix_none,
+ CX_REG, true);
+ if (indirect_thunk_needed)
+ output_indirect_thunk_function (indirect_thunk_prefix_none,
+ INVALID_REGNUM, false);
+
+ for (regno = FIRST_REX_INT_REG; regno <= LAST_REX_INT_REG; regno++)
{
- if (in_return)
+ unsigned int i = regno - FIRST_REX_INT_REG + LAST_INT_REG + 1;
+ if ((indirect_thunks_used & (1 << i)))
+ output_indirect_thunk_function (indirect_thunk_prefix_none,
+ regno, false);
+ }
+
+ for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
+ {
+ char name[32];
+ tree decl;
+
+ if ((indirect_thunks_used & (1 << regno)))
+ output_indirect_thunk_function (indirect_thunk_prefix_none,
+ regno, false);
+
+ if (!(pic_labels_used & (1 << regno)))
+ continue;
+
+ get_pc_thunk_name (name, regno);
+
+ decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
+ get_identifier (name),
+ build_function_type_list (void_type_node, NULL_TREE));
+ DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
+ NULL_TREE, void_type_node);
+ TREE_PUBLIC (decl) = 1;
+ TREE_STATIC (decl) = 1;
+ DECL_IGNORED_P (decl) = 1;
+
+#if TARGET_MACHO
+ if (TARGET_MACHO)
{
- if (!issued_sse_ret_error)
- {
- error ("SSE register return with SSE disabled");
- issued_sse_ret_error = true;
- }
+ switch_to_section (darwin_sections[picbase_thunk_section]);
+ fputs ("\t.weak_definition\t", asm_out_file);
+ assemble_name (asm_out_file, name);
+ fputs ("\n\t.private_extern\t", asm_out_file);
+ assemble_name (asm_out_file, name);
+ putc ('\n', asm_out_file);
+ ASM_OUTPUT_LABEL (asm_out_file, name);
+ DECL_WEAK (decl) = 1;
}
- else if (!issued_sse_arg_error)
+ else
+#endif
+ if (USE_HIDDEN_LINKONCE)
{
- error ("SSE register argument with SSE disabled");
- issued_sse_arg_error = true;
- }
- return NULL;
- }
+ cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
- /* Likewise, error if the ABI requires us to return values in the
- x87 registers and the user specified -mno-80387. */
- if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
- for (i = 0; i < n; i++)
- if (regclass[i] == X86_64_X87_CLASS
- || regclass[i] == X86_64_X87UP_CLASS
- || regclass[i] == X86_64_COMPLEX_X87_CLASS)
+ targetm.asm_out.unique_section (decl, 0);
+ switch_to_section (get_named_section (decl, NULL, 0));
+
+ targetm.asm_out.globalize_label (asm_out_file, name);
+ fputs ("\t.hidden\t", asm_out_file);
+ assemble_name (asm_out_file, name);
+ putc ('\n', asm_out_file);
+ ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
+ }
+ else
{
- if (!issued_x87_ret_error)
- {
- error ("x87 register return with x87 disabled");
- issued_x87_ret_error = true;
- }
- return NULL;
+ switch_to_section (text_section);
+ ASM_OUTPUT_LABEL (asm_out_file, name);
}
- /* First construct simple cases. Avoid SCmode, since we want to use
- single register to pass this type. */
- if (n == 1 && mode != SCmode)
- switch (regclass[0])
- {
- case X86_64_INTEGER_CLASS:
- case X86_64_INTEGERSI_CLASS:
- return gen_rtx_REG (mode, intreg[0]);
- case X86_64_SSE_CLASS:
- case X86_64_SSESF_CLASS:
- case X86_64_SSEDF_CLASS:
- if (mode != BLKmode)
- return gen_reg_or_parallel (mode, orig_mode,
- GET_SSE_REGNO (sse_regno));
- break;
- case X86_64_X87_CLASS:
- case X86_64_COMPLEX_X87_CLASS:
- return gen_rtx_REG (mode, FIRST_STACK_REG);
- case X86_64_NO_CLASS:
- /* Zero sized array, struct or class. */
- return NULL;
- default:
- gcc_unreachable ();
- }
- if (n == 2
- && regclass[0] == X86_64_SSE_CLASS
- && regclass[1] == X86_64_SSEUP_CLASS
- && mode != BLKmode)
- return gen_reg_or_parallel (mode, orig_mode,
- GET_SSE_REGNO (sse_regno));
- if (n == 4
- && regclass[0] == X86_64_SSE_CLASS
- && regclass[1] == X86_64_SSEUP_CLASS
- && regclass[2] == X86_64_SSEUP_CLASS
- && regclass[3] == X86_64_SSEUP_CLASS
- && mode != BLKmode)
- return gen_reg_or_parallel (mode, orig_mode,
- GET_SSE_REGNO (sse_regno));
- if (n == 8
- && regclass[0] == X86_64_SSE_CLASS
- && regclass[1] == X86_64_SSEUP_CLASS
- && regclass[2] == X86_64_SSEUP_CLASS
- && regclass[3] == X86_64_SSEUP_CLASS
- && regclass[4] == X86_64_SSEUP_CLASS
- && regclass[5] == X86_64_SSEUP_CLASS
- && regclass[6] == X86_64_SSEUP_CLASS
- && regclass[7] == X86_64_SSEUP_CLASS
- && mode != BLKmode)
- return gen_reg_or_parallel (mode, orig_mode,
- GET_SSE_REGNO (sse_regno));
- if (n == 2
- && regclass[0] == X86_64_X87_CLASS
- && regclass[1] == X86_64_X87UP_CLASS)
- return gen_rtx_REG (XFmode, FIRST_STACK_REG);
+ DECL_INITIAL (decl) = make_node (BLOCK);
+ current_function_decl = decl;
+ allocate_struct_function (decl, false);
+ init_function_start (decl);
+ /* We're about to hide the function body from callees of final_* by
+ emitting it directly; tell them we're a thunk, if they care. */
+ cfun->is_thunk = true;
+ first_function_block_is_cold = false;
+ /* Make sure unwind info is emitted for the thunk if needed. */
+ final_start_function (emit_barrier (), asm_out_file, 1);
- if (n == 2
- && regclass[0] == X86_64_INTEGER_CLASS
- && regclass[1] == X86_64_INTEGER_CLASS
- && (mode == CDImode || mode == TImode || mode == BLKmode)
- && intreg[0] + 1 == intreg[1])
- {
- if (mode == BLKmode)
+ /* Pad stack IP move with 4 instructions (two NOPs count
+ as one instruction). */
+ if (TARGET_PAD_SHORT_FUNCTION)
{
- /* Use TImode for BLKmode values in 2 integer registers. */
- exp[0] = gen_rtx_EXPR_LIST (VOIDmode,
- gen_rtx_REG (TImode, intreg[0]),
- GEN_INT (0));
- ret = gen_rtx_PARALLEL (mode, rtvec_alloc (1));
- XVECEXP (ret, 0, 0) = exp[0];
- return ret;
+ int i = 8;
+
+ while (i--)
+ fputs ("\tnop\n", asm_out_file);
}
- else
- return gen_rtx_REG (mode, intreg[0]);
+
+ xops[0] = gen_rtx_REG (Pmode, regno);
+ xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
+ output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
+ output_asm_insn ("%!ret", NULL);
+ final_end_function ();
+ init_insn_lengths ();
+ free_after_compilation (cfun);
+ set_cfun (NULL);
+ current_function_decl = NULL;
}
- /* Otherwise figure out the entries of the PARALLEL. */
- for (i = 0; i < n; i++)
+ if (flag_split_stack)
+ file_end_indicate_split_stack ();
+}
+
+/* Emit code for the SET_GOT patterns. */
+
+const char *
+output_set_got (rtx dest, rtx label)
+{
+ rtx xops[3];
+
+ xops[0] = dest;
+
+ if (TARGET_VXWORKS_RTP && flag_pic)
{
- int pos;
+ /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
+ xops[2] = gen_rtx_MEM (Pmode,
+ gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
+ output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
- switch (regclass[i])
- {
- case X86_64_NO_CLASS:
- break;
- case X86_64_INTEGER_CLASS:
- case X86_64_INTEGERSI_CLASS:
- /* Merge TImodes on aligned occasions here too. */
- if (i * 8 + 8 > bytes)
- {
- unsigned int tmpbits = (bytes - i * 8) * BITS_PER_UNIT;
- if (!int_mode_for_size (tmpbits, 0).exists (&tmpmode))
- /* We've requested 24 bytes we
- don't have mode for. Use DImode. */
- tmpmode = DImode;
- }
- else if (regclass[i] == X86_64_INTEGERSI_CLASS)
- tmpmode = SImode;
- else
- tmpmode = DImode;
- exp [nexps++]
- = gen_rtx_EXPR_LIST (VOIDmode,
- gen_rtx_REG (tmpmode, *intreg),
- GEN_INT (i*8));
- intreg++;
- break;
- case X86_64_SSESF_CLASS:
- exp [nexps++]
- = gen_rtx_EXPR_LIST (VOIDmode,
- gen_rtx_REG (SFmode,
- GET_SSE_REGNO (sse_regno)),
- GEN_INT (i*8));
- sse_regno++;
- break;
- case X86_64_SSEDF_CLASS:
- exp [nexps++]
- = gen_rtx_EXPR_LIST (VOIDmode,
- gen_rtx_REG (DFmode,
- GET_SSE_REGNO (sse_regno)),
- GEN_INT (i*8));
- sse_regno++;
- break;
- case X86_64_SSE_CLASS:
- pos = i;
- switch (n)
- {
- case 1:
- tmpmode = DImode;
- break;
- case 2:
- if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
- {
- tmpmode = TImode;
- i++;
- }
- else
- tmpmode = DImode;
- break;
- case 4:
- gcc_assert (i == 0
- && regclass[1] == X86_64_SSEUP_CLASS
- && regclass[2] == X86_64_SSEUP_CLASS
- && regclass[3] == X86_64_SSEUP_CLASS);
- tmpmode = OImode;
- i += 3;
- break;
- case 8:
- gcc_assert (i == 0
- && regclass[1] == X86_64_SSEUP_CLASS
- && regclass[2] == X86_64_SSEUP_CLASS
- && regclass[3] == X86_64_SSEUP_CLASS
- && regclass[4] == X86_64_SSEUP_CLASS
- && regclass[5] == X86_64_SSEUP_CLASS
- && regclass[6] == X86_64_SSEUP_CLASS
- && regclass[7] == X86_64_SSEUP_CLASS);
- tmpmode = XImode;
- i += 7;
- break;
- default:
- gcc_unreachable ();
- }
- exp [nexps++]
- = gen_rtx_EXPR_LIST (VOIDmode,
- gen_rtx_REG (tmpmode,
- GET_SSE_REGNO (sse_regno)),
- GEN_INT (pos*8));
- sse_regno++;
- break;
- default:
- gcc_unreachable ();
- }
+ /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
+ Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
+ an unadorned address. */
+ xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
+ SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
+ output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
+ return "";
}
- /* Empty aligned struct, union or class. */
- if (nexps == 0)
- return NULL;
-
- ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
- for (i = 0; i < nexps; i++)
- XVECEXP (ret, 0, i) = exp [i];
- return ret;
-}
+ xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
-/* Update the data in CUM to advance over an argument of mode MODE
- and data type TYPE. (TYPE is null for libcalls where that information
- may not be available.)
+ if (flag_pic)
+ {
+ char name[32];
+ get_pc_thunk_name (name, REGNO (dest));
+ pic_labels_used |= 1 << REGNO (dest);
- Return a number of integer regsiters advanced over. */
+ xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
+ xops[2] = gen_rtx_MEM (QImode, xops[2]);
+ output_asm_insn ("%!call\t%X2", xops);
-static int
-function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
- const_tree type, HOST_WIDE_INT bytes,
- HOST_WIDE_INT words)
-{
- int res = 0;
- bool error_p = false;
+#if TARGET_MACHO
+ /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
+ This is what will be referenced by the Mach-O PIC subsystem. */
+ if (machopic_should_output_picbase_label () || !label)
+ ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
- if (TARGET_IAMCU)
- {
- /* Intel MCU psABI passes scalars and aggregates no larger than 8
- bytes in registers. */
- if (!VECTOR_MODE_P (mode) && bytes <= 8)
- goto pass_in_reg;
- return res;
+ /* When we are restoring the pic base at the site of a nonlocal label,
+ and we decided to emit the pic base above, we will still output a
+ local label used for calculating the correction offset (even though
+ the offset will be 0 in that case). */
+ if (label)
+ targetm.asm_out.internal_label (asm_out_file, "L",
+ CODE_LABEL_NUMBER (label));
+#endif
}
-
- switch (mode)
+ else
{
- default:
- break;
+ if (TARGET_MACHO)
+ /* We don't need a pic base, we're not producing pic. */
+ gcc_unreachable ();
- case E_BLKmode:
- if (bytes < 0)
- break;
- /* FALLTHRU */
+ xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
+ output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
+ targetm.asm_out.internal_label (asm_out_file, "L",
+ CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
+ }
- case E_DImode:
- case E_SImode:
- case E_HImode:
- case E_QImode:
-pass_in_reg:
- cum->words += words;
- cum->nregs -= words;
- cum->regno += words;
- if (cum->nregs >= 0)
- res = words;
- if (cum->nregs <= 0)
- {
- cum->nregs = 0;
- cfun->machine->arg_reg_available = false;
- cum->regno = 0;
- }
- break;
+ if (!TARGET_MACHO)
+ output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
- case E_OImode:
- /* OImode shouldn't be used directly. */
- gcc_unreachable ();
+ return "";
+}
- case E_DFmode:
- if (cum->float_in_sse == -1)
- error_p = true;
- if (cum->float_in_sse < 2)
- break;
- /* FALLTHRU */
- case E_SFmode:
- if (cum->float_in_sse == -1)
- error_p = true;
- if (cum->float_in_sse < 1)
- break;
- /* FALLTHRU */
+/* Generate an "push" pattern for input ARG. */
- case E_V8SFmode:
- case E_V8SImode:
- case E_V64QImode:
- case E_V32HImode:
- case E_V16SImode:
- case E_V8DImode:
- case E_V16SFmode:
- case E_V8DFmode:
- case E_V32QImode:
- case E_V16HImode:
- case E_V4DFmode:
- case E_V4DImode:
- case E_TImode:
- case E_V16QImode:
- case E_V8HImode:
- case E_V4SImode:
- case E_V2DImode:
- case E_V4SFmode:
- case E_V2DFmode:
- if (!type || !AGGREGATE_TYPE_P (type))
- {
- cum->sse_words += words;
- cum->sse_nregs -= 1;
- cum->sse_regno += 1;
- if (cum->sse_nregs <= 0)
- {
- cum->sse_nregs = 0;
- cum->sse_regno = 0;
- }
- }
- break;
+rtx
+gen_push (rtx arg)
+{
+ struct machine_function *m = cfun->machine;
- case E_V8QImode:
- case E_V4HImode:
- case E_V2SImode:
- case E_V2SFmode:
- case E_V1TImode:
- case E_V1DImode:
- if (!type || !AGGREGATE_TYPE_P (type))
- {
- cum->mmx_words += words;
- cum->mmx_nregs -= 1;
- cum->mmx_regno += 1;
- if (cum->mmx_nregs <= 0)
- {
- cum->mmx_nregs = 0;
- cum->mmx_regno = 0;
- }
- }
- break;
- }
- if (error_p)
- {
- cum->float_in_sse = 0;
- error ("calling %qD with SSE calling convention without "
- "SSE/SSE2 enabled", cum->decl);
- sorry ("this is a GCC bug that can be worked around by adding "
- "attribute used to function called");
- }
+ if (m->fs.cfa_reg == stack_pointer_rtx)
+ m->fs.cfa_offset += UNITS_PER_WORD;
+ m->fs.sp_offset += UNITS_PER_WORD;
- return res;
+ if (REG_P (arg) && GET_MODE (arg) != word_mode)
+ arg = gen_rtx_REG (word_mode, REGNO (arg));
+
+ return gen_rtx_SET (gen_rtx_MEM (word_mode,
+ gen_rtx_PRE_DEC (Pmode,
+ stack_pointer_rtx)),
+ arg);
}
-static int
-function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
- const_tree type, HOST_WIDE_INT words, bool named)
-{
- int int_nregs, sse_nregs;
+/* Generate an "pop" pattern for input ARG. */
- /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
- if (!named && (VALID_AVX512F_REG_MODE (mode)
- || VALID_AVX256_REG_MODE (mode)))
- return 0;
+rtx
+gen_pop (rtx arg)
+{
+ if (REG_P (arg) && GET_MODE (arg) != word_mode)
+ arg = gen_rtx_REG (word_mode, REGNO (arg));
- if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
- && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
- {
- cum->nregs -= int_nregs;
- cum->sse_nregs -= sse_nregs;
- cum->regno += int_nregs;
- cum->sse_regno += sse_nregs;
- return int_nregs;
- }
- else
- {
- int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
- cum->words = ROUND_UP (cum->words, align);
- cum->words += words;
- return 0;
- }
+ return gen_rtx_SET (arg,
+ gen_rtx_MEM (word_mode,
+ gen_rtx_POST_INC (Pmode,
+ stack_pointer_rtx)));
}
-static int
-function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
- HOST_WIDE_INT words)
+/* Return >= 0 if there is an unused call-clobbered register available
+ for the entire function. */
+
+static unsigned int
+ix86_select_alt_pic_regnum (void)
{
- /* Otherwise, this should be passed indirect. */
- gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
+ if (ix86_use_pseudo_pic_reg ())
+ return INVALID_REGNUM;
- cum->words += words;
- if (cum->nregs > 0)
+ if (crtl->is_leaf
+ && !crtl->profile
+ && !ix86_current_function_calls_tls_descriptor)
{
- cum->nregs -= 1;
- cum->regno += 1;
- return 1;
+ int i, drap;
+ /* Can't use the same register for both PIC and DRAP. */
+ if (crtl->drap_reg)
+ drap = REGNO (crtl->drap_reg);
+ else
+ drap = -1;
+ for (i = 2; i >= 0; --i)
+ if (i != drap && !df_regs_ever_live_p (i))
+ return i;
}
- return 0;
+
+ return INVALID_REGNUM;
}
-/* Update the data in CUM to advance over an argument of mode MODE and
- data type TYPE. (TYPE is null for libcalls where that information
- may not be available.) */
+/* Return true if REGNO is used by the epilogue. */
-static void
-ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
- const_tree type, bool named)
+bool
+ix86_epilogue_uses (int regno)
{
- CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
- HOST_WIDE_INT bytes, words;
- int nregs;
+ /* If there are no caller-saved registers, we preserve all registers,
+ except for MMX and x87 registers which aren't supported when saving
+ and restoring registers. Don't explicitly save SP register since
+ it is always preserved. */
+ return (epilogue_completed
+ && cfun->machine->no_caller_saved_registers
+ && !fixed_regs[regno]
+ && !STACK_REGNO_P (regno)
+ && !MMX_REGNO_P (regno));
+}
- /* The argument of interrupt handler is a special case and is
- handled in ix86_function_arg. */
- if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
- return;
+/* Return nonzero if register REGNO can be used as a scratch register
+ in peephole2. */
- if (mode == BLKmode)
- bytes = int_size_in_bytes (type);
- else
- bytes = GET_MODE_SIZE (mode);
- words = CEIL (bytes, UNITS_PER_WORD);
+static bool
+ix86_hard_regno_scratch_ok (unsigned int regno)
+{
+ /* If there are no caller-saved registers, we can't use any register
+ as a scratch register after epilogue and use REGNO as scratch
+ register only if it has been used before to avoid saving and
+ restoring it. */
+ return (!cfun->machine->no_caller_saved_registers
+ || (!epilogue_completed
+ && df_regs_ever_live_p (regno)));
+}
- if (type)
- mode = type_natural_mode (type, NULL, false);
+/* Return TRUE if we need to save REGNO. */
- if (TARGET_64BIT)
+bool
+ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
+{
+ /* If there are no caller-saved registers, we preserve all registers,
+ except for MMX and x87 registers which aren't supported when saving
+ and restoring registers. Don't explicitly save SP register since
+ it is always preserved. */
+ if (cfun->machine->no_caller_saved_registers)
{
- enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
+ /* Don't preserve registers used for function return value. */
+ rtx reg = crtl->return_rtx;
+ if (reg)
+ {
+ unsigned int i = REGNO (reg);
+ unsigned int nregs = REG_NREGS (reg);
+ while (nregs-- > 0)
+ if ((i + nregs) == regno)
+ return false;
+ }
- if (call_abi == MS_ABI)
- nregs = function_arg_advance_ms_64 (cum, bytes, words);
- else
- nregs = function_arg_advance_64 (cum, mode, type, words, named);
+ return (df_regs_ever_live_p (regno)
+ && !fixed_regs[regno]
+ && !STACK_REGNO_P (regno)
+ && !MMX_REGNO_P (regno)
+ && (regno != HARD_FRAME_POINTER_REGNUM
+ || !frame_pointer_needed));
}
- else
- nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
- if (!nregs)
+ if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
+ && pic_offset_table_rtx)
{
- /* Track if there are outgoing arguments on stack. */
- if (cum->caller)
- cfun->machine->outgoing_args_on_stack = true;
+ if (ix86_use_pseudo_pic_reg ())
+ {
+ /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
+ _mcount in prologue. */
+ if (!TARGET_64BIT && flag_pic && crtl->profile)
+ return true;
+ }
+ else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
+ || crtl->profile
+ || crtl->calls_eh_return
+ || crtl->uses_const_pool
+ || cfun->has_nonlocal_label)
+ return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
}
-}
-
-/* Define where to put the arguments to a function.
- Value is zero to push the argument on the stack,
- or a hard register in which to store the argument.
-
- MODE is the argument's machine mode.
- TYPE is the data type of the argument (as a tree).
- This is null for libcalls where that information may
- not be available.
- CUM is a variable of type CUMULATIVE_ARGS which gives info about
- the preceding args and about the function being called.
- NAMED is nonzero if this argument is a named parameter
- (otherwise it is an extra parameter matching an ellipsis). */
-static rtx
-function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
- machine_mode orig_mode, const_tree type,
- HOST_WIDE_INT bytes, HOST_WIDE_INT words)
-{
- bool error_p = false;
-
- /* Avoid the AL settings for the Unix64 ABI. */
- if (mode == VOIDmode)
- return constm1_rtx;
-
- if (TARGET_IAMCU)
+ if (crtl->calls_eh_return && maybe_eh_return)
{
- /* Intel MCU psABI passes scalars and aggregates no larger than 8
- bytes in registers. */
- if (!VECTOR_MODE_P (mode) && bytes <= 8)
- goto pass_in_reg;
- return NULL_RTX;
+ unsigned i;
+ for (i = 0; ; i++)
+ {
+ unsigned test = EH_RETURN_DATA_REGNO (i);
+ if (test == INVALID_REGNUM)
+ break;
+ if (test == regno)
+ return true;
+ }
}
- switch (mode)
+ if (ignore_outlined && cfun->machine->call_ms2sysv)
{
- default:
- break;
+ unsigned count = cfun->machine->call_ms2sysv_extra_regs
+ + xlogue_layout::MIN_REGS;
+ if (xlogue_layout::is_stub_managed_reg (regno, count))
+ return false;
+ }
- case E_BLKmode:
- if (bytes < 0)
- break;
- /* FALLTHRU */
- case E_DImode:
- case E_SImode:
- case E_HImode:
- case E_QImode:
-pass_in_reg:
- if (words <= cum->nregs)
- {
- int regno = cum->regno;
+ if (crtl->drap_reg
+ && regno == REGNO (crtl->drap_reg)
+ && !cfun->machine->no_drap_save_restore)
+ return true;
- /* Fastcall allocates the first two DWORD (SImode) or
- smaller arguments to ECX and EDX if it isn't an
- aggregate type . */
- if (cum->fastcall)
- {
- if (mode == BLKmode
- || mode == DImode
- || (type && AGGREGATE_TYPE_P (type)))
- break;
+ return (df_regs_ever_live_p (regno)
+ && !call_used_regs[regno]
+ && !fixed_regs[regno]
+ && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
+}
- /* ECX not EAX is the first allocated register. */
- if (regno == AX_REG)
- regno = CX_REG;
- }
- return gen_rtx_REG (mode, regno);
- }
- break;
+/* Return number of saved general prupose registers. */
- case E_DFmode:
- if (cum->float_in_sse == -1)
- error_p = true;
- if (cum->float_in_sse < 2)
- break;
- /* FALLTHRU */
- case E_SFmode:
- if (cum->float_in_sse == -1)
- error_p = true;
- if (cum->float_in_sse < 1)
- break;
- /* FALLTHRU */
- case E_TImode:
- /* In 32bit, we pass TImode in xmm registers. */
- case E_V16QImode:
- case E_V8HImode:
- case E_V4SImode:
- case E_V2DImode:
- case E_V4SFmode:
- case E_V2DFmode:
- if (!type || !AGGREGATE_TYPE_P (type))
- {
- if (cum->sse_nregs)
- return gen_reg_or_parallel (mode, orig_mode,
- cum->sse_regno + FIRST_SSE_REG);
- }
- break;
+static int
+ix86_nsaved_regs (void)
+{
+ int nregs = 0;
+ int regno;
- case E_OImode:
- case E_XImode:
- /* OImode and XImode shouldn't be used directly. */
- gcc_unreachable ();
+ for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+ nregs ++;
+ return nregs;
+}
- case E_V64QImode:
- case E_V32HImode:
- case E_V16SImode:
- case E_V8DImode:
- case E_V16SFmode:
- case E_V8DFmode:
- case E_V8SFmode:
- case E_V8SImode:
- case E_V32QImode:
- case E_V16HImode:
- case E_V4DFmode:
- case E_V4DImode:
- if (!type || !AGGREGATE_TYPE_P (type))
- {
- if (cum->sse_nregs)
- return gen_reg_or_parallel (mode, orig_mode,
- cum->sse_regno + FIRST_SSE_REG);
- }
- break;
+/* Return number of saved SSE registers. */
- case E_V8QImode:
- case E_V4HImode:
- case E_V2SImode:
- case E_V2SFmode:
- case E_V1TImode:
- case E_V1DImode:
- if (!type || !AGGREGATE_TYPE_P (type))
- {
- if (cum->mmx_nregs)
- return gen_reg_or_parallel (mode, orig_mode,
- cum->mmx_regno + FIRST_MMX_REG);
- }
- break;
- }
- if (error_p)
- {
- cum->float_in_sse = 0;
- error ("calling %qD with SSE calling convention without "
- "SSE/SSE2 enabled", cum->decl);
- sorry ("this is a GCC bug that can be worked around by adding "
- "attribute used to function called");
- }
+static int
+ix86_nsaved_sseregs (void)
+{
+ int nregs = 0;
+ int regno;
- return NULL_RTX;
+ if (!TARGET_64BIT_MS_ABI)
+ return 0;
+ for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+ nregs ++;
+ return nregs;
}
-static rtx
-function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
- machine_mode orig_mode, const_tree type, bool named)
+/* Given FROM and TO register numbers, say whether this elimination is
+ allowed. If stack alignment is needed, we can only replace argument
+ pointer with hard frame pointer, or replace frame pointer with stack
+ pointer. Otherwise, frame pointer elimination is automatically
+ handled and all other eliminations are valid. */
+
+static bool
+ix86_can_eliminate (const int from, const int to)
{
- /* Handle a hidden AL argument containing number of registers
- for varargs x86-64 functions. */
- if (mode == VOIDmode)
- return GEN_INT (cum->maybe_vaarg
- ? (cum->sse_nregs < 0
- ? X86_64_SSE_REGPARM_MAX
- : cum->sse_regno)
- : -1);
+ if (stack_realign_fp)
+ return ((from == ARG_POINTER_REGNUM
+ && to == HARD_FRAME_POINTER_REGNUM)
+ || (from == FRAME_POINTER_REGNUM
+ && to == STACK_POINTER_REGNUM));
+ else
+ return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
+}
- switch (mode)
+/* Return the offset between two registers, one to be eliminated, and the other
+ its replacement, at the start of a routine. */
+
+HOST_WIDE_INT
+ix86_initial_elimination_offset (int from, int to)
+{
+ struct ix86_frame &frame = cfun->machine->frame;
+
+ if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
+ return frame.hard_frame_pointer_offset;
+ else if (from == FRAME_POINTER_REGNUM
+ && to == HARD_FRAME_POINTER_REGNUM)
+ return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
+ else
{
- default:
- break;
+ gcc_assert (to == STACK_POINTER_REGNUM);
- case E_V8SFmode:
- case E_V8SImode:
- case E_V32QImode:
- case E_V16HImode:
- case E_V4DFmode:
- case E_V4DImode:
- case E_V16SFmode:
- case E_V16SImode:
- case E_V64QImode:
- case E_V32HImode:
- case E_V8DFmode:
- case E_V8DImode:
- /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
- if (!named)
- return NULL;
- break;
- }
+ if (from == ARG_POINTER_REGNUM)
+ return frame.stack_pointer_offset;
- return construct_container (mode, orig_mode, type, 0, cum->nregs,
- cum->sse_nregs,
- &x86_64_int_parameter_registers [cum->regno],
- cum->sse_regno);
+ gcc_assert (from == FRAME_POINTER_REGNUM);
+ return frame.stack_pointer_offset - frame.frame_pointer_offset;
+ }
}
+/* In a dynamically-aligned function, we can't know the offset from
+ stack pointer to frame pointer, so we must ensure that setjmp
+ eliminates fp against the hard fp (%ebp) rather than trying to
+ index from %esp up to the top of the frame across a gap that is
+ of unknown (at compile-time) size. */
static rtx
-function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
- machine_mode orig_mode, bool named,
- HOST_WIDE_INT bytes)
+ix86_builtin_setjmp_frame_value (void)
{
- unsigned int regno;
-
- /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
- We use value of -2 to specify that current function call is MSABI. */
- if (mode == VOIDmode)
- return GEN_INT (-2);
-
- /* If we've run out of registers, it goes on the stack. */
- if (cum->nregs == 0)
- return NULL_RTX;
-
- regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
-
- /* Only floating point modes are passed in anything but integer regs. */
- if (TARGET_SSE && (mode == SFmode || mode == DFmode))
- {
- if (named)
- regno = cum->regno + FIRST_SSE_REG;
- else
- {
- rtx t1, t2;
+ return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
+}
- /* Unnamed floating parameters are passed in both the
- SSE and integer registers. */
- t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
- t2 = gen_rtx_REG (mode, regno);
- t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
- t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
- return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
- }
- }
- /* Handle aggregated types passed in register. */
- if (orig_mode == BLKmode)
+/* Emits a warning for unsupported msabi to sysv pro/epilogues. */
+void warn_once_call_ms2sysv_xlogues (const char *feature)
+{
+ static bool warned_once = false;
+ if (!warned_once)
{
- if (bytes > 0 && bytes <= 8)
- mode = (bytes > 4 ? DImode : SImode);
- if (mode == BLKmode)
- mode = DImode;
+ warning (0, "%<-mcall-ms2sysv-xlogues%> is not compatible with %s",
+ feature);
+ warned_once = true;
}
+}
- return gen_reg_or_parallel (mode, orig_mode, regno);
+/* Return the probing interval for -fstack-clash-protection. */
+
+static HOST_WIDE_INT
+get_probe_interval (void)
+{
+ if (flag_stack_clash_protection)
+ return (HOST_WIDE_INT_1U
+ << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL));
+ else
+ return (HOST_WIDE_INT_1U << STACK_CHECK_PROBE_INTERVAL_EXP);
}
-/* Return where to put the arguments to a function.
- Return zero to push the argument on the stack, or a hard register in which to store the argument.
+/* When using -fsplit-stack, the allocation routines set a field in
+ the TCB to the bottom of the stack plus this much space, measured
+ in bytes. */
- MODE is the argument's machine mode. TYPE is the data type of the
- argument. It is null for libcalls where that information may not be
- available. CUM gives information about the preceding args and about
- the function being called. NAMED is nonzero if this argument is a
- named parameter (otherwise it is an extra parameter matching an
- ellipsis). */
+#define SPLIT_STACK_AVAILABLE 256
-static rtx
-ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
- const_tree type, bool named)
+/* Fill structure ix86_frame about frame of currently computed function. */
+
+static void
+ix86_compute_frame_layout (void)
{
- CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
- machine_mode mode = omode;
- HOST_WIDE_INT bytes, words;
- rtx arg;
+ struct ix86_frame *frame = &cfun->machine->frame;
+ struct machine_function *m = cfun->machine;
+ unsigned HOST_WIDE_INT stack_alignment_needed;
+ HOST_WIDE_INT offset;
+ unsigned HOST_WIDE_INT preferred_alignment;
+ HOST_WIDE_INT size = get_frame_size ();
+ HOST_WIDE_INT to_allocate;
- if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
+ /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
+ * ms_abi functions that call a sysv function. We now need to prune away
+ * cases where it should be disabled. */
+ if (TARGET_64BIT && m->call_ms2sysv)
{
- gcc_assert (type != NULL_TREE);
- if (POINTER_TYPE_P (type))
+ gcc_assert (TARGET_64BIT_MS_ABI);
+ gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
+ gcc_assert (!TARGET_SEH);
+ gcc_assert (TARGET_SSE);
+ gcc_assert (!ix86_using_red_zone ());
+
+ if (crtl->calls_eh_return)
{
- /* This is the pointer argument. */
- gcc_assert (TYPE_MODE (type) == Pmode);
- /* It is at -WORD(AP) in the current frame in interrupt and
- exception handlers. */
- arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
+ gcc_assert (!reload_completed);
+ m->call_ms2sysv = false;
+ warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
+ }
+
+ else if (ix86_static_chain_on_stack)
+ {
+ gcc_assert (!reload_completed);
+ m->call_ms2sysv = false;
+ warn_once_call_ms2sysv_xlogues ("static call chains");
}
+
+ /* Finally, compute which registers the stub will manage. */
else
{
- gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
- && TREE_CODE (type) == INTEGER_TYPE
- && TYPE_MODE (type) == word_mode);
- /* The error code is the word-mode integer argument at
- -2 * WORD(AP) in the current frame of the exception
- handler. */
- arg = gen_rtx_MEM (word_mode,
- plus_constant (Pmode,
- arg_pointer_rtx,
- -2 * UNITS_PER_WORD));
+ unsigned count = xlogue_layout::count_stub_managed_regs ();
+ m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
+ m->call_ms2sysv_pad_in = 0;
}
- return arg;
}
- if (mode == BLKmode)
- bytes = int_size_in_bytes (type);
- else
- bytes = GET_MODE_SIZE (mode);
- words = CEIL (bytes, UNITS_PER_WORD);
+ frame->nregs = ix86_nsaved_regs ();
+ frame->nsseregs = ix86_nsaved_sseregs ();
- /* To simplify the code below, represent vector types with a vector mode
- even if MMX/SSE are not active. */
- if (type && TREE_CODE (type) == VECTOR_TYPE)
- mode = type_natural_mode (type, cum, false);
+ /* 64-bit MS ABI seem to require stack alignment to be always 16,
+ except for function prologues, leaf functions and when the defult
+ incoming stack boundary is overriden at command line or via
+ force_align_arg_pointer attribute.
- if (TARGET_64BIT)
+ Darwin's ABI specifies 128b alignment for both 32 and 64 bit variants
+ at call sites, including profile function calls.
+ */
+ if (((TARGET_64BIT_MS_ABI || TARGET_MACHO)
+ && crtl->preferred_stack_boundary < 128)
+ && (!crtl->is_leaf || cfun->calls_alloca != 0
+ || ix86_current_function_calls_tls_descriptor
+ || (TARGET_MACHO && crtl->profile)
+ || ix86_incoming_stack_boundary < 128))
{
- enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
-
- if (call_abi == MS_ABI)
- arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
- else
- arg = function_arg_64 (cum, mode, omode, type, named);
+ crtl->preferred_stack_boundary = 128;
+ crtl->stack_alignment_needed = 128;
}
- else
- arg = function_arg_32 (cum, mode, omode, type, bytes, words);
- /* Track if there are outgoing arguments on stack. */
- if (arg == NULL_RTX && cum->caller)
- cfun->machine->outgoing_args_on_stack = true;
-
- return arg;
-}
-
-/* A C expression that indicates when an argument must be passed by
- reference. If nonzero for an argument, a copy of that argument is
- made in memory and a pointer to the argument is passed instead of
- the argument itself. The pointer is passed in whatever way is
- appropriate for passing a pointer to that type. */
+ stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
+ preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
-static bool
-ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
- const_tree type, bool)
-{
- CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+ gcc_assert (!size || stack_alignment_needed);
+ gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
+ gcc_assert (preferred_alignment <= stack_alignment_needed);
- if (TARGET_64BIT)
+ /* The only ABI saving SSE regs should be 64-bit ms_abi. */
+ gcc_assert (TARGET_64BIT || !frame->nsseregs);
+ if (TARGET_64BIT && m->call_ms2sysv)
{
- enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
-
- /* See Windows x64 Software Convention. */
- if (call_abi == MS_ABI)
- {
- HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
+ gcc_assert (stack_alignment_needed >= 16);
+ gcc_assert (!frame->nsseregs);
+ }
- if (type)
- {
- /* Arrays are passed by reference. */
- if (TREE_CODE (type) == ARRAY_TYPE)
- return true;
+ /* For SEH we have to limit the amount of code movement into the prologue.
+ At present we do this via a BLOCKAGE, at which point there's very little
+ scheduling that can be done, which means that there's very little point
+ in doing anything except PUSHs. */
+ if (TARGET_SEH)
+ m->use_fast_prologue_epilogue = false;
+ else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
+ {
+ int count = frame->nregs;
+ struct cgraph_node *node = cgraph_node::get (current_function_decl);
- if (RECORD_OR_UNION_TYPE_P (type))
- {
- /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
- are passed by reference. */
- msize = int_size_in_bytes (type);
- }
- }
+ /* The fast prologue uses move instead of push to save registers. This
+ is significantly longer, but also executes faster as modern hardware
+ can execute the moves in parallel, but can't do that for push/pop.
- /* __m128 is passed by reference. */
- return msize != 1 && msize != 2 && msize != 4 && msize != 8;
- }
- else if (type && int_size_in_bytes (type) == -1)
- return true;
+ Be careful about choosing what prologue to emit: When function takes
+ many instructions to execute we may use slow version as well as in
+ case function is known to be outside hot spot (this is known with
+ feedback only). Weight the size of function by number of registers
+ to save as it is cheap to use one or two push instructions but very
+ slow to use many of them. */
+ if (count)
+ count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
+ if (node->frequency < NODE_FREQUENCY_NORMAL
+ || (flag_branch_probabilities
+ && node->frequency < NODE_FREQUENCY_HOT))
+ m->use_fast_prologue_epilogue = false;
+ else
+ m->use_fast_prologue_epilogue
+ = !expensive_function_p (count);
}
- return false;
-}
+ frame->save_regs_using_mov
+ = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
+ /* If static stack checking is enabled and done with probes,
+ the registers need to be saved before allocating the frame. */
+ && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
-/* Return true when TYPE should be 128bit aligned for 32bit argument
- passing ABI. XXX: This function is obsolete and is only used for
- checking psABI compatibility with previous versions of GCC. */
+ /* Skip return address and error code in exception handler. */
+ offset = INCOMING_FRAME_SP_OFFSET;
-static bool
-ix86_compat_aligned_value_p (const_tree type)
-{
- machine_mode mode = TYPE_MODE (type);
- if (((TARGET_SSE && SSE_REG_MODE_P (mode))
- || mode == TDmode
- || mode == TFmode
- || mode == TCmode)
- && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
- return true;
- if (TYPE_ALIGN (type) < 128)
- return false;
+ /* Skip pushed static chain. */
+ if (ix86_static_chain_on_stack)
+ offset += UNITS_PER_WORD;
- if (AGGREGATE_TYPE_P (type))
- {
- /* Walk the aggregates recursively. */
- switch (TREE_CODE (type))
- {
- case RECORD_TYPE:
- case UNION_TYPE:
- case QUAL_UNION_TYPE:
- {
- tree field;
+ /* Skip saved base pointer. */
+ if (frame_pointer_needed)
+ offset += UNITS_PER_WORD;
+ frame->hfp_save_offset = offset;
- /* Walk all the structure fields. */
- for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
- {
- if (TREE_CODE (field) == FIELD_DECL
- && ix86_compat_aligned_value_p (TREE_TYPE (field)))
- return true;
- }
- break;
- }
+ /* The traditional frame pointer location is at the top of the frame. */
+ frame->hard_frame_pointer_offset = offset;
- case ARRAY_TYPE:
- /* Just for use if some languages passes arrays by value. */
- if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
- return true;
- break;
+ /* Register save area */
+ offset += frame->nregs * UNITS_PER_WORD;
+ frame->reg_save_offset = offset;
- default:
- gcc_unreachable ();
- }
- }
- return false;
-}
+ /* On SEH target, registers are pushed just before the frame pointer
+ location. */
+ if (TARGET_SEH)
+ frame->hard_frame_pointer_offset = offset;
-/* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
- XXX: This function is obsolete and is only used for checking psABI
- compatibility with previous versions of GCC. */
+ /* Calculate the size of the va-arg area (not including padding, if any). */
+ frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
-static unsigned int
-ix86_compat_function_arg_boundary (machine_mode mode,
- const_tree type, unsigned int align)
-{
- /* In 32bit, only _Decimal128 and __float128 are aligned to their
- natural boundaries. */
- if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
+ /* Also adjust stack_realign_offset for the largest alignment of
+ stack slot actually used. */
+ if (stack_realign_fp
+ || (cfun->machine->max_used_stack_alignment != 0
+ && (offset % cfun->machine->max_used_stack_alignment) != 0))
{
- /* i386 ABI defines all arguments to be 4 byte aligned. We have to
- make an exception for SSE modes since these require 128bit
- alignment.
-
- The handling here differs from field_alignment. ICC aligns MMX
- arguments to 4 byte boundaries, while structure fields are aligned
- to 8 byte boundaries. */
- if (!type)
- {
- if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
- align = PARM_BOUNDARY;
- }
- else
- {
- if (!ix86_compat_aligned_value_p (type))
- align = PARM_BOUNDARY;
- }
- }
- if (align > BIGGEST_ALIGNMENT)
- align = BIGGEST_ALIGNMENT;
- return align;
-}
-
-/* Return true when TYPE should be 128bit aligned for 32bit argument
- passing ABI. */
-
-static bool
-ix86_contains_aligned_value_p (const_tree type)
-{
- machine_mode mode = TYPE_MODE (type);
-
- if (mode == XFmode || mode == XCmode)
- return false;
-
- if (TYPE_ALIGN (type) < 128)
- return false;
+ /* We may need a 16-byte aligned stack for the remainder of the
+ register save area, but the stack frame for the local function
+ may require a greater alignment if using AVX/2/512. In order
+ to avoid wasting space, we first calculate the space needed for
+ the rest of the register saves, add that to the stack pointer,
+ and then realign the stack to the boundary of the start of the
+ frame for the local function. */
+ HOST_WIDE_INT space_needed = 0;
+ HOST_WIDE_INT sse_reg_space_needed = 0;
- if (AGGREGATE_TYPE_P (type))
- {
- /* Walk the aggregates recursively. */
- switch (TREE_CODE (type))
+ if (TARGET_64BIT)
{
- case RECORD_TYPE:
- case UNION_TYPE:
- case QUAL_UNION_TYPE:
- {
- tree field;
+ if (m->call_ms2sysv)
+ {
+ m->call_ms2sysv_pad_in = 0;
+ space_needed = xlogue_layout::get_instance ().get_stack_space_used ();
+ }
- /* Walk all the structure fields. */
- for (field = TYPE_FIELDS (type);
- field;
- field = DECL_CHAIN (field))
- {
- if (TREE_CODE (field) == FIELD_DECL
- && ix86_contains_aligned_value_p (TREE_TYPE (field)))
- return true;
- }
- break;
- }
+ else if (frame->nsseregs)
+ /* The only ABI that has saved SSE registers (Win64) also has a
+ 16-byte aligned default stack. However, many programs violate
+ the ABI, and Wine64 forces stack realignment to compensate. */
+ space_needed = frame->nsseregs * 16;
- case ARRAY_TYPE:
- /* Just for use if some languages passes arrays by value. */
- if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
- return true;
- break;
+ sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16);
- default:
- gcc_unreachable ();
+ /* 64-bit frame->va_arg_size should always be a multiple of 16, but
+ rounding to be pedantic. */
+ space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16);
}
- }
- else
- return TYPE_ALIGN (type) >= 128;
+ else
+ space_needed = frame->va_arg_size;
- return false;
-}
+ /* Record the allocation size required prior to the realignment AND. */
+ frame->stack_realign_allocate = space_needed;
-/* Gives the alignment boundary, in bits, of an argument with the
- specified mode and type. */
+ /* The re-aligned stack starts at frame->stack_realign_offset. Values
+ before this point are not directly comparable with values below
+ this point. Use sp_valid_at to determine if the stack pointer is
+ valid for a given offset, fp_valid_at for the frame pointer, or
+ choose_baseaddr to have a base register chosen for you.
-static unsigned int
-ix86_function_arg_boundary (machine_mode mode, const_tree type)
-{
- unsigned int align;
- if (type)
- {
- /* Since the main variant type is used for call, we convert it to
- the main variant type. */
- type = TYPE_MAIN_VARIANT (type);
- align = TYPE_ALIGN (type);
- if (TYPE_EMPTY_P (type))
- return PARM_BOUNDARY;
+ Note that the result of (frame->stack_realign_offset
+ & (stack_alignment_needed - 1)) may not equal zero. */
+ offset = ROUND_UP (offset + space_needed, stack_alignment_needed);
+ frame->stack_realign_offset = offset - space_needed;
+ frame->sse_reg_save_offset = frame->stack_realign_offset
+ + sse_reg_space_needed;
}
- else
- align = GET_MODE_ALIGNMENT (mode);
- if (align < PARM_BOUNDARY)
- align = PARM_BOUNDARY;
else
{
- static bool warned;
- unsigned int saved_align = align;
+ frame->stack_realign_offset = offset;
- if (!TARGET_64BIT)
+ if (TARGET_64BIT && m->call_ms2sysv)
{
- /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
- if (!type)
- {
- if (mode == XFmode || mode == XCmode)
- align = PARM_BOUNDARY;
- }
- else if (!ix86_contains_aligned_value_p (type))
- align = PARM_BOUNDARY;
-
- if (align < 128)
- align = PARM_BOUNDARY;
+ m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
+ offset += xlogue_layout::get_instance ().get_stack_space_used ();
}
- if (warn_psabi
- && !warned
- && align != ix86_compat_function_arg_boundary (mode, type,
- saved_align))
+ /* Align and set SSE register save area. */
+ else if (frame->nsseregs)
{
- warned = true;
- inform (input_location,
- "the ABI for passing parameters with %d-byte"
- " alignment has changed in GCC 4.6",
- align / BITS_PER_UNIT);
+ /* If the incoming stack boundary is at least 16 bytes, or DRAP is
+ required and the DRAP re-alignment boundary is at least 16 bytes,
+ then we want the SSE register save area properly aligned. */
+ if (ix86_incoming_stack_boundary >= 128
+ || (stack_realign_drap && stack_alignment_needed >= 16))
+ offset = ROUND_UP (offset, 16);
+ offset += frame->nsseregs * 16;
}
+ frame->sse_reg_save_offset = offset;
+ offset += frame->va_arg_size;
}
- return align;
-}
-
-/* Return true if N is a possible register number of function value. */
-
-static bool
-ix86_function_value_regno_p (const unsigned int regno)
-{
- switch (regno)
- {
- case AX_REG:
- return true;
- case DX_REG:
- return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
- case DI_REG:
- case SI_REG:
- return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
+ /* Align start of frame for local function. When a function call
+ is removed, it may become a leaf function. But if argument may
+ be passed on stack, we need to align the stack when there is no
+ tail call. */
+ if (m->call_ms2sysv
+ || frame->va_arg_size != 0
+ || size != 0
+ || !crtl->is_leaf
+ || (!crtl->tail_call_emit
+ && cfun->machine->outgoing_args_on_stack)
+ || cfun->calls_alloca
+ || ix86_current_function_calls_tls_descriptor)
+ offset = ROUND_UP (offset, stack_alignment_needed);
- /* Complex values are returned in %st(0)/%st(1) pair. */
- case ST0_REG:
- case ST1_REG:
- /* TODO: The function should depend on current function ABI but
- builtins.c would need updating then. Therefore we use the
- default ABI. */
- if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
- return false;
- return TARGET_FLOAT_RETURNS_IN_80387;
+ /* Frame pointer points here. */
+ frame->frame_pointer_offset = offset;
- /* Complex values are returned in %xmm0/%xmm1 pair. */
- case XMM0_REG:
- case XMM1_REG:
- return TARGET_SSE;
+ offset += size;
- case MM0_REG:
- if (TARGET_MACHO || TARGET_64BIT)
- return false;
- return TARGET_MMX;
+ /* Add outgoing arguments area. Can be skipped if we eliminated
+ all the function calls as dead code.
+ Skipping is however impossible when function calls alloca. Alloca
+ expander assumes that last crtl->outgoing_args_size
+ of stack frame are unused. */
+ if (ACCUMULATE_OUTGOING_ARGS
+ && (!crtl->is_leaf || cfun->calls_alloca
+ || ix86_current_function_calls_tls_descriptor))
+ {
+ offset += crtl->outgoing_args_size;
+ frame->outgoing_arguments_size = crtl->outgoing_args_size;
}
+ else
+ frame->outgoing_arguments_size = 0;
- return false;
-}
+ /* Align stack boundary. Only needed if we're calling another function
+ or using alloca. */
+ if (!crtl->is_leaf || cfun->calls_alloca
+ || ix86_current_function_calls_tls_descriptor)
+ offset = ROUND_UP (offset, preferred_alignment);
-/* Define how to find the value returned by a function.
- VALTYPE is the data type of the value (as a tree).
- If the precise function being called is known, FUNC is its FUNCTION_DECL;
- otherwise, FUNC is 0. */
+ /* We've reached end of stack frame. */
+ frame->stack_pointer_offset = offset;
-static rtx
-function_value_32 (machine_mode orig_mode, machine_mode mode,
- const_tree fntype, const_tree fn)
-{
- unsigned int regno;
+ /* Size prologue needs to allocate. */
+ to_allocate = offset - frame->sse_reg_save_offset;
- /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
- we normally prevent this case when mmx is not available. However
- some ABIs may require the result to be returned like DImode. */
- if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
- regno = FIRST_MMX_REG;
+ if ((!to_allocate && frame->nregs <= 1)
+ || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000))
+ /* If stack clash probing needs a loop, then it needs a
+ scratch register. But the returned register is only guaranteed
+ to be safe to use after register saves are complete. So if
+ stack clash protections are enabled and the allocated frame is
+ larger than the probe interval, then use pushes to save
+ callee saved registers. */
+ || (flag_stack_clash_protection && to_allocate > get_probe_interval ()))
+ frame->save_regs_using_mov = false;
- /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
- we prevent this case when sse is not available. However some ABIs
- may require the result to be returned like integer TImode. */
- else if (mode == TImode
- || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
- regno = FIRST_SSE_REG;
-
- /* 32-byte vector modes in %ymm0. */
- else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
- regno = FIRST_SSE_REG;
-
- /* 64-byte vector modes in %zmm0. */
- else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
- regno = FIRST_SSE_REG;
-
- /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
- else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
- regno = FIRST_FLOAT_REG;
+ if (ix86_using_red_zone ()
+ && crtl->sp_is_unchanging
+ && crtl->is_leaf
+ && !ix86_pc_thunk_call_expanded
+ && !ix86_current_function_calls_tls_descriptor)
+ {
+ frame->red_zone_size = to_allocate;
+ if (frame->save_regs_using_mov)
+ frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
+ if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
+ frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
+ }
else
- /* Most things go in %eax. */
- regno = AX_REG;
+ frame->red_zone_size = 0;
+ frame->stack_pointer_offset -= frame->red_zone_size;
- /* Override FP return register with %xmm0 for local functions when
- SSE math is enabled or for functions with sseregparm attribute. */
- if ((fn || fntype) && (mode == SFmode || mode == DFmode))
+ /* The SEH frame pointer location is near the bottom of the frame.
+ This is enforced by the fact that the difference between the
+ stack pointer and the frame pointer is limited to 240 bytes in
+ the unwind data structure. */
+ if (TARGET_SEH)
{
- int sse_level = ix86_function_sseregparm (fntype, fn, false);
- if (sse_level == -1)
+ HOST_WIDE_INT diff;
+
+ /* If we can leave the frame pointer where it is, do so. Also, returns
+ the establisher frame for __builtin_frame_address (0). */
+ diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
+ if (diff <= SEH_MAX_FRAME_SIZE
+ && (diff > 240 || (diff & 15) != 0)
+ && !crtl->accesses_prior_frames)
{
- error ("calling %qD with SSE calling convention without "
- "SSE/SSE2 enabled", fn);
- sorry ("this is a GCC bug that can be worked around by adding "
- "attribute used to function called");
+ /* Ideally we'd determine what portion of the local stack frame
+ (within the constraint of the lowest 240) is most heavily used.
+ But without that complication, simply bias the frame pointer
+ by 128 bytes so as to maximize the amount of the local stack
+ frame that is addressable with 8-bit offsets. */
+ frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
}
- else if ((sse_level >= 1 && mode == SFmode)
- || (sse_level == 2 && mode == DFmode))
- regno = FIRST_SSE_REG;
}
-
- /* OImode shouldn't be used directly. */
- gcc_assert (mode != OImode);
-
- return gen_rtx_REG (orig_mode, regno);
}
-static rtx
-function_value_64 (machine_mode orig_mode, machine_mode mode,
- const_tree valtype)
-{
- rtx ret;
-
- /* Handle libcalls, which don't provide a type node. */
- if (valtype == NULL)
- {
- unsigned int regno;
+/* This is semi-inlined memory_address_length, but simplified
+ since we know that we're always dealing with reg+offset, and
+ to avoid having to create and discard all that rtl. */
- switch (mode)
- {
- case E_SFmode:
- case E_SCmode:
- case E_DFmode:
- case E_DCmode:
- case E_TFmode:
- case E_SDmode:
- case E_DDmode:
- case E_TDmode:
- regno = FIRST_SSE_REG;
- break;
- case E_XFmode:
- case E_XCmode:
- regno = FIRST_FLOAT_REG;
- break;
- case E_TCmode:
- return NULL;
- default:
- regno = AX_REG;
- }
+static inline int
+choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
+{
+ int len = 4;
- return gen_rtx_REG (mode, regno);
- }
- else if (POINTER_TYPE_P (valtype))
+ if (offset == 0)
{
- /* Pointers are always returned in word_mode. */
- mode = word_mode;
+ /* EBP and R13 cannot be encoded without an offset. */
+ len = (regno == BP_REG || regno == R13_REG);
}
+ else if (IN_RANGE (offset, -128, 127))
+ len = 1;
- ret = construct_container (mode, orig_mode, valtype, 1,
- X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
- x86_64_int_return_registers, 0);
-
- /* For zero sized structures, construct_container returns NULL, but we
- need to keep rest of compiler happy by returning meaningful value. */
- if (!ret)
- ret = gen_rtx_REG (orig_mode, AX_REG);
+ /* ESP and R12 must be encoded with a SIB byte. */
+ if (regno == SP_REG || regno == R12_REG)
+ len++;
- return ret;
+ return len;
}
-static rtx
-function_value_ms_32 (machine_mode orig_mode, machine_mode mode,
- const_tree fntype, const_tree fn, const_tree valtype)
-{
- unsigned int regno;
-
- /* Floating point return values in %st(0)
- (unless -mno-fp-ret-in-387 or aggregate type of up to 8 bytes). */
- if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387
- && (GET_MODE_SIZE (mode) > 8
- || valtype == NULL_TREE || !AGGREGATE_TYPE_P (valtype)))
- {
- regno = FIRST_FLOAT_REG;
- return gen_rtx_REG (orig_mode, regno);
- }
- else
- return function_value_32(orig_mode, mode, fntype,fn);
-}
+/* Determine if the stack pointer is valid for accessing the CFA_OFFSET in
+ the frame save area. The register is saved at CFA - CFA_OFFSET. */
-static rtx
-function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
- const_tree valtype)
+static bool
+sp_valid_at (HOST_WIDE_INT cfa_offset)
{
- unsigned int regno = AX_REG;
-
- if (TARGET_SSE)
+ const struct machine_frame_state &fs = cfun->machine->fs;
+ if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset)
{
- switch (GET_MODE_SIZE (mode))
- {
- case 16:
- if (valtype != NULL_TREE
- && !VECTOR_INTEGER_TYPE_P (valtype)
- && !VECTOR_INTEGER_TYPE_P (valtype)
- && !INTEGRAL_TYPE_P (valtype)
- && !VECTOR_FLOAT_TYPE_P (valtype))
- break;
- if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
- && !COMPLEX_MODE_P (mode))
- regno = FIRST_SSE_REG;
- break;
- case 8:
- case 4:
- if (valtype != NULL_TREE && AGGREGATE_TYPE_P (valtype))
- break;
- if (mode == SFmode || mode == DFmode)
- regno = FIRST_SSE_REG;
- break;
- default:
- break;
- }
+ /* Validate that the cfa_offset isn't in a "no-man's land". */
+ gcc_assert (cfa_offset <= fs.sp_realigned_fp_last);
+ return false;
}
- return gen_rtx_REG (orig_mode, regno);
+ return fs.sp_valid;
}
-static rtx
-ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
- machine_mode orig_mode, machine_mode mode)
-{
- const_tree fn, fntype;
+/* Determine if the frame pointer is valid for accessing the CFA_OFFSET in
+ the frame save area. The register is saved at CFA - CFA_OFFSET. */
- fn = NULL_TREE;
- if (fntype_or_decl && DECL_P (fntype_or_decl))
- fn = fntype_or_decl;
- fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
-
- if (ix86_function_type_abi (fntype) == MS_ABI)
+static inline bool
+fp_valid_at (HOST_WIDE_INT cfa_offset)
+{
+ const struct machine_frame_state &fs = cfun->machine->fs;
+ if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last)
{
- if (TARGET_64BIT)
- return function_value_ms_64 (orig_mode, mode, valtype);
- else
- return function_value_ms_32 (orig_mode, mode, fntype, fn, valtype);
+ /* Validate that the cfa_offset isn't in a "no-man's land". */
+ gcc_assert (cfa_offset >= fs.sp_realigned_offset);
+ return false;
}
- else if (TARGET_64BIT)
- return function_value_64 (orig_mode, mode, valtype);
- else
- return function_value_32 (orig_mode, mode, fntype, fn);
+ return fs.fp_valid;
}
-static rtx
-ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
-{
- machine_mode mode, orig_mode;
+/* Choose a base register based upon alignment requested, speed and/or
+ size. */
- orig_mode = TYPE_MODE (valtype);
- mode = type_natural_mode (valtype, NULL, true);
- return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
-}
+static void
+choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
+ HOST_WIDE_INT &base_offset,
+ unsigned int align_reqested, unsigned int *align)
+{
+ const struct machine_function *m = cfun->machine;
+ unsigned int hfp_align;
+ unsigned int drap_align;
+ unsigned int sp_align;
+ bool hfp_ok = fp_valid_at (cfa_offset);
+ bool drap_ok = m->fs.drap_valid;
+ bool sp_ok = sp_valid_at (cfa_offset);
-/* Pointer function arguments and return values are promoted to
- word_mode for normal functions. */
+ hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
-static machine_mode
-ix86_promote_function_mode (const_tree type, machine_mode mode,
- int *punsignedp, const_tree fntype,
- int for_return)
-{
- if (cfun->machine->func_type == TYPE_NORMAL
- && type != NULL_TREE
- && POINTER_TYPE_P (type))
+ /* Filter out any registers that don't meet the requested alignment
+ criteria. */
+ if (align_reqested)
{
- *punsignedp = POINTERS_EXTEND_UNSIGNED;
- return word_mode;
- }
- return default_promote_function_mode (type, mode, punsignedp, fntype,
- for_return);
-}
-
-/* Return true if a structure, union or array with MODE containing FIELD
- should be accessed using BLKmode. */
-
-static bool
-ix86_member_type_forces_blk (const_tree field, machine_mode mode)
-{
- /* Union with XFmode must be in BLKmode. */
- return (mode == XFmode
- && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
- || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
-}
-
-rtx
-ix86_libcall_value (machine_mode mode)
-{
- return ix86_function_value_1 (NULL, NULL, mode, mode);
-}
-
-/* Return true iff type is returned in memory. */
+ if (m->fs.realigned)
+ hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
+ /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
+ notes (which we would need to use a realigned stack pointer),
+ so disable on SEH targets. */
+ else if (m->fs.sp_realigned)
+ sp_align = crtl->stack_alignment_needed;
-static bool
-ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
-{
-#ifdef SUBTARGET_RETURN_IN_MEMORY
- return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
-#else
- const machine_mode mode = type_natural_mode (type, NULL, true);
- HOST_WIDE_INT size;
+ hfp_ok = hfp_ok && hfp_align >= align_reqested;
+ drap_ok = drap_ok && drap_align >= align_reqested;
+ sp_ok = sp_ok && sp_align >= align_reqested;
+ }
- if (TARGET_64BIT)
+ if (m->use_fast_prologue_epilogue)
{
- if (ix86_function_type_abi (fntype) == MS_ABI)
- {
- size = int_size_in_bytes (type);
-
- /* __m128 is returned in xmm0. */
- if ((!type || VECTOR_INTEGER_TYPE_P (type)
- || INTEGRAL_TYPE_P (type)
- || VECTOR_FLOAT_TYPE_P (type))
- && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
- && !COMPLEX_MODE_P (mode)
- && (GET_MODE_SIZE (mode) == 16 || size == 16))
- return false;
+ /* Choose the base register most likely to allow the most scheduling
+ opportunities. Generally FP is valid throughout the function,
+ while DRAP must be reloaded within the epilogue. But choose either
+ over the SP due to increased encoding size. */
- /* Otherwise, the size must be exactly in [1248]. */
- return size != 1 && size != 2 && size != 4 && size != 8;
+ if (hfp_ok)
+ {
+ base_reg = hard_frame_pointer_rtx;
+ base_offset = m->fs.fp_offset - cfa_offset;
}
- else
+ else if (drap_ok)
{
- int needed_intregs, needed_sseregs;
-
- return examine_argument (mode, type, 1,
- &needed_intregs, &needed_sseregs);
+ base_reg = crtl->drap_reg;
+ base_offset = 0 - cfa_offset;
+ }
+ else if (sp_ok)
+ {
+ base_reg = stack_pointer_rtx;
+ base_offset = m->fs.sp_offset - cfa_offset;
}
}
else
{
- size = int_size_in_bytes (type);
+ HOST_WIDE_INT toffset;
+ int len = 16, tlen;
- /* Intel MCU psABI returns scalars and aggregates no larger than 8
- bytes in registers. */
- if (TARGET_IAMCU)
- return VECTOR_MODE_P (mode) || size < 0 || size > 8;
+ /* Choose the base register with the smallest address encoding.
+ With a tie, choose FP > DRAP > SP. */
+ if (sp_ok)
+ {
+ base_reg = stack_pointer_rtx;
+ base_offset = m->fs.sp_offset - cfa_offset;
+ len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
+ }
+ if (drap_ok)
+ {
+ toffset = 0 - cfa_offset;
+ tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
+ if (tlen <= len)
+ {
+ base_reg = crtl->drap_reg;
+ base_offset = toffset;
+ len = tlen;
+ }
+ }
+ if (hfp_ok)
+ {
+ toffset = m->fs.fp_offset - cfa_offset;
+ tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
+ if (tlen <= len)
+ {
+ base_reg = hard_frame_pointer_rtx;
+ base_offset = toffset;
+ len = tlen;
+ }
+ }
+ }
- if (mode == BLKmode)
- return true;
+ /* Set the align return value. */
+ if (align)
+ {
+ if (base_reg == stack_pointer_rtx)
+ *align = sp_align;
+ else if (base_reg == crtl->drap_reg)
+ *align = drap_align;
+ else if (base_reg == hard_frame_pointer_rtx)
+ *align = hfp_align;
+ }
+}
- if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
- return false;
+/* Return an RTX that points to CFA_OFFSET within the stack frame and
+ the alignment of address. If ALIGN is non-null, it should point to
+ an alignment value (in bits) that is preferred or zero and will
+ recieve the alignment of the base register that was selected,
+ irrespective of rather or not CFA_OFFSET is a multiple of that
+ alignment value. If it is possible for the base register offset to be
+ non-immediate then SCRATCH_REGNO should specify a scratch register to
+ use.
- if (VECTOR_MODE_P (mode) || mode == TImode)
- {
- /* User-created vectors small enough to fit in EAX. */
- if (size < 8)
- return false;
+ The valid base registers are taken from CFUN->MACHINE->FS. */
- /* Unless ABI prescibes otherwise,
- MMX/3dNow values are returned in MM0 if available. */
-
- if (size == 8)
- return TARGET_VECT8_RETURNS || !TARGET_MMX;
+static rtx
+choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align,
+ unsigned int scratch_regno = INVALID_REGNUM)
+{
+ rtx base_reg = NULL;
+ HOST_WIDE_INT base_offset = 0;
- /* SSE values are returned in XMM0 if available. */
- if (size == 16)
- return !TARGET_SSE;
+ /* If a specific alignment is requested, try to get a base register
+ with that alignment first. */
+ if (align && *align)
+ choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
- /* AVX values are returned in YMM0 if available. */
- if (size == 32)
- return !TARGET_AVX;
+ if (!base_reg)
+ choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
- /* AVX512F values are returned in ZMM0 if available. */
- if (size == 64)
- return !TARGET_AVX512F;
- }
+ gcc_assert (base_reg != NULL);
- if (mode == XFmode)
- return false;
+ rtx base_offset_rtx = GEN_INT (base_offset);
- if (size > 12)
- return true;
+ if (!x86_64_immediate_operand (base_offset_rtx, Pmode))
+ {
+ gcc_assert (scratch_regno != INVALID_REGNUM);
- /* OImode shouldn't be used directly. */
- gcc_assert (mode != OImode);
+ rtx scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
+ emit_move_insn (scratch_reg, base_offset_rtx);
- return false;
+ return gen_rtx_PLUS (Pmode, base_reg, scratch_reg);
}
-#endif
+
+ return plus_constant (Pmode, base_reg, base_offset);
}
-\f
-/* Create the va_list data type. */
+/* Emit code to save registers in the prologue. */
-static tree
-ix86_build_builtin_va_list_64 (void)
+static void
+ix86_emit_save_regs (void)
{
- tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
-
- record = lang_hooks.types.make_type (RECORD_TYPE);
- type_decl = build_decl (BUILTINS_LOCATION,
- TYPE_DECL, get_identifier ("__va_list_tag"), record);
+ unsigned int regno;
+ rtx_insn *insn;
- f_gpr = build_decl (BUILTINS_LOCATION,
- FIELD_DECL, get_identifier ("gp_offset"),
- unsigned_type_node);
- f_fpr = build_decl (BUILTINS_LOCATION,
- FIELD_DECL, get_identifier ("fp_offset"),
- unsigned_type_node);
- f_ovf = build_decl (BUILTINS_LOCATION,
- FIELD_DECL, get_identifier ("overflow_arg_area"),
- ptr_type_node);
- f_sav = build_decl (BUILTINS_LOCATION,
- FIELD_DECL, get_identifier ("reg_save_area"),
- ptr_type_node);
+ for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
+ if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+ {
+ insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
+ RTX_FRAME_RELATED_P (insn) = 1;
+ }
+}
- va_list_gpr_counter_field = f_gpr;
- va_list_fpr_counter_field = f_fpr;
+/* Emit a single register save at CFA - CFA_OFFSET. */
- DECL_FIELD_CONTEXT (f_gpr) = record;
- DECL_FIELD_CONTEXT (f_fpr) = record;
- DECL_FIELD_CONTEXT (f_ovf) = record;
- DECL_FIELD_CONTEXT (f_sav) = record;
+static void
+ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
+ HOST_WIDE_INT cfa_offset)
+{
+ struct machine_function *m = cfun->machine;
+ rtx reg = gen_rtx_REG (mode, regno);
+ rtx mem, addr, base, insn;
+ unsigned int align = GET_MODE_ALIGNMENT (mode);
- TYPE_STUB_DECL (record) = type_decl;
- TYPE_NAME (record) = type_decl;
- TYPE_FIELDS (record) = f_gpr;
- DECL_CHAIN (f_gpr) = f_fpr;
- DECL_CHAIN (f_fpr) = f_ovf;
- DECL_CHAIN (f_ovf) = f_sav;
+ addr = choose_baseaddr (cfa_offset, &align);
+ mem = gen_frame_mem (mode, addr);
- layout_type (record);
+ /* The location aligment depends upon the base register. */
+ align = MIN (GET_MODE_ALIGNMENT (mode), align);
+ gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
+ set_mem_align (mem, align);
- TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
- NULL_TREE, TYPE_ATTRIBUTES (record));
+ insn = emit_insn (gen_rtx_SET (mem, reg));
+ RTX_FRAME_RELATED_P (insn) = 1;
- /* The correct type is an array type of one element. */
- return build_array_type (record, build_index_type (size_zero_node));
-}
-
-/* Setup the builtin va_list data type and for 64-bit the additional
- calling convention specific va_list data types. */
+ base = addr;
+ if (GET_CODE (base) == PLUS)
+ base = XEXP (base, 0);
+ gcc_checking_assert (REG_P (base));
-static tree
-ix86_build_builtin_va_list (void)
-{
- if (TARGET_64BIT)
+ /* When saving registers into a re-aligned local stack frame, avoid
+ any tricky guessing by dwarf2out. */
+ if (m->fs.realigned)
{
- /* Initialize ABI specific va_list builtin types.
-
- In lto1, we can encounter two va_list types:
- - one as a result of the type-merge across TUs, and
- - the one constructed here.
- These two types will not have the same TYPE_MAIN_VARIANT, and therefore
- a type identity check in canonical_va_list_type based on
- TYPE_MAIN_VARIANT (which we used to have) will not work.
- Instead, we tag each va_list_type_node with its unique attribute, and
- look for the attribute in the type identity check in
- canonical_va_list_type.
-
- Tagging sysv_va_list_type_node directly with the attribute is
- problematic since it's a array of one record, which will degrade into a
- pointer to record when used as parameter (see build_va_arg comments for
- an example), dropping the attribute in the process. So we tag the
- record instead. */
+ gcc_checking_assert (stack_realign_drap);
- /* For SYSV_ABI we use an array of one record. */
- sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
-
- /* For MS_ABI we use plain pointer to argument area. */
- tree char_ptr_type = build_pointer_type (char_type_node);
- tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
- TYPE_ATTRIBUTES (char_ptr_type));
- ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
+ if (regno == REGNO (crtl->drap_reg))
+ {
+ /* A bit of a hack. We force the DRAP register to be saved in
+ the re-aligned stack frame, which provides us with a copy
+ of the CFA that will last past the prologue. Install it. */
+ gcc_checking_assert (cfun->machine->fs.fp_valid);
+ addr = plus_constant (Pmode, hard_frame_pointer_rtx,
+ cfun->machine->fs.fp_offset - cfa_offset);
+ mem = gen_rtx_MEM (mode, addr);
+ add_reg_note (insn, REG_CFA_DEF_CFA, mem);
+ }
+ else
+ {
+ /* The frame pointer is a stable reference within the
+ aligned frame. Use it. */
+ gcc_checking_assert (cfun->machine->fs.fp_valid);
+ addr = plus_constant (Pmode, hard_frame_pointer_rtx,
+ cfun->machine->fs.fp_offset - cfa_offset);
+ mem = gen_rtx_MEM (mode, addr);
+ add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
+ }
+ }
- return ((ix86_abi == MS_ABI)
- ? ms_va_list_type_node
- : sysv_va_list_type_node);
+ else if (base == stack_pointer_rtx && m->fs.sp_realigned
+ && cfa_offset >= m->fs.sp_realigned_offset)
+ {
+ gcc_checking_assert (stack_realign_fp);
+ add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
}
- else
+
+ /* The memory may not be relative to the current CFA register,
+ which means that we may need to generate a new pattern for
+ use by the unwind info. */
+ else if (base != m->fs.cfa_reg)
{
- /* For i386 we use plain pointer to argument area. */
- return build_pointer_type (char_type_node);
+ addr = plus_constant (Pmode, m->fs.cfa_reg,
+ m->fs.cfa_offset - cfa_offset);
+ mem = gen_rtx_MEM (mode, addr);
+ add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
}
}
-/* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
-
+/* Emit code to save registers using MOV insns.
+ First register is stored at CFA - CFA_OFFSET. */
static void
-setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
+ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
{
- rtx save_area, mem;
- alias_set_type set;
- int i, max;
+ unsigned int regno;
- /* GPR size of varargs save area. */
- if (cfun->va_list_gpr_size)
- ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
- else
- ix86_varargs_gpr_size = 0;
+ for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+ {
+ ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
+ cfa_offset -= UNITS_PER_WORD;
+ }
+}
- /* FPR size of varargs save area. We don't need it if we don't pass
- anything in SSE registers. */
- if (TARGET_SSE && cfun->va_list_fpr_size)
- ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
- else
- ix86_varargs_fpr_size = 0;
+/* Emit code to save SSE registers using MOV insns.
+ First register is stored at CFA - CFA_OFFSET. */
+static void
+ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
+{
+ unsigned int regno;
- if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
- return;
+ for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+ {
+ ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
+ cfa_offset -= GET_MODE_SIZE (V4SFmode);
+ }
+}
- save_area = frame_pointer_rtx;
- set = get_varargs_alias_set ();
+static GTY(()) rtx queued_cfa_restores;
- max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
- if (max > X86_64_REGPARM_MAX)
- max = X86_64_REGPARM_MAX;
+/* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
+ manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
+ Don't add the note if the previously saved value will be left untouched
+ within stack red-zone till return, as unwinders can find the same value
+ in the register and on the stack. */
- for (i = cum->regno; i < max; i++)
- {
- mem = gen_rtx_MEM (word_mode,
- plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
- MEM_NOTRAP_P (mem) = 1;
- set_mem_alias_set (mem, set);
- emit_move_insn (mem,
- gen_rtx_REG (word_mode,
- x86_64_int_parameter_registers[i]));
- }
+static void
+ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
+{
+ if (!crtl->shrink_wrapped
+ && cfa_offset <= cfun->machine->fs.red_zone_offset)
+ return;
- if (ix86_varargs_fpr_size)
+ if (insn)
{
- machine_mode smode;
- rtx_code_label *label;
- rtx test;
+ add_reg_note (insn, REG_CFA_RESTORE, reg);
+ RTX_FRAME_RELATED_P (insn) = 1;
+ }
+ else
+ queued_cfa_restores
+ = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
+}
- /* Now emit code to save SSE registers. The AX parameter contains number
- of SSE parameter registers used to call this function, though all we
- actually check here is the zero/non-zero status. */
+/* Add queued REG_CFA_RESTORE notes if any to INSN. */
- label = gen_label_rtx ();
- test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
- emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
- label));
+static void
+ix86_add_queued_cfa_restore_notes (rtx insn)
+{
+ rtx last;
+ if (!queued_cfa_restores)
+ return;
+ for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
+ ;
+ XEXP (last, 1) = REG_NOTES (insn);
+ REG_NOTES (insn) = queued_cfa_restores;
+ queued_cfa_restores = NULL_RTX;
+ RTX_FRAME_RELATED_P (insn) = 1;
+}
- /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
- we used movdqa (i.e. TImode) instead? Perhaps even better would
- be if we could determine the real mode of the data, via a hook
- into pass_stdarg. Ignore all that for now. */
- smode = V4SFmode;
- if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
- crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
+/* Expand prologue or epilogue stack adjustment.
+ The pattern exist to put a dependency on all ebp-based memory accesses.
+ STYLE should be negative if instructions should be marked as frame related,
+ zero if %r11 register is live and cannot be freely used and positive
+ otherwise. */
- max = cum->sse_regno + cfun->va_list_fpr_size / 16;
- if (max > X86_64_SSE_REGPARM_MAX)
- max = X86_64_SSE_REGPARM_MAX;
+static rtx
+pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
+ int style, bool set_cfa)
+{
+ struct machine_function *m = cfun->machine;
+ rtx insn;
+ bool add_frame_related_expr = false;
- for (i = cum->sse_regno; i < max; ++i)
+ if (Pmode == SImode)
+ insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
+ else if (x86_64_immediate_operand (offset, DImode))
+ insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
+ else
+ {
+ rtx tmp;
+ /* r11 is used by indirect sibcall return as well, set before the
+ epilogue and used after the epilogue. */
+ if (style)
+ tmp = gen_rtx_REG (DImode, R11_REG);
+ else
{
- mem = plus_constant (Pmode, save_area,
- i * 16 + ix86_varargs_gpr_size);
- mem = gen_rtx_MEM (smode, mem);
- MEM_NOTRAP_P (mem) = 1;
- set_mem_alias_set (mem, set);
- set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
-
- emit_move_insn (mem, gen_rtx_REG (smode, GET_SSE_REGNO (i)));
+ gcc_assert (src != hard_frame_pointer_rtx
+ && dest != hard_frame_pointer_rtx);
+ tmp = hard_frame_pointer_rtx;
}
+ insn = emit_insn (gen_rtx_SET (tmp, offset));
+ if (style < 0)
+ add_frame_related_expr = true;
- emit_label (label);
+ insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
}
-}
-
-static void
-setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
-{
- alias_set_type set = get_varargs_alias_set ();
- int i;
- /* Reset to zero, as there might be a sysv vaarg used
- before. */
- ix86_varargs_gpr_size = 0;
- ix86_varargs_fpr_size = 0;
+ insn = emit_insn (insn);
+ if (style >= 0)
+ ix86_add_queued_cfa_restore_notes (insn);
- for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
+ if (set_cfa)
{
- rtx reg, mem;
+ rtx r;
- mem = gen_rtx_MEM (Pmode,
- plus_constant (Pmode, virtual_incoming_args_rtx,
- i * UNITS_PER_WORD));
- MEM_NOTRAP_P (mem) = 1;
- set_mem_alias_set (mem, set);
+ gcc_assert (m->fs.cfa_reg == src);
+ m->fs.cfa_offset += INTVAL (offset);
+ m->fs.cfa_reg = dest;
- reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
- emit_move_insn (mem, reg);
+ r = gen_rtx_PLUS (Pmode, src, offset);
+ r = gen_rtx_SET (dest, r);
+ add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
+ RTX_FRAME_RELATED_P (insn) = 1;
+ }
+ else if (style < 0)
+ {
+ RTX_FRAME_RELATED_P (insn) = 1;
+ if (add_frame_related_expr)
+ {
+ rtx r = gen_rtx_PLUS (Pmode, src, offset);
+ r = gen_rtx_SET (dest, r);
+ add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
+ }
}
-}
-static void
-ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
- tree type, int *, int no_rtl)
-{
- CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
- CUMULATIVE_ARGS next_cum;
- tree fntype;
+ if (dest == stack_pointer_rtx)
+ {
+ HOST_WIDE_INT ooffset = m->fs.sp_offset;
+ bool valid = m->fs.sp_valid;
+ bool realigned = m->fs.sp_realigned;
- /* This argument doesn't appear to be used anymore. Which is good,
- because the old code here didn't suppress rtl generation. */
- gcc_assert (!no_rtl);
+ if (src == hard_frame_pointer_rtx)
+ {
+ valid = m->fs.fp_valid;
+ realigned = false;
+ ooffset = m->fs.fp_offset;
+ }
+ else if (src == crtl->drap_reg)
+ {
+ valid = m->fs.drap_valid;
+ realigned = false;
+ ooffset = 0;
+ }
+ else
+ {
+ /* Else there are two possibilities: SP itself, which we set
+ up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
+ taken care of this by hand along the eh_return path. */
+ gcc_checking_assert (src == stack_pointer_rtx
+ || offset == const0_rtx);
+ }
- if (!TARGET_64BIT)
- return;
+ m->fs.sp_offset = ooffset - INTVAL (offset);
+ m->fs.sp_valid = valid;
+ m->fs.sp_realigned = realigned;
+ }
+ return insn;
+}
- fntype = TREE_TYPE (current_function_decl);
+/* Find an available register to be used as dynamic realign argument
+ pointer regsiter. Such a register will be written in prologue and
+ used in begin of body, so it must not be
+ 1. parameter passing register.
+ 2. GOT pointer.
+ We reuse static-chain register if it is available. Otherwise, we
+ use DI for i386 and R13 for x86-64. We chose R13 since it has
+ shorter encoding.
- /* For varargs, we do not want to skip the dummy va_dcl argument.
- For stdargs, we do want to skip the last named argument. */
- next_cum = *cum;
- if (stdarg_p (fntype))
- ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
- true);
+ Return: the regno of chosen register. */
- if (cum->call_abi == MS_ABI)
- setup_incoming_varargs_ms_64 (&next_cum);
+static unsigned int
+find_drap_reg (void)
+{
+ tree decl = cfun->decl;
+
+ /* Always use callee-saved register if there are no caller-saved
+ registers. */
+ if (TARGET_64BIT)
+ {
+ /* Use R13 for nested function or function need static chain.
+ Since function with tail call may use any caller-saved
+ registers in epilogue, DRAP must not use caller-saved
+ register in such case. */
+ if (DECL_STATIC_CHAIN (decl)
+ || cfun->machine->no_caller_saved_registers
+ || crtl->tail_call_emit)
+ return R13_REG;
+
+ return R10_REG;
+ }
else
- setup_incoming_varargs_64 (&next_cum);
+ {
+ /* Use DI for nested function or function need static chain.
+ Since function with tail call may use any caller-saved
+ registers in epilogue, DRAP must not use caller-saved
+ register in such case. */
+ if (DECL_STATIC_CHAIN (decl)
+ || cfun->machine->no_caller_saved_registers
+ || crtl->tail_call_emit)
+ return DI_REG;
+
+ /* Reuse static chain register if it isn't used for parameter
+ passing. */
+ if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
+ {
+ unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
+ if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
+ return CX_REG;
+ }
+ return DI_REG;
+ }
}
-static void
-ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
- machine_mode mode,
- tree type,
- int *pretend_size ATTRIBUTE_UNUSED,
- int no_rtl)
+/* Return minimum incoming stack alignment. */
+
+static unsigned int
+ix86_minimum_incoming_stack_boundary (bool sibcall)
{
- CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
- CUMULATIVE_ARGS next_cum;
- tree fntype;
- int max;
+ unsigned int incoming_stack_boundary;
- gcc_assert (!no_rtl);
+ /* Stack of interrupt handler is aligned to 128 bits in 64bit mode. */
+ if (cfun->machine->func_type != TYPE_NORMAL)
+ incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
+ /* Prefer the one specified at command line. */
+ else if (ix86_user_incoming_stack_boundary)
+ incoming_stack_boundary = ix86_user_incoming_stack_boundary;
+ /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
+ if -mstackrealign is used, it isn't used for sibcall check and
+ estimated stack alignment is 128bit. */
+ else if (!sibcall
+ && ix86_force_align_arg_pointer
+ && crtl->stack_alignment_estimated == 128)
+ incoming_stack_boundary = MIN_STACK_BOUNDARY;
+ else
+ incoming_stack_boundary = ix86_default_incoming_stack_boundary;
- /* Do nothing if we use plain pointer to argument area. */
- if (!TARGET_64BIT || cum->call_abi == MS_ABI)
- return;
+ /* Incoming stack alignment can be changed on individual functions
+ via force_align_arg_pointer attribute. We use the smallest
+ incoming stack boundary. */
+ if (incoming_stack_boundary > MIN_STACK_BOUNDARY
+ && lookup_attribute ("force_align_arg_pointer",
+ TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
+ incoming_stack_boundary = MIN_STACK_BOUNDARY;
- fntype = TREE_TYPE (current_function_decl);
+ /* The incoming stack frame has to be aligned at least at
+ parm_stack_boundary. */
+ if (incoming_stack_boundary < crtl->parm_stack_boundary)
+ incoming_stack_boundary = crtl->parm_stack_boundary;
- /* For varargs, we do not want to skip the dummy va_dcl argument.
- For stdargs, we do want to skip the last named argument. */
- next_cum = *cum;
- if (stdarg_p (fntype))
- ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
- true);
+ /* Stack at entrance of main is aligned by runtime. We use the
+ smallest incoming stack boundary. */
+ if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
+ && DECL_NAME (current_function_decl)
+ && MAIN_NAME_P (DECL_NAME (current_function_decl))
+ && DECL_FILE_SCOPE_P (current_function_decl))
+ incoming_stack_boundary = MAIN_STACK_BOUNDARY;
- max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
- if (max > X86_64_REGPARM_MAX)
- max = X86_64_REGPARM_MAX;
+ return incoming_stack_boundary;
}
+/* Update incoming stack boundary and estimated stack alignment. */
-/* Checks if TYPE is of kind va_list char *. */
-
-static bool
-is_va_list_char_pointer (tree type)
+static void
+ix86_update_stack_boundary (void)
{
- tree canonic;
+ ix86_incoming_stack_boundary
+ = ix86_minimum_incoming_stack_boundary (false);
- /* For 32-bit it is always true. */
- if (!TARGET_64BIT)
- return true;
- canonic = ix86_canonical_va_list_type (type);
- return (canonic == ms_va_list_type_node
- || (ix86_abi == MS_ABI && canonic == va_list_type_node));
+ /* x86_64 vararg needs 16byte stack alignment for register save area. */
+ if (TARGET_64BIT
+ && cfun->stdarg
+ && crtl->stack_alignment_estimated < 128)
+ crtl->stack_alignment_estimated = 128;
+
+ /* __tls_get_addr needs to be called with 16-byte aligned stack. */
+ if (ix86_tls_descriptor_calls_expanded_in_cfun
+ && crtl->preferred_stack_boundary < 128)
+ crtl->preferred_stack_boundary = 128;
}
-/* Implement va_start. */
+/* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
+ needed or an rtx for DRAP otherwise. */
-static void
-ix86_va_start (tree valist, rtx nextarg)
+static rtx
+ix86_get_drap_rtx (void)
{
- HOST_WIDE_INT words, n_gpr, n_fpr;
- tree f_gpr, f_fpr, f_ovf, f_sav;
- tree gpr, fpr, ovf, sav, t;
- tree type;
- rtx ovf_rtx;
+ /* We must use DRAP if there are outgoing arguments on stack and
+ ACCUMULATE_OUTGOING_ARGS is false. */
+ if (ix86_force_drap
+ || (cfun->machine->outgoing_args_on_stack
+ && !ACCUMULATE_OUTGOING_ARGS))
+ crtl->need_drap = true;
- if (flag_split_stack
- && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
+ if (stack_realign_drap)
{
- unsigned int scratch_regno;
-
- /* When we are splitting the stack, we can't refer to the stack
- arguments using internal_arg_pointer, because they may be on
- the old stack. The split stack prologue will arrange to
- leave a pointer to the old stack arguments in a scratch
- register, which we here copy to a pseudo-register. The split
- stack prologue can't set the pseudo-register directly because
- it (the prologue) runs before any registers have been saved. */
+ /* Assign DRAP to vDRAP and returns vDRAP */
+ unsigned int regno = find_drap_reg ();
+ rtx drap_vreg;
+ rtx arg_ptr;
+ rtx_insn *seq, *insn;
- scratch_regno = split_stack_prologue_scratch_regno ();
- if (scratch_regno != INVALID_REGNUM)
- {
- rtx reg;
- rtx_insn *seq;
+ arg_ptr = gen_rtx_REG (Pmode, regno);
+ crtl->drap_reg = arg_ptr;
- reg = gen_reg_rtx (Pmode);
- cfun->machine->split_stack_varargs_pointer = reg;
-
- start_sequence ();
- emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
- seq = get_insns ();
- end_sequence ();
+ start_sequence ();
+ drap_vreg = copy_to_reg (arg_ptr);
+ seq = get_insns ();
+ end_sequence ();
- push_topmost_sequence ();
- emit_insn_after (seq, entry_of_function ());
- pop_topmost_sequence ();
+ insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
+ if (!optimize)
+ {
+ add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
+ RTX_FRAME_RELATED_P (insn) = 1;
}
+ return drap_vreg;
}
+ else
+ return NULL;
+}
- /* Only 64bit target needs something special. */
- if (is_va_list_char_pointer (TREE_TYPE (valist)))
- {
- if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
- std_expand_builtin_va_start (valist, nextarg);
- else
- {
- rtx va_r, next;
+/* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
- va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
- next = expand_binop (ptr_mode, add_optab,
- cfun->machine->split_stack_varargs_pointer,
- crtl->args.arg_offset_rtx,
- NULL_RTX, 0, OPTAB_LIB_WIDEN);
- convert_move (va_r, next, 0);
- }
- return;
- }
+static rtx
+ix86_internal_arg_pointer (void)
+{
+ return virtual_incoming_args_rtx;
+}
- f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
- f_fpr = DECL_CHAIN (f_gpr);
- f_ovf = DECL_CHAIN (f_fpr);
- f_sav = DECL_CHAIN (f_ovf);
+struct scratch_reg {
+ rtx reg;
+ bool saved;
+};
- valist = build_simple_mem_ref (valist);
- TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
- /* The following should be folded into the MEM_REF offset. */
- gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
- f_gpr, NULL_TREE);
- fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
- f_fpr, NULL_TREE);
- ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
- f_ovf, NULL_TREE);
- sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
- f_sav, NULL_TREE);
+/* Return a short-lived scratch register for use on function entry.
+ In 32-bit mode, it is valid only after the registers are saved
+ in the prologue. This register must be released by means of
+ release_scratch_register_on_entry once it is dead. */
- /* Count number of gp and fp argument registers used. */
- words = crtl->args.info.words;
- n_gpr = crtl->args.info.regno;
- n_fpr = crtl->args.info.sse_regno;
+static void
+get_scratch_register_on_entry (struct scratch_reg *sr)
+{
+ int regno;
- if (cfun->va_list_gpr_size)
+ sr->saved = false;
+
+ if (TARGET_64BIT)
{
- type = TREE_TYPE (gpr);
- t = build2 (MODIFY_EXPR, type,
- gpr, build_int_cst (type, n_gpr * 8));
- TREE_SIDE_EFFECTS (t) = 1;
- expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
+ /* We always use R11 in 64-bit mode. */
+ regno = R11_REG;
}
+ else
+ {
+ tree decl = current_function_decl, fntype = TREE_TYPE (decl);
+ bool fastcall_p
+ = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
+ bool thiscall_p
+ = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
+ bool static_chain_p = DECL_STATIC_CHAIN (decl);
+ int regparm = ix86_function_regparm (fntype, decl);
+ int drap_regno
+ = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
- if (TARGET_SSE && cfun->va_list_fpr_size)
+ /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
+ for the static chain register. */
+ if ((regparm < 1 || (fastcall_p && !static_chain_p))
+ && drap_regno != AX_REG)
+ regno = AX_REG;
+ /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
+ for the static chain register. */
+ else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
+ regno = AX_REG;
+ else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
+ regno = DX_REG;
+ /* ecx is the static chain register. */
+ else if (regparm < 3 && !fastcall_p && !thiscall_p
+ && !static_chain_p
+ && drap_regno != CX_REG)
+ regno = CX_REG;
+ else if (ix86_save_reg (BX_REG, true, false))
+ regno = BX_REG;
+ /* esi is the static chain register. */
+ else if (!(regparm == 3 && static_chain_p)
+ && ix86_save_reg (SI_REG, true, false))
+ regno = SI_REG;
+ else if (ix86_save_reg (DI_REG, true, false))
+ regno = DI_REG;
+ else
+ {
+ regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
+ sr->saved = true;
+ }
+ }
+
+ sr->reg = gen_rtx_REG (Pmode, regno);
+ if (sr->saved)
{
- type = TREE_TYPE (fpr);
- t = build2 (MODIFY_EXPR, type, fpr,
- build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
- TREE_SIDE_EFFECTS (t) = 1;
- expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
+ rtx_insn *insn = emit_insn (gen_push (sr->reg));
+ RTX_FRAME_RELATED_P (insn) = 1;
}
+}
- /* Find the overflow area. */
- type = TREE_TYPE (ovf);
- if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
- ovf_rtx = crtl->args.internal_arg_pointer;
- else
- ovf_rtx = cfun->machine->split_stack_varargs_pointer;
- t = make_tree (type, ovf_rtx);
- if (words != 0)
- t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
+/* Release a scratch register obtained from the preceding function.
- t = build2 (MODIFY_EXPR, type, ovf, t);
- TREE_SIDE_EFFECTS (t) = 1;
- expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
+ If RELEASE_VIA_POP is true, we just pop the register off the stack
+ to release it. This is what non-Linux systems use with -fstack-check.
- if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
+ Otherwise we use OFFSET to locate the saved register and the
+ allocated stack space becomes part of the local frame and is
+ deallocated by the epilogue. */
+
+static void
+release_scratch_register_on_entry (struct scratch_reg *sr, HOST_WIDE_INT offset,
+ bool release_via_pop)
+{
+ if (sr->saved)
{
- /* Find the register save area.
- Prologue of the function save it right above stack frame. */
- type = TREE_TYPE (sav);
- t = make_tree (type, frame_pointer_rtx);
- if (!ix86_varargs_gpr_size)
- t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
+ if (release_via_pop)
+ {
+ struct machine_function *m = cfun->machine;
+ rtx x, insn = emit_insn (gen_pop (sr->reg));
- t = build2 (MODIFY_EXPR, type, sav, t);
- TREE_SIDE_EFFECTS (t) = 1;
- expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
+ /* The RX FRAME_RELATED_P mechanism doesn't know about pop. */
+ RTX_FRAME_RELATED_P (insn) = 1;
+ x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
+ x = gen_rtx_SET (stack_pointer_rtx, x);
+ add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
+ m->fs.sp_offset -= UNITS_PER_WORD;
+ }
+ else
+ {
+ rtx x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (offset));
+ x = gen_rtx_SET (sr->reg, gen_rtx_MEM (word_mode, x));
+ emit_insn (x);
+ }
}
}
-/* Implement va_arg. */
+/* Emit code to adjust the stack pointer by SIZE bytes while probing it.
-static tree
-ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
- gimple_seq *post_p)
-{
- static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
- tree f_gpr, f_fpr, f_ovf, f_sav;
- tree gpr, fpr, ovf, sav, t;
- int size, rsize;
- tree lab_false, lab_over = NULL_TREE;
- tree addr, t2;
- rtx container;
- int indirect_p = 0;
- tree ptrtype;
- machine_mode nat_mode;
- unsigned int arg_boundary;
+ This differs from the next routine in that it tries hard to prevent
+ attacks that jump the stack guard. Thus it is never allowed to allocate
+ more than PROBE_INTERVAL bytes of stack space without a suitable
+ probe.
- /* Only 64bit target needs something special. */
- if (is_va_list_char_pointer (TREE_TYPE (valist)))
- return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
+ INT_REGISTERS_SAVED is true if integer registers have already been
+ pushed on the stack. */
- f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
- f_fpr = DECL_CHAIN (f_gpr);
- f_ovf = DECL_CHAIN (f_fpr);
- f_sav = DECL_CHAIN (f_ovf);
+static void
+ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size,
+ const bool int_registers_saved)
+{
+ struct machine_function *m = cfun->machine;
- gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
- valist, f_gpr, NULL_TREE);
+ /* If this function does not statically allocate stack space, then
+ no probes are needed. */
+ if (!size)
+ {
+ /* However, the allocation of space via pushes for register
+ saves could be viewed as allocating space, but without the
+ need to probe. */
+ if (m->frame.nregs || m->frame.nsseregs || frame_pointer_needed)
+ dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
+ else
+ dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
+ return;
+ }
- fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
- ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
- sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
+ /* If we are a noreturn function, then we have to consider the
+ possibility that we're called via a jump rather than a call.
- indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
- if (indirect_p)
- type = build_pointer_type (type);
- size = arg_int_size_in_bytes (type);
- rsize = CEIL (size, UNITS_PER_WORD);
+ Thus we don't have the implicit probe generated by saving the
+ return address into the stack at the call. Thus, the stack
+ pointer could be anywhere in the guard page. The safe thing
+ to do is emit a probe now.
- nat_mode = type_natural_mode (type, NULL, false);
- switch (nat_mode)
+ The probe can be avoided if we have already emitted any callee
+ register saves into the stack or have a frame pointer (which will
+ have been saved as well). Those saves will function as implicit
+ probes.
+
+ ?!? This should be revamped to work like aarch64 and s390 where
+ we track the offset from the most recent probe. Normally that
+ offset would be zero. For a noreturn function we would reset
+ it to PROBE_INTERVAL - (STACK_BOUNDARY / BITS_PER_UNIT). Then
+ we just probe when we cross PROBE_INTERVAL. */
+ if (TREE_THIS_VOLATILE (cfun->decl)
+ && !(m->frame.nregs || m->frame.nsseregs || frame_pointer_needed))
{
- case E_V8SFmode:
- case E_V8SImode:
- case E_V32QImode:
- case E_V16HImode:
- case E_V4DFmode:
- case E_V4DImode:
- case E_V16SFmode:
- case E_V16SImode:
- case E_V64QImode:
- case E_V32HImode:
- case E_V8DFmode:
- case E_V8DImode:
- /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
- if (!TARGET_64BIT_MS_ABI)
+ /* We can safely use any register here since we're just going to push
+ its value and immediately pop it back. But we do try and avoid
+ argument passing registers so as not to introduce dependencies in
+ the pipeline. For 32 bit we use %esi and for 64 bit we use %rax. */
+ rtx dummy_reg = gen_rtx_REG (word_mode, TARGET_64BIT ? AX_REG : SI_REG);
+ rtx_insn *insn_push = emit_insn (gen_push (dummy_reg));
+ rtx_insn *insn_pop = emit_insn (gen_pop (dummy_reg));
+ m->fs.sp_offset -= UNITS_PER_WORD;
+ if (m->fs.cfa_reg == stack_pointer_rtx)
{
- container = NULL;
- break;
+ m->fs.cfa_offset -= UNITS_PER_WORD;
+ rtx x = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
+ x = gen_rtx_SET (stack_pointer_rtx, x);
+ add_reg_note (insn_push, REG_CFA_ADJUST_CFA, x);
+ RTX_FRAME_RELATED_P (insn_push) = 1;
+ x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
+ x = gen_rtx_SET (stack_pointer_rtx, x);
+ add_reg_note (insn_pop, REG_CFA_ADJUST_CFA, x);
+ RTX_FRAME_RELATED_P (insn_pop) = 1;
}
- /* FALLTHRU */
-
- default:
- container = construct_container (nat_mode, TYPE_MODE (type),
- type, 0, X86_64_REGPARM_MAX,
- X86_64_SSE_REGPARM_MAX, intreg,
- 0);
- break;
+ emit_insn (gen_blockage ());
}
- /* Pull the value out of the saved registers. */
-
- addr = create_tmp_var (ptr_type_node, "addr");
+ /* If we allocate less than the size of the guard statically,
+ then no probing is necessary, but we do need to allocate
+ the stack. */
+ if (size < (1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE)))
+ {
+ pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (-size), -1,
+ m->fs.cfa_reg == stack_pointer_rtx);
+ dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
+ return;
+ }
- if (container)
+ /* We're allocating a large enough stack frame that we need to
+ emit probes. Either emit them inline or in a loop depending
+ on the size. */
+ HOST_WIDE_INT probe_interval = get_probe_interval ();
+ if (size <= 4 * probe_interval)
{
- int needed_intregs, needed_sseregs;
- bool need_temp;
- tree int_addr, sse_addr;
+ HOST_WIDE_INT i;
+ for (i = probe_interval; i <= size; i += probe_interval)
+ {
+ /* Allocate PROBE_INTERVAL bytes. */
+ rtx insn
+ = pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (-probe_interval), -1,
+ m->fs.cfa_reg == stack_pointer_rtx);
+ add_reg_note (insn, REG_STACK_CHECK, const0_rtx);
- lab_false = create_artificial_label (UNKNOWN_LOCATION);
- lab_over = create_artificial_label (UNKNOWN_LOCATION);
+ /* And probe at *sp. */
+ emit_stack_probe (stack_pointer_rtx);
+ emit_insn (gen_blockage ());
+ }
- examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
+ /* We need to allocate space for the residual, but we do not need
+ to probe the residual. */
+ HOST_WIDE_INT residual = (i - probe_interval - size);
+ if (residual)
+ pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (residual), -1,
+ m->fs.cfa_reg == stack_pointer_rtx);
+ dump_stack_clash_frame_info (PROBE_INLINE, residual != 0);
+ }
+ else
+ {
+ /* We expect the GP registers to be saved when probes are used
+ as the probing sequences might need a scratch register and
+ the routine to allocate one assumes the integer registers
+ have already been saved. */
+ gcc_assert (int_registers_saved);
- need_temp = (!REG_P (container)
- && ((needed_intregs && TYPE_ALIGN (type) > 64)
- || TYPE_ALIGN (type) > 128));
+ struct scratch_reg sr;
+ get_scratch_register_on_entry (&sr);
- /* In case we are passing structure, verify that it is consecutive block
- on the register save area. If not we need to do moves. */
- if (!need_temp && !REG_P (container))
- {
- /* Verify that all registers are strictly consecutive */
- if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
- {
- int i;
+ /* If we needed to save a register, then account for any space
+ that was pushed (we are not going to pop the register when
+ we do the restore). */
+ if (sr.saved)
+ size -= UNITS_PER_WORD;
- for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
- {
- rtx slot = XVECEXP (container, 0, i);
- if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
- || INTVAL (XEXP (slot, 1)) != i * 16)
- need_temp = true;
- }
- }
- else
- {
- int i;
+ /* Step 1: round SIZE down to a multiple of the interval. */
+ HOST_WIDE_INT rounded_size = size & -probe_interval;
- for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
- {
- rtx slot = XVECEXP (container, 0, i);
- if (REGNO (XEXP (slot, 0)) != (unsigned int) i
- || INTVAL (XEXP (slot, 1)) != i * 8)
- need_temp = true;
- }
- }
- }
- if (!need_temp)
- {
- int_addr = addr;
- sse_addr = addr;
- }
+ /* Step 2: compute final value of the loop counter. Use lea if
+ possible. */
+ rtx addr = plus_constant (Pmode, stack_pointer_rtx, -rounded_size);
+ rtx insn;
+ if (address_no_seg_operand (addr, Pmode))
+ insn = emit_insn (gen_rtx_SET (sr.reg, addr));
else
{
- int_addr = create_tmp_var (ptr_type_node, "int_addr");
- sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
- }
-
- /* First ensure that we fit completely in registers. */
- if (needed_intregs)
- {
- t = build_int_cst (TREE_TYPE (gpr),
- (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
- t = build2 (GE_EXPR, boolean_type_node, gpr, t);
- t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
- t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
- gimplify_and_add (t, pre_p);
+ emit_move_insn (sr.reg, GEN_INT (-rounded_size));
+ insn = emit_insn (gen_rtx_SET (sr.reg,
+ gen_rtx_PLUS (Pmode, sr.reg,
+ stack_pointer_rtx)));
}
- if (needed_sseregs)
+ if (m->fs.cfa_reg == stack_pointer_rtx)
{
- t = build_int_cst (TREE_TYPE (fpr),
- (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
- + X86_64_REGPARM_MAX * 8);
- t = build2 (GE_EXPR, boolean_type_node, fpr, t);
- t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
- t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
- gimplify_and_add (t, pre_p);
+ add_reg_note (insn, REG_CFA_DEF_CFA,
+ plus_constant (Pmode, sr.reg,
+ m->fs.cfa_offset + rounded_size));
+ RTX_FRAME_RELATED_P (insn) = 1;
}
- /* Compute index to start of area used for integer regs. */
- if (needed_intregs)
- {
- /* int_addr = gpr + sav; */
- t = fold_build_pointer_plus (sav, gpr);
- gimplify_assign (int_addr, t, pre_p);
- }
- if (needed_sseregs)
+ /* Step 3: the loop. */
+ rtx size_rtx = GEN_INT (rounded_size);
+ insn = emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg,
+ size_rtx));
+ if (m->fs.cfa_reg == stack_pointer_rtx)
{
- /* sse_addr = fpr + sav; */
- t = fold_build_pointer_plus (sav, fpr);
- gimplify_assign (sse_addr, t, pre_p);
+ m->fs.cfa_offset += rounded_size;
+ add_reg_note (insn, REG_CFA_DEF_CFA,
+ plus_constant (Pmode, stack_pointer_rtx,
+ m->fs.cfa_offset));
+ RTX_FRAME_RELATED_P (insn) = 1;
}
- if (need_temp)
- {
- int i, prev_size = 0;
- tree temp = create_tmp_var (type, "va_arg_tmp");
+ m->fs.sp_offset += rounded_size;
+ emit_insn (gen_blockage ());
- /* addr = &temp; */
- t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
- gimplify_assign (addr, t, pre_p);
+ /* Step 4: adjust SP if we cannot assert at compile-time that SIZE
+ is equal to ROUNDED_SIZE. */
- for (i = 0; i < XVECLEN (container, 0); i++)
- {
- rtx slot = XVECEXP (container, 0, i);
- rtx reg = XEXP (slot, 0);
- machine_mode mode = GET_MODE (reg);
- tree piece_type;
- tree addr_type;
- tree daddr_type;
- tree src_addr, src;
- int src_offset;
- tree dest_addr, dest;
- int cur_size = GET_MODE_SIZE (mode);
+ if (size != rounded_size)
+ pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (rounded_size - size), -1,
+ m->fs.cfa_reg == stack_pointer_rtx);
+ dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
- gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
- prev_size = INTVAL (XEXP (slot, 1));
- if (prev_size + cur_size > size)
- {
- cur_size = size - prev_size;
- unsigned int nbits = cur_size * BITS_PER_UNIT;
- if (!int_mode_for_size (nbits, 1).exists (&mode))
- mode = QImode;
- }
- piece_type = lang_hooks.types.type_for_mode (mode, 1);
- if (mode == GET_MODE (reg))
- addr_type = build_pointer_type (piece_type);
- else
- addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
- true);
- daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
- true);
+ /* This does not deallocate the space reserved for the scratch
+ register. That will be deallocated in the epilogue. */
+ release_scratch_register_on_entry (&sr, size, false);
+ }
- if (SSE_REGNO_P (REGNO (reg)))
- {
- src_addr = sse_addr;
- src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
- }
- else
- {
- src_addr = int_addr;
- src_offset = REGNO (reg) * 8;
- }
- src_addr = fold_convert (addr_type, src_addr);
- src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
+ /* Make sure nothing is scheduled before we are done. */
+ emit_insn (gen_blockage ());
+}
- dest_addr = fold_convert (daddr_type, addr);
- dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
- if (cur_size == GET_MODE_SIZE (mode))
- {
- src = build_va_arg_indirect_ref (src_addr);
- dest = build_va_arg_indirect_ref (dest_addr);
+/* Emit code to adjust the stack pointer by SIZE bytes while probing it.
- gimplify_assign (dest, src, pre_p);
- }
- else
- {
- tree copy
- = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
- 3, dest_addr, src_addr,
- size_int (cur_size));
- gimplify_and_add (copy, pre_p);
- }
- prev_size += cur_size;
- }
- }
+ INT_REGISTERS_SAVED is true if integer registers have already been
+ pushed on the stack. */
- if (needed_intregs)
- {
- t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
- build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
- gimplify_assign (gpr, t, pre_p);
- }
+static void
+ix86_adjust_stack_and_probe (HOST_WIDE_INT size,
+ const bool int_registers_saved)
+{
+ /* We skip the probe for the first interval + a small dope of 4 words and
+ probe that many bytes past the specified size to maintain a protection
+ area at the botton of the stack. */
+ const int dope = 4 * UNITS_PER_WORD;
+ rtx size_rtx = GEN_INT (size), last;
- if (needed_sseregs)
- {
- t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
- build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
- gimplify_assign (unshare_expr (fpr), t, pre_p);
- }
+ /* See if we have a constant small number of probes to generate. If so,
+ that's the easy case. The run-time loop is made up of 9 insns in the
+ generic case while the compile-time loop is made up of 3+2*(n-1) insns
+ for n # of intervals. */
+ if (size <= 4 * get_probe_interval ())
+ {
+ HOST_WIDE_INT i, adjust;
+ bool first_probe = true;
- gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
+ /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
+ values of N from 1 until it exceeds SIZE. If only one probe is
+ needed, this will not generate any code. Then adjust and probe
+ to PROBE_INTERVAL + SIZE. */
+ for (i = get_probe_interval (); i < size; i += get_probe_interval ())
+ {
+ if (first_probe)
+ {
+ adjust = 2 * get_probe_interval () + dope;
+ first_probe = false;
+ }
+ else
+ adjust = get_probe_interval ();
- gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
- }
+ emit_insn (gen_rtx_SET (stack_pointer_rtx,
+ plus_constant (Pmode, stack_pointer_rtx,
+ -adjust)));
+ emit_stack_probe (stack_pointer_rtx);
+ }
- /* ... otherwise out of the overflow area. */
+ if (first_probe)
+ adjust = size + get_probe_interval () + dope;
+ else
+ adjust = size + get_probe_interval () - i;
- /* When we align parameter on stack for caller, if the parameter
- alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
- aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
- here with caller. */
- arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
- if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
- arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
+ emit_insn (gen_rtx_SET (stack_pointer_rtx,
+ plus_constant (Pmode, stack_pointer_rtx,
+ -adjust)));
+ emit_stack_probe (stack_pointer_rtx);
- /* Care for on-stack alignment if needed. */
- if (arg_boundary <= 64 || size == 0)
- t = ovf;
- else
- {
- HOST_WIDE_INT align = arg_boundary / 8;
- t = fold_build_pointer_plus_hwi (ovf, align - 1);
- t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
- build_int_cst (TREE_TYPE (t), -align));
+ /* Adjust back to account for the additional first interval. */
+ last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
+ plus_constant (Pmode, stack_pointer_rtx,
+ (get_probe_interval ()
+ + dope))));
}
- gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
- gimplify_assign (addr, t, pre_p);
+ /* Otherwise, do the same as above, but in a loop. Note that we must be
+ extra careful with variables wrapping around because we might be at
+ the very top (or the very bottom) of the address space and we have
+ to be able to handle this case properly; in particular, we use an
+ equality test for the loop condition. */
+ else
+ {
+ /* We expect the GP registers to be saved when probes are used
+ as the probing sequences might need a scratch register and
+ the routine to allocate one assumes the integer registers
+ have already been saved. */
+ gcc_assert (int_registers_saved);
- t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
- gimplify_assign (unshare_expr (ovf), t, pre_p);
+ HOST_WIDE_INT rounded_size;
+ struct scratch_reg sr;
- if (container)
- gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
+ get_scratch_register_on_entry (&sr);
- ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
- addr = fold_convert (ptrtype, addr);
+ /* If we needed to save a register, then account for any space
+ that was pushed (we are not going to pop the register when
+ we do the restore). */
+ if (sr.saved)
+ size -= UNITS_PER_WORD;
- if (indirect_p)
- addr = build_va_arg_indirect_ref (addr);
- return build_va_arg_indirect_ref (addr);
-}
-\f
-/* Return true if OPNUM's MEM should be matched
- in movabs* patterns. */
+ /* Step 1: round SIZE to the previous multiple of the interval. */
-bool
-ix86_check_movabs (rtx insn, int opnum)
-{
- rtx set, mem;
+ rounded_size = ROUND_DOWN (size, get_probe_interval ());
- set = PATTERN (insn);
- if (GET_CODE (set) == PARALLEL)
- set = XVECEXP (set, 0, 0);
- gcc_assert (GET_CODE (set) == SET);
- mem = XEXP (set, opnum);
- while (SUBREG_P (mem))
- mem = SUBREG_REG (mem);
- gcc_assert (MEM_P (mem));
- return volatile_ok || !MEM_VOLATILE_P (mem);
-}
-/* Return false if INSN contains a MEM with a non-default address space. */
-bool
-ix86_check_no_addr_space (rtx insn)
-{
- subrtx_var_iterator::array_type array;
- FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
- {
- rtx x = *iter;
- if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
- return false;
- }
- return true;
-}
-\f
-/* Initialize the table of extra 80387 mathematical constants. */
+ /* Step 2: compute initial and final value of the loop counter. */
-static void
-init_ext_80387_constants (void)
-{
- static const char * cst[5] =
- {
- "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
- "0.6931471805599453094286904741849753009", /* 1: fldln2 */
- "1.4426950408889634073876517827983434472", /* 2: fldl2e */
- "3.3219280948873623478083405569094566090", /* 3: fldl2t */
- "3.1415926535897932385128089594061862044", /* 4: fldpi */
- };
- int i;
+ /* SP = SP_0 + PROBE_INTERVAL. */
+ emit_insn (gen_rtx_SET (stack_pointer_rtx,
+ plus_constant (Pmode, stack_pointer_rtx,
+ - (get_probe_interval () + dope))));
- for (i = 0; i < 5; i++)
- {
- real_from_string (&ext_80387_constants_table[i], cst[i]);
- /* Ensure each constant is rounded to XFmode precision. */
- real_convert (&ext_80387_constants_table[i],
- XFmode, &ext_80387_constants_table[i]);
- }
+ /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
+ if (rounded_size <= (HOST_WIDE_INT_1 << 31))
+ emit_insn (gen_rtx_SET (sr.reg,
+ plus_constant (Pmode, stack_pointer_rtx,
+ -rounded_size)));
+ else
+ {
+ emit_move_insn (sr.reg, GEN_INT (-rounded_size));
+ emit_insn (gen_rtx_SET (sr.reg,
+ gen_rtx_PLUS (Pmode, sr.reg,
+ stack_pointer_rtx)));
+ }
- ext_80387_constants_init = 1;
-}
-/* Return non-zero if the constant is something that
- can be loaded with a special instruction. */
+ /* Step 3: the loop
-int
-standard_80387_constant_p (rtx x)
-{
- machine_mode mode = GET_MODE (x);
+ do
+ {
+ SP = SP + PROBE_INTERVAL
+ probe at SP
+ }
+ while (SP != LAST_ADDR)
- const REAL_VALUE_TYPE *r;
+ adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
+ values of N from 1 until it is equal to ROUNDED_SIZE. */
- if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
- return -1;
+ emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
- if (x == CONST0_RTX (mode))
- return 1;
- if (x == CONST1_RTX (mode))
- return 2;
- r = CONST_DOUBLE_REAL_VALUE (x);
+ /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
+ assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
- /* For XFmode constants, try to find a special 80387 instruction when
- optimizing for size or on those CPUs that benefit from them. */
- if (mode == XFmode
- && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
- {
- int i;
+ if (size != rounded_size)
+ {
+ emit_insn (gen_rtx_SET (stack_pointer_rtx,
+ plus_constant (Pmode, stack_pointer_rtx,
+ rounded_size - size)));
+ emit_stack_probe (stack_pointer_rtx);
+ }
- if (! ext_80387_constants_init)
- init_ext_80387_constants ();
+ /* Adjust back to account for the additional first interval. */
+ last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
+ plus_constant (Pmode, stack_pointer_rtx,
+ (get_probe_interval ()
+ + dope))));
- for (i = 0; i < 5; i++)
- if (real_identical (r, &ext_80387_constants_table[i]))
- return i + 3;
+ /* This does not deallocate the space reserved for the scratch
+ register. That will be deallocated in the epilogue. */
+ release_scratch_register_on_entry (&sr, size, false);
}
- /* Load of the constant -0.0 or -1.0 will be split as
- fldz;fchs or fld1;fchs sequence. */
- if (real_isnegzero (r))
- return 8;
- if (real_identical (r, &dconstm1))
- return 9;
+ /* Even if the stack pointer isn't the CFA register, we need to correctly
+ describe the adjustments made to it, in particular differentiate the
+ frame-related ones from the frame-unrelated ones. */
+ if (size > 0)
+ {
+ rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
+ XVECEXP (expr, 0, 0)
+ = gen_rtx_SET (stack_pointer_rtx,
+ plus_constant (Pmode, stack_pointer_rtx, -size));
+ XVECEXP (expr, 0, 1)
+ = gen_rtx_SET (stack_pointer_rtx,
+ plus_constant (Pmode, stack_pointer_rtx,
+ get_probe_interval () + dope + size));
+ add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
+ RTX_FRAME_RELATED_P (last) = 1;
- return 0;
+ cfun->machine->fs.sp_offset += size;
+ }
+
+ /* Make sure nothing is scheduled before we are done. */
+ emit_insn (gen_blockage ());
}
-/* Return the opcode of the special instruction to be used to load
- the constant X. */
+/* Adjust the stack pointer up to REG while probing it. */
const char *
-standard_80387_constant_opcode (rtx x)
+output_adjust_stack_and_probe (rtx reg)
{
- switch (standard_80387_constant_p (x))
- {
- case 1:
- return "fldz";
- case 2:
- return "fld1";
- case 3:
- return "fldlg2";
- case 4:
- return "fldln2";
- case 5:
- return "fldl2e";
- case 6:
- return "fldl2t";
- case 7:
- return "fldpi";
- case 8:
- case 9:
- return "#";
- default:
- gcc_unreachable ();
- }
+ static int labelno = 0;
+ char loop_lab[32];
+ rtx xops[2];
+
+ ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
+
+ /* Loop. */
+ ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
+
+ /* SP = SP + PROBE_INTERVAL. */
+ xops[0] = stack_pointer_rtx;
+ xops[1] = GEN_INT (get_probe_interval ());
+ output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
+
+ /* Probe at SP. */
+ xops[1] = const0_rtx;
+ output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
+
+ /* Test if SP == LAST_ADDR. */
+ xops[0] = stack_pointer_rtx;
+ xops[1] = reg;
+ output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
+
+ /* Branch. */
+ fputs ("\tjne\t", asm_out_file);
+ assemble_name_raw (asm_out_file, loop_lab);
+ fputc ('\n', asm_out_file);
+
+ return "";
}
-/* Return the CONST_DOUBLE representing the 80387 constant that is
- loaded by the specified special instruction. The argument IDX
- matches the return value from standard_80387_constant_p. */
+/* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
+ inclusive. These are offsets from the current stack pointer.
-rtx
-standard_80387_constant_rtx (int idx)
+ INT_REGISTERS_SAVED is true if integer registers have already been
+ pushed on the stack. */
+
+static void
+ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size,
+ const bool int_registers_saved)
{
- int i;
+ /* See if we have a constant small number of probes to generate. If so,
+ that's the easy case. The run-time loop is made up of 6 insns in the
+ generic case while the compile-time loop is made up of n insns for n #
+ of intervals. */
+ if (size <= 6 * get_probe_interval ())
+ {
+ HOST_WIDE_INT i;
- if (! ext_80387_constants_init)
- init_ext_80387_constants ();
+ /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
+ it exceeds SIZE. If only one probe is needed, this will not
+ generate any code. Then probe at FIRST + SIZE. */
+ for (i = get_probe_interval (); i < size; i += get_probe_interval ())
+ emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+ -(first + i)));
- switch (idx)
+ emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
+ -(first + size)));
+ }
+
+ /* Otherwise, do the same as above, but in a loop. Note that we must be
+ extra careful with variables wrapping around because we might be at
+ the very top (or the very bottom) of the address space and we have
+ to be able to handle this case properly; in particular, we use an
+ equality test for the loop condition. */
+ else
{
- case 3:
- case 4:
- case 5:
- case 6:
- case 7:
- i = idx - 3;
- break;
+ /* We expect the GP registers to be saved when probes are used
+ as the probing sequences might need a scratch register and
+ the routine to allocate one assumes the integer registers
+ have already been saved. */
+ gcc_assert (int_registers_saved);
- default:
- gcc_unreachable ();
- }
+ HOST_WIDE_INT rounded_size, last;
+ struct scratch_reg sr;
- return const_double_from_real_value (ext_80387_constants_table[i],
- XFmode);
-}
+ get_scratch_register_on_entry (&sr);
-/* Return 1 if X is all bits 0 and 2 if X is all bits 1
- in supported SSE/AVX vector mode. */
-int
-standard_sse_constant_p (rtx x, machine_mode pred_mode)
-{
- machine_mode mode;
+ /* Step 1: round SIZE to the previous multiple of the interval. */
- if (!TARGET_SSE)
- return 0;
+ rounded_size = ROUND_DOWN (size, get_probe_interval ());
- mode = GET_MODE (x);
- if (x == const0_rtx || const0_operand (x, mode))
- return 1;
+ /* Step 2: compute initial and final value of the loop counter. */
- if (x == constm1_rtx || vector_all_ones_operand (x, mode))
- {
- /* VOIDmode integer constant, get mode from the predicate. */
- if (mode == VOIDmode)
- mode = pred_mode;
+ /* TEST_OFFSET = FIRST. */
+ emit_move_insn (sr.reg, GEN_INT (-first));
- switch (GET_MODE_SIZE (mode))
- {
- case 64:
- if (TARGET_AVX512F)
- return 2;
- break;
- case 32:
- if (TARGET_AVX2)
- return 2;
- break;
- case 16:
- if (TARGET_SSE2)
- return 2;
- break;
- case 0:
- /* VOIDmode */
- gcc_unreachable ();
- default:
- break;
- }
+ /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
+ last = first + rounded_size;
+
+
+ /* Step 3: the loop
+
+ do
+ {
+ TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
+ probe at TEST_ADDR
+ }
+ while (TEST_ADDR != LAST_ADDR)
+
+ probes at FIRST + N * PROBE_INTERVAL for values of N from 1
+ until it is equal to ROUNDED_SIZE. */
+
+ emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
+
+
+ /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
+ that SIZE is equal to ROUNDED_SIZE. */
+
+ if (size != rounded_size)
+ emit_stack_probe (plus_constant (Pmode,
+ gen_rtx_PLUS (Pmode,
+ stack_pointer_rtx,
+ sr.reg),
+ rounded_size - size));
+
+ release_scratch_register_on_entry (&sr, size, true);
}
- return 0;
+ /* Make sure nothing is scheduled before we are done. */
+ emit_insn (gen_blockage ());
}
-/* Return the opcode of the special instruction to be used to load
- the constant operands[1] into operands[0]. */
+/* Probe a range of stack addresses from REG to END, inclusive. These are
+ offsets from the current stack pointer. */
const char *
-standard_sse_constant_opcode (rtx_insn *insn, rtx *operands)
+output_probe_stack_range (rtx reg, rtx end)
{
- machine_mode mode;
- rtx x = operands[1];
+ static int labelno = 0;
+ char loop_lab[32];
+ rtx xops[3];
- gcc_assert (TARGET_SSE);
+ ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
- mode = GET_MODE (x);
-
- if (x == const0_rtx || const0_operand (x, mode))
- {
- switch (get_attr_mode (insn))
- {
- case MODE_TI:
- if (!EXT_REX_SSE_REG_P (operands[0]))
- return "%vpxor\t%0, %d0";
- /* FALLTHRU */
- case MODE_XI:
- case MODE_OI:
- if (EXT_REX_SSE_REG_P (operands[0]))
- return (TARGET_AVX512VL
- ? "vpxord\t%x0, %x0, %x0"
- : "vpxord\t%g0, %g0, %g0");
- return "vpxor\t%x0, %x0, %x0";
-
- case MODE_V2DF:
- if (!EXT_REX_SSE_REG_P (operands[0]))
- return "%vxorpd\t%0, %d0";
- /* FALLTHRU */
- case MODE_V8DF:
- case MODE_V4DF:
- if (!EXT_REX_SSE_REG_P (operands[0]))
- return "vxorpd\t%x0, %x0, %x0";
- else if (TARGET_AVX512DQ)
- return (TARGET_AVX512VL
- ? "vxorpd\t%x0, %x0, %x0"
- : "vxorpd\t%g0, %g0, %g0");
- else
- return (TARGET_AVX512VL
- ? "vpxorq\t%x0, %x0, %x0"
- : "vpxorq\t%g0, %g0, %g0");
+ /* Loop. */
+ ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
- case MODE_V4SF:
- if (!EXT_REX_SSE_REG_P (operands[0]))
- return "%vxorps\t%0, %d0";
- /* FALLTHRU */
- case MODE_V16SF:
- case MODE_V8SF:
- if (!EXT_REX_SSE_REG_P (operands[0]))
- return "vxorps\t%x0, %x0, %x0";
- else if (TARGET_AVX512DQ)
- return (TARGET_AVX512VL
- ? "vxorps\t%x0, %x0, %x0"
- : "vxorps\t%g0, %g0, %g0");
- else
- return (TARGET_AVX512VL
- ? "vpxord\t%x0, %x0, %x0"
- : "vpxord\t%g0, %g0, %g0");
+ /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
+ xops[0] = reg;
+ xops[1] = GEN_INT (get_probe_interval ());
+ output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
- default:
- gcc_unreachable ();
- }
- }
- else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
- {
- enum attr_mode insn_mode = get_attr_mode (insn);
-
- switch (insn_mode)
- {
- case MODE_XI:
- case MODE_V8DF:
- case MODE_V16SF:
- gcc_assert (TARGET_AVX512F);
- return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
+ /* Probe at TEST_ADDR. */
+ xops[0] = stack_pointer_rtx;
+ xops[1] = reg;
+ xops[2] = const0_rtx;
+ output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
- case MODE_OI:
- case MODE_V4DF:
- case MODE_V8SF:
- gcc_assert (TARGET_AVX2);
- /* FALLTHRU */
- case MODE_TI:
- case MODE_V2DF:
- case MODE_V4SF:
- gcc_assert (TARGET_SSE2);
- if (!EXT_REX_SSE_REG_P (operands[0]))
- return (TARGET_AVX
- ? "vpcmpeqd\t%0, %0, %0"
- : "pcmpeqd\t%0, %0");
- else if (TARGET_AVX512VL)
- return "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}";
- else
- return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
+ /* Test if TEST_ADDR == LAST_ADDR. */
+ xops[0] = reg;
+ xops[1] = end;
+ output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
- default:
- gcc_unreachable ();
- }
- }
+ /* Branch. */
+ fputs ("\tjne\t", asm_out_file);
+ assemble_name_raw (asm_out_file, loop_lab);
+ fputc ('\n', asm_out_file);
- gcc_unreachable ();
+ return "";
}
-/* Returns true if INSN can be transformed from a memory load
- to a supported FP constant load. */
+/* Return true if stack frame is required. Update STACK_ALIGNMENT
+ to the largest alignment, in bits, of stack slot used if stack
+ frame is required and CHECK_STACK_SLOT is true. */
-bool
-ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
+static bool
+ix86_find_max_used_stack_alignment (unsigned int &stack_alignment,
+ bool check_stack_slot)
{
- rtx src = find_constant_src (insn);
-
- gcc_assert (REG_P (dst));
-
- if (src == NULL
- || (SSE_REGNO_P (REGNO (dst))
- && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
- || (STACK_REGNO_P (REGNO (dst))
- && standard_80387_constant_p (src) < 1))
- return false;
-
- return true;
-}
+ HARD_REG_SET set_up_by_prologue, prologue_used;
+ basic_block bb;
-/* Returns true if OP contains a symbol reference */
+ CLEAR_HARD_REG_SET (prologue_used);
+ CLEAR_HARD_REG_SET (set_up_by_prologue);
+ add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
+ add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
+ add_to_hard_reg_set (&set_up_by_prologue, Pmode,
+ HARD_FRAME_POINTER_REGNUM);
-bool
-symbolic_reference_mentioned_p (rtx op)
-{
- const char *fmt;
- int i;
+ /* The preferred stack alignment is the minimum stack alignment. */
+ if (stack_alignment > crtl->preferred_stack_boundary)
+ stack_alignment = crtl->preferred_stack_boundary;
- if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
- return true;
+ bool require_stack_frame = false;
- fmt = GET_RTX_FORMAT (GET_CODE (op));
- for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
+ FOR_EACH_BB_FN (bb, cfun)
{
- if (fmt[i] == 'E')
- {
- int j;
-
- for (j = XVECLEN (op, i) - 1; j >= 0; j--)
- if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
- return true;
- }
+ rtx_insn *insn;
+ FOR_BB_INSNS (bb, insn)
+ if (NONDEBUG_INSN_P (insn)
+ && requires_stack_frame_p (insn, prologue_used,
+ set_up_by_prologue))
+ {
+ require_stack_frame = true;
- else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
- return true;
+ if (check_stack_slot)
+ {
+ /* Find the maximum stack alignment. */
+ subrtx_iterator::array_type array;
+ FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
+ if (MEM_P (*iter)
+ && (reg_mentioned_p (stack_pointer_rtx,
+ *iter)
+ || reg_mentioned_p (frame_pointer_rtx,
+ *iter)))
+ {
+ unsigned int alignment = MEM_ALIGN (*iter);
+ if (alignment > stack_alignment)
+ stack_alignment = alignment;
+ }
+ }
+ }
}
- return false;
+ return require_stack_frame;
}
-/* Return true if it is appropriate to emit `ret' instructions in the
- body of a function. Do this only if the epilogue is simple, needing a
- couple of insns. Prior to reloading, we can't tell how many registers
- must be saved, so return false then. Return false if there is no frame
- marker to de-allocate. */
-
-bool
-ix86_can_use_return_insn_p (void)
-{
- if (ix86_function_naked (current_function_decl))
- return false;
-
- /* Don't use `ret' instruction in interrupt handler. */
- if (! reload_completed
- || frame_pointer_needed
- || cfun->machine->func_type != TYPE_NORMAL)
- return 0;
-
- /* Don't allow more than 32k pop, since that's all we can do
- with one instruction. */
- if (crtl->args.pops_args && crtl->args.size >= 32768)
- return 0;
-
- struct ix86_frame &frame = cfun->machine->frame;
- return (frame.stack_pointer_offset == UNITS_PER_WORD
- && (frame.nregs + frame.nsseregs) == 0);
-}
-\f
-/* Value should be nonzero if functions must have frame pointers.
- Zero means the frame pointer need not be set up (and parms may
- be accessed via the stack pointer) in functions that seem suitable. */
+/* Finalize stack_realign_needed and frame_pointer_needed flags, which
+ will guide prologue/epilogue to be generated in correct form. */
-static bool
-ix86_frame_pointer_required (void)
+static void
+ix86_finalize_stack_frame_flags (void)
{
- /* If we accessed previous frames, then the generated code expects
- to be able to access the saved ebp value in our frame. */
- if (cfun->machine->accesses_prev_frame)
- return true;
-
- /* Several x86 os'es need a frame pointer for other reasons,
- usually pertaining to setjmp. */
- if (SUBTARGET_FRAME_POINTER_REQUIRED)
- return true;
+ /* Check if stack realign is really needed after reload, and
+ stores result in cfun */
+ unsigned int incoming_stack_boundary
+ = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
+ ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
+ unsigned int stack_alignment
+ = (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
+ ? crtl->max_used_stack_slot_alignment
+ : crtl->stack_alignment_needed);
+ unsigned int stack_realign
+ = (incoming_stack_boundary < stack_alignment);
+ bool recompute_frame_layout_p = false;
- /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
- if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
- return true;
+ if (crtl->stack_realign_finalized)
+ {
+ /* After stack_realign_needed is finalized, we can't no longer
+ change it. */
+ gcc_assert (crtl->stack_realign_needed == stack_realign);
+ return;
+ }
- /* Win64 SEH, very large frames need a frame-pointer as maximum stack
- allocation is 4GB. */
- if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
- return true;
-
- /* SSE saves require frame-pointer when stack is misaligned. */
- if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
- return true;
-
- /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
- turns off the frame pointer by default. Turn it back on now if
- we've not got a leaf function. */
- if (TARGET_OMIT_LEAF_FRAME_POINTER
- && (!crtl->is_leaf
- || ix86_current_function_calls_tls_descriptor))
- return true;
-
- if (crtl->profile && !flag_fentry)
- return true;
-
- return false;
-}
+ /* If the only reason for frame_pointer_needed is that we conservatively
+ assumed stack realignment might be needed or -fno-omit-frame-pointer
+ is used, but in the end nothing that needed the stack alignment had
+ been spilled nor stack access, clear frame_pointer_needed and say we
+ don't need stack realignment. */
+ if ((stack_realign || (!flag_omit_frame_pointer && optimize))
+ && frame_pointer_needed
+ && crtl->is_leaf
+ && crtl->sp_is_unchanging
+ && !ix86_current_function_calls_tls_descriptor
+ && !crtl->accesses_prior_frames
+ && !cfun->calls_alloca
+ && !crtl->calls_eh_return
+ /* See ira_setup_eliminable_regset for the rationale. */
+ && !(STACK_CHECK_MOVING_SP
+ && flag_stack_check
+ && flag_exceptions
+ && cfun->can_throw_non_call_exceptions)
+ && !ix86_frame_pointer_required ()
+ && get_frame_size () == 0
+ && ix86_nsaved_sseregs () == 0
+ && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
+ {
+ if (ix86_find_max_used_stack_alignment (stack_alignment,
+ stack_realign))
+ {
+ /* Stack frame is required. If stack alignment needed is less
+ than incoming stack boundary, don't realign stack. */
+ stack_realign = incoming_stack_boundary < stack_alignment;
+ if (!stack_realign)
+ {
+ crtl->max_used_stack_slot_alignment
+ = incoming_stack_boundary;
+ crtl->stack_alignment_needed
+ = incoming_stack_boundary;
+ /* Also update preferred_stack_boundary for leaf
+ functions. */
+ crtl->preferred_stack_boundary
+ = incoming_stack_boundary;
+ }
+ }
+ else
+ {
+ /* If drap has been set, but it actually isn't live at the
+ start of the function, there is no reason to set it up. */
+ if (crtl->drap_reg)
+ {
+ basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
+ if (! REGNO_REG_SET_P (DF_LR_IN (bb),
+ REGNO (crtl->drap_reg)))
+ {
+ crtl->drap_reg = NULL_RTX;
+ crtl->need_drap = false;
+ }
+ }
+ else
+ cfun->machine->no_drap_save_restore = true;
-/* Record that the current function accesses previous call frames. */
+ frame_pointer_needed = false;
+ stack_realign = false;
+ crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
+ crtl->stack_alignment_needed = incoming_stack_boundary;
+ crtl->stack_alignment_estimated = incoming_stack_boundary;
+ if (crtl->preferred_stack_boundary > incoming_stack_boundary)
+ crtl->preferred_stack_boundary = incoming_stack_boundary;
+ df_finish_pass (true);
+ df_scan_alloc (NULL);
+ df_scan_blocks ();
+ df_compute_regs_ever_live (true);
+ df_analyze ();
-void
-ix86_setup_frame_addresses (void)
-{
- cfun->machine->accesses_prev_frame = 1;
-}
-\f
-#ifndef USE_HIDDEN_LINKONCE
-# if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
-# define USE_HIDDEN_LINKONCE 1
-# else
-# define USE_HIDDEN_LINKONCE 0
-# endif
-#endif
+ if (flag_var_tracking)
+ {
+ /* Since frame pointer is no longer available, replace it with
+ stack pointer - UNITS_PER_WORD in debug insns. */
+ df_ref ref, next;
+ for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
+ ref; ref = next)
+ {
+ next = DF_REF_NEXT_REG (ref);
+ if (!DF_REF_INSN_INFO (ref))
+ continue;
-/* Label count for call and return thunks. It is used to make unique
- labels in call and return thunks. */
-static int indirectlabelno;
+ /* Make sure the next ref is for a different instruction,
+ so that we're not affected by the rescan. */
+ rtx_insn *insn = DF_REF_INSN (ref);
+ while (next && DF_REF_INSN (next) == insn)
+ next = DF_REF_NEXT_REG (next);
-/* True if call thunk function is needed. */
-static bool indirect_thunk_needed = false;
+ if (DEBUG_INSN_P (insn))
+ {
+ bool changed = false;
+ for (; ref != next; ref = DF_REF_NEXT_REG (ref))
+ {
+ rtx *loc = DF_REF_LOC (ref);
+ if (*loc == hard_frame_pointer_rtx)
+ {
+ *loc = plus_constant (Pmode,
+ stack_pointer_rtx,
+ -UNITS_PER_WORD);
+ changed = true;
+ }
+ }
+ if (changed)
+ df_insn_rescan (insn);
+ }
+ }
+ }
-/* Bit masks of integer registers, which contain branch target, used
- by call thunk functions. */
-static int indirect_thunks_used;
+ recompute_frame_layout_p = true;
+ }
+ }
+ else if (crtl->max_used_stack_slot_alignment >= 128)
+ {
+ /* We don't need to realign stack. max_used_stack_alignment is
+ used to decide how stack frame should be aligned. This is
+ independent of any psABIs nor 32-bit vs 64-bit. It is always
+ safe to compute max_used_stack_alignment. We compute it only
+ if 128-bit aligned load/store may be generated on misaligned
+ stack slot which will lead to segfault. */
+ if (ix86_find_max_used_stack_alignment (stack_alignment, true))
+ cfun->machine->max_used_stack_alignment
+ = stack_alignment / BITS_PER_UNIT;
+ }
-/* True if return thunk function is needed. */
-static bool indirect_return_needed = false;
+ if (crtl->stack_realign_needed != stack_realign)
+ recompute_frame_layout_p = true;
+ crtl->stack_realign_needed = stack_realign;
+ crtl->stack_realign_finalized = true;
+ if (recompute_frame_layout_p)
+ ix86_compute_frame_layout ();
+}
-/* True if return thunk function via CX is needed. */
-static bool indirect_return_via_cx;
+/* Delete SET_GOT right after entry block if it is allocated to reg. */
-#ifndef INDIRECT_LABEL
-# define INDIRECT_LABEL "LIND"
-#endif
+static void
+ix86_elim_entry_set_got (rtx reg)
+{
+ basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
+ rtx_insn *c_insn = BB_HEAD (bb);
+ if (!NONDEBUG_INSN_P (c_insn))
+ c_insn = next_nonnote_nondebug_insn (c_insn);
+ if (c_insn && NONJUMP_INSN_P (c_insn))
+ {
+ rtx pat = PATTERN (c_insn);
+ if (GET_CODE (pat) == PARALLEL)
+ {
+ rtx vec = XVECEXP (pat, 0, 0);
+ if (GET_CODE (vec) == SET
+ && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
+ && REGNO (XEXP (vec, 0)) == REGNO (reg))
+ delete_insn (c_insn);
+ }
+ }
+}
-/* Indicate what prefix is needed for an indirect branch. */
-enum indirect_thunk_prefix
+static rtx
+gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
{
- indirect_thunk_prefix_none,
- indirect_thunk_prefix_nt
-};
+ rtx addr, mem;
-/* Return the prefix needed for an indirect branch INSN. */
+ if (offset)
+ addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
+ mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
+ return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
+}
-enum indirect_thunk_prefix
-indirect_thunk_need_prefix (rtx_insn *insn)
+static inline rtx
+gen_frame_load (rtx reg, rtx frame_reg, int offset)
{
- enum indirect_thunk_prefix need_prefix;
- if ((cfun->machine->indirect_branch_type
- == indirect_branch_thunk_extern)
- && ix86_notrack_prefixed_insn_p (insn))
- {
- /* NOTRACK prefix is only used with external thunk so that it
- can be properly updated to support CET at run-time. */
- need_prefix = indirect_thunk_prefix_nt;
- }
- else
- need_prefix = indirect_thunk_prefix_none;
- return need_prefix;
+ return gen_frame_set (reg, frame_reg, offset, false);
}
-/* Fills in the label name that should be used for the indirect thunk. */
+static inline rtx
+gen_frame_store (rtx reg, rtx frame_reg, int offset)
+{
+ return gen_frame_set (reg, frame_reg, offset, true);
+}
static void
-indirect_thunk_name (char name[32], unsigned int regno,
- enum indirect_thunk_prefix need_prefix,
- bool ret_p)
+ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
{
- if (regno != INVALID_REGNUM && regno != CX_REG && ret_p)
- gcc_unreachable ();
+ struct machine_function *m = cfun->machine;
+ const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
+ + m->call_ms2sysv_extra_regs;
+ rtvec v = rtvec_alloc (ncregs + 1);
+ unsigned int align, i, vi = 0;
+ rtx_insn *insn;
+ rtx sym, addr;
+ rtx rax = gen_rtx_REG (word_mode, AX_REG);
+ const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
- if (USE_HIDDEN_LINKONCE)
- {
- const char *prefix;
+ /* AL should only be live with sysv_abi. */
+ gcc_assert (!ix86_eax_live_at_start_p ());
+ gcc_assert (m->fs.sp_offset >= frame.sse_reg_save_offset);
- if (need_prefix == indirect_thunk_prefix_nt
- && regno != INVALID_REGNUM)
- {
- /* NOTRACK prefix is only used with external thunk via
- register so that NOTRACK prefix can be added to indirect
- branch via register to support CET at run-time. */
- prefix = "_nt";
- }
- else
- prefix = "";
+ /* Setup RAX as the stub's base pointer. We use stack_realign_offset rather
+ we've actually realigned the stack or not. */
+ align = GET_MODE_ALIGNMENT (V4SFmode);
+ addr = choose_baseaddr (frame.stack_realign_offset
+ + xlogue.get_stub_ptr_offset (), &align, AX_REG);
+ gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
- const char *ret = ret_p ? "return" : "indirect";
+ emit_insn (gen_rtx_SET (rax, addr));
- if (regno != INVALID_REGNUM)
- {
- const char *reg_prefix;
- if (LEGACY_INT_REGNO_P (regno))
- reg_prefix = TARGET_64BIT ? "r" : "e";
- else
- reg_prefix = "";
- sprintf (name, "__x86_%s_thunk%s_%s%s",
- ret, prefix, reg_prefix, reg_names[regno]);
- }
- else
- sprintf (name, "__x86_%s_thunk%s", ret, prefix);
- }
- else
+ /* Get the stub symbol. */
+ sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
+ : XLOGUE_STUB_SAVE);
+ RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
+
+ for (i = 0; i < ncregs; ++i)
{
- if (regno != INVALID_REGNUM)
- ASM_GENERATE_INTERNAL_LABEL (name, "LITR", regno);
- else
- {
- if (ret_p)
- ASM_GENERATE_INTERNAL_LABEL (name, "LRT", 0);
- else
- ASM_GENERATE_INTERNAL_LABEL (name, "LIT", 0);
- }
+ const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
+ rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
+ r.regno);
+ RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);
}
-}
-/* Output a call and return thunk for indirect branch. If REGNO != -1,
- the function address is in REGNO and the call and return thunk looks like:
-
- call L2
- L1:
- pause
- lfence
- jmp L1
- L2:
- mov %REG, (%sp)
- ret
+ gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
- Otherwise, the function address is on the top of stack and the
- call and return thunk looks like:
+ insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
+ RTX_FRAME_RELATED_P (insn) = true;
+}
- call L2
- L1:
- pause
- lfence
- jmp L1
- L2:
- lea WORD_SIZE(%sp), %sp
- ret
- */
+/* Expand the prologue into a bunch of separate insns. */
-static void
-output_indirect_thunk (unsigned int regno)
+void
+ix86_expand_prologue (void)
{
- char indirectlabel1[32];
- char indirectlabel2[32];
+ struct machine_function *m = cfun->machine;
+ rtx insn, t;
+ HOST_WIDE_INT allocate;
+ bool int_registers_saved;
+ bool sse_registers_saved;
+ bool save_stub_call_needed;
+ rtx static_chain = NULL_RTX;
- ASM_GENERATE_INTERNAL_LABEL (indirectlabel1, INDIRECT_LABEL,
- indirectlabelno++);
- ASM_GENERATE_INTERNAL_LABEL (indirectlabel2, INDIRECT_LABEL,
- indirectlabelno++);
+ if (ix86_function_naked (current_function_decl))
+ return;
- /* Call */
- fputs ("\tcall\t", asm_out_file);
- assemble_name_raw (asm_out_file, indirectlabel2);
- fputc ('\n', asm_out_file);
+ ix86_finalize_stack_frame_flags ();
- ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
+ /* DRAP should not coexist with stack_realign_fp */
+ gcc_assert (!(crtl->drap_reg && stack_realign_fp));
- /* AMD and Intel CPUs prefer each a different instruction as loop filler.
- Usage of both pause + lfence is compromise solution. */
- fprintf (asm_out_file, "\tpause\n\tlfence\n");
+ memset (&m->fs, 0, sizeof (m->fs));
- /* Jump. */
- fputs ("\tjmp\t", asm_out_file);
- assemble_name_raw (asm_out_file, indirectlabel1);
- fputc ('\n', asm_out_file);
+ /* Initialize CFA state for before the prologue. */
+ m->fs.cfa_reg = stack_pointer_rtx;
+ m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
- ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
+ /* Track SP offset to the CFA. We continue tracking this after we've
+ swapped the CFA register away from SP. In the case of re-alignment
+ this is fudged; we're interested to offsets within the local frame. */
+ m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
+ m->fs.sp_valid = true;
+ m->fs.sp_realigned = false;
- /* The above call insn pushed a word to stack. Adjust CFI info. */
- if (flag_asynchronous_unwind_tables && dwarf2out_do_frame ())
- {
- if (! dwarf2out_do_cfi_asm ())
- {
- dw_cfi_ref xcfi = ggc_cleared_alloc<dw_cfi_node> ();
- xcfi->dw_cfi_opc = DW_CFA_advance_loc4;
- xcfi->dw_cfi_oprnd1.dw_cfi_addr = ggc_strdup (indirectlabel2);
- vec_safe_push (cfun->fde->dw_fde_cfi, xcfi);
- }
- dw_cfi_ref xcfi = ggc_cleared_alloc<dw_cfi_node> ();
- xcfi->dw_cfi_opc = DW_CFA_def_cfa_offset;
- xcfi->dw_cfi_oprnd1.dw_cfi_offset = 2 * UNITS_PER_WORD;
- vec_safe_push (cfun->fde->dw_fde_cfi, xcfi);
- dwarf2out_emit_cfi (xcfi);
- }
+ const struct ix86_frame &frame = cfun->machine->frame;
- if (regno != INVALID_REGNUM)
- {
- /* MOV. */
- rtx xops[2];
- xops[0] = gen_rtx_MEM (word_mode, stack_pointer_rtx);
- xops[1] = gen_rtx_REG (word_mode, regno);
- output_asm_insn ("mov\t{%1, %0|%0, %1}", xops);
- }
- else
+ if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
{
- /* LEA. */
- rtx xops[2];
- xops[0] = stack_pointer_rtx;
- xops[1] = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
- output_asm_insn ("lea\t{%E1, %0|%0, %E1}", xops);
- }
+ /* We should have already generated an error for any use of
+ ms_hook on a nested function. */
+ gcc_checking_assert (!ix86_static_chain_on_stack);
- fputs ("\tret\n", asm_out_file);
-}
+ /* Check if profiling is active and we shall use profiling before
+ prologue variant. If so sorry. */
+ if (crtl->profile && flag_fentry != 0)
+ sorry ("ms_hook_prologue attribute isn%'t compatible "
+ "with %<-mfentry%> for 32-bit");
-/* Output a funtion with a call and return thunk for indirect branch.
- If REGNO != INVALID_REGNUM, the function address is in REGNO.
- Otherwise, the function address is on the top of stack. Thunk is
- used for function return if RET_P is true. */
+ /* In ix86_asm_output_function_label we emitted:
+ 8b ff movl.s %edi,%edi
+ 55 push %ebp
+ 8b ec movl.s %esp,%ebp
-static void
-output_indirect_thunk_function (enum indirect_thunk_prefix need_prefix,
- unsigned int regno, bool ret_p)
-{
- char name[32];
- tree decl;
+ This matches the hookable function prologue in Win32 API
+ functions in Microsoft Windows XP Service Pack 2 and newer.
+ Wine uses this to enable Windows apps to hook the Win32 API
+ functions provided by Wine.
- /* Create __x86_indirect_thunk. */
- indirect_thunk_name (name, regno, need_prefix, ret_p);
- decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
- get_identifier (name),
- build_function_type_list (void_type_node, NULL_TREE));
- DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
- NULL_TREE, void_type_node);
- TREE_PUBLIC (decl) = 1;
- TREE_STATIC (decl) = 1;
- DECL_IGNORED_P (decl) = 1;
+ What that means is that we've already set up the frame pointer. */
-#if TARGET_MACHO
- if (TARGET_MACHO)
- {
- switch_to_section (darwin_sections[picbase_thunk_section]);
- fputs ("\t.weak_definition\t", asm_out_file);
- assemble_name (asm_out_file, name);
- fputs ("\n\t.private_extern\t", asm_out_file);
- assemble_name (asm_out_file, name);
- putc ('\n', asm_out_file);
- ASM_OUTPUT_LABEL (asm_out_file, name);
- DECL_WEAK (decl) = 1;
- }
- else
-#endif
- if (USE_HIDDEN_LINKONCE)
- {
- cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
+ if (frame_pointer_needed
+ && !(crtl->drap_reg && crtl->stack_realign_needed))
+ {
+ rtx push, mov;
- targetm.asm_out.unique_section (decl, 0);
- switch_to_section (get_named_section (decl, NULL, 0));
+ /* We've decided to use the frame pointer already set up.
+ Describe this to the unwinder by pretending that both
+ push and mov insns happen right here.
- targetm.asm_out.globalize_label (asm_out_file, name);
- fputs ("\t.hidden\t", asm_out_file);
- assemble_name (asm_out_file, name);
- putc ('\n', asm_out_file);
- ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
- }
- else
- {
- switch_to_section (text_section);
- ASM_OUTPUT_LABEL (asm_out_file, name);
- }
+ Putting the unwind info here at the end of the ms_hook
+ is done so that we can make absolutely certain we get
+ the required byte sequence at the start of the function,
+ rather than relying on an assembler that can produce
+ the exact encoding required.
- DECL_INITIAL (decl) = make_node (BLOCK);
- current_function_decl = decl;
- allocate_struct_function (decl, false);
- init_function_start (decl);
- /* We're about to hide the function body from callees of final_* by
- emitting it directly; tell them we're a thunk, if they care. */
- cfun->is_thunk = true;
- first_function_block_is_cold = false;
- /* Make sure unwind info is emitted for the thunk if needed. */
- final_start_function (emit_barrier (), asm_out_file, 1);
+ However it does mean (in the unpatched case) that we have
+ a 1 insn window where the asynchronous unwind info is
+ incorrect. However, if we placed the unwind info at
+ its correct location we would have incorrect unwind info
+ in the patched case. Which is probably all moot since
+ I don't expect Wine generates dwarf2 unwind info for the
+ system libraries that use this feature. */
- output_indirect_thunk (regno);
+ insn = emit_insn (gen_blockage ());
- final_end_function ();
- init_insn_lengths ();
- free_after_compilation (cfun);
- set_cfun (NULL);
- current_function_decl = NULL;
-}
+ push = gen_push (hard_frame_pointer_rtx);
+ mov = gen_rtx_SET (hard_frame_pointer_rtx,
+ stack_pointer_rtx);
+ RTX_FRAME_RELATED_P (push) = 1;
+ RTX_FRAME_RELATED_P (mov) = 1;
-static int pic_labels_used;
+ RTX_FRAME_RELATED_P (insn) = 1;
+ add_reg_note (insn, REG_FRAME_RELATED_EXPR,
+ gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
-/* Fills in the label name that should be used for a pc thunk for
- the given register. */
+ /* Note that gen_push incremented m->fs.cfa_offset, even
+ though we didn't emit the push insn here. */
+ m->fs.cfa_reg = hard_frame_pointer_rtx;
+ m->fs.fp_offset = m->fs.cfa_offset;
+ m->fs.fp_valid = true;
+ }
+ else
+ {
+ /* The frame pointer is not needed so pop %ebp again.
+ This leaves us with a pristine state. */
+ emit_insn (gen_pop (hard_frame_pointer_rtx));
+ }
+ }
-static void
-get_pc_thunk_name (char name[32], unsigned int regno)
-{
- gcc_assert (!TARGET_64BIT);
+ /* The first insn of a function that accepts its static chain on the
+ stack is to push the register that would be filled in by a direct
+ call. This insn will be skipped by the trampoline. */
+ else if (ix86_static_chain_on_stack)
+ {
+ static_chain = ix86_static_chain (cfun->decl, false);
+ insn = emit_insn (gen_push (static_chain));
+ emit_insn (gen_blockage ());
- if (USE_HIDDEN_LINKONCE)
- sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
- else
- ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
-}
+ /* We don't want to interpret this push insn as a register save,
+ only as a stack adjustment. The real copy of the register as
+ a save will be done later, if needed. */
+ t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
+ t = gen_rtx_SET (stack_pointer_rtx, t);
+ add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
+ RTX_FRAME_RELATED_P (insn) = 1;
+ }
+ /* Emit prologue code to adjust stack alignment and setup DRAP, in case
+ of DRAP is needed and stack realignment is really needed after reload */
+ if (stack_realign_drap)
+ {
+ int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
-/* This function generates code for -fpic that loads %ebx with
- the return address of the caller and then returns. */
+ /* Can't use DRAP in interrupt function. */
+ if (cfun->machine->func_type != TYPE_NORMAL)
+ sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
+ "in interrupt service routine. This may be worked "
+ "around by avoiding functions with aggregate return.");
-static void
-ix86_code_end (void)
-{
- rtx xops[2];
- unsigned int regno;
+ /* Only need to push parameter pointer reg if it is caller saved. */
+ if (!call_used_regs[REGNO (crtl->drap_reg)])
+ {
+ /* Push arg pointer reg */
+ insn = emit_insn (gen_push (crtl->drap_reg));
+ RTX_FRAME_RELATED_P (insn) = 1;
+ }
- if (indirect_return_needed)
- output_indirect_thunk_function (indirect_thunk_prefix_none,
- INVALID_REGNUM, true);
- if (indirect_return_via_cx)
- output_indirect_thunk_function (indirect_thunk_prefix_none,
- CX_REG, true);
- if (indirect_thunk_needed)
- output_indirect_thunk_function (indirect_thunk_prefix_none,
- INVALID_REGNUM, false);
+ /* Grab the argument pointer. */
+ t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
+ insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
+ RTX_FRAME_RELATED_P (insn) = 1;
+ m->fs.cfa_reg = crtl->drap_reg;
+ m->fs.cfa_offset = 0;
- for (regno = FIRST_REX_INT_REG; regno <= LAST_REX_INT_REG; regno++)
- {
- unsigned int i = regno - FIRST_REX_INT_REG + LAST_INT_REG + 1;
- if ((indirect_thunks_used & (1 << i)))
- output_indirect_thunk_function (indirect_thunk_prefix_none,
- regno, false);
- }
+ /* Align the stack. */
+ insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
+ stack_pointer_rtx,
+ GEN_INT (-align_bytes)));
+ RTX_FRAME_RELATED_P (insn) = 1;
- for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
- {
- char name[32];
- tree decl;
+ /* Replicate the return address on the stack so that return
+ address can be reached via (argp - 1) slot. This is needed
+ to implement macro RETURN_ADDR_RTX and intrinsic function
+ expand_builtin_return_addr etc. */
+ t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
+ t = gen_frame_mem (word_mode, t);
+ insn = emit_insn (gen_push (t));
+ RTX_FRAME_RELATED_P (insn) = 1;
- if ((indirect_thunks_used & (1 << regno)))
- output_indirect_thunk_function (indirect_thunk_prefix_none,
- regno, false);
+ /* For the purposes of frame and register save area addressing,
+ we've started over with a new frame. */
+ m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
+ m->fs.realigned = true;
- if (!(pic_labels_used & (1 << regno)))
- continue;
+ if (static_chain)
+ {
+ /* Replicate static chain on the stack so that static chain
+ can be reached via (argp - 2) slot. This is needed for
+ nested function with stack realignment. */
+ insn = emit_insn (gen_push (static_chain));
+ RTX_FRAME_RELATED_P (insn) = 1;
+ }
+ }
- get_pc_thunk_name (name, regno);
+ int_registers_saved = (frame.nregs == 0);
+ sse_registers_saved = (frame.nsseregs == 0);
+ save_stub_call_needed = (m->call_ms2sysv);
+ gcc_assert (sse_registers_saved || !save_stub_call_needed);
- decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
- get_identifier (name),
- build_function_type_list (void_type_node, NULL_TREE));
- DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
- NULL_TREE, void_type_node);
- TREE_PUBLIC (decl) = 1;
- TREE_STATIC (decl) = 1;
- DECL_IGNORED_P (decl) = 1;
+ if (frame_pointer_needed && !m->fs.fp_valid)
+ {
+ /* Note: AT&T enter does NOT have reversed args. Enter is probably
+ slower on all targets. Also sdb didn't like it. */
+ insn = emit_insn (gen_push (hard_frame_pointer_rtx));
+ RTX_FRAME_RELATED_P (insn) = 1;
-#if TARGET_MACHO
- if (TARGET_MACHO)
+ /* Push registers now, before setting the frame pointer
+ on SEH target. */
+ if (!int_registers_saved
+ && TARGET_SEH
+ && !frame.save_regs_using_mov)
{
- switch_to_section (darwin_sections[picbase_thunk_section]);
- fputs ("\t.weak_definition\t", asm_out_file);
- assemble_name (asm_out_file, name);
- fputs ("\n\t.private_extern\t", asm_out_file);
- assemble_name (asm_out_file, name);
- putc ('\n', asm_out_file);
- ASM_OUTPUT_LABEL (asm_out_file, name);
- DECL_WEAK (decl) = 1;
+ ix86_emit_save_regs ();
+ int_registers_saved = true;
+ gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
}
- else
-#endif
- if (USE_HIDDEN_LINKONCE)
- {
- cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
- targetm.asm_out.unique_section (decl, 0);
- switch_to_section (get_named_section (decl, NULL, 0));
+ if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
+ {
+ insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
+ RTX_FRAME_RELATED_P (insn) = 1;
- targetm.asm_out.globalize_label (asm_out_file, name);
- fputs ("\t.hidden\t", asm_out_file);
- assemble_name (asm_out_file, name);
- putc ('\n', asm_out_file);
- ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
+ if (m->fs.cfa_reg == stack_pointer_rtx)
+ m->fs.cfa_reg = hard_frame_pointer_rtx;
+ m->fs.fp_offset = m->fs.sp_offset;
+ m->fs.fp_valid = true;
}
- else
+ }
+
+ if (!int_registers_saved)
+ {
+ /* If saving registers via PUSH, do so now. */
+ if (!frame.save_regs_using_mov)
{
- switch_to_section (text_section);
- ASM_OUTPUT_LABEL (asm_out_file, name);
+ ix86_emit_save_regs ();
+ int_registers_saved = true;
+ gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
}
- DECL_INITIAL (decl) = make_node (BLOCK);
- current_function_decl = decl;
- allocate_struct_function (decl, false);
- init_function_start (decl);
- /* We're about to hide the function body from callees of final_* by
- emitting it directly; tell them we're a thunk, if they care. */
- cfun->is_thunk = true;
- first_function_block_is_cold = false;
- /* Make sure unwind info is emitted for the thunk if needed. */
- final_start_function (emit_barrier (), asm_out_file, 1);
-
- /* Pad stack IP move with 4 instructions (two NOPs count
- as one instruction). */
- if (TARGET_PAD_SHORT_FUNCTION)
+ /* When using red zone we may start register saving before allocating
+ the stack frame saving one cycle of the prologue. However, avoid
+ doing this if we have to probe the stack; at least on x86_64 the
+ stack probe can turn into a call that clobbers a red zone location. */
+ else if (ix86_using_red_zone ()
+ && (! TARGET_STACK_PROBE
+ || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
{
- int i = 8;
-
- while (i--)
- fputs ("\tnop\n", asm_out_file);
+ ix86_emit_save_regs_using_mov (frame.reg_save_offset);
+ int_registers_saved = true;
}
-
- xops[0] = gen_rtx_REG (Pmode, regno);
- xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
- output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
- output_asm_insn ("%!ret", NULL);
- final_end_function ();
- init_insn_lengths ();
- free_after_compilation (cfun);
- set_cfun (NULL);
- current_function_decl = NULL;
}
- if (flag_split_stack)
- file_end_indicate_split_stack ();
-}
+ if (stack_realign_fp)
+ {
+ int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
+ gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
-/* Emit code for the SET_GOT patterns. */
+ /* Record last valid frame pointer offset. */
+ m->fs.sp_realigned_fp_last = frame.reg_save_offset;
-const char *
-output_set_got (rtx dest, rtx label)
-{
- rtx xops[3];
+ /* The computation of the size of the re-aligned stack frame means
+ that we must allocate the size of the register save area before
+ performing the actual alignment. Otherwise we cannot guarantee
+ that there's enough storage above the realignment point. */
+ allocate = frame.reg_save_offset - m->fs.sp_offset
+ + frame.stack_realign_allocate;
+ if (allocate)
+ pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (-allocate), -1, false);
- xops[0] = dest;
+ /* Align the stack. */
+ insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
+ stack_pointer_rtx,
+ GEN_INT (-align_bytes)));
+ m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
+ m->fs.sp_realigned_offset = m->fs.sp_offset
+ - frame.stack_realign_allocate;
+ /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset.
+ Beyond this point, stack access should be done via choose_baseaddr or
+ by using sp_valid_at and fp_valid_at to determine the correct base
+ register. Henceforth, any CFA offset should be thought of as logical
+ and not physical. */
+ gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last);
+ gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
+ m->fs.sp_realigned = true;
- if (TARGET_VXWORKS_RTP && flag_pic)
- {
- /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
- xops[2] = gen_rtx_MEM (Pmode,
- gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
- output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
+ /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
+ is needed to describe where a register is saved using a realigned
+ stack pointer, so we need to invalidate the stack pointer for that
+ target. */
+ if (TARGET_SEH)
+ m->fs.sp_valid = false;
- /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
- Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
- an unadorned address. */
- xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
- SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
- output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
- return "";
+ /* If SP offset is non-immediate after allocation of the stack frame,
+ then emit SSE saves or stub call prior to allocating the rest of the
+ stack frame. This is less efficient for the out-of-line stub because
+ we can't combine allocations across the call barrier, but it's better
+ than using a scratch register. */
+ else if (!x86_64_immediate_operand (GEN_INT (frame.stack_pointer_offset
+ - m->fs.sp_realigned_offset),
+ Pmode))
+ {
+ if (!sse_registers_saved)
+ {
+ ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
+ sse_registers_saved = true;
+ }
+ else if (save_stub_call_needed)
+ {
+ ix86_emit_outlined_ms2sysv_save (frame);
+ save_stub_call_needed = false;
+ }
+ }
}
- xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
+ allocate = frame.stack_pointer_offset - m->fs.sp_offset;
- if (flag_pic)
+ if (flag_stack_usage_info)
{
- char name[32];
- get_pc_thunk_name (name, REGNO (dest));
- pic_labels_used |= 1 << REGNO (dest);
+ /* We start to count from ARG_POINTER. */
+ HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
- xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
- xops[2] = gen_rtx_MEM (QImode, xops[2]);
- output_asm_insn ("%!call\t%X2", xops);
+ /* If it was realigned, take into account the fake frame. */
+ if (stack_realign_drap)
+ {
+ if (ix86_static_chain_on_stack)
+ stack_size += UNITS_PER_WORD;
-#if TARGET_MACHO
- /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
- This is what will be referenced by the Mach-O PIC subsystem. */
- if (machopic_should_output_picbase_label () || !label)
- ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
+ if (!call_used_regs[REGNO (crtl->drap_reg)])
+ stack_size += UNITS_PER_WORD;
- /* When we are restoring the pic base at the site of a nonlocal label,
- and we decided to emit the pic base above, we will still output a
- local label used for calculating the correction offset (even though
- the offset will be 0 in that case). */
- if (label)
- targetm.asm_out.internal_label (asm_out_file, "L",
- CODE_LABEL_NUMBER (label));
-#endif
- }
- else
- {
- if (TARGET_MACHO)
- /* We don't need a pic base, we're not producing pic. */
- gcc_unreachable ();
+ /* This over-estimates by 1 minimal-stack-alignment-unit but
+ mitigates that by counting in the new return address slot. */
+ current_function_dynamic_stack_size
+ += crtl->stack_alignment_needed / BITS_PER_UNIT;
+ }
- xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
- output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
- targetm.asm_out.internal_label (asm_out_file, "L",
- CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
+ current_function_static_stack_size = stack_size;
}
- if (!TARGET_MACHO)
- output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
-
- return "";
-}
-
-/* Generate an "push" pattern for input ARG. */
-
-static rtx
-gen_push (rtx arg)
-{
- struct machine_function *m = cfun->machine;
-
- if (m->fs.cfa_reg == stack_pointer_rtx)
- m->fs.cfa_offset += UNITS_PER_WORD;
- m->fs.sp_offset += UNITS_PER_WORD;
-
- if (REG_P (arg) && GET_MODE (arg) != word_mode)
- arg = gen_rtx_REG (word_mode, REGNO (arg));
-
- return gen_rtx_SET (gen_rtx_MEM (word_mode,
- gen_rtx_PRE_DEC (Pmode,
- stack_pointer_rtx)),
- arg);
-}
+ /* On SEH target with very large frame size, allocate an area to save
+ SSE registers (as the very large allocation won't be described). */
+ if (TARGET_SEH
+ && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
+ && !sse_registers_saved)
+ {
+ HOST_WIDE_INT sse_size
+ = frame.sse_reg_save_offset - frame.reg_save_offset;
-/* Generate an "pop" pattern for input ARG. */
+ gcc_assert (int_registers_saved);
-static rtx
-gen_pop (rtx arg)
-{
- if (REG_P (arg) && GET_MODE (arg) != word_mode)
- arg = gen_rtx_REG (word_mode, REGNO (arg));
+ /* No need to do stack checking as the area will be immediately
+ written. */
+ pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (-sse_size), -1,
+ m->fs.cfa_reg == stack_pointer_rtx);
+ allocate -= sse_size;
+ ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
+ sse_registers_saved = true;
+ }
- return gen_rtx_SET (arg,
- gen_rtx_MEM (word_mode,
- gen_rtx_POST_INC (Pmode,
- stack_pointer_rtx)));
-}
+ /* The stack has already been decremented by the instruction calling us
+ so probe if the size is non-negative to preserve the protection area. */
+ if (allocate >= 0
+ && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
+ || flag_stack_clash_protection))
+ {
+ if (flag_stack_clash_protection)
+ {
+ ix86_adjust_stack_and_probe_stack_clash (allocate,
+ int_registers_saved);
+ allocate = 0;
+ }
+ else if (STACK_CHECK_MOVING_SP)
+ {
+ if (!(crtl->is_leaf && !cfun->calls_alloca
+ && allocate <= get_probe_interval ()))
+ {
+ ix86_adjust_stack_and_probe (allocate, int_registers_saved);
+ allocate = 0;
+ }
+ }
+ else
+ {
+ HOST_WIDE_INT size = allocate;
-/* Return >= 0 if there is an unused call-clobbered register available
- for the entire function. */
+ if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
+ size = 0x80000000 - get_stack_check_protect () - 1;
-static unsigned int
-ix86_select_alt_pic_regnum (void)
-{
- if (ix86_use_pseudo_pic_reg ())
- return INVALID_REGNUM;
+ if (TARGET_STACK_PROBE)
+ {
+ if (crtl->is_leaf && !cfun->calls_alloca)
+ {
+ if (size > get_probe_interval ())
+ ix86_emit_probe_stack_range (0, size, int_registers_saved);
+ }
+ else
+ ix86_emit_probe_stack_range (0,
+ size + get_stack_check_protect (),
+ int_registers_saved);
+ }
+ else
+ {
+ if (crtl->is_leaf && !cfun->calls_alloca)
+ {
+ if (size > get_probe_interval ()
+ && size > get_stack_check_protect ())
+ ix86_emit_probe_stack_range (get_stack_check_protect (),
+ (size
+ - get_stack_check_protect ()),
+ int_registers_saved);
+ }
+ else
+ ix86_emit_probe_stack_range (get_stack_check_protect (), size,
+ int_registers_saved);
+ }
+ }
+ }
- if (crtl->is_leaf
- && !crtl->profile
- && !ix86_current_function_calls_tls_descriptor)
+ if (allocate == 0)
+ ;
+ else if (!ix86_target_stack_probe ()
+ || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
{
- int i, drap;
- /* Can't use the same register for both PIC and DRAP. */
- if (crtl->drap_reg)
- drap = REGNO (crtl->drap_reg);
- else
- drap = -1;
- for (i = 2; i >= 0; --i)
- if (i != drap && !df_regs_ever_live_p (i))
- return i;
+ pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (-allocate), -1,
+ m->fs.cfa_reg == stack_pointer_rtx);
}
+ else
+ {
+ rtx eax = gen_rtx_REG (Pmode, AX_REG);
+ rtx r10 = NULL;
+ rtx (*adjust_stack_insn)(rtx, rtx, rtx);
+ const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
+ bool eax_live = ix86_eax_live_at_start_p ();
+ bool r10_live = false;
- return INVALID_REGNUM;
-}
+ if (TARGET_64BIT)
+ r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
-/* Return true if REGNO is used by the epilogue. */
+ if (eax_live)
+ {
+ insn = emit_insn (gen_push (eax));
+ allocate -= UNITS_PER_WORD;
+ /* Note that SEH directives need to continue tracking the stack
+ pointer even after the frame pointer has been set up. */
+ if (sp_is_cfa_reg || TARGET_SEH)
+ {
+ if (sp_is_cfa_reg)
+ m->fs.cfa_offset += UNITS_PER_WORD;
+ RTX_FRAME_RELATED_P (insn) = 1;
+ add_reg_note (insn, REG_FRAME_RELATED_EXPR,
+ gen_rtx_SET (stack_pointer_rtx,
+ plus_constant (Pmode, stack_pointer_rtx,
+ -UNITS_PER_WORD)));
+ }
+ }
-bool
-ix86_epilogue_uses (int regno)
-{
- /* If there are no caller-saved registers, we preserve all registers,
- except for MMX and x87 registers which aren't supported when saving
- and restoring registers. Don't explicitly save SP register since
- it is always preserved. */
- return (epilogue_completed
- && cfun->machine->no_caller_saved_registers
- && !fixed_regs[regno]
- && !STACK_REGNO_P (regno)
- && !MMX_REGNO_P (regno));
-}
+ if (r10_live)
+ {
+ r10 = gen_rtx_REG (Pmode, R10_REG);
+ insn = emit_insn (gen_push (r10));
+ allocate -= UNITS_PER_WORD;
+ if (sp_is_cfa_reg || TARGET_SEH)
+ {
+ if (sp_is_cfa_reg)
+ m->fs.cfa_offset += UNITS_PER_WORD;
+ RTX_FRAME_RELATED_P (insn) = 1;
+ add_reg_note (insn, REG_FRAME_RELATED_EXPR,
+ gen_rtx_SET (stack_pointer_rtx,
+ plus_constant (Pmode, stack_pointer_rtx,
+ -UNITS_PER_WORD)));
+ }
+ }
-/* Return nonzero if register REGNO can be used as a scratch register
- in peephole2. */
+ emit_move_insn (eax, GEN_INT (allocate));
+ emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
-static bool
-ix86_hard_regno_scratch_ok (unsigned int regno)
-{
- /* If there are no caller-saved registers, we can't use any register
- as a scratch register after epilogue and use REGNO as scratch
- register only if it has been used before to avoid saving and
- restoring it. */
- return (!cfun->machine->no_caller_saved_registers
- || (!epilogue_completed
- && df_regs_ever_live_p (regno)));
-}
+ /* Use the fact that AX still contains ALLOCATE. */
+ adjust_stack_insn = (Pmode == DImode
+ ? gen_pro_epilogue_adjust_stack_di_sub
+ : gen_pro_epilogue_adjust_stack_si_sub);
-/* Return TRUE if we need to save REGNO. */
+ insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
+ stack_pointer_rtx, eax));
-static bool
-ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
-{
- /* If there are no caller-saved registers, we preserve all registers,
- except for MMX and x87 registers which aren't supported when saving
- and restoring registers. Don't explicitly save SP register since
- it is always preserved. */
- if (cfun->machine->no_caller_saved_registers)
- {
- /* Don't preserve registers used for function return value. */
- rtx reg = crtl->return_rtx;
- if (reg)
+ if (sp_is_cfa_reg || TARGET_SEH)
{
- unsigned int i = REGNO (reg);
- unsigned int nregs = REG_NREGS (reg);
- while (nregs-- > 0)
- if ((i + nregs) == regno)
- return false;
+ if (sp_is_cfa_reg)
+ m->fs.cfa_offset += allocate;
+ RTX_FRAME_RELATED_P (insn) = 1;
+ add_reg_note (insn, REG_FRAME_RELATED_EXPR,
+ gen_rtx_SET (stack_pointer_rtx,
+ plus_constant (Pmode, stack_pointer_rtx,
+ -allocate)));
}
+ m->fs.sp_offset += allocate;
- return (df_regs_ever_live_p (regno)
- && !fixed_regs[regno]
- && !STACK_REGNO_P (regno)
- && !MMX_REGNO_P (regno)
- && (regno != HARD_FRAME_POINTER_REGNUM
- || !frame_pointer_needed));
+ /* Use stack_pointer_rtx for relative addressing so that code works for
+ realigned stack. But this means that we need a blockage to prevent
+ stores based on the frame pointer from being scheduled before. */
+ if (r10_live && eax_live)
+ {
+ t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
+ emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
+ gen_frame_mem (word_mode, t));
+ t = plus_constant (Pmode, t, UNITS_PER_WORD);
+ emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
+ gen_frame_mem (word_mode, t));
+ emit_insn (gen_memory_blockage ());
+ }
+ else if (eax_live || r10_live)
+ {
+ t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
+ emit_move_insn (gen_rtx_REG (word_mode,
+ (eax_live ? AX_REG : R10_REG)),
+ gen_frame_mem (word_mode, t));
+ emit_insn (gen_memory_blockage ());
+ }
}
+ gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
- if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
- && pic_offset_table_rtx)
+ /* If we havn't already set up the frame pointer, do so now. */
+ if (frame_pointer_needed && !m->fs.fp_valid)
{
- if (ix86_use_pseudo_pic_reg ())
- {
- /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
- _mcount in prologue. */
- if (!TARGET_64BIT && flag_pic && crtl->profile)
- return true;
- }
- else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
- || crtl->profile
- || crtl->calls_eh_return
- || crtl->uses_const_pool
- || cfun->has_nonlocal_label)
- return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
+ insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (frame.stack_pointer_offset
+ - frame.hard_frame_pointer_offset));
+ insn = emit_insn (insn);
+ RTX_FRAME_RELATED_P (insn) = 1;
+ add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
+
+ if (m->fs.cfa_reg == stack_pointer_rtx)
+ m->fs.cfa_reg = hard_frame_pointer_rtx;
+ m->fs.fp_offset = frame.hard_frame_pointer_offset;
+ m->fs.fp_valid = true;
}
- if (crtl->calls_eh_return && maybe_eh_return)
+ if (!int_registers_saved)
+ ix86_emit_save_regs_using_mov (frame.reg_save_offset);
+ if (!sse_registers_saved)
+ ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
+ else if (save_stub_call_needed)
+ ix86_emit_outlined_ms2sysv_save (frame);
+
+ /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
+ in PROLOGUE. */
+ if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
{
- unsigned i;
- for (i = 0; ; i++)
- {
- unsigned test = EH_RETURN_DATA_REGNO (i);
- if (test == INVALID_REGNUM)
- break;
- if (test == regno)
- return true;
- }
+ rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
+ insn = emit_insn (gen_set_got (pic));
+ RTX_FRAME_RELATED_P (insn) = 1;
+ add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
+ emit_insn (gen_prologue_use (pic));
+ /* Deleting already emmitted SET_GOT if exist and allocated to
+ REAL_PIC_OFFSET_TABLE_REGNUM. */
+ ix86_elim_entry_set_got (pic);
}
- if (ignore_outlined && cfun->machine->call_ms2sysv)
+ if (crtl->drap_reg && !crtl->stack_realign_needed)
{
- unsigned count = cfun->machine->call_ms2sysv_extra_regs
- + xlogue_layout::MIN_REGS;
- if (xlogue_layout::is_stub_managed_reg (regno, count))
- return false;
+ /* vDRAP is setup but after reload it turns out stack realign
+ isn't necessary, here we will emit prologue to setup DRAP
+ without stack realign adjustment */
+ t = choose_baseaddr (0, NULL);
+ emit_insn (gen_rtx_SET (crtl->drap_reg, t));
}
- if (crtl->drap_reg
- && regno == REGNO (crtl->drap_reg)
- && !cfun->machine->no_drap_save_restore)
- return true;
+ /* Prevent instructions from being scheduled into register save push
+ sequence when access to the redzone area is done through frame pointer.
+ The offset between the frame pointer and the stack pointer is calculated
+ relative to the value of the stack pointer at the end of the function
+ prologue, and moving instructions that access redzone area via frame
+ pointer inside push sequence violates this assumption. */
+ if (frame_pointer_needed && frame.red_zone_size)
+ emit_insn (gen_memory_blockage ());
- return (df_regs_ever_live_p (regno)
- && !call_used_regs[regno]
- && !fixed_regs[regno]
- && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
+ /* SEH requires that the prologue end within 256 bytes of the start of
+ the function. Prevent instruction schedules that would extend that.
+ Further, prevent alloca modifications to the stack pointer from being
+ combined with prologue modifications. */
+ if (TARGET_SEH)
+ emit_insn (gen_prologue_use (stack_pointer_rtx));
}
-/* Return number of saved general prupose registers. */
+/* Emit code to restore REG using a POP insn. */
-static int
-ix86_nsaved_regs (void)
+static void
+ix86_emit_restore_reg_using_pop (rtx reg)
{
- int nregs = 0;
- int regno;
+ struct machine_function *m = cfun->machine;
+ rtx_insn *insn = emit_insn (gen_pop (reg));
- for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
- if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
- nregs ++;
- return nregs;
-}
+ ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
+ m->fs.sp_offset -= UNITS_PER_WORD;
-/* Return number of saved SSE registers. */
+ if (m->fs.cfa_reg == crtl->drap_reg
+ && REGNO (reg) == REGNO (crtl->drap_reg))
+ {
+ /* Previously we'd represented the CFA as an expression
+ like *(%ebp - 8). We've just popped that value from
+ the stack, which means we need to reset the CFA to
+ the drap register. This will remain until we restore
+ the stack pointer. */
+ add_reg_note (insn, REG_CFA_DEF_CFA, reg);
+ RTX_FRAME_RELATED_P (insn) = 1;
-static int
-ix86_nsaved_sseregs (void)
-{
- int nregs = 0;
- int regno;
+ /* This means that the DRAP register is valid for addressing too. */
+ m->fs.drap_valid = true;
+ return;
+ }
- if (!TARGET_64BIT_MS_ABI)
- return 0;
- for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
- if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
- nregs ++;
- return nregs;
+ if (m->fs.cfa_reg == stack_pointer_rtx)
+ {
+ rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
+ x = gen_rtx_SET (stack_pointer_rtx, x);
+ add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
+ RTX_FRAME_RELATED_P (insn) = 1;
+
+ m->fs.cfa_offset -= UNITS_PER_WORD;
+ }
+
+ /* When the frame pointer is the CFA, and we pop it, we are
+ swapping back to the stack pointer as the CFA. This happens
+ for stack frames that don't allocate other data, so we assume
+ the stack pointer is now pointing at the return address, i.e.
+ the function entry state, which makes the offset be 1 word. */
+ if (reg == hard_frame_pointer_rtx)
+ {
+ m->fs.fp_valid = false;
+ if (m->fs.cfa_reg == hard_frame_pointer_rtx)
+ {
+ m->fs.cfa_reg = stack_pointer_rtx;
+ m->fs.cfa_offset -= UNITS_PER_WORD;
+
+ add_reg_note (insn, REG_CFA_DEF_CFA,
+ gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+ GEN_INT (m->fs.cfa_offset)));
+ RTX_FRAME_RELATED_P (insn) = 1;
+ }
+ }
}
-/* Given FROM and TO register numbers, say whether this elimination is
- allowed. If stack alignment is needed, we can only replace argument
- pointer with hard frame pointer, or replace frame pointer with stack
- pointer. Otherwise, frame pointer elimination is automatically
- handled and all other eliminations are valid. */
+/* Emit code to restore saved registers using POP insns. */
-static bool
-ix86_can_eliminate (const int from, const int to)
+static void
+ix86_emit_restore_regs_using_pop (void)
{
- if (stack_realign_fp)
- return ((from == ARG_POINTER_REGNUM
- && to == HARD_FRAME_POINTER_REGNUM)
- || (from == FRAME_POINTER_REGNUM
- && to == STACK_POINTER_REGNUM));
- else
- return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
+ unsigned int regno;
+
+ for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
+ ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
}
-/* Return the offset between two registers, one to be eliminated, and the other
- its replacement, at the start of a routine. */
+/* Emit code and notes for the LEAVE instruction. If insn is non-null,
+ omits the emit and only attaches the notes. */
-HOST_WIDE_INT
-ix86_initial_elimination_offset (int from, int to)
+static void
+ix86_emit_leave (rtx_insn *insn)
{
- struct ix86_frame &frame = cfun->machine->frame;
+ struct machine_function *m = cfun->machine;
+ if (!insn)
+ insn = emit_insn (ix86_gen_leave ());
- if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
- return frame.hard_frame_pointer_offset;
- else if (from == FRAME_POINTER_REGNUM
- && to == HARD_FRAME_POINTER_REGNUM)
- return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
- else
- {
- gcc_assert (to == STACK_POINTER_REGNUM);
+ ix86_add_queued_cfa_restore_notes (insn);
- if (from == ARG_POINTER_REGNUM)
- return frame.stack_pointer_offset;
+ gcc_assert (m->fs.fp_valid);
+ m->fs.sp_valid = true;
+ m->fs.sp_realigned = false;
+ m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
+ m->fs.fp_valid = false;
- gcc_assert (from == FRAME_POINTER_REGNUM);
- return frame.stack_pointer_offset - frame.frame_pointer_offset;
+ if (m->fs.cfa_reg == hard_frame_pointer_rtx)
+ {
+ m->fs.cfa_reg = stack_pointer_rtx;
+ m->fs.cfa_offset = m->fs.sp_offset;
+
+ add_reg_note (insn, REG_CFA_DEF_CFA,
+ plus_constant (Pmode, stack_pointer_rtx,
+ m->fs.sp_offset));
+ RTX_FRAME_RELATED_P (insn) = 1;
}
+ ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
+ m->fs.fp_offset);
}
-/* In a dynamically-aligned function, we can't know the offset from
- stack pointer to frame pointer, so we must ensure that setjmp
- eliminates fp against the hard fp (%ebp) rather than trying to
- index from %esp up to the top of the frame across a gap that is
- of unknown (at compile-time) size. */
-static rtx
-ix86_builtin_setjmp_frame_value (void)
+/* Emit code to restore saved registers using MOV insns.
+ First register is restored from CFA - CFA_OFFSET. */
+static void
+ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
+ bool maybe_eh_return)
{
- return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
+ struct machine_function *m = cfun->machine;
+ unsigned int regno;
+
+ for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
+ {
+ rtx reg = gen_rtx_REG (word_mode, regno);
+ rtx mem;
+ rtx_insn *insn;
+
+ mem = choose_baseaddr (cfa_offset, NULL);
+ mem = gen_frame_mem (word_mode, mem);
+ insn = emit_move_insn (reg, mem);
+
+ if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
+ {
+ /* Previously we'd represented the CFA as an expression
+ like *(%ebp - 8). We've just popped that value from
+ the stack, which means we need to reset the CFA to
+ the drap register. This will remain until we restore
+ the stack pointer. */
+ add_reg_note (insn, REG_CFA_DEF_CFA, reg);
+ RTX_FRAME_RELATED_P (insn) = 1;
+
+ /* This means that the DRAP register is valid for addressing. */
+ m->fs.drap_valid = true;
+ }
+ else
+ ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
+
+ cfa_offset -= UNITS_PER_WORD;
+ }
}
-/* Emits a warning for unsupported msabi to sysv pro/epilogues. */
-static void warn_once_call_ms2sysv_xlogues (const char *feature)
+/* Emit code to restore saved registers using MOV insns.
+ First register is restored from CFA - CFA_OFFSET. */
+static void
+ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
+ bool maybe_eh_return)
{
- static bool warned_once = false;
- if (!warned_once)
- {
- warning (0, "%<-mcall-ms2sysv-xlogues%> is not compatible with %s",
- feature);
- warned_once = true;
- }
-}
+ unsigned int regno;
-/* Return the probing interval for -fstack-clash-protection. */
+ for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
+ {
+ rtx reg = gen_rtx_REG (V4SFmode, regno);
+ rtx mem;
+ unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
-static HOST_WIDE_INT
-get_probe_interval (void)
-{
- if (flag_stack_clash_protection)
- return (HOST_WIDE_INT_1U
- << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL));
- else
- return (HOST_WIDE_INT_1U << STACK_CHECK_PROBE_INTERVAL_EXP);
-}
+ mem = choose_baseaddr (cfa_offset, &align);
+ mem = gen_rtx_MEM (V4SFmode, mem);
-/* When using -fsplit-stack, the allocation routines set a field in
- the TCB to the bottom of the stack plus this much space, measured
- in bytes. */
+ /* The location aligment depends upon the base register. */
+ align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
+ gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
+ set_mem_align (mem, align);
+ emit_insn (gen_rtx_SET (reg, mem));
-#define SPLIT_STACK_AVAILABLE 256
+ ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
-/* Fill structure ix86_frame about frame of currently computed function. */
+ cfa_offset -= GET_MODE_SIZE (V4SFmode);
+ }
+}
static void
-ix86_compute_frame_layout (void)
+ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
+ bool use_call, int style)
{
- struct ix86_frame *frame = &cfun->machine->frame;
struct machine_function *m = cfun->machine;
- unsigned HOST_WIDE_INT stack_alignment_needed;
- HOST_WIDE_INT offset;
- unsigned HOST_WIDE_INT preferred_alignment;
- HOST_WIDE_INT size = get_frame_size ();
- HOST_WIDE_INT to_allocate;
-
- /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
- * ms_abi functions that call a sysv function. We now need to prune away
- * cases where it should be disabled. */
- if (TARGET_64BIT && m->call_ms2sysv)
- {
- gcc_assert (TARGET_64BIT_MS_ABI);
- gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
- gcc_assert (!TARGET_SEH);
- gcc_assert (TARGET_SSE);
- gcc_assert (!ix86_using_red_zone ());
-
- if (crtl->calls_eh_return)
- {
- gcc_assert (!reload_completed);
- m->call_ms2sysv = false;
- warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
- }
+ const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
+ + m->call_ms2sysv_extra_regs;
+ rtvec v;
+ unsigned int elems_needed, align, i, vi = 0;
+ rtx_insn *insn;
+ rtx sym, tmp;
+ rtx rsi = gen_rtx_REG (word_mode, SI_REG);
+ rtx r10 = NULL_RTX;
+ const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
+ HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
+ HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
+ rtx rsi_frame_load = NULL_RTX;
+ HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
+ enum xlogue_stub stub;
- else if (ix86_static_chain_on_stack)
- {
- gcc_assert (!reload_completed);
- m->call_ms2sysv = false;
- warn_once_call_ms2sysv_xlogues ("static call chains");
- }
+ gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
- /* Finally, compute which registers the stub will manage. */
- else
- {
- unsigned count = xlogue_layout::count_stub_managed_regs ();
- m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
- m->call_ms2sysv_pad_in = 0;
- }
- }
+ /* If using a realigned stack, we should never start with padding. */
+ gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
- frame->nregs = ix86_nsaved_regs ();
- frame->nsseregs = ix86_nsaved_sseregs ();
+ /* Setup RSI as the stub's base pointer. */
+ align = GET_MODE_ALIGNMENT (V4SFmode);
+ tmp = choose_baseaddr (rsi_offset, &align, SI_REG);
+ gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
- /* 64-bit MS ABI seem to require stack alignment to be always 16,
- except for function prologues, leaf functions and when the defult
- incoming stack boundary is overriden at command line or via
- force_align_arg_pointer attribute.
+ emit_insn (gen_rtx_SET (rsi, tmp));
- Darwin's ABI specifies 128b alignment for both 32 and 64 bit variants
- at call sites, including profile function calls.
- */
- if (((TARGET_64BIT_MS_ABI || TARGET_MACHO)
- && crtl->preferred_stack_boundary < 128)
- && (!crtl->is_leaf || cfun->calls_alloca != 0
- || ix86_current_function_calls_tls_descriptor
- || (TARGET_MACHO && crtl->profile)
- || ix86_incoming_stack_boundary < 128))
+ /* Get a symbol for the stub. */
+ if (frame_pointer_needed)
+ stub = use_call ? XLOGUE_STUB_RESTORE_HFP
+ : XLOGUE_STUB_RESTORE_HFP_TAIL;
+ else
+ stub = use_call ? XLOGUE_STUB_RESTORE
+ : XLOGUE_STUB_RESTORE_TAIL;
+ sym = xlogue.get_stub_rtx (stub);
+
+ elems_needed = ncregs;
+ if (use_call)
+ elems_needed += 1;
+ else
+ elems_needed += frame_pointer_needed ? 5 : 3;
+ v = rtvec_alloc (elems_needed);
+
+ /* We call the epilogue stub when we need to pop incoming args or we are
+ doing a sibling call as the tail. Otherwise, we will emit a jmp to the
+ epilogue stub and it is the tail-call. */
+ if (use_call)
+ RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
+ else
{
- crtl->preferred_stack_boundary = 128;
- crtl->stack_alignment_needed = 128;
- }
+ RTVEC_ELT (v, vi++) = ret_rtx;
+ RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
+ if (frame_pointer_needed)
+ {
+ rtx rbp = gen_rtx_REG (DImode, BP_REG);
+ gcc_assert (m->fs.fp_valid);
+ gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
- stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
- preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
+ tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
+ RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
+ RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
+ tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
+ RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
+ }
+ else
+ {
+ /* If no hard frame pointer, we set R10 to the SP restore value. */
+ gcc_assert (!m->fs.fp_valid);
+ gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
+ gcc_assert (m->fs.sp_valid);
- gcc_assert (!size || stack_alignment_needed);
- gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
- gcc_assert (preferred_alignment <= stack_alignment_needed);
+ r10 = gen_rtx_REG (DImode, R10_REG);
+ tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
+ emit_insn (gen_rtx_SET (r10, tmp));
- /* The only ABI saving SSE regs should be 64-bit ms_abi. */
- gcc_assert (TARGET_64BIT || !frame->nsseregs);
- if (TARGET_64BIT && m->call_ms2sysv)
- {
- gcc_assert (stack_alignment_needed >= 16);
- gcc_assert (!frame->nsseregs);
+ RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
+ }
}
- /* For SEH we have to limit the amount of code movement into the prologue.
- At present we do this via a BLOCKAGE, at which point there's very little
- scheduling that can be done, which means that there's very little point
- in doing anything except PUSHs. */
- if (TARGET_SEH)
- m->use_fast_prologue_epilogue = false;
- else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
+ /* Generate frame load insns and restore notes. */
+ for (i = 0; i < ncregs; ++i)
{
- int count = frame->nregs;
- struct cgraph_node *node = cgraph_node::get (current_function_decl);
+ const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
+ machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
+ rtx reg, frame_load;
- /* The fast prologue uses move instead of push to save registers. This
- is significantly longer, but also executes faster as modern hardware
- can execute the moves in parallel, but can't do that for push/pop.
+ reg = gen_rtx_REG (mode, r.regno);
+ frame_load = gen_frame_load (reg, rsi, r.offset);
- Be careful about choosing what prologue to emit: When function takes
- many instructions to execute we may use slow version as well as in
- case function is known to be outside hot spot (this is known with
- feedback only). Weight the size of function by number of registers
- to save as it is cheap to use one or two push instructions but very
- slow to use many of them. */
- if (count)
- count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
- if (node->frequency < NODE_FREQUENCY_NORMAL
- || (flag_branch_probabilities
- && node->frequency < NODE_FREQUENCY_HOT))
- m->use_fast_prologue_epilogue = false;
+ /* Save RSI frame load insn & note to add last. */
+ if (r.regno == SI_REG)
+ {
+ gcc_assert (!rsi_frame_load);
+ rsi_frame_load = frame_load;
+ rsi_restore_offset = r.offset;
+ }
else
- m->use_fast_prologue_epilogue
- = !expensive_function_p (count);
+ {
+ RTVEC_ELT (v, vi++) = frame_load;
+ ix86_add_cfa_restore_note (NULL, reg, r.offset);
+ }
}
- frame->save_regs_using_mov
- = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
- /* If static stack checking is enabled and done with probes,
- the registers need to be saved before allocating the frame. */
- && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
+ /* Add RSI frame load & restore note at the end. */
+ gcc_assert (rsi_frame_load);
+ gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
+ RTVEC_ELT (v, vi++) = rsi_frame_load;
+ ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
+ rsi_restore_offset);
- /* Skip return address and error code in exception handler. */
- offset = INCOMING_FRAME_SP_OFFSET;
+ /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */
+ if (!use_call && !frame_pointer_needed)
+ {
+ gcc_assert (m->fs.sp_valid);
+ gcc_assert (!m->fs.sp_realigned);
- /* Skip pushed static chain. */
- if (ix86_static_chain_on_stack)
- offset += UNITS_PER_WORD;
+ /* At this point, R10 should point to frame.stack_realign_offset. */
+ if (m->fs.cfa_reg == stack_pointer_rtx)
+ m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
+ m->fs.sp_offset = frame.stack_realign_offset;
+ }
- /* Skip saved base pointer. */
- if (frame_pointer_needed)
- offset += UNITS_PER_WORD;
- frame->hfp_save_offset = offset;
+ gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
+ tmp = gen_rtx_PARALLEL (VOIDmode, v);
+ if (use_call)
+ insn = emit_insn (tmp);
+ else
+ {
+ insn = emit_jump_insn (tmp);
+ JUMP_LABEL (insn) = ret_rtx;
- /* The traditional frame pointer location is at the top of the frame. */
- frame->hard_frame_pointer_offset = offset;
+ if (frame_pointer_needed)
+ ix86_emit_leave (insn);
+ else
+ {
+ /* Need CFA adjust note. */
+ tmp = gen_rtx_SET (stack_pointer_rtx, r10);
+ add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
+ }
+ }
- /* Register save area */
- offset += frame->nregs * UNITS_PER_WORD;
- frame->reg_save_offset = offset;
+ RTX_FRAME_RELATED_P (insn) = true;
+ ix86_add_queued_cfa_restore_notes (insn);
- /* On SEH target, registers are pushed just before the frame pointer
- location. */
- if (TARGET_SEH)
- frame->hard_frame_pointer_offset = offset;
+ /* If we're not doing a tail-call, we need to adjust the stack. */
+ if (use_call && m->fs.sp_valid)
+ {
+ HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
+ pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (dealloc), style,
+ m->fs.cfa_reg == stack_pointer_rtx);
+ }
+}
- /* Calculate the size of the va-arg area (not including padding, if any). */
- frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
+/* Restore function stack, frame, and registers. */
- /* Also adjust stack_realign_offset for the largest alignment of
- stack slot actually used. */
- if (stack_realign_fp
- || (cfun->machine->max_used_stack_alignment != 0
- && (offset % cfun->machine->max_used_stack_alignment) != 0))
- {
- /* We may need a 16-byte aligned stack for the remainder of the
- register save area, but the stack frame for the local function
- may require a greater alignment if using AVX/2/512. In order
- to avoid wasting space, we first calculate the space needed for
- the rest of the register saves, add that to the stack pointer,
- and then realign the stack to the boundary of the start of the
- frame for the local function. */
- HOST_WIDE_INT space_needed = 0;
- HOST_WIDE_INT sse_reg_space_needed = 0;
+void
+ix86_expand_epilogue (int style)
+{
+ struct machine_function *m = cfun->machine;
+ struct machine_frame_state frame_state_save = m->fs;
+ bool restore_regs_via_mov;
+ bool using_drap;
+ bool restore_stub_is_tail = false;
- if (TARGET_64BIT)
- {
- if (m->call_ms2sysv)
- {
- m->call_ms2sysv_pad_in = 0;
- space_needed = xlogue_layout::get_instance ().get_stack_space_used ();
- }
+ if (ix86_function_naked (current_function_decl))
+ {
+ /* The program should not reach this point. */
+ emit_insn (gen_ud2 ());
+ return;
+ }
- else if (frame->nsseregs)
- /* The only ABI that has saved SSE registers (Win64) also has a
- 16-byte aligned default stack. However, many programs violate
- the ABI, and Wine64 forces stack realignment to compensate. */
- space_needed = frame->nsseregs * 16;
+ ix86_finalize_stack_frame_flags ();
+ const struct ix86_frame &frame = cfun->machine->frame;
- sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16);
+ m->fs.sp_realigned = stack_realign_fp;
+ m->fs.sp_valid = stack_realign_fp
+ || !frame_pointer_needed
+ || crtl->sp_is_unchanging;
+ gcc_assert (!m->fs.sp_valid
+ || m->fs.sp_offset == frame.stack_pointer_offset);
- /* 64-bit frame->va_arg_size should always be a multiple of 16, but
- rounding to be pedantic. */
- space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16);
- }
- else
- space_needed = frame->va_arg_size;
+ /* The FP must be valid if the frame pointer is present. */
+ gcc_assert (frame_pointer_needed == m->fs.fp_valid);
+ gcc_assert (!m->fs.fp_valid
+ || m->fs.fp_offset == frame.hard_frame_pointer_offset);
- /* Record the allocation size required prior to the realignment AND. */
- frame->stack_realign_allocate = space_needed;
+ /* We must have *some* valid pointer to the stack frame. */
+ gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
- /* The re-aligned stack starts at frame->stack_realign_offset. Values
- before this point are not directly comparable with values below
- this point. Use sp_valid_at to determine if the stack pointer is
- valid for a given offset, fp_valid_at for the frame pointer, or
- choose_baseaddr to have a base register chosen for you.
+ /* The DRAP is never valid at this point. */
+ gcc_assert (!m->fs.drap_valid);
- Note that the result of (frame->stack_realign_offset
- & (stack_alignment_needed - 1)) may not equal zero. */
- offset = ROUND_UP (offset + space_needed, stack_alignment_needed);
- frame->stack_realign_offset = offset - space_needed;
- frame->sse_reg_save_offset = frame->stack_realign_offset
- + sse_reg_space_needed;
- }
- else
- {
- frame->stack_realign_offset = offset;
+ /* See the comment about red zone and frame
+ pointer usage in ix86_expand_prologue. */
+ if (frame_pointer_needed && frame.red_zone_size)
+ emit_insn (gen_memory_blockage ());
- if (TARGET_64BIT && m->call_ms2sysv)
- {
- m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
- offset += xlogue_layout::get_instance ().get_stack_space_used ();
- }
+ using_drap = crtl->drap_reg && crtl->stack_realign_needed;
+ gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
- /* Align and set SSE register save area. */
- else if (frame->nsseregs)
- {
- /* If the incoming stack boundary is at least 16 bytes, or DRAP is
- required and the DRAP re-alignment boundary is at least 16 bytes,
- then we want the SSE register save area properly aligned. */
- if (ix86_incoming_stack_boundary >= 128
- || (stack_realign_drap && stack_alignment_needed >= 16))
- offset = ROUND_UP (offset, 16);
- offset += frame->nsseregs * 16;
- }
- frame->sse_reg_save_offset = offset;
- offset += frame->va_arg_size;
+ /* Determine the CFA offset of the end of the red-zone. */
+ m->fs.red_zone_offset = 0;
+ if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
+ {
+ /* The red-zone begins below return address and error code in
+ exception handler. */
+ m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
+
+ /* When the register save area is in the aligned portion of
+ the stack, determine the maximum runtime displacement that
+ matches up with the aligned frame. */
+ if (stack_realign_drap)
+ m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
+ + UNITS_PER_WORD);
}
- /* Align start of frame for local function. When a function call
- is removed, it may become a leaf function. But if argument may
- be passed on stack, we need to align the stack when there is no
- tail call. */
- if (m->call_ms2sysv
- || frame->va_arg_size != 0
- || size != 0
- || !crtl->is_leaf
- || (!crtl->tail_call_emit
- && cfun->machine->outgoing_args_on_stack)
- || cfun->calls_alloca
- || ix86_current_function_calls_tls_descriptor)
- offset = ROUND_UP (offset, stack_alignment_needed);
+ HOST_WIDE_INT reg_save_offset = frame.reg_save_offset;
- /* Frame pointer points here. */
- frame->frame_pointer_offset = offset;
+ /* Special care must be taken for the normal return case of a function
+ using eh_return: the eax and edx registers are marked as saved, but
+ not restored along this path. Adjust the save location to match. */
+ if (crtl->calls_eh_return && style != 2)
+ reg_save_offset -= 2 * UNITS_PER_WORD;
- offset += size;
+ /* EH_RETURN requires the use of moves to function properly. */
+ if (crtl->calls_eh_return)
+ restore_regs_via_mov = true;
+ /* SEH requires the use of pops to identify the epilogue. */
+ else if (TARGET_SEH)
+ restore_regs_via_mov = false;
+ /* If we're only restoring one register and sp cannot be used then
+ using a move instruction to restore the register since it's
+ less work than reloading sp and popping the register. */
+ else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
+ restore_regs_via_mov = true;
+ else if (TARGET_EPILOGUE_USING_MOVE
+ && cfun->machine->use_fast_prologue_epilogue
+ && (frame.nregs > 1
+ || m->fs.sp_offset != reg_save_offset))
+ restore_regs_via_mov = true;
+ else if (frame_pointer_needed
+ && !frame.nregs
+ && m->fs.sp_offset != reg_save_offset)
+ restore_regs_via_mov = true;
+ else if (frame_pointer_needed
+ && TARGET_USE_LEAVE
+ && cfun->machine->use_fast_prologue_epilogue
+ && frame.nregs == 1)
+ restore_regs_via_mov = true;
+ else
+ restore_regs_via_mov = false;
- /* Add outgoing arguments area. Can be skipped if we eliminated
- all the function calls as dead code.
- Skipping is however impossible when function calls alloca. Alloca
- expander assumes that last crtl->outgoing_args_size
- of stack frame are unused. */
- if (ACCUMULATE_OUTGOING_ARGS
- && (!crtl->is_leaf || cfun->calls_alloca
- || ix86_current_function_calls_tls_descriptor))
+ if (restore_regs_via_mov || frame.nsseregs)
{
- offset += crtl->outgoing_args_size;
- frame->outgoing_arguments_size = crtl->outgoing_args_size;
+ /* Ensure that the entire register save area is addressable via
+ the stack pointer, if we will restore SSE regs via sp. */
+ if (TARGET_64BIT
+ && m->fs.sp_offset > 0x7fffffff
+ && sp_valid_at (frame.stack_realign_offset + 1)
+ && (frame.nsseregs + frame.nregs) != 0)
+ {
+ pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (m->fs.sp_offset
+ - frame.sse_reg_save_offset),
+ style,
+ m->fs.cfa_reg == stack_pointer_rtx);
+ }
}
- else
- frame->outgoing_arguments_size = 0;
- /* Align stack boundary. Only needed if we're calling another function
- or using alloca. */
- if (!crtl->is_leaf || cfun->calls_alloca
- || ix86_current_function_calls_tls_descriptor)
- offset = ROUND_UP (offset, preferred_alignment);
+ /* If there are any SSE registers to restore, then we have to do it
+ via moves, since there's obviously no pop for SSE regs. */
+ if (frame.nsseregs)
+ ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
+ style == 2);
- /* We've reached end of stack frame. */
- frame->stack_pointer_offset = offset;
+ if (m->call_ms2sysv)
+ {
+ int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
- /* Size prologue needs to allocate. */
- to_allocate = offset - frame->sse_reg_save_offset;
+ /* We cannot use a tail-call for the stub if:
+ 1. We have to pop incoming args,
+ 2. We have additional int regs to restore, or
+ 3. A sibling call will be the tail-call, or
+ 4. We are emitting an eh_return_internal epilogue.
- if ((!to_allocate && frame->nregs <= 1)
- || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000))
- /* If stack clash probing needs a loop, then it needs a
- scratch register. But the returned register is only guaranteed
- to be safe to use after register saves are complete. So if
- stack clash protections are enabled and the allocated frame is
- larger than the probe interval, then use pushes to save
- callee saved registers. */
- || (flag_stack_clash_protection && to_allocate > get_probe_interval ()))
- frame->save_regs_using_mov = false;
+ TODO: Item 4 has not yet tested!
- if (ix86_using_red_zone ()
- && crtl->sp_is_unchanging
- && crtl->is_leaf
- && !ix86_pc_thunk_call_expanded
- && !ix86_current_function_calls_tls_descriptor)
- {
- frame->red_zone_size = to_allocate;
- if (frame->save_regs_using_mov)
- frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
- if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
- frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
+ If any of the above are true, we will call the stub rather than
+ jump to it. */
+ restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
+ ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
}
- else
- frame->red_zone_size = 0;
- frame->stack_pointer_offset -= frame->red_zone_size;
- /* The SEH frame pointer location is near the bottom of the frame.
- This is enforced by the fact that the difference between the
- stack pointer and the frame pointer is limited to 240 bytes in
- the unwind data structure. */
- if (TARGET_SEH)
+ /* If using out-of-line stub that is a tail-call, then...*/
+ if (m->call_ms2sysv && restore_stub_is_tail)
{
- HOST_WIDE_INT diff;
-
- /* If we can leave the frame pointer where it is, do so. Also, returns
- the establisher frame for __builtin_frame_address (0). */
- diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
- if (diff <= SEH_MAX_FRAME_SIZE
- && (diff > 240 || (diff & 15) != 0)
- && !crtl->accesses_prior_frames)
- {
- /* Ideally we'd determine what portion of the local stack frame
- (within the constraint of the lowest 240) is most heavily used.
- But without that complication, simply bias the frame pointer
- by 128 bytes so as to maximize the amount of the local stack
- frame that is addressable with 8-bit offsets. */
- frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
- }
+ /* TODO: parinoid tests. (remove eventually) */
+ gcc_assert (m->fs.sp_valid);
+ gcc_assert (!m->fs.sp_realigned);
+ gcc_assert (!m->fs.fp_valid);
+ gcc_assert (!m->fs.realigned);
+ gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
+ gcc_assert (!crtl->drap_reg);
+ gcc_assert (!frame.nregs);
}
-}
+ else if (restore_regs_via_mov)
+ {
+ rtx t;
-/* This is semi-inlined memory_address_length, but simplified
- since we know that we're always dealing with reg+offset, and
- to avoid having to create and discard all that rtl. */
+ if (frame.nregs)
+ ix86_emit_restore_regs_using_mov (reg_save_offset, style == 2);
-static inline int
-choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
-{
- int len = 4;
+ /* eh_return epilogues need %ecx added to the stack pointer. */
+ if (style == 2)
+ {
+ rtx sa = EH_RETURN_STACKADJ_RTX;
+ rtx_insn *insn;
- if (offset == 0)
- {
- /* EBP and R13 cannot be encoded without an offset. */
- len = (regno == BP_REG || regno == R13_REG);
- }
- else if (IN_RANGE (offset, -128, 127))
- len = 1;
+ /* %ecx can't be used for both DRAP register and eh_return. */
+ if (crtl->drap_reg)
+ gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
- /* ESP and R12 must be encoded with a SIB byte. */
- if (regno == SP_REG || regno == R12_REG)
- len++;
+ /* regparm nested functions don't work with eh_return. */
+ gcc_assert (!ix86_static_chain_on_stack);
- return len;
-}
+ if (frame_pointer_needed)
+ {
+ t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
+ t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
+ emit_insn (gen_rtx_SET (sa, t));
-/* Determine if the stack pointer is valid for accessing the CFA_OFFSET in
- the frame save area. The register is saved at CFA - CFA_OFFSET. */
+ t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
+ insn = emit_move_insn (hard_frame_pointer_rtx, t);
-static bool
-sp_valid_at (HOST_WIDE_INT cfa_offset)
-{
- const struct machine_frame_state &fs = cfun->machine->fs;
- if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset)
- {
- /* Validate that the cfa_offset isn't in a "no-man's land". */
- gcc_assert (cfa_offset <= fs.sp_realigned_fp_last);
- return false;
- }
- return fs.sp_valid;
-}
-
-/* Determine if the frame pointer is valid for accessing the CFA_OFFSET in
- the frame save area. The register is saved at CFA - CFA_OFFSET. */
-
-static inline bool
-fp_valid_at (HOST_WIDE_INT cfa_offset)
-{
- const struct machine_frame_state &fs = cfun->machine->fs;
- if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last)
- {
- /* Validate that the cfa_offset isn't in a "no-man's land". */
- gcc_assert (cfa_offset >= fs.sp_realigned_offset);
- return false;
- }
- return fs.fp_valid;
-}
-
-/* Choose a base register based upon alignment requested, speed and/or
- size. */
-
-static void
-choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
- HOST_WIDE_INT &base_offset,
- unsigned int align_reqested, unsigned int *align)
-{
- const struct machine_function *m = cfun->machine;
- unsigned int hfp_align;
- unsigned int drap_align;
- unsigned int sp_align;
- bool hfp_ok = fp_valid_at (cfa_offset);
- bool drap_ok = m->fs.drap_valid;
- bool sp_ok = sp_valid_at (cfa_offset);
-
- hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
-
- /* Filter out any registers that don't meet the requested alignment
- criteria. */
- if (align_reqested)
- {
- if (m->fs.realigned)
- hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
- /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
- notes (which we would need to use a realigned stack pointer),
- so disable on SEH targets. */
- else if (m->fs.sp_realigned)
- sp_align = crtl->stack_alignment_needed;
+ /* Note that we use SA as a temporary CFA, as the return
+ address is at the proper place relative to it. We
+ pretend this happens at the FP restore insn because
+ prior to this insn the FP would be stored at the wrong
+ offset relative to SA, and after this insn we have no
+ other reasonable register to use for the CFA. We don't
+ bother resetting the CFA to the SP for the duration of
+ the return insn, unless the control flow instrumentation
+ is done. In this case the SP is used later and we have
+ to reset CFA to SP. */
+ add_reg_note (insn, REG_CFA_DEF_CFA,
+ plus_constant (Pmode, sa, UNITS_PER_WORD));
+ ix86_add_queued_cfa_restore_notes (insn);
+ add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
+ RTX_FRAME_RELATED_P (insn) = 1;
- hfp_ok = hfp_ok && hfp_align >= align_reqested;
- drap_ok = drap_ok && drap_align >= align_reqested;
- sp_ok = sp_ok && sp_align >= align_reqested;
- }
+ m->fs.cfa_reg = sa;
+ m->fs.cfa_offset = UNITS_PER_WORD;
+ m->fs.fp_valid = false;
- if (m->use_fast_prologue_epilogue)
- {
- /* Choose the base register most likely to allow the most scheduling
- opportunities. Generally FP is valid throughout the function,
- while DRAP must be reloaded within the epilogue. But choose either
- over the SP due to increased encoding size. */
+ pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
+ const0_rtx, style,
+ flag_cf_protection);
+ }
+ else
+ {
+ t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
+ t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
+ insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
+ ix86_add_queued_cfa_restore_notes (insn);
- if (hfp_ok)
- {
- base_reg = hard_frame_pointer_rtx;
- base_offset = m->fs.fp_offset - cfa_offset;
- }
- else if (drap_ok)
- {
- base_reg = crtl->drap_reg;
- base_offset = 0 - cfa_offset;
- }
- else if (sp_ok)
- {
- base_reg = stack_pointer_rtx;
- base_offset = m->fs.sp_offset - cfa_offset;
+ gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
+ if (m->fs.cfa_offset != UNITS_PER_WORD)
+ {
+ m->fs.cfa_offset = UNITS_PER_WORD;
+ add_reg_note (insn, REG_CFA_DEF_CFA,
+ plus_constant (Pmode, stack_pointer_rtx,
+ UNITS_PER_WORD));
+ RTX_FRAME_RELATED_P (insn) = 1;
+ }
+ }
+ m->fs.sp_offset = UNITS_PER_WORD;
+ m->fs.sp_valid = true;
+ m->fs.sp_realigned = false;
}
}
else
{
- HOST_WIDE_INT toffset;
- int len = 16, tlen;
-
- /* Choose the base register with the smallest address encoding.
- With a tie, choose FP > DRAP > SP. */
- if (sp_ok)
+ /* SEH requires that the function end with (1) a stack adjustment
+ if necessary, (2) a sequence of pops, and (3) a return or
+ jump instruction. Prevent insns from the function body from
+ being scheduled into this sequence. */
+ if (TARGET_SEH)
{
- base_reg = stack_pointer_rtx;
- base_offset = m->fs.sp_offset - cfa_offset;
- len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
+ /* Prevent a catch region from being adjacent to the standard
+ epilogue sequence. Unfortunately neither crtl->uses_eh_lsda
+ nor several other flags that would be interesting to test are
+ set up yet. */
+ if (flag_non_call_exceptions)
+ emit_insn (gen_nops (const1_rtx));
+ else
+ emit_insn (gen_blockage ());
}
- if (drap_ok)
+
+ /* First step is to deallocate the stack frame so that we can
+ pop the registers. If the stack pointer was realigned, it needs
+ to be restored now. Also do it on SEH target for very large
+ frame as the emitted instructions aren't allowed by the ABI
+ in epilogues. */
+ if (!m->fs.sp_valid || m->fs.sp_realigned
+ || (TARGET_SEH
+ && (m->fs.sp_offset - reg_save_offset
+ >= SEH_MAX_FRAME_SIZE)))
{
- toffset = 0 - cfa_offset;
- tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
- if (tlen <= len)
- {
- base_reg = crtl->drap_reg;
- base_offset = toffset;
- len = tlen;
- }
+ pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
+ GEN_INT (m->fs.fp_offset
+ - reg_save_offset),
+ style, false);
}
- if (hfp_ok)
+ else if (m->fs.sp_offset != reg_save_offset)
{
- toffset = m->fs.fp_offset - cfa_offset;
- tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
- if (tlen <= len)
- {
- base_reg = hard_frame_pointer_rtx;
- base_offset = toffset;
- len = tlen;
- }
+ pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (m->fs.sp_offset
+ - reg_save_offset),
+ style,
+ m->fs.cfa_reg == stack_pointer_rtx);
}
- }
- /* Set the align return value. */
- if (align)
- {
- if (base_reg == stack_pointer_rtx)
- *align = sp_align;
- else if (base_reg == crtl->drap_reg)
- *align = drap_align;
- else if (base_reg == hard_frame_pointer_rtx)
- *align = hfp_align;
- }
-}
+ ix86_emit_restore_regs_using_pop ();
+ }
-/* Return an RTX that points to CFA_OFFSET within the stack frame and
- the alignment of address. If ALIGN is non-null, it should point to
- an alignment value (in bits) that is preferred or zero and will
- recieve the alignment of the base register that was selected,
- irrespective of rather or not CFA_OFFSET is a multiple of that
- alignment value. If it is possible for the base register offset to be
- non-immediate then SCRATCH_REGNO should specify a scratch register to
- use.
+ /* If we used a stack pointer and haven't already got rid of it,
+ then do so now. */
+ if (m->fs.fp_valid)
+ {
+ /* If the stack pointer is valid and pointing at the frame
+ pointer store address, then we only need a pop. */
+ if (sp_valid_at (frame.hfp_save_offset)
+ && m->fs.sp_offset == frame.hfp_save_offset)
+ ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
+ /* Leave results in shorter dependency chains on CPUs that are
+ able to grok it fast. */
+ else if (TARGET_USE_LEAVE
+ || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
+ || !cfun->machine->use_fast_prologue_epilogue)
+ ix86_emit_leave (NULL);
+ else
+ {
+ pro_epilogue_adjust_stack (stack_pointer_rtx,
+ hard_frame_pointer_rtx,
+ const0_rtx, style, !using_drap);
+ ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
+ }
+ }
- The valid base registers are taken from CFUN->MACHINE->FS. */
+ if (using_drap)
+ {
+ int param_ptr_offset = UNITS_PER_WORD;
+ rtx_insn *insn;
-static rtx
-choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align,
- unsigned int scratch_regno = INVALID_REGNUM)
-{
- rtx base_reg = NULL;
- HOST_WIDE_INT base_offset = 0;
+ gcc_assert (stack_realign_drap);
- /* If a specific alignment is requested, try to get a base register
- with that alignment first. */
- if (align && *align)
- choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
+ if (ix86_static_chain_on_stack)
+ param_ptr_offset += UNITS_PER_WORD;
+ if (!call_used_regs[REGNO (crtl->drap_reg)])
+ param_ptr_offset += UNITS_PER_WORD;
- if (!base_reg)
- choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
+ insn = emit_insn (gen_rtx_SET
+ (stack_pointer_rtx,
+ gen_rtx_PLUS (Pmode,
+ crtl->drap_reg,
+ GEN_INT (-param_ptr_offset))));
+ m->fs.cfa_reg = stack_pointer_rtx;
+ m->fs.cfa_offset = param_ptr_offset;
+ m->fs.sp_offset = param_ptr_offset;
+ m->fs.realigned = false;
- gcc_assert (base_reg != NULL);
+ add_reg_note (insn, REG_CFA_DEF_CFA,
+ gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+ GEN_INT (param_ptr_offset)));
+ RTX_FRAME_RELATED_P (insn) = 1;
- rtx base_offset_rtx = GEN_INT (base_offset);
+ if (!call_used_regs[REGNO (crtl->drap_reg)])
+ ix86_emit_restore_reg_using_pop (crtl->drap_reg);
+ }
- if (!x86_64_immediate_operand (base_offset_rtx, Pmode))
+ /* At this point the stack pointer must be valid, and we must have
+ restored all of the registers. We may not have deallocated the
+ entire stack frame. We've delayed this until now because it may
+ be possible to merge the local stack deallocation with the
+ deallocation forced by ix86_static_chain_on_stack. */
+ gcc_assert (m->fs.sp_valid);
+ gcc_assert (!m->fs.sp_realigned);
+ gcc_assert (!m->fs.fp_valid);
+ gcc_assert (!m->fs.realigned);
+ if (m->fs.sp_offset != UNITS_PER_WORD)
{
- gcc_assert (scratch_regno != INVALID_REGNUM);
-
- rtx scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
- emit_move_insn (scratch_reg, base_offset_rtx);
-
- return gen_rtx_PLUS (Pmode, base_reg, scratch_reg);
+ pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
+ style, true);
}
+ else
+ ix86_add_queued_cfa_restore_notes (get_last_insn ());
- return plus_constant (Pmode, base_reg, base_offset);
-}
-
-/* Emit code to save registers in the prologue. */
-
-static void
-ix86_emit_save_regs (void)
-{
- unsigned int regno;
- rtx_insn *insn;
-
- for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
- if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
- {
- insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
- RTX_FRAME_RELATED_P (insn) = 1;
- }
-}
-
-/* Emit a single register save at CFA - CFA_OFFSET. */
+ /* Sibcall epilogues don't want a return instruction. */
+ if (style == 0)
+ {
+ m->fs = frame_state_save;
+ return;
+ }
-static void
-ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
- HOST_WIDE_INT cfa_offset)
-{
- struct machine_function *m = cfun->machine;
- rtx reg = gen_rtx_REG (mode, regno);
- rtx mem, addr, base, insn;
- unsigned int align = GET_MODE_ALIGNMENT (mode);
+ if (cfun->machine->func_type != TYPE_NORMAL)
+ emit_jump_insn (gen_interrupt_return ());
+ else if (crtl->args.pops_args && crtl->args.size)
+ {
+ rtx popc = GEN_INT (crtl->args.pops_args);
- addr = choose_baseaddr (cfa_offset, &align);
- mem = gen_frame_mem (mode, addr);
+ /* i386 can only pop 64K bytes. If asked to pop more, pop return
+ address, do explicit add, and jump indirectly to the caller. */
- /* The location aligment depends upon the base register. */
- align = MIN (GET_MODE_ALIGNMENT (mode), align);
- gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
- set_mem_align (mem, align);
+ if (crtl->args.pops_args >= 65536)
+ {
+ rtx ecx = gen_rtx_REG (SImode, CX_REG);
+ rtx_insn *insn;
- insn = emit_insn (gen_rtx_SET (mem, reg));
- RTX_FRAME_RELATED_P (insn) = 1;
+ /* There is no "pascal" calling convention in any 64bit ABI. */
+ gcc_assert (!TARGET_64BIT);
- base = addr;
- if (GET_CODE (base) == PLUS)
- base = XEXP (base, 0);
- gcc_checking_assert (REG_P (base));
+ insn = emit_insn (gen_pop (ecx));
+ m->fs.cfa_offset -= UNITS_PER_WORD;
+ m->fs.sp_offset -= UNITS_PER_WORD;
- /* When saving registers into a re-aligned local stack frame, avoid
- any tricky guessing by dwarf2out. */
- if (m->fs.realigned)
- {
- gcc_checking_assert (stack_realign_drap);
+ rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
+ x = gen_rtx_SET (stack_pointer_rtx, x);
+ add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
+ add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
+ RTX_FRAME_RELATED_P (insn) = 1;
- if (regno == REGNO (crtl->drap_reg))
- {
- /* A bit of a hack. We force the DRAP register to be saved in
- the re-aligned stack frame, which provides us with a copy
- of the CFA that will last past the prologue. Install it. */
- gcc_checking_assert (cfun->machine->fs.fp_valid);
- addr = plus_constant (Pmode, hard_frame_pointer_rtx,
- cfun->machine->fs.fp_offset - cfa_offset);
- mem = gen_rtx_MEM (mode, addr);
- add_reg_note (insn, REG_CFA_DEF_CFA, mem);
+ pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ popc, -1, true);
+ emit_jump_insn (gen_simple_return_indirect_internal (ecx));
}
else
- {
- /* The frame pointer is a stable reference within the
- aligned frame. Use it. */
- gcc_checking_assert (cfun->machine->fs.fp_valid);
- addr = plus_constant (Pmode, hard_frame_pointer_rtx,
- cfun->machine->fs.fp_offset - cfa_offset);
- mem = gen_rtx_MEM (mode, addr);
- add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
- }
- }
-
- else if (base == stack_pointer_rtx && m->fs.sp_realigned
- && cfa_offset >= m->fs.sp_realigned_offset)
- {
- gcc_checking_assert (stack_realign_fp);
- add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
+ emit_jump_insn (gen_simple_return_pop_internal (popc));
}
-
- /* The memory may not be relative to the current CFA register,
- which means that we may need to generate a new pattern for
- use by the unwind info. */
- else if (base != m->fs.cfa_reg)
+ else if (!m->call_ms2sysv || !restore_stub_is_tail)
{
- addr = plus_constant (Pmode, m->fs.cfa_reg,
- m->fs.cfa_offset - cfa_offset);
- mem = gen_rtx_MEM (mode, addr);
- add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
- }
-}
+ /* In case of return from EH a simple return cannot be used
+ as a return address will be compared with a shadow stack
+ return address. Use indirect jump instead. */
+ if (style == 2 && flag_cf_protection)
+ {
+ /* Register used in indirect jump must be in word_mode. But
+ Pmode may not be the same as word_mode for x32. */
+ rtx ecx = gen_rtx_REG (word_mode, CX_REG);
+ rtx_insn *insn;
-/* Emit code to save registers using MOV insns.
- First register is stored at CFA - CFA_OFFSET. */
-static void
-ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
-{
- unsigned int regno;
+ insn = emit_insn (gen_pop (ecx));
+ m->fs.cfa_offset -= UNITS_PER_WORD;
+ m->fs.sp_offset -= UNITS_PER_WORD;
- for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
- if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
- {
- ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
- cfa_offset -= UNITS_PER_WORD;
- }
-}
+ rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
+ x = gen_rtx_SET (stack_pointer_rtx, x);
+ add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
+ add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
+ RTX_FRAME_RELATED_P (insn) = 1;
-/* Emit code to save SSE registers using MOV insns.
- First register is stored at CFA - CFA_OFFSET. */
-static void
-ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
-{
- unsigned int regno;
+ emit_jump_insn (gen_simple_return_indirect_internal (ecx));
+ }
+ else
+ emit_jump_insn (gen_simple_return_internal ());
+ }
- for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
- if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
- {
- ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
- cfa_offset -= GET_MODE_SIZE (V4SFmode);
- }
+ /* Restore the state back to the state from the prologue,
+ so that it's correct for the next epilogue. */
+ m->fs = frame_state_save;
}
-static GTY(()) rtx queued_cfa_restores;
-
-/* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
- manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
- Don't add the note if the previously saved value will be left untouched
- within stack red-zone till return, as unwinders can find the same value
- in the register and on the stack. */
+/* Reset from the function's potential modifications. */
static void
-ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
+ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED)
{
- if (!crtl->shrink_wrapped
- && cfa_offset <= cfun->machine->fs.red_zone_offset)
- return;
+ if (pic_offset_table_rtx
+ && !ix86_use_pseudo_pic_reg ())
+ SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
- if (insn)
+ if (TARGET_MACHO)
{
- add_reg_note (insn, REG_CFA_RESTORE, reg);
- RTX_FRAME_RELATED_P (insn) = 1;
- }
- else
- queued_cfa_restores
- = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
-}
-
-/* Add queued REG_CFA_RESTORE notes if any to INSN. */
+ rtx_insn *insn = get_last_insn ();
+ rtx_insn *deleted_debug_label = NULL;
-static void
-ix86_add_queued_cfa_restore_notes (rtx insn)
-{
- rtx last;
- if (!queued_cfa_restores)
- return;
- for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
- ;
- XEXP (last, 1) = REG_NOTES (insn);
- REG_NOTES (insn) = queued_cfa_restores;
- queued_cfa_restores = NULL_RTX;
- RTX_FRAME_RELATED_P (insn) = 1;
-}
+ /* Mach-O doesn't support labels at the end of objects, so if
+ it looks like we might want one, take special action.
+ First, collect any sequence of deleted debug labels. */
+ while (insn
+ && NOTE_P (insn)
+ && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
+ {
+ /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
+ notes only, instead set their CODE_LABEL_NUMBER to -1,
+ otherwise there would be code generation differences
+ in between -g and -g0. */
+ if (NOTE_P (insn) && NOTE_KIND (insn)
+ == NOTE_INSN_DELETED_DEBUG_LABEL)
+ deleted_debug_label = insn;
+ insn = PREV_INSN (insn);
+ }
-/* Expand prologue or epilogue stack adjustment.
- The pattern exist to put a dependency on all ebp-based memory accesses.
- STYLE should be negative if instructions should be marked as frame related,
- zero if %r11 register is live and cannot be freely used and positive
- otherwise. */
+ /* If we have:
+ label:
+ barrier
+ then this needs to be detected, so skip past the barrier. */
-static rtx
-pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
- int style, bool set_cfa)
-{
- struct machine_function *m = cfun->machine;
- rtx insn;
- bool add_frame_related_expr = false;
+ if (insn && BARRIER_P (insn))
+ insn = PREV_INSN (insn);
- if (Pmode == SImode)
- insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
- else if (x86_64_immediate_operand (offset, DImode))
- insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
- else
- {
- rtx tmp;
- /* r11 is used by indirect sibcall return as well, set before the
- epilogue and used after the epilogue. */
- if (style)
- tmp = gen_rtx_REG (DImode, R11_REG);
- else
+ /* Up to now we've only seen notes or barriers. */
+ if (insn)
{
- gcc_assert (src != hard_frame_pointer_rtx
- && dest != hard_frame_pointer_rtx);
- tmp = hard_frame_pointer_rtx;
+ if (LABEL_P (insn)
+ || (NOTE_P (insn)
+ && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
+ /* Trailing label. */
+ fputs ("\tnop\n", file);
+ else if (cfun && ! cfun->is_thunk)
+ {
+ /* See if we have a completely empty function body, skipping
+ the special case of the picbase thunk emitted as asm. */
+ while (insn && ! INSN_P (insn))
+ insn = PREV_INSN (insn);
+ /* If we don't find any insns, we've got an empty function body;
+ I.e. completely empty - without a return or branch. This is
+ taken as the case where a function body has been removed
+ because it contains an inline __builtin_unreachable(). GCC
+ declares that reaching __builtin_unreachable() means UB so
+ we're not obliged to do anything special; however, we want
+ non-zero-sized function bodies. To meet this, and help the
+ user out, let's trap the case. */
+ if (insn == NULL)
+ fputs ("\tud2\n", file);
+ }
}
- insn = emit_insn (gen_rtx_SET (tmp, offset));
- if (style < 0)
- add_frame_related_expr = true;
-
- insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
+ else if (deleted_debug_label)
+ for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
+ if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
+ CODE_LABEL_NUMBER (insn) = -1;
}
+}
- insn = emit_insn (insn);
- if (style >= 0)
- ix86_add_queued_cfa_restore_notes (insn);
+/* Return a scratch register to use in the split stack prologue. The
+ split stack prologue is used for -fsplit-stack. It is the first
+ instructions in the function, even before the regular prologue.
+ The scratch register can be any caller-saved register which is not
+ used for parameters or for the static chain. */
- if (set_cfa)
+static unsigned int
+split_stack_prologue_scratch_regno (void)
+{
+ if (TARGET_64BIT)
+ return R11_REG;
+ else
{
- rtx r;
+ bool is_fastcall, is_thiscall;
+ int regparm;
- gcc_assert (m->fs.cfa_reg == src);
- m->fs.cfa_offset += INTVAL (offset);
- m->fs.cfa_reg = dest;
+ is_fastcall = (lookup_attribute ("fastcall",
+ TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
+ != NULL);
+ is_thiscall = (lookup_attribute ("thiscall",
+ TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
+ != NULL);
+ regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
- r = gen_rtx_PLUS (Pmode, src, offset);
- r = gen_rtx_SET (dest, r);
- add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
- RTX_FRAME_RELATED_P (insn) = 1;
- }
- else if (style < 0)
- {
- RTX_FRAME_RELATED_P (insn) = 1;
- if (add_frame_related_expr)
+ if (is_fastcall)
{
- rtx r = gen_rtx_PLUS (Pmode, src, offset);
- r = gen_rtx_SET (dest, r);
- add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
+ if (DECL_STATIC_CHAIN (cfun->decl))
+ {
+ sorry ("%<-fsplit-stack%> does not support fastcall with "
+ "nested function");
+ return INVALID_REGNUM;
+ }
+ return AX_REG;
}
- }
-
- if (dest == stack_pointer_rtx)
- {
- HOST_WIDE_INT ooffset = m->fs.sp_offset;
- bool valid = m->fs.sp_valid;
- bool realigned = m->fs.sp_realigned;
-
- if (src == hard_frame_pointer_rtx)
- {
- valid = m->fs.fp_valid;
- realigned = false;
- ooffset = m->fs.fp_offset;
+ else if (is_thiscall)
+ {
+ if (!DECL_STATIC_CHAIN (cfun->decl))
+ return DX_REG;
+ return AX_REG;
}
- else if (src == crtl->drap_reg)
+ else if (regparm < 3)
{
- valid = m->fs.drap_valid;
- realigned = false;
- ooffset = 0;
+ if (!DECL_STATIC_CHAIN (cfun->decl))
+ return CX_REG;
+ else
+ {
+ if (regparm >= 2)
+ {
+ sorry ("%<-fsplit-stack%> does not support 2 register "
+ "parameters for a nested function");
+ return INVALID_REGNUM;
+ }
+ return DX_REG;
+ }
}
else
{
- /* Else there are two possibilities: SP itself, which we set
- up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
- taken care of this by hand along the eh_return path. */
- gcc_checking_assert (src == stack_pointer_rtx
- || offset == const0_rtx);
+ /* FIXME: We could make this work by pushing a register
+ around the addition and comparison. */
+ sorry ("%<-fsplit-stack%> does not support 3 register parameters");
+ return INVALID_REGNUM;
}
-
- m->fs.sp_offset = ooffset - INTVAL (offset);
- m->fs.sp_valid = valid;
- m->fs.sp_realigned = realigned;
}
- return insn;
}
-/* Find an available register to be used as dynamic realign argument
- pointer regsiter. Such a register will be written in prologue and
- used in begin of body, so it must not be
- 1. parameter passing register.
- 2. GOT pointer.
- We reuse static-chain register if it is available. Otherwise, we
- use DI for i386 and R13 for x86-64. We chose R13 since it has
- shorter encoding.
+/* A SYMBOL_REF for the function which allocates new stackspace for
+ -fsplit-stack. */
- Return: the regno of chosen register. */
+static GTY(()) rtx split_stack_fn;
-static unsigned int
-find_drap_reg (void)
-{
- tree decl = cfun->decl;
+/* A SYMBOL_REF for the more stack function when using the large
+ model. */
- /* Always use callee-saved register if there are no caller-saved
- registers. */
- if (TARGET_64BIT)
- {
- /* Use R13 for nested function or function need static chain.
- Since function with tail call may use any caller-saved
- registers in epilogue, DRAP must not use caller-saved
- register in such case. */
- if (DECL_STATIC_CHAIN (decl)
- || cfun->machine->no_caller_saved_registers
- || crtl->tail_call_emit)
- return R13_REG;
+static GTY(()) rtx split_stack_fn_large;
- return R10_REG;
- }
- else
- {
- /* Use DI for nested function or function need static chain.
- Since function with tail call may use any caller-saved
- registers in epilogue, DRAP must not use caller-saved
- register in such case. */
- if (DECL_STATIC_CHAIN (decl)
- || cfun->machine->no_caller_saved_registers
- || crtl->tail_call_emit)
- return DI_REG;
+/* Return location of the stack guard value in the TLS block. */
- /* Reuse static chain register if it isn't used for parameter
- passing. */
- if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
- {
- unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
- if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
- return CX_REG;
- }
- return DI_REG;
- }
-}
+rtx
+ix86_split_stack_guard (void)
+{
+ int offset;
+ addr_space_t as = DEFAULT_TLS_SEG_REG;
+ rtx r;
-/* Handle a "force_align_arg_pointer" attribute. */
+ gcc_assert (flag_split_stack);
-static tree
-ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
- tree, int, bool *no_add_attrs)
-{
- if (TREE_CODE (*node) != FUNCTION_TYPE
- && TREE_CODE (*node) != METHOD_TYPE
- && TREE_CODE (*node) != FIELD_DECL
- && TREE_CODE (*node) != TYPE_DECL)
- {
- warning (OPT_Wattributes, "%qE attribute only applies to functions",
- name);
- *no_add_attrs = true;
- }
+#ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
+ offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
+#else
+ gcc_unreachable ();
+#endif
- return NULL_TREE;
+ r = GEN_INT (offset);
+ r = gen_const_mem (Pmode, r);
+ set_mem_addr_space (r, as);
+
+ return r;
}
-/* Return minimum incoming stack alignment. */
+/* Handle -fsplit-stack. These are the first instructions in the
+ function, even before the regular prologue. */
-static unsigned int
-ix86_minimum_incoming_stack_boundary (bool sibcall)
+void
+ix86_expand_split_stack_prologue (void)
{
- unsigned int incoming_stack_boundary;
+ HOST_WIDE_INT allocate;
+ unsigned HOST_WIDE_INT args_size;
+ rtx_code_label *label;
+ rtx limit, current, allocate_rtx, call_fusage;
+ rtx_insn *call_insn;
+ rtx scratch_reg = NULL_RTX;
+ rtx_code_label *varargs_label = NULL;
+ rtx fn;
- /* Stack of interrupt handler is aligned to 128 bits in 64bit mode. */
- if (cfun->machine->func_type != TYPE_NORMAL)
- incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
- /* Prefer the one specified at command line. */
- else if (ix86_user_incoming_stack_boundary)
- incoming_stack_boundary = ix86_user_incoming_stack_boundary;
- /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
- if -mstackrealign is used, it isn't used for sibcall check and
- estimated stack alignment is 128bit. */
- else if (!sibcall
- && ix86_force_align_arg_pointer
- && crtl->stack_alignment_estimated == 128)
- incoming_stack_boundary = MIN_STACK_BOUNDARY;
- else
- incoming_stack_boundary = ix86_default_incoming_stack_boundary;
-
- /* Incoming stack alignment can be changed on individual functions
- via force_align_arg_pointer attribute. We use the smallest
- incoming stack boundary. */
- if (incoming_stack_boundary > MIN_STACK_BOUNDARY
- && lookup_attribute (ix86_force_align_arg_pointer_string,
- TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
- incoming_stack_boundary = MIN_STACK_BOUNDARY;
-
- /* The incoming stack frame has to be aligned at least at
- parm_stack_boundary. */
- if (incoming_stack_boundary < crtl->parm_stack_boundary)
- incoming_stack_boundary = crtl->parm_stack_boundary;
+ gcc_assert (flag_split_stack && reload_completed);
- /* Stack at entrance of main is aligned by runtime. We use the
- smallest incoming stack boundary. */
- if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
- && DECL_NAME (current_function_decl)
- && MAIN_NAME_P (DECL_NAME (current_function_decl))
- && DECL_FILE_SCOPE_P (current_function_decl))
- incoming_stack_boundary = MAIN_STACK_BOUNDARY;
+ ix86_finalize_stack_frame_flags ();
+ struct ix86_frame &frame = cfun->machine->frame;
+ allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
- return incoming_stack_boundary;
-}
+ /* This is the label we will branch to if we have enough stack
+ space. We expect the basic block reordering pass to reverse this
+ branch if optimizing, so that we branch in the unlikely case. */
+ label = gen_label_rtx ();
-/* Update incoming stack boundary and estimated stack alignment. */
+ /* We need to compare the stack pointer minus the frame size with
+ the stack boundary in the TCB. The stack boundary always gives
+ us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
+ can compare directly. Otherwise we need to do an addition. */
-static void
-ix86_update_stack_boundary (void)
-{
- ix86_incoming_stack_boundary
- = ix86_minimum_incoming_stack_boundary (false);
+ limit = ix86_split_stack_guard ();
- /* x86_64 vararg needs 16byte stack alignment for register save area. */
- if (TARGET_64BIT
- && cfun->stdarg
- && crtl->stack_alignment_estimated < 128)
- crtl->stack_alignment_estimated = 128;
+ if (allocate < SPLIT_STACK_AVAILABLE)
+ current = stack_pointer_rtx;
+ else
+ {
+ unsigned int scratch_regno;
+ rtx offset;
- /* __tls_get_addr needs to be called with 16-byte aligned stack. */
- if (ix86_tls_descriptor_calls_expanded_in_cfun
- && crtl->preferred_stack_boundary < 128)
- crtl->preferred_stack_boundary = 128;
-}
+ /* We need a scratch register to hold the stack pointer minus
+ the required frame size. Since this is the very start of the
+ function, the scratch register can be any caller-saved
+ register which is not used for parameters. */
+ offset = GEN_INT (- allocate);
+ scratch_regno = split_stack_prologue_scratch_regno ();
+ if (scratch_regno == INVALID_REGNUM)
+ return;
+ scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
+ if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
+ {
+ /* We don't use ix86_gen_add3 in this case because it will
+ want to split to lea, but when not optimizing the insn
+ will not be split after this point. */
+ emit_insn (gen_rtx_SET (scratch_reg,
+ gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+ offset)));
+ }
+ else
+ {
+ emit_move_insn (scratch_reg, offset);
+ emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
+ stack_pointer_rtx));
+ }
+ current = scratch_reg;
+ }
-/* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
- needed or an rtx for DRAP otherwise. */
+ ix86_expand_branch (GEU, current, limit, label);
+ rtx_insn *jump_insn = get_last_insn ();
+ JUMP_LABEL (jump_insn) = label;
-static rtx
-ix86_get_drap_rtx (void)
-{
- /* We must use DRAP if there are outgoing arguments on stack and
- ACCUMULATE_OUTGOING_ARGS is false. */
- if (ix86_force_drap
- || (cfun->machine->outgoing_args_on_stack
- && !ACCUMULATE_OUTGOING_ARGS))
- crtl->need_drap = true;
+ /* Mark the jump as very likely to be taken. */
+ add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
- if (stack_realign_drap)
+ if (split_stack_fn == NULL_RTX)
{
- /* Assign DRAP to vDRAP and returns vDRAP */
- unsigned int regno = find_drap_reg ();
- rtx drap_vreg;
- rtx arg_ptr;
- rtx_insn *seq, *insn;
+ split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
+ SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
+ }
+ fn = split_stack_fn;
- arg_ptr = gen_rtx_REG (Pmode, regno);
- crtl->drap_reg = arg_ptr;
+ /* Get more stack space. We pass in the desired stack space and the
+ size of the arguments to copy to the new stack. In 32-bit mode
+ we push the parameters; __morestack will return on a new stack
+ anyhow. In 64-bit mode we pass the parameters in r10 and
+ r11. */
+ allocate_rtx = GEN_INT (allocate);
+ args_size = crtl->args.size >= 0 ? (HOST_WIDE_INT) crtl->args.size : 0;
+ call_fusage = NULL_RTX;
+ rtx pop = NULL_RTX;
+ if (TARGET_64BIT)
+ {
+ rtx reg10, reg11;
- start_sequence ();
- drap_vreg = copy_to_reg (arg_ptr);
- seq = get_insns ();
- end_sequence ();
+ reg10 = gen_rtx_REG (Pmode, R10_REG);
+ reg11 = gen_rtx_REG (Pmode, R11_REG);
- insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
- if (!optimize)
+ /* If this function uses a static chain, it will be in %r10.
+ Preserve it across the call to __morestack. */
+ if (DECL_STATIC_CHAIN (cfun->decl))
{
- add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
- RTX_FRAME_RELATED_P (insn) = 1;
+ rtx rax;
+
+ rax = gen_rtx_REG (word_mode, AX_REG);
+ emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
+ use_reg (&call_fusage, rax);
}
- return drap_vreg;
- }
- else
- return NULL;
-}
-/* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
+ if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
+ && !TARGET_PECOFF)
+ {
+ HOST_WIDE_INT argval;
-static rtx
-ix86_internal_arg_pointer (void)
-{
- return virtual_incoming_args_rtx;
-}
+ gcc_assert (Pmode == DImode);
+ /* When using the large model we need to load the address
+ into a register, and we've run out of registers. So we
+ switch to a different calling convention, and we call a
+ different function: __morestack_large. We pass the
+ argument size in the upper 32 bits of r10 and pass the
+ frame size in the lower 32 bits. */
+ gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
+ gcc_assert ((args_size & 0xffffffff) == args_size);
-struct scratch_reg {
- rtx reg;
- bool saved;
-};
+ if (split_stack_fn_large == NULL_RTX)
+ {
+ split_stack_fn_large
+ = gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
+ SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
+ }
+ if (ix86_cmodel == CM_LARGE_PIC)
+ {
+ rtx_code_label *label;
+ rtx x;
-/* Return a short-lived scratch register for use on function entry.
- In 32-bit mode, it is valid only after the registers are saved
- in the prologue. This register must be released by means of
- release_scratch_register_on_entry once it is dead. */
+ label = gen_label_rtx ();
+ emit_label (label);
+ LABEL_PRESERVE_P (label) = 1;
+ emit_insn (gen_set_rip_rex64 (reg10, label));
+ emit_insn (gen_set_got_offset_rex64 (reg11, label));
+ emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
+ x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
+ UNSPEC_GOT);
+ x = gen_rtx_CONST (Pmode, x);
+ emit_move_insn (reg11, x);
+ x = gen_rtx_PLUS (Pmode, reg10, reg11);
+ x = gen_const_mem (Pmode, x);
+ emit_move_insn (reg11, x);
+ }
+ else
+ emit_move_insn (reg11, split_stack_fn_large);
-static void
-get_scratch_register_on_entry (struct scratch_reg *sr)
-{
- int regno;
+ fn = reg11;
- sr->saved = false;
+ argval = ((args_size << 16) << 16) + allocate;
+ emit_move_insn (reg10, GEN_INT (argval));
+ }
+ else
+ {
+ emit_move_insn (reg10, allocate_rtx);
+ emit_move_insn (reg11, GEN_INT (args_size));
+ use_reg (&call_fusage, reg11);
+ }
- if (TARGET_64BIT)
- {
- /* We always use R11 in 64-bit mode. */
- regno = R11_REG;
+ use_reg (&call_fusage, reg10);
}
else
{
- tree decl = current_function_decl, fntype = TREE_TYPE (decl);
- bool fastcall_p
- = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
- bool thiscall_p
- = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
- bool static_chain_p = DECL_STATIC_CHAIN (decl);
- int regparm = ix86_function_regparm (fntype, decl);
- int drap_regno
- = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
+ rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
+ add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
+ insn = emit_insn (gen_push (allocate_rtx));
+ add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
+ pop = GEN_INT (2 * UNITS_PER_WORD);
+ }
+ call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
+ GEN_INT (UNITS_PER_WORD), constm1_rtx,
+ pop, false);
+ add_function_usage_to (call_insn, call_fusage);
+ if (!TARGET_64BIT)
+ add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
+ /* Indicate that this function can't jump to non-local gotos. */
+ make_reg_eh_region_note_nothrow_nononlocal (call_insn);
- /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
- for the static chain register. */
- if ((regparm < 1 || (fastcall_p && !static_chain_p))
- && drap_regno != AX_REG)
- regno = AX_REG;
- /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
- for the static chain register. */
- else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
- regno = AX_REG;
- else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
- regno = DX_REG;
- /* ecx is the static chain register. */
- else if (regparm < 3 && !fastcall_p && !thiscall_p
- && !static_chain_p
- && drap_regno != CX_REG)
- regno = CX_REG;
- else if (ix86_save_reg (BX_REG, true, false))
- regno = BX_REG;
- /* esi is the static chain register. */
- else if (!(regparm == 3 && static_chain_p)
- && ix86_save_reg (SI_REG, true, false))
- regno = SI_REG;
- else if (ix86_save_reg (DI_REG, true, false))
- regno = DI_REG;
- else
- {
- regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
- sr->saved = true;
- }
- }
+ /* In order to make call/return prediction work right, we now need
+ to execute a return instruction. See
+ libgcc/config/i386/morestack.S for the details on how this works.
- sr->reg = gen_rtx_REG (Pmode, regno);
- if (sr->saved)
+ For flow purposes gcc must not see this as a return
+ instruction--we need control flow to continue at the subsequent
+ label. Therefore, we use an unspec. */
+ gcc_assert (crtl->args.pops_args < 65536);
+ rtx_insn *ret_insn
+ = emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
+
+ if ((flag_cf_protection & CF_BRANCH))
{
- rtx_insn *insn = emit_insn (gen_push (sr->reg));
- RTX_FRAME_RELATED_P (insn) = 1;
+ /* Insert ENDBR since __morestack will jump back here via indirect
+ call. */
+ rtx cet_eb = gen_nop_endbr ();
+ emit_insn_after (cet_eb, ret_insn);
}
-}
-/* Release a scratch register obtained from the preceding function.
+ /* If we are in 64-bit mode and this function uses a static chain,
+ we saved %r10 in %rax before calling _morestack. */
+ if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
+ emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
+ gen_rtx_REG (word_mode, AX_REG));
- If RELEASE_VIA_POP is true, we just pop the register off the stack
- to release it. This is what non-Linux systems use with -fstack-check.
+ /* If this function calls va_start, we need to store a pointer to
+ the arguments on the old stack, because they may not have been
+ all copied to the new stack. At this point the old stack can be
+ found at the frame pointer value used by __morestack, because
+ __morestack has set that up before calling back to us. Here we
+ store that pointer in a scratch register, and in
+ ix86_expand_prologue we store the scratch register in a stack
+ slot. */
+ if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
+ {
+ unsigned int scratch_regno;
+ rtx frame_reg;
+ int words;
- Otherwise we use OFFSET to locate the saved register and the
- allocated stack space becomes part of the local frame and is
- deallocated by the epilogue. */
+ scratch_regno = split_stack_prologue_scratch_regno ();
+ scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
+ frame_reg = gen_rtx_REG (Pmode, BP_REG);
-static void
-release_scratch_register_on_entry (struct scratch_reg *sr, HOST_WIDE_INT offset,
- bool release_via_pop)
-{
- if (sr->saved)
- {
- if (release_via_pop)
- {
- struct machine_function *m = cfun->machine;
- rtx x, insn = emit_insn (gen_pop (sr->reg));
+ /* 64-bit:
+ fp -> old fp value
+ return address within this function
+ return address of caller of this function
+ stack arguments
+ So we add three words to get to the stack arguments.
- /* The RX FRAME_RELATED_P mechanism doesn't know about pop. */
- RTX_FRAME_RELATED_P (insn) = 1;
- x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
- x = gen_rtx_SET (stack_pointer_rtx, x);
- add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
- m->fs.sp_offset -= UNITS_PER_WORD;
- }
- else
- {
- rtx x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (offset));
- x = gen_rtx_SET (sr->reg, gen_rtx_MEM (word_mode, x));
- emit_insn (x);
- }
+ 32-bit:
+ fp -> old fp value
+ return address within this function
+ first argument to __morestack
+ second argument to __morestack
+ return address of caller of this function
+ stack arguments
+ So we add five words to get to the stack arguments.
+ */
+ words = TARGET_64BIT ? 3 : 5;
+ emit_insn (gen_rtx_SET (scratch_reg,
+ gen_rtx_PLUS (Pmode, frame_reg,
+ GEN_INT (words * UNITS_PER_WORD))));
+
+ varargs_label = gen_label_rtx ();
+ emit_jump_insn (gen_jump (varargs_label));
+ JUMP_LABEL (get_last_insn ()) = varargs_label;
+
+ emit_barrier ();
}
-}
-/* Emit code to adjust the stack pointer by SIZE bytes while probing it.
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
- This differs from the next routine in that it tries hard to prevent
- attacks that jump the stack guard. Thus it is never allowed to allocate
- more than PROBE_INTERVAL bytes of stack space without a suitable
- probe.
+ /* If this function calls va_start, we now have to set the scratch
+ register for the case where we do not call __morestack. In this
+ case we need to set it based on the stack pointer. */
+ if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
+ {
+ emit_insn (gen_rtx_SET (scratch_reg,
+ gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+ GEN_INT (UNITS_PER_WORD))));
- INT_REGISTERS_SAVED is true if integer registers have already been
- pushed on the stack. */
+ emit_label (varargs_label);
+ LABEL_NUSES (varargs_label) = 1;
+ }
+}
+
+/* We may have to tell the dataflow pass that the split stack prologue
+ is initializing a scratch register. */
static void
-ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size,
- const bool int_registers_saved)
+ix86_live_on_entry (bitmap regs)
{
- struct machine_function *m = cfun->machine;
-
- /* If this function does not statically allocate stack space, then
- no probes are needed. */
- if (!size)
+ if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
{
- /* However, the allocation of space via pushes for register
- saves could be viewed as allocating space, but without the
- need to probe. */
- if (m->frame.nregs || m->frame.nsseregs || frame_pointer_needed)
- dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
- else
- dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
- return;
+ gcc_assert (flag_split_stack);
+ bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
}
+}
+\f
+/* Extract the parts of an RTL expression that is a valid memory address
+ for an instruction. Return 0 if the structure of the address is
+ grossly off. Return -1 if the address contains ASHIFT, so it is not
+ strictly valid, but still used for computing length of lea instruction. */
- /* If we are a noreturn function, then we have to consider the
- possibility that we're called via a jump rather than a call.
-
- Thus we don't have the implicit probe generated by saving the
- return address into the stack at the call. Thus, the stack
- pointer could be anywhere in the guard page. The safe thing
- to do is emit a probe now.
-
- The probe can be avoided if we have already emitted any callee
- register saves into the stack or have a frame pointer (which will
- have been saved as well). Those saves will function as implicit
- probes.
+int
+ix86_decompose_address (rtx addr, struct ix86_address *out)
+{
+ rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
+ rtx base_reg, index_reg;
+ HOST_WIDE_INT scale = 1;
+ rtx scale_rtx = NULL_RTX;
+ rtx tmp;
+ int retval = 1;
+ addr_space_t seg = ADDR_SPACE_GENERIC;
- ?!? This should be revamped to work like aarch64 and s390 where
- we track the offset from the most recent probe. Normally that
- offset would be zero. For a noreturn function we would reset
- it to PROBE_INTERVAL - (STACK_BOUNDARY / BITS_PER_UNIT). Then
- we just probe when we cross PROBE_INTERVAL. */
- if (TREE_THIS_VOLATILE (cfun->decl)
- && !(m->frame.nregs || m->frame.nsseregs || frame_pointer_needed))
+ /* Allow zero-extended SImode addresses,
+ they will be emitted with addr32 prefix. */
+ if (TARGET_64BIT && GET_MODE (addr) == DImode)
{
- /* We can safely use any register here since we're just going to push
- its value and immediately pop it back. But we do try and avoid
- argument passing registers so as not to introduce dependencies in
- the pipeline. For 32 bit we use %esi and for 64 bit we use %rax. */
- rtx dummy_reg = gen_rtx_REG (word_mode, TARGET_64BIT ? AX_REG : SI_REG);
- rtx_insn *insn_push = emit_insn (gen_push (dummy_reg));
- rtx_insn *insn_pop = emit_insn (gen_pop (dummy_reg));
- m->fs.sp_offset -= UNITS_PER_WORD;
- if (m->fs.cfa_reg == stack_pointer_rtx)
+ if (GET_CODE (addr) == ZERO_EXTEND
+ && GET_MODE (XEXP (addr, 0)) == SImode)
{
- m->fs.cfa_offset -= UNITS_PER_WORD;
- rtx x = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
- x = gen_rtx_SET (stack_pointer_rtx, x);
- add_reg_note (insn_push, REG_CFA_ADJUST_CFA, x);
- RTX_FRAME_RELATED_P (insn_push) = 1;
- x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
- x = gen_rtx_SET (stack_pointer_rtx, x);
- add_reg_note (insn_pop, REG_CFA_ADJUST_CFA, x);
- RTX_FRAME_RELATED_P (insn_pop) = 1;
+ addr = XEXP (addr, 0);
+ if (CONST_INT_P (addr))
+ return 0;
+ }
+ else if (GET_CODE (addr) == AND
+ && const_32bit_mask (XEXP (addr, 1), DImode))
+ {
+ addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
+ if (addr == NULL_RTX)
+ return 0;
+
+ if (CONST_INT_P (addr))
+ return 0;
}
- emit_insn (gen_blockage ());
}
- /* If we allocate less than the size of the guard statically,
- then no probing is necessary, but we do need to allocate
- the stack. */
- if (size < (1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE)))
+ /* Allow SImode subregs of DImode addresses,
+ they will be emitted with addr32 prefix. */
+ if (TARGET_64BIT && GET_MODE (addr) == SImode)
{
- pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
- GEN_INT (-size), -1,
- m->fs.cfa_reg == stack_pointer_rtx);
- dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
- return;
+ if (SUBREG_P (addr)
+ && GET_MODE (SUBREG_REG (addr)) == DImode)
+ {
+ addr = SUBREG_REG (addr);
+ if (CONST_INT_P (addr))
+ return 0;
+ }
}
- /* We're allocating a large enough stack frame that we need to
- emit probes. Either emit them inline or in a loop depending
- on the size. */
- HOST_WIDE_INT probe_interval = get_probe_interval ();
- if (size <= 4 * probe_interval)
+ if (REG_P (addr))
+ base = addr;
+ else if (SUBREG_P (addr))
{
- HOST_WIDE_INT i;
- for (i = probe_interval; i <= size; i += probe_interval)
- {
- /* Allocate PROBE_INTERVAL bytes. */
- rtx insn
- = pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
- GEN_INT (-probe_interval), -1,
- m->fs.cfa_reg == stack_pointer_rtx);
- add_reg_note (insn, REG_STACK_CHECK, const0_rtx);
-
- /* And probe at *sp. */
- emit_stack_probe (stack_pointer_rtx);
- emit_insn (gen_blockage ());
- }
-
- /* We need to allocate space for the residual, but we do not need
- to probe the residual. */
- HOST_WIDE_INT residual = (i - probe_interval - size);
- if (residual)
- pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
- GEN_INT (residual), -1,
- m->fs.cfa_reg == stack_pointer_rtx);
- dump_stack_clash_frame_info (PROBE_INLINE, residual != 0);
+ if (REG_P (SUBREG_REG (addr)))
+ base = addr;
+ else
+ return 0;
}
- else
+ else if (GET_CODE (addr) == PLUS)
{
- /* We expect the GP registers to be saved when probes are used
- as the probing sequences might need a scratch register and
- the routine to allocate one assumes the integer registers
- have already been saved. */
- gcc_assert (int_registers_saved);
-
- struct scratch_reg sr;
- get_scratch_register_on_entry (&sr);
-
- /* If we needed to save a register, then account for any space
- that was pushed (we are not going to pop the register when
- we do the restore). */
- if (sr.saved)
- size -= UNITS_PER_WORD;
-
- /* Step 1: round SIZE down to a multiple of the interval. */
- HOST_WIDE_INT rounded_size = size & -probe_interval;
+ rtx addends[4], op;
+ int n = 0, i;
- /* Step 2: compute final value of the loop counter. Use lea if
- possible. */
- rtx addr = plus_constant (Pmode, stack_pointer_rtx, -rounded_size);
- rtx insn;
- if (address_no_seg_operand (addr, Pmode))
- insn = emit_insn (gen_rtx_SET (sr.reg, addr));
- else
- {
- emit_move_insn (sr.reg, GEN_INT (-rounded_size));
- insn = emit_insn (gen_rtx_SET (sr.reg,
- gen_rtx_PLUS (Pmode, sr.reg,
- stack_pointer_rtx)));
- }
- if (m->fs.cfa_reg == stack_pointer_rtx)
+ op = addr;
+ do
{
- add_reg_note (insn, REG_CFA_DEF_CFA,
- plus_constant (Pmode, sr.reg,
- m->fs.cfa_offset + rounded_size));
- RTX_FRAME_RELATED_P (insn) = 1;
+ if (n >= 4)
+ return 0;
+ addends[n++] = XEXP (op, 1);
+ op = XEXP (op, 0);
}
+ while (GET_CODE (op) == PLUS);
+ if (n >= 4)
+ return 0;
+ addends[n] = op;
- /* Step 3: the loop. */
- rtx size_rtx = GEN_INT (rounded_size);
- insn = emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg,
- size_rtx));
- if (m->fs.cfa_reg == stack_pointer_rtx)
+ for (i = n; i >= 0; --i)
{
- m->fs.cfa_offset += rounded_size;
- add_reg_note (insn, REG_CFA_DEF_CFA,
- plus_constant (Pmode, stack_pointer_rtx,
- m->fs.cfa_offset));
- RTX_FRAME_RELATED_P (insn) = 1;
- }
- m->fs.sp_offset += rounded_size;
- emit_insn (gen_blockage ());
-
- /* Step 4: adjust SP if we cannot assert at compile-time that SIZE
- is equal to ROUNDED_SIZE. */
-
- if (size != rounded_size)
- pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
- GEN_INT (rounded_size - size), -1,
- m->fs.cfa_reg == stack_pointer_rtx);
- dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
+ op = addends[i];
+ switch (GET_CODE (op))
+ {
+ case MULT:
+ if (index)
+ return 0;
+ index = XEXP (op, 0);
+ scale_rtx = XEXP (op, 1);
+ break;
- /* This does not deallocate the space reserved for the scratch
- register. That will be deallocated in the epilogue. */
- release_scratch_register_on_entry (&sr, size, false);
- }
+ case ASHIFT:
+ if (index)
+ return 0;
+ index = XEXP (op, 0);
+ tmp = XEXP (op, 1);
+ if (!CONST_INT_P (tmp))
+ return 0;
+ scale = INTVAL (tmp);
+ if ((unsigned HOST_WIDE_INT) scale > 3)
+ return 0;
+ scale = 1 << scale;
+ break;
- /* Make sure nothing is scheduled before we are done. */
- emit_insn (gen_blockage ());
-}
+ case ZERO_EXTEND:
+ op = XEXP (op, 0);
+ if (GET_CODE (op) != UNSPEC)
+ return 0;
+ /* FALLTHRU */
-/* Emit code to adjust the stack pointer by SIZE bytes while probing it.
+ case UNSPEC:
+ if (XINT (op, 1) == UNSPEC_TP
+ && TARGET_TLS_DIRECT_SEG_REFS
+ && seg == ADDR_SPACE_GENERIC)
+ seg = DEFAULT_TLS_SEG_REG;
+ else
+ return 0;
+ break;
- INT_REGISTERS_SAVED is true if integer registers have already been
- pushed on the stack. */
+ case SUBREG:
+ if (!REG_P (SUBREG_REG (op)))
+ return 0;
+ /* FALLTHRU */
-static void
-ix86_adjust_stack_and_probe (HOST_WIDE_INT size,
- const bool int_registers_saved)
-{
- /* We skip the probe for the first interval + a small dope of 4 words and
- probe that many bytes past the specified size to maintain a protection
- area at the botton of the stack. */
- const int dope = 4 * UNITS_PER_WORD;
- rtx size_rtx = GEN_INT (size), last;
+ case REG:
+ if (!base)
+ base = op;
+ else if (!index)
+ index = op;
+ else
+ return 0;
+ break;
- /* See if we have a constant small number of probes to generate. If so,
- that's the easy case. The run-time loop is made up of 9 insns in the
- generic case while the compile-time loop is made up of 3+2*(n-1) insns
- for n # of intervals. */
- if (size <= 4 * get_probe_interval ())
- {
- HOST_WIDE_INT i, adjust;
- bool first_probe = true;
+ case CONST:
+ case CONST_INT:
+ case SYMBOL_REF:
+ case LABEL_REF:
+ if (disp)
+ return 0;
+ disp = op;
+ break;
- /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
- values of N from 1 until it exceeds SIZE. If only one probe is
- needed, this will not generate any code. Then adjust and probe
- to PROBE_INTERVAL + SIZE. */
- for (i = get_probe_interval (); i < size; i += get_probe_interval ())
- {
- if (first_probe)
- {
- adjust = 2 * get_probe_interval () + dope;
- first_probe = false;
+ default:
+ return 0;
}
- else
- adjust = get_probe_interval ();
-
- emit_insn (gen_rtx_SET (stack_pointer_rtx,
- plus_constant (Pmode, stack_pointer_rtx,
- -adjust)));
- emit_stack_probe (stack_pointer_rtx);
}
+ }
+ else if (GET_CODE (addr) == MULT)
+ {
+ index = XEXP (addr, 0); /* index*scale */
+ scale_rtx = XEXP (addr, 1);
+ }
+ else if (GET_CODE (addr) == ASHIFT)
+ {
+ /* We're called for lea too, which implements ashift on occasion. */
+ index = XEXP (addr, 0);
+ tmp = XEXP (addr, 1);
+ if (!CONST_INT_P (tmp))
+ return 0;
+ scale = INTVAL (tmp);
+ if ((unsigned HOST_WIDE_INT) scale > 3)
+ return 0;
+ scale = 1 << scale;
+ retval = -1;
+ }
+ else
+ disp = addr; /* displacement */
- if (first_probe)
- adjust = size + get_probe_interval () + dope;
+ if (index)
+ {
+ if (REG_P (index))
+ ;
+ else if (SUBREG_P (index)
+ && REG_P (SUBREG_REG (index)))
+ ;
else
- adjust = size + get_probe_interval () - i;
-
- emit_insn (gen_rtx_SET (stack_pointer_rtx,
- plus_constant (Pmode, stack_pointer_rtx,
- -adjust)));
- emit_stack_probe (stack_pointer_rtx);
-
- /* Adjust back to account for the additional first interval. */
- last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
- plus_constant (Pmode, stack_pointer_rtx,
- (get_probe_interval ()
- + dope))));
+ return 0;
}
- /* Otherwise, do the same as above, but in a loop. Note that we must be
- extra careful with variables wrapping around because we might be at
- the very top (or the very bottom) of the address space and we have
- to be able to handle this case properly; in particular, we use an
- equality test for the loop condition. */
- else
+ /* Extract the integral value of scale. */
+ if (scale_rtx)
{
- /* We expect the GP registers to be saved when probes are used
- as the probing sequences might need a scratch register and
- the routine to allocate one assumes the integer registers
- have already been saved. */
- gcc_assert (int_registers_saved);
-
- HOST_WIDE_INT rounded_size;
- struct scratch_reg sr;
+ if (!CONST_INT_P (scale_rtx))
+ return 0;
+ scale = INTVAL (scale_rtx);
+ }
- get_scratch_register_on_entry (&sr);
+ base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
+ index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
- /* If we needed to save a register, then account for any space
- that was pushed (we are not going to pop the register when
- we do the restore). */
- if (sr.saved)
- size -= UNITS_PER_WORD;
+ /* Avoid useless 0 displacement. */
+ if (disp == const0_rtx && (base || index))
+ disp = NULL_RTX;
- /* Step 1: round SIZE to the previous multiple of the interval. */
+ /* Allow arg pointer and stack pointer as index if there is not scaling. */
+ if (base_reg && index_reg && scale == 1
+ && (REGNO (index_reg) == ARG_POINTER_REGNUM
+ || REGNO (index_reg) == FRAME_POINTER_REGNUM
+ || REGNO (index_reg) == SP_REG))
+ {
+ std::swap (base, index);
+ std::swap (base_reg, index_reg);
+ }
- rounded_size = ROUND_DOWN (size, get_probe_interval ());
+ /* Special case: %ebp cannot be encoded as a base without a displacement.
+ Similarly %r13. */
+ if (!disp && base_reg
+ && (REGNO (base_reg) == ARG_POINTER_REGNUM
+ || REGNO (base_reg) == FRAME_POINTER_REGNUM
+ || REGNO (base_reg) == BP_REG
+ || REGNO (base_reg) == R13_REG))
+ disp = const0_rtx;
+ /* Special case: on K6, [%esi] makes the instruction vector decoded.
+ Avoid this by transforming to [%esi+0].
+ Reload calls address legitimization without cfun defined, so we need
+ to test cfun for being non-NULL. */
+ if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
+ && base_reg && !index_reg && !disp
+ && REGNO (base_reg) == SI_REG)
+ disp = const0_rtx;
- /* Step 2: compute initial and final value of the loop counter. */
+ /* Special case: encode reg+reg instead of reg*2. */
+ if (!base && index && scale == 2)
+ base = index, base_reg = index_reg, scale = 1;
- /* SP = SP_0 + PROBE_INTERVAL. */
- emit_insn (gen_rtx_SET (stack_pointer_rtx,
- plus_constant (Pmode, stack_pointer_rtx,
- - (get_probe_interval () + dope))));
+ /* Special case: scaling cannot be encoded without base or displacement. */
+ if (!base && !disp && index && scale != 1)
+ disp = const0_rtx;
- /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
- if (rounded_size <= (HOST_WIDE_INT_1 << 31))
- emit_insn (gen_rtx_SET (sr.reg,
- plus_constant (Pmode, stack_pointer_rtx,
- -rounded_size)));
- else
- {
- emit_move_insn (sr.reg, GEN_INT (-rounded_size));
- emit_insn (gen_rtx_SET (sr.reg,
- gen_rtx_PLUS (Pmode, sr.reg,
- stack_pointer_rtx)));
- }
+ out->base = base;
+ out->index = index;
+ out->disp = disp;
+ out->scale = scale;
+ out->seg = seg;
+ return retval;
+}
+\f
+/* Return cost of the memory address x.
+ For i386, it is better to use a complex address than let gcc copy
+ the address into a reg and make a new pseudo. But not if the address
+ requires to two regs - that would mean more pseudos with longer
+ lifetimes. */
+static int
+ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
+{
+ struct ix86_address parts;
+ int cost = 1;
+ int ok = ix86_decompose_address (x, &parts);
- /* Step 3: the loop
+ gcc_assert (ok);
- do
- {
- SP = SP + PROBE_INTERVAL
- probe at SP
- }
- while (SP != LAST_ADDR)
+ if (parts.base && SUBREG_P (parts.base))
+ parts.base = SUBREG_REG (parts.base);
+ if (parts.index && SUBREG_P (parts.index))
+ parts.index = SUBREG_REG (parts.index);
- adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
- values of N from 1 until it is equal to ROUNDED_SIZE. */
+ /* Attempt to minimize number of registers in the address by increasing
+ address cost for each used register. We don't increase address cost
+ for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
+ is not invariant itself it most likely means that base or index is not
+ invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
+ which is not profitable for x86. */
+ if (parts.base
+ && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
+ && (current_pass->type == GIMPLE_PASS
+ || !pic_offset_table_rtx
+ || !REG_P (parts.base)
+ || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
+ cost++;
- emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
+ if (parts.index
+ && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
+ && (current_pass->type == GIMPLE_PASS
+ || !pic_offset_table_rtx
+ || !REG_P (parts.index)
+ || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
+ cost++;
+ /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
+ since it's predecode logic can't detect the length of instructions
+ and it degenerates to vector decoded. Increase cost of such
+ addresses here. The penalty is minimally 2 cycles. It may be worthwhile
+ to split such addresses or even refuse such addresses at all.
- /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
- assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
+ Following addressing modes are affected:
+ [base+scale*index]
+ [scale*index+disp]
+ [base+index]
- if (size != rounded_size)
- {
- emit_insn (gen_rtx_SET (stack_pointer_rtx,
- plus_constant (Pmode, stack_pointer_rtx,
- rounded_size - size)));
- emit_stack_probe (stack_pointer_rtx);
- }
+ The first and last case may be avoidable by explicitly coding the zero in
+ memory address, but I don't have AMD-K6 machine handy to check this
+ theory. */
- /* Adjust back to account for the additional first interval. */
- last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
- plus_constant (Pmode, stack_pointer_rtx,
- (get_probe_interval ()
- + dope))));
+ if (TARGET_K6
+ && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
+ || (parts.disp && !parts.base && parts.index && parts.scale != 1)
+ || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
+ cost += 10;
- /* This does not deallocate the space reserved for the scratch
- register. That will be deallocated in the epilogue. */
- release_scratch_register_on_entry (&sr, size, false);
- }
+ return cost;
+}
+\f
+/* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
+ this is used for to form addresses to local data when -fPIC is in
+ use. */
- /* Even if the stack pointer isn't the CFA register, we need to correctly
- describe the adjustments made to it, in particular differentiate the
- frame-related ones from the frame-unrelated ones. */
- if (size > 0)
- {
- rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
- XVECEXP (expr, 0, 0)
- = gen_rtx_SET (stack_pointer_rtx,
- plus_constant (Pmode, stack_pointer_rtx, -size));
- XVECEXP (expr, 0, 1)
- = gen_rtx_SET (stack_pointer_rtx,
- plus_constant (Pmode, stack_pointer_rtx,
- get_probe_interval () + dope + size));
- add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
- RTX_FRAME_RELATED_P (last) = 1;
+static bool
+darwin_local_data_pic (rtx disp)
+{
+ return (GET_CODE (disp) == UNSPEC
+ && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
+}
- cfun->machine->fs.sp_offset += size;
- }
+/* True if operand X should be loaded from GOT. */
- /* Make sure nothing is scheduled before we are done. */
- emit_insn (gen_blockage ());
+bool
+ix86_force_load_from_GOT_p (rtx x)
+{
+ return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
+ && !TARGET_PECOFF && !TARGET_MACHO
+ && !flag_pic
+ && ix86_cmodel != CM_LARGE
+ && GET_CODE (x) == SYMBOL_REF
+ && SYMBOL_REF_FUNCTION_P (x)
+ && (!flag_plt
+ || (SYMBOL_REF_DECL (x)
+ && lookup_attribute ("noplt",
+ DECL_ATTRIBUTES (SYMBOL_REF_DECL (x)))))
+ && !SYMBOL_REF_LOCAL_P (x));
}
-/* Adjust the stack pointer up to REG while probing it. */
+/* Determine if a given RTX is a valid constant. We already know this
+ satisfies CONSTANT_P. */
-const char *
-output_adjust_stack_and_probe (rtx reg)
+static bool
+ix86_legitimate_constant_p (machine_mode mode, rtx x)
{
- static int labelno = 0;
- char loop_lab[32];
- rtx xops[2];
+ switch (GET_CODE (x))
+ {
+ case CONST:
+ x = XEXP (x, 0);
- ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
+ if (GET_CODE (x) == PLUS)
+ {
+ if (!CONST_INT_P (XEXP (x, 1)))
+ return false;
+ x = XEXP (x, 0);
+ }
- /* Loop. */
- ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
+ if (TARGET_MACHO && darwin_local_data_pic (x))
+ return true;
- /* SP = SP + PROBE_INTERVAL. */
- xops[0] = stack_pointer_rtx;
- xops[1] = GEN_INT (get_probe_interval ());
- output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
+ /* Only some unspecs are valid as "constants". */
+ if (GET_CODE (x) == UNSPEC)
+ switch (XINT (x, 1))
+ {
+ case UNSPEC_GOT:
+ case UNSPEC_GOTOFF:
+ case UNSPEC_PLTOFF:
+ return TARGET_64BIT;
+ case UNSPEC_TPOFF:
+ case UNSPEC_NTPOFF:
+ x = XVECEXP (x, 0, 0);
+ return (GET_CODE (x) == SYMBOL_REF
+ && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
+ case UNSPEC_DTPOFF:
+ x = XVECEXP (x, 0, 0);
+ return (GET_CODE (x) == SYMBOL_REF
+ && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
+ default:
+ return false;
+ }
- /* Probe at SP. */
- xops[1] = const0_rtx;
- output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
+ /* We must have drilled down to a symbol. */
+ if (GET_CODE (x) == LABEL_REF)
+ return true;
+ if (GET_CODE (x) != SYMBOL_REF)
+ return false;
+ /* FALLTHRU */
- /* Test if SP == LAST_ADDR. */
- xops[0] = stack_pointer_rtx;
- xops[1] = reg;
- output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
+ case SYMBOL_REF:
+ /* TLS symbols are never valid. */
+ if (SYMBOL_REF_TLS_MODEL (x))
+ return false;
- /* Branch. */
- fputs ("\tjne\t", asm_out_file);
- assemble_name_raw (asm_out_file, loop_lab);
- fputc ('\n', asm_out_file);
+ /* DLLIMPORT symbols are never valid. */
+ if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
+ && SYMBOL_REF_DLLIMPORT_P (x))
+ return false;
- return "";
-}
+#if TARGET_MACHO
+ /* mdynamic-no-pic */
+ if (MACHO_DYNAMIC_NO_PIC_P)
+ return machopic_symbol_defined_p (x);
+#endif
-/* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
- inclusive. These are offsets from the current stack pointer.
+ /* External function address should be loaded
+ via the GOT slot to avoid PLT. */
+ if (ix86_force_load_from_GOT_p (x))
+ return false;
- INT_REGISTERS_SAVED is true if integer registers have already been
- pushed on the stack. */
+ break;
-static void
-ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size,
- const bool int_registers_saved)
-{
- /* See if we have a constant small number of probes to generate. If so,
- that's the easy case. The run-time loop is made up of 6 insns in the
- generic case while the compile-time loop is made up of n insns for n #
- of intervals. */
- if (size <= 6 * get_probe_interval ())
- {
- HOST_WIDE_INT i;
+ CASE_CONST_SCALAR_INT:
+ switch (mode)
+ {
+ case E_TImode:
+ if (TARGET_64BIT)
+ return true;
+ /* FALLTHRU */
+ case E_OImode:
+ case E_XImode:
+ if (!standard_sse_constant_p (x, mode))
+ return false;
+ default:
+ break;
+ }
+ break;
- /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
- it exceeds SIZE. If only one probe is needed, this will not
- generate any code. Then probe at FIRST + SIZE. */
- for (i = get_probe_interval (); i < size; i += get_probe_interval ())
- emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
- -(first + i)));
+ case CONST_VECTOR:
+ if (!standard_sse_constant_p (x, mode))
+ return false;
- emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
- -(first + size)));
+ default:
+ break;
}
- /* Otherwise, do the same as above, but in a loop. Note that we must be
- extra careful with variables wrapping around because we might be at
- the very top (or the very bottom) of the address space and we have
- to be able to handle this case properly; in particular, we use an
- equality test for the loop condition. */
- else
- {
- /* We expect the GP registers to be saved when probes are used
- as the probing sequences might need a scratch register and
- the routine to allocate one assumes the integer registers
- have already been saved. */
- gcc_assert (int_registers_saved);
-
- HOST_WIDE_INT rounded_size, last;
- struct scratch_reg sr;
-
- get_scratch_register_on_entry (&sr);
-
-
- /* Step 1: round SIZE to the previous multiple of the interval. */
+ /* Otherwise we handle everything else in the move patterns. */
+ return true;
+}
- rounded_size = ROUND_DOWN (size, get_probe_interval ());
+/* Determine if it's legal to put X into the constant pool. This
+ is not possible for the address of thread-local symbols, which
+ is checked above. */
+static bool
+ix86_cannot_force_const_mem (machine_mode mode, rtx x)
+{
+ /* We can put any immediate constant in memory. */
+ switch (GET_CODE (x))
+ {
+ CASE_CONST_ANY:
+ return false;
- /* Step 2: compute initial and final value of the loop counter. */
+ default:
+ break;
+ }
- /* TEST_OFFSET = FIRST. */
- emit_move_insn (sr.reg, GEN_INT (-first));
+ return !ix86_legitimate_constant_p (mode, x);
+}
- /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
- last = first + rounded_size;
+/* Nonzero if the symbol is marked as dllimport, or as stub-variable,
+ otherwise zero. */
+static bool
+is_imported_p (rtx x)
+{
+ if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
+ || GET_CODE (x) != SYMBOL_REF)
+ return false;
- /* Step 3: the loop
+ return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
+}
- do
- {
- TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
- probe at TEST_ADDR
- }
- while (TEST_ADDR != LAST_ADDR)
- probes at FIRST + N * PROBE_INTERVAL for values of N from 1
- until it is equal to ROUNDED_SIZE. */
+/* Nonzero if the constant value X is a legitimate general operand
+ when generating PIC code. It is given that flag_pic is on and
+ that X satisfies CONSTANT_P. */
- emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
+bool
+legitimate_pic_operand_p (rtx x)
+{
+ rtx inner;
+ switch (GET_CODE (x))
+ {
+ case CONST:
+ inner = XEXP (x, 0);
+ if (GET_CODE (inner) == PLUS
+ && CONST_INT_P (XEXP (inner, 1)))
+ inner = XEXP (inner, 0);
- /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
- that SIZE is equal to ROUNDED_SIZE. */
+ /* Only some unspecs are valid as "constants". */
+ if (GET_CODE (inner) == UNSPEC)
+ switch (XINT (inner, 1))
+ {
+ case UNSPEC_GOT:
+ case UNSPEC_GOTOFF:
+ case UNSPEC_PLTOFF:
+ return TARGET_64BIT;
+ case UNSPEC_TPOFF:
+ x = XVECEXP (inner, 0, 0);
+ return (GET_CODE (x) == SYMBOL_REF
+ && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
+ case UNSPEC_MACHOPIC_OFFSET:
+ return legitimate_pic_address_disp_p (x);
+ default:
+ return false;
+ }
+ /* FALLTHRU */
- if (size != rounded_size)
- emit_stack_probe (plus_constant (Pmode,
- gen_rtx_PLUS (Pmode,
- stack_pointer_rtx,
- sr.reg),
- rounded_size - size));
+ case SYMBOL_REF:
+ case LABEL_REF:
+ return legitimate_pic_address_disp_p (x);
- release_scratch_register_on_entry (&sr, size, true);
+ default:
+ return true;
}
-
- /* Make sure nothing is scheduled before we are done. */
- emit_insn (gen_blockage ());
}
-/* Probe a range of stack addresses from REG to END, inclusive. These are
- offsets from the current stack pointer. */
+/* Determine if a given CONST RTX is a valid memory displacement
+ in PIC mode. */
-const char *
-output_probe_stack_range (rtx reg, rtx end)
+bool
+legitimate_pic_address_disp_p (rtx disp)
{
- static int labelno = 0;
- char loop_lab[32];
- rtx xops[3];
-
- ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
+ bool saw_plus;
- /* Loop. */
- ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
+ /* In 64bit mode we can allow direct addresses of symbols and labels
+ when they are not dynamic symbols. */
+ if (TARGET_64BIT)
+ {
+ rtx op0 = disp, op1;
- /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
- xops[0] = reg;
- xops[1] = GEN_INT (get_probe_interval ());
- output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
+ switch (GET_CODE (disp))
+ {
+ case LABEL_REF:
+ return true;
- /* Probe at TEST_ADDR. */
- xops[0] = stack_pointer_rtx;
- xops[1] = reg;
- xops[2] = const0_rtx;
- output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
+ case CONST:
+ if (GET_CODE (XEXP (disp, 0)) != PLUS)
+ break;
+ op0 = XEXP (XEXP (disp, 0), 0);
+ op1 = XEXP (XEXP (disp, 0), 1);
+ if (!CONST_INT_P (op1))
+ break;
+ if (GET_CODE (op0) == UNSPEC
+ && (XINT (op0, 1) == UNSPEC_DTPOFF
+ || XINT (op0, 1) == UNSPEC_NTPOFF)
+ && trunc_int_for_mode (INTVAL (op1), SImode) == INTVAL (op1))
+ return true;
+ if (INTVAL (op1) >= 16*1024*1024
+ || INTVAL (op1) < -16*1024*1024)
+ break;
+ if (GET_CODE (op0) == LABEL_REF)
+ return true;
+ if (GET_CODE (op0) == CONST
+ && GET_CODE (XEXP (op0, 0)) == UNSPEC
+ && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
+ return true;
+ if (GET_CODE (op0) == UNSPEC
+ && XINT (op0, 1) == UNSPEC_PCREL)
+ return true;
+ if (GET_CODE (op0) != SYMBOL_REF)
+ break;
+ /* FALLTHRU */
- /* Test if TEST_ADDR == LAST_ADDR. */
- xops[0] = reg;
- xops[1] = end;
- output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
+ case SYMBOL_REF:
+ /* TLS references should always be enclosed in UNSPEC.
+ The dllimported symbol needs always to be resolved. */
+ if (SYMBOL_REF_TLS_MODEL (op0)
+ || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
+ return false;
- /* Branch. */
- fputs ("\tjne\t", asm_out_file);
- assemble_name_raw (asm_out_file, loop_lab);
- fputc ('\n', asm_out_file);
+ if (TARGET_PECOFF)
+ {
+ if (is_imported_p (op0))
+ return true;
- return "";
+ if (SYMBOL_REF_FAR_ADDR_P (op0)
+ || !SYMBOL_REF_LOCAL_P (op0))
+ break;
+
+ /* Function-symbols need to be resolved only for
+ large-model.
+ For the small-model we don't need to resolve anything
+ here. */
+ if ((ix86_cmodel != CM_LARGE_PIC
+ && SYMBOL_REF_FUNCTION_P (op0))
+ || ix86_cmodel == CM_SMALL_PIC)
+ return true;
+ /* Non-external symbols don't need to be resolved for
+ large, and medium-model. */
+ if ((ix86_cmodel == CM_LARGE_PIC
+ || ix86_cmodel == CM_MEDIUM_PIC)
+ && !SYMBOL_REF_EXTERNAL_P (op0))
+ return true;
+ }
+ else if (!SYMBOL_REF_FAR_ADDR_P (op0)
+ && (SYMBOL_REF_LOCAL_P (op0)
+ || (HAVE_LD_PIE_COPYRELOC
+ && flag_pie
+ && !SYMBOL_REF_WEAK (op0)
+ && !SYMBOL_REF_FUNCTION_P (op0)))
+ && ix86_cmodel != CM_LARGE_PIC)
+ return true;
+ break;
+
+ default:
+ break;
+ }
+ }
+ if (GET_CODE (disp) != CONST)
+ return false;
+ disp = XEXP (disp, 0);
+
+ if (TARGET_64BIT)
+ {
+ /* We are unsafe to allow PLUS expressions. This limit allowed distance
+ of GOT tables. We should not need these anyway. */
+ if (GET_CODE (disp) != UNSPEC
+ || (XINT (disp, 1) != UNSPEC_GOTPCREL
+ && XINT (disp, 1) != UNSPEC_GOTOFF
+ && XINT (disp, 1) != UNSPEC_PCREL
+ && XINT (disp, 1) != UNSPEC_PLTOFF))
+ return false;
+
+ if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
+ && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
+ return false;
+ return true;
+ }
+
+ saw_plus = false;
+ if (GET_CODE (disp) == PLUS)
+ {
+ if (!CONST_INT_P (XEXP (disp, 1)))
+ return false;
+ disp = XEXP (disp, 0);
+ saw_plus = true;
+ }
+
+ if (TARGET_MACHO && darwin_local_data_pic (disp))
+ return true;
+
+ if (GET_CODE (disp) != UNSPEC)
+ return false;
+
+ switch (XINT (disp, 1))
+ {
+ case UNSPEC_GOT:
+ if (saw_plus)
+ return false;
+ /* We need to check for both symbols and labels because VxWorks loads
+ text labels with @GOT rather than @GOTOFF. See gotoff_operand for
+ details. */
+ return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
+ || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
+ case UNSPEC_GOTOFF:
+ /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
+ While ABI specify also 32bit relocation but we don't produce it in
+ small PIC model at all. */
+ if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
+ || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
+ && !TARGET_64BIT)
+ return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
+ return false;
+ case UNSPEC_GOTTPOFF:
+ case UNSPEC_GOTNTPOFF:
+ case UNSPEC_INDNTPOFF:
+ if (saw_plus)
+ return false;
+ disp = XVECEXP (disp, 0, 0);
+ return (GET_CODE (disp) == SYMBOL_REF
+ && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
+ case UNSPEC_NTPOFF:
+ disp = XVECEXP (disp, 0, 0);
+ return (GET_CODE (disp) == SYMBOL_REF
+ && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
+ case UNSPEC_DTPOFF:
+ disp = XVECEXP (disp, 0, 0);
+ return (GET_CODE (disp) == SYMBOL_REF
+ && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
+ }
+
+ return false;
}
-/* Return true if stack frame is required. Update STACK_ALIGNMENT
- to the largest alignment, in bits, of stack slot used if stack
- frame is required and CHECK_STACK_SLOT is true. */
+/* Determine if op is suitable RTX for an address register.
+ Return naked register if a register or a register subreg is
+ found, otherwise return NULL_RTX. */
-static bool
-ix86_find_max_used_stack_alignment (unsigned int &stack_alignment,
- bool check_stack_slot)
+static rtx
+ix86_validate_address_register (rtx op)
{
- HARD_REG_SET set_up_by_prologue, prologue_used;
- basic_block bb;
+ machine_mode mode = GET_MODE (op);
- CLEAR_HARD_REG_SET (prologue_used);
- CLEAR_HARD_REG_SET (set_up_by_prologue);
- add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
- add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
- add_to_hard_reg_set (&set_up_by_prologue, Pmode,
- HARD_FRAME_POINTER_REGNUM);
+ /* Only SImode or DImode registers can form the address. */
+ if (mode != SImode && mode != DImode)
+ return NULL_RTX;
- /* The preferred stack alignment is the minimum stack alignment. */
- if (stack_alignment > crtl->preferred_stack_boundary)
- stack_alignment = crtl->preferred_stack_boundary;
+ if (REG_P (op))
+ return op;
+ else if (SUBREG_P (op))
+ {
+ rtx reg = SUBREG_REG (op);
- bool require_stack_frame = false;
+ if (!REG_P (reg))
+ return NULL_RTX;
- FOR_EACH_BB_FN (bb, cfun)
- {
- rtx_insn *insn;
- FOR_BB_INSNS (bb, insn)
- if (NONDEBUG_INSN_P (insn)
- && requires_stack_frame_p (insn, prologue_used,
- set_up_by_prologue))
- {
- require_stack_frame = true;
+ mode = GET_MODE (reg);
- if (check_stack_slot)
- {
- /* Find the maximum stack alignment. */
- subrtx_iterator::array_type array;
- FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
- if (MEM_P (*iter)
- && (reg_mentioned_p (stack_pointer_rtx,
- *iter)
- || reg_mentioned_p (frame_pointer_rtx,
- *iter)))
- {
- unsigned int alignment = MEM_ALIGN (*iter);
- if (alignment > stack_alignment)
- stack_alignment = alignment;
- }
- }
- }
+ /* Don't allow SUBREGs that span more than a word. It can
+ lead to spill failures when the register is one word out
+ of a two word structure. */
+ if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+ return NULL_RTX;
+
+ /* Allow only SUBREGs of non-eliminable hard registers. */
+ if (register_no_elim_operand (reg, mode))
+ return reg;
}
- return require_stack_frame;
+ /* Op is not a register. */
+ return NULL_RTX;
}
-/* Finalize stack_realign_needed and frame_pointer_needed flags, which
- will guide prologue/epilogue to be generated in correct form. */
+/* Recognizes RTL expressions that are valid memory addresses for an
+ instruction. The MODE argument is the machine mode for the MEM
+ expression that wants to use this address.
-static void
-ix86_finalize_stack_frame_flags (void)
+ It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
+ convert common non-canonical forms to canonical form so that they will
+ be recognized. */
+
+static bool
+ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
{
- /* Check if stack realign is really needed after reload, and
- stores result in cfun */
- unsigned int incoming_stack_boundary
- = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
- ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
- unsigned int stack_alignment
- = (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
- ? crtl->max_used_stack_slot_alignment
- : crtl->stack_alignment_needed);
- unsigned int stack_realign
- = (incoming_stack_boundary < stack_alignment);
- bool recompute_frame_layout_p = false;
+ struct ix86_address parts;
+ rtx base, index, disp;
+ HOST_WIDE_INT scale;
+ addr_space_t seg;
- if (crtl->stack_realign_finalized)
+ if (ix86_decompose_address (addr, &parts) <= 0)
+ /* Decomposition failed. */
+ return false;
+
+ base = parts.base;
+ index = parts.index;
+ disp = parts.disp;
+ scale = parts.scale;
+ seg = parts.seg;
+
+ /* Validate base register. */
+ if (base)
{
- /* After stack_realign_needed is finalized, we can't no longer
- change it. */
- gcc_assert (crtl->stack_realign_needed == stack_realign);
- return;
+ rtx reg = ix86_validate_address_register (base);
+
+ if (reg == NULL_RTX)
+ return false;
+
+ if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
+ || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
+ /* Base is not valid. */
+ return false;
}
- /* If the only reason for frame_pointer_needed is that we conservatively
- assumed stack realignment might be needed or -fno-omit-frame-pointer
- is used, but in the end nothing that needed the stack alignment had
- been spilled nor stack access, clear frame_pointer_needed and say we
- don't need stack realignment. */
- if ((stack_realign || (!flag_omit_frame_pointer && optimize))
- && frame_pointer_needed
- && crtl->is_leaf
- && crtl->sp_is_unchanging
- && !ix86_current_function_calls_tls_descriptor
- && !crtl->accesses_prior_frames
- && !cfun->calls_alloca
- && !crtl->calls_eh_return
- /* See ira_setup_eliminable_regset for the rationale. */
- && !(STACK_CHECK_MOVING_SP
- && flag_stack_check
- && flag_exceptions
- && cfun->can_throw_non_call_exceptions)
- && !ix86_frame_pointer_required ()
- && get_frame_size () == 0
- && ix86_nsaved_sseregs () == 0
- && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
+ /* Validate index register. */
+ if (index)
{
- if (ix86_find_max_used_stack_alignment (stack_alignment,
- stack_realign))
- {
- /* Stack frame is required. If stack alignment needed is less
- than incoming stack boundary, don't realign stack. */
- stack_realign = incoming_stack_boundary < stack_alignment;
- if (!stack_realign)
- {
- crtl->max_used_stack_slot_alignment
- = incoming_stack_boundary;
- crtl->stack_alignment_needed
- = incoming_stack_boundary;
- /* Also update preferred_stack_boundary for leaf
- functions. */
- crtl->preferred_stack_boundary
- = incoming_stack_boundary;
- }
- }
- else
- {
- /* If drap has been set, but it actually isn't live at the
- start of the function, there is no reason to set it up. */
- if (crtl->drap_reg)
- {
- basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
- if (! REGNO_REG_SET_P (DF_LR_IN (bb),
- REGNO (crtl->drap_reg)))
- {
- crtl->drap_reg = NULL_RTX;
- crtl->need_drap = false;
- }
- }
- else
- cfun->machine->no_drap_save_restore = true;
-
- frame_pointer_needed = false;
- stack_realign = false;
- crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
- crtl->stack_alignment_needed = incoming_stack_boundary;
- crtl->stack_alignment_estimated = incoming_stack_boundary;
- if (crtl->preferred_stack_boundary > incoming_stack_boundary)
- crtl->preferred_stack_boundary = incoming_stack_boundary;
- df_finish_pass (true);
- df_scan_alloc (NULL);
- df_scan_blocks ();
- df_compute_regs_ever_live (true);
- df_analyze ();
-
- if (flag_var_tracking)
- {
- /* Since frame pointer is no longer available, replace it with
- stack pointer - UNITS_PER_WORD in debug insns. */
- df_ref ref, next;
- for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
- ref; ref = next)
- {
- next = DF_REF_NEXT_REG (ref);
- if (!DF_REF_INSN_INFO (ref))
- continue;
-
- /* Make sure the next ref is for a different instruction,
- so that we're not affected by the rescan. */
- rtx_insn *insn = DF_REF_INSN (ref);
- while (next && DF_REF_INSN (next) == insn)
- next = DF_REF_NEXT_REG (next);
+ rtx reg = ix86_validate_address_register (index);
- if (DEBUG_INSN_P (insn))
- {
- bool changed = false;
- for (; ref != next; ref = DF_REF_NEXT_REG (ref))
- {
- rtx *loc = DF_REF_LOC (ref);
- if (*loc == hard_frame_pointer_rtx)
- {
- *loc = plus_constant (Pmode,
- stack_pointer_rtx,
- -UNITS_PER_WORD);
- changed = true;
- }
- }
- if (changed)
- df_insn_rescan (insn);
- }
- }
- }
+ if (reg == NULL_RTX)
+ return false;
- recompute_frame_layout_p = true;
- }
- }
- else if (crtl->max_used_stack_slot_alignment >= 128)
- {
- /* We don't need to realign stack. max_used_stack_alignment is
- used to decide how stack frame should be aligned. This is
- independent of any psABIs nor 32-bit vs 64-bit. It is always
- safe to compute max_used_stack_alignment. We compute it only
- if 128-bit aligned load/store may be generated on misaligned
- stack slot which will lead to segfault. */
- if (ix86_find_max_used_stack_alignment (stack_alignment, true))
- cfun->machine->max_used_stack_alignment
- = stack_alignment / BITS_PER_UNIT;
+ if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
+ || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
+ /* Index is not valid. */
+ return false;
}
- if (crtl->stack_realign_needed != stack_realign)
- recompute_frame_layout_p = true;
- crtl->stack_realign_needed = stack_realign;
- crtl->stack_realign_finalized = true;
- if (recompute_frame_layout_p)
- ix86_compute_frame_layout ();
-}
+ /* Index and base should have the same mode. */
+ if (base && index
+ && GET_MODE (base) != GET_MODE (index))
+ return false;
-/* Delete SET_GOT right after entry block if it is allocated to reg. */
+ /* Address override works only on the (%reg) part of %fs:(%reg). */
+ if (seg != ADDR_SPACE_GENERIC
+ && ((base && GET_MODE (base) != word_mode)
+ || (index && GET_MODE (index) != word_mode)))
+ return false;
-static void
-ix86_elim_entry_set_got (rtx reg)
-{
- basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
- rtx_insn *c_insn = BB_HEAD (bb);
- if (!NONDEBUG_INSN_P (c_insn))
- c_insn = next_nonnote_nondebug_insn (c_insn);
- if (c_insn && NONJUMP_INSN_P (c_insn))
+ /* Validate scale factor. */
+ if (scale != 1)
{
- rtx pat = PATTERN (c_insn);
- if (GET_CODE (pat) == PARALLEL)
- {
- rtx vec = XVECEXP (pat, 0, 0);
- if (GET_CODE (vec) == SET
- && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
- && REGNO (XEXP (vec, 0)) == REGNO (reg))
- delete_insn (c_insn);
- }
+ if (!index)
+ /* Scale without index. */
+ return false;
+
+ if (scale != 2 && scale != 4 && scale != 8)
+ /* Scale is not a valid multiplier. */
+ return false;
}
-}
-static rtx
-gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
-{
- rtx addr, mem;
+ /* Validate displacement. */
+ if (disp)
+ {
+ if (GET_CODE (disp) == CONST
+ && GET_CODE (XEXP (disp, 0)) == UNSPEC
+ && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
+ switch (XINT (XEXP (disp, 0), 1))
+ {
+ /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
+ when used. While ABI specify also 32bit relocations, we
+ don't produce them at all and use IP relative instead.
+ Allow GOT in 32bit mode for both PIC and non-PIC if symbol
+ should be loaded via GOT. */
+ case UNSPEC_GOT:
+ if (!TARGET_64BIT
+ && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
+ goto is_legitimate_pic;
+ /* FALLTHRU */
+ case UNSPEC_GOTOFF:
+ gcc_assert (flag_pic);
+ if (!TARGET_64BIT)
+ goto is_legitimate_pic;
- if (offset)
- addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
- mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
- return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
-}
+ /* 64bit address unspec. */
+ return false;
-static inline rtx
-gen_frame_load (rtx reg, rtx frame_reg, int offset)
-{
- return gen_frame_set (reg, frame_reg, offset, false);
-}
+ case UNSPEC_GOTPCREL:
+ if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
+ goto is_legitimate_pic;
+ /* FALLTHRU */
+ case UNSPEC_PCREL:
+ gcc_assert (flag_pic);
+ goto is_legitimate_pic;
-static inline rtx
-gen_frame_store (rtx reg, rtx frame_reg, int offset)
-{
- return gen_frame_set (reg, frame_reg, offset, true);
-}
+ case UNSPEC_GOTTPOFF:
+ case UNSPEC_GOTNTPOFF:
+ case UNSPEC_INDNTPOFF:
+ case UNSPEC_NTPOFF:
+ case UNSPEC_DTPOFF:
+ break;
-static void
-ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
-{
- struct machine_function *m = cfun->machine;
- const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
- + m->call_ms2sysv_extra_regs;
- rtvec v = rtvec_alloc (ncregs + 1);
- unsigned int align, i, vi = 0;
- rtx_insn *insn;
- rtx sym, addr;
- rtx rax = gen_rtx_REG (word_mode, AX_REG);
- const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
+ default:
+ /* Invalid address unspec. */
+ return false;
+ }
- /* AL should only be live with sysv_abi. */
- gcc_assert (!ix86_eax_live_at_start_p ());
- gcc_assert (m->fs.sp_offset >= frame.sse_reg_save_offset);
+ else if (SYMBOLIC_CONST (disp)
+ && (flag_pic
+ || (TARGET_MACHO
+#if TARGET_MACHO
+ && MACHOPIC_INDIRECT
+ && !machopic_operand_p (disp)
+#endif
+ )))
+ {
- /* Setup RAX as the stub's base pointer. We use stack_realign_offset rather
- we've actually realigned the stack or not. */
- align = GET_MODE_ALIGNMENT (V4SFmode);
- addr = choose_baseaddr (frame.stack_realign_offset
- + xlogue.get_stub_ptr_offset (), &align, AX_REG);
- gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
+ is_legitimate_pic:
+ if (TARGET_64BIT && (index || base))
+ {
+ /* foo@dtpoff(%rX) is ok. */
+ if (GET_CODE (disp) != CONST
+ || GET_CODE (XEXP (disp, 0)) != PLUS
+ || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
+ || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
+ || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
+ && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
+ /* Non-constant pic memory reference. */
+ return false;
+ }
+ else if ((!TARGET_MACHO || flag_pic)
+ && ! legitimate_pic_address_disp_p (disp))
+ /* Displacement is an invalid pic construct. */
+ return false;
+#if TARGET_MACHO
+ else if (MACHO_DYNAMIC_NO_PIC_P
+ && !ix86_legitimate_constant_p (Pmode, disp))
+ /* displacment must be referenced via non_lazy_pointer */
+ return false;
+#endif
- emit_insn (gen_rtx_SET (rax, addr));
+ /* This code used to verify that a symbolic pic displacement
+ includes the pic_offset_table_rtx register.
- /* Get the stub symbol. */
- sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
- : XLOGUE_STUB_SAVE);
- RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
+ While this is good idea, unfortunately these constructs may
+ be created by "adds using lea" optimization for incorrect
+ code like:
- for (i = 0; i < ncregs; ++i)
- {
- const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
- rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
- r.regno);
- RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);
- }
+ int a;
+ int foo(int i)
+ {
+ return *(&a+i);
+ }
- gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
+ This code is nonsensical, but results in addressing
+ GOT table with pic_offset_table_rtx base. We can't
+ just refuse it easily, since it gets matched by
+ "addsi3" pattern, that later gets split to lea in the
+ case output register differs from input. While this
+ can be handled by separate addsi pattern for this case
+ that never results in lea, this seems to be easier and
+ correct fix for crash to disable this test. */
+ }
+ else if (GET_CODE (disp) != LABEL_REF
+ && !CONST_INT_P (disp)
+ && (GET_CODE (disp) != CONST
+ || !ix86_legitimate_constant_p (Pmode, disp))
+ && (GET_CODE (disp) != SYMBOL_REF
+ || !ix86_legitimate_constant_p (Pmode, disp)))
+ /* Displacement is not constant. */
+ return false;
+ else if (TARGET_64BIT
+ && !x86_64_immediate_operand (disp, VOIDmode))
+ /* Displacement is out of range. */
+ return false;
+ /* In x32 mode, constant addresses are sign extended to 64bit, so
+ we have to prevent addresses from 0x80000000 to 0xffffffff. */
+ else if (TARGET_X32 && !(index || base)
+ && CONST_INT_P (disp)
+ && val_signbit_known_set_p (SImode, INTVAL (disp)))
+ return false;
+ }
- insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
- RTX_FRAME_RELATED_P (insn) = true;
+ /* Everything looks valid. */
+ return true;
}
-/* Expand the prologue into a bunch of separate insns. */
+/* Determine if a given RTX is a valid constant address. */
-void
-ix86_expand_prologue (void)
+bool
+constant_address_p (rtx x)
{
- struct machine_function *m = cfun->machine;
- rtx insn, t;
- HOST_WIDE_INT allocate;
- bool int_registers_saved;
- bool sse_registers_saved;
- bool save_stub_call_needed;
- rtx static_chain = NULL_RTX;
+ return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
+}
+\f
+/* Return a unique alias set for the GOT. */
- if (ix86_function_naked (current_function_decl))
- return;
+alias_set_type
+ix86_GOT_alias_set (void)
+{
+ static alias_set_type set = -1;
+ if (set == -1)
+ set = new_alias_set ();
+ return set;
+}
- ix86_finalize_stack_frame_flags ();
+/* Return a legitimate reference for ORIG (an address) using the
+ register REG. If REG is 0, a new pseudo is generated.
- /* DRAP should not coexist with stack_realign_fp */
- gcc_assert (!(crtl->drap_reg && stack_realign_fp));
+ There are two types of references that must be handled:
- memset (&m->fs, 0, sizeof (m->fs));
+ 1. Global data references must load the address from the GOT, via
+ the PIC reg. An insn is emitted to do this load, and the reg is
+ returned.
- /* Initialize CFA state for before the prologue. */
- m->fs.cfa_reg = stack_pointer_rtx;
- m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
+ 2. Static data references, constant pool addresses, and code labels
+ compute the address as an offset from the GOT, whose base is in
+ the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
+ differentiate them from global data objects. The returned
+ address is the PIC reg + an unspec constant.
- /* Track SP offset to the CFA. We continue tracking this after we've
- swapped the CFA register away from SP. In the case of re-alignment
- this is fudged; we're interested to offsets within the local frame. */
- m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
- m->fs.sp_valid = true;
- m->fs.sp_realigned = false;
+ TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
+ reg also appears in the address. */
- const struct ix86_frame &frame = cfun->machine->frame;
+rtx
+legitimize_pic_address (rtx orig, rtx reg)
+{
+ rtx addr = orig;
+ rtx new_rtx = orig;
- if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
+#if TARGET_MACHO
+ if (TARGET_MACHO && !TARGET_64BIT)
{
- /* We should have already generated an error for any use of
- ms_hook on a nested function. */
- gcc_checking_assert (!ix86_static_chain_on_stack);
-
- /* Check if profiling is active and we shall use profiling before
- prologue variant. If so sorry. */
- if (crtl->profile && flag_fentry != 0)
- sorry ("ms_hook_prologue attribute isn%'t compatible "
- "with %<-mfentry%> for 32-bit");
-
- /* In ix86_asm_output_function_label we emitted:
- 8b ff movl.s %edi,%edi
- 55 push %ebp
- 8b ec movl.s %esp,%ebp
-
- This matches the hookable function prologue in Win32 API
- functions in Microsoft Windows XP Service Pack 2 and newer.
- Wine uses this to enable Windows apps to hook the Win32 API
- functions provided by Wine.
-
- What that means is that we've already set up the frame pointer. */
-
- if (frame_pointer_needed
- && !(crtl->drap_reg && crtl->stack_realign_needed))
- {
- rtx push, mov;
-
- /* We've decided to use the frame pointer already set up.
- Describe this to the unwinder by pretending that both
- push and mov insns happen right here.
+ if (reg == 0)
+ reg = gen_reg_rtx (Pmode);
+ /* Use the generic Mach-O PIC machinery. */
+ return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
+ }
+#endif
- Putting the unwind info here at the end of the ms_hook
- is done so that we can make absolutely certain we get
- the required byte sequence at the start of the function,
- rather than relying on an assembler that can produce
- the exact encoding required.
+ if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
+ {
+ rtx tmp = legitimize_pe_coff_symbol (addr, true);
+ if (tmp)
+ return tmp;
+ }
- However it does mean (in the unpatched case) that we have
- a 1 insn window where the asynchronous unwind info is
- incorrect. However, if we placed the unwind info at
- its correct location we would have incorrect unwind info
- in the patched case. Which is probably all moot since
- I don't expect Wine generates dwarf2 unwind info for the
- system libraries that use this feature. */
+ if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
+ new_rtx = addr;
+ else if ((!TARGET_64BIT
+ || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
+ && !TARGET_PECOFF
+ && gotoff_operand (addr, Pmode))
+ {
+ /* This symbol may be referenced via a displacement
+ from the PIC base address (@GOTOFF). */
+ if (GET_CODE (addr) == CONST)
+ addr = XEXP (addr, 0);
- insn = emit_insn (gen_blockage ());
+ if (GET_CODE (addr) == PLUS)
+ {
+ new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
+ UNSPEC_GOTOFF);
+ new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
+ }
+ else
+ new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
- push = gen_push (hard_frame_pointer_rtx);
- mov = gen_rtx_SET (hard_frame_pointer_rtx,
- stack_pointer_rtx);
- RTX_FRAME_RELATED_P (push) = 1;
- RTX_FRAME_RELATED_P (mov) = 1;
+ new_rtx = gen_rtx_CONST (Pmode, new_rtx);
- RTX_FRAME_RELATED_P (insn) = 1;
- add_reg_note (insn, REG_FRAME_RELATED_EXPR,
- gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
+ if (TARGET_64BIT)
+ new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
- /* Note that gen_push incremented m->fs.cfa_offset, even
- though we didn't emit the push insn here. */
- m->fs.cfa_reg = hard_frame_pointer_rtx;
- m->fs.fp_offset = m->fs.cfa_offset;
- m->fs.fp_valid = true;
- }
- else
+ if (reg != 0)
{
- /* The frame pointer is not needed so pop %ebp again.
- This leaves us with a pristine state. */
- emit_insn (gen_pop (hard_frame_pointer_rtx));
- }
+ gcc_assert (REG_P (reg));
+ new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
+ new_rtx, reg, 1, OPTAB_DIRECT);
+ }
+ else
+ new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
}
-
- /* The first insn of a function that accepts its static chain on the
- stack is to push the register that would be filled in by a direct
- call. This insn will be skipped by the trampoline. */
- else if (ix86_static_chain_on_stack)
+ else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
+ /* We can't use @GOTOFF for text labels
+ on VxWorks, see gotoff_operand. */
+ || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
{
- static_chain = ix86_static_chain (cfun->decl, false);
- insn = emit_insn (gen_push (static_chain));
- emit_insn (gen_blockage ());
+ rtx tmp = legitimize_pe_coff_symbol (addr, true);
+ if (tmp)
+ return tmp;
- /* We don't want to interpret this push insn as a register save,
- only as a stack adjustment. The real copy of the register as
- a save will be done later, if needed. */
- t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
- t = gen_rtx_SET (stack_pointer_rtx, t);
- add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
- RTX_FRAME_RELATED_P (insn) = 1;
- }
+ /* For x64 PE-COFF there is no GOT table,
+ so we use address directly. */
+ if (TARGET_64BIT && TARGET_PECOFF)
+ {
+ new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
+ new_rtx = gen_rtx_CONST (Pmode, new_rtx);
+ }
+ else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
+ {
+ new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
+ UNSPEC_GOTPCREL);
+ new_rtx = gen_rtx_CONST (Pmode, new_rtx);
+ new_rtx = gen_const_mem (Pmode, new_rtx);
+ set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
+ }
+ else
+ {
+ /* This symbol must be referenced via a load
+ from the Global Offset Table (@GOT). */
+ new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
+ new_rtx = gen_rtx_CONST (Pmode, new_rtx);
+ if (TARGET_64BIT)
+ new_rtx = force_reg (Pmode, new_rtx);
+ new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
+ new_rtx = gen_const_mem (Pmode, new_rtx);
+ set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
+ }
- /* Emit prologue code to adjust stack alignment and setup DRAP, in case
- of DRAP is needed and stack realignment is really needed after reload */
- if (stack_realign_drap)
+ new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
+ }
+ else
{
- int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
+ if (CONST_INT_P (addr)
+ && !x86_64_immediate_operand (addr, VOIDmode))
+ new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
+ else if (GET_CODE (addr) == CONST)
+ {
+ addr = XEXP (addr, 0);
- /* Can't use DRAP in interrupt function. */
- if (cfun->machine->func_type != TYPE_NORMAL)
- sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
- "in interrupt service routine. This may be worked "
- "around by avoiding functions with aggregate return.");
+ /* We must match stuff we generate before. Assume the only
+ unspecs that can get here are ours. Not that we could do
+ anything with them anyway.... */
+ if (GET_CODE (addr) == UNSPEC
+ || (GET_CODE (addr) == PLUS
+ && GET_CODE (XEXP (addr, 0)) == UNSPEC))
+ return orig;
+ gcc_assert (GET_CODE (addr) == PLUS);
+ }
- /* Only need to push parameter pointer reg if it is caller saved. */
- if (!call_used_regs[REGNO (crtl->drap_reg)])
+ if (GET_CODE (addr) == PLUS)
{
- /* Push arg pointer reg */
- insn = emit_insn (gen_push (crtl->drap_reg));
- RTX_FRAME_RELATED_P (insn) = 1;
- }
+ rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
- /* Grab the argument pointer. */
- t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
- insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
- RTX_FRAME_RELATED_P (insn) = 1;
- m->fs.cfa_reg = crtl->drap_reg;
- m->fs.cfa_offset = 0;
+ /* Check first to see if this is a constant
+ offset from a @GOTOFF symbol reference. */
+ if (!TARGET_PECOFF
+ && gotoff_operand (op0, Pmode)
+ && CONST_INT_P (op1))
+ {
+ if (!TARGET_64BIT)
+ {
+ new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
+ UNSPEC_GOTOFF);
+ new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
+ new_rtx = gen_rtx_CONST (Pmode, new_rtx);
- /* Align the stack. */
- insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
- stack_pointer_rtx,
- GEN_INT (-align_bytes)));
- RTX_FRAME_RELATED_P (insn) = 1;
+ if (reg != 0)
+ {
+ gcc_assert (REG_P (reg));
+ new_rtx = expand_simple_binop (Pmode, PLUS,
+ pic_offset_table_rtx,
+ new_rtx, reg, 1,
+ OPTAB_DIRECT);
+ }
+ else
+ new_rtx
+ = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
+ }
+ else
+ {
+ if (INTVAL (op1) < -16*1024*1024
+ || INTVAL (op1) >= 16*1024*1024)
+ {
+ if (!x86_64_immediate_operand (op1, Pmode))
+ op1 = force_reg (Pmode, op1);
- /* Replicate the return address on the stack so that return
- address can be reached via (argp - 1) slot. This is needed
- to implement macro RETURN_ADDR_RTX and intrinsic function
- expand_builtin_return_addr etc. */
- t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
- t = gen_frame_mem (word_mode, t);
- insn = emit_insn (gen_push (t));
- RTX_FRAME_RELATED_P (insn) = 1;
+ new_rtx
+ = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
+ }
+ }
+ }
+ else
+ {
+ rtx base = legitimize_pic_address (op0, reg);
+ machine_mode mode = GET_MODE (base);
+ new_rtx
+ = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
- /* For the purposes of frame and register save area addressing,
- we've started over with a new frame. */
- m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
- m->fs.realigned = true;
+ if (CONST_INT_P (new_rtx))
+ {
+ if (INTVAL (new_rtx) < -16*1024*1024
+ || INTVAL (new_rtx) >= 16*1024*1024)
+ {
+ if (!x86_64_immediate_operand (new_rtx, mode))
+ new_rtx = force_reg (mode, new_rtx);
- if (static_chain)
- {
- /* Replicate static chain on the stack so that static chain
- can be reached via (argp - 2) slot. This is needed for
- nested function with stack realignment. */
- insn = emit_insn (gen_push (static_chain));
- RTX_FRAME_RELATED_P (insn) = 1;
+ new_rtx
+ = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
+ }
+ else
+ new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
+ }
+ else
+ {
+ /* For %rip addressing, we have to use
+ just disp32, not base nor index. */
+ if (TARGET_64BIT
+ && (GET_CODE (base) == SYMBOL_REF
+ || GET_CODE (base) == LABEL_REF))
+ base = force_reg (mode, base);
+ if (GET_CODE (new_rtx) == PLUS
+ && CONSTANT_P (XEXP (new_rtx, 1)))
+ {
+ base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
+ new_rtx = XEXP (new_rtx, 1);
+ }
+ new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
+ }
+ }
}
}
+ return new_rtx;
+}
+\f
+/* Load the thread pointer. If TO_REG is true, force it into a register. */
- int_registers_saved = (frame.nregs == 0);
- sse_registers_saved = (frame.nsseregs == 0);
- save_stub_call_needed = (m->call_ms2sysv);
- gcc_assert (sse_registers_saved || !save_stub_call_needed);
+static rtx
+get_thread_pointer (machine_mode tp_mode, bool to_reg)
+{
+ rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
- if (frame_pointer_needed && !m->fs.fp_valid)
+ if (GET_MODE (tp) != tp_mode)
{
- /* Note: AT&T enter does NOT have reversed args. Enter is probably
- slower on all targets. Also sdb didn't like it. */
- insn = emit_insn (gen_push (hard_frame_pointer_rtx));
- RTX_FRAME_RELATED_P (insn) = 1;
+ gcc_assert (GET_MODE (tp) == SImode);
+ gcc_assert (tp_mode == DImode);
- /* Push registers now, before setting the frame pointer
- on SEH target. */
- if (!int_registers_saved
- && TARGET_SEH
- && !frame.save_regs_using_mov)
- {
- ix86_emit_save_regs ();
- int_registers_saved = true;
- gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
- }
+ tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
+ }
- if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
- {
- insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
- RTX_FRAME_RELATED_P (insn) = 1;
+ if (to_reg)
+ tp = copy_to_mode_reg (tp_mode, tp);
- if (m->fs.cfa_reg == stack_pointer_rtx)
- m->fs.cfa_reg = hard_frame_pointer_rtx;
- m->fs.fp_offset = m->fs.sp_offset;
- m->fs.fp_valid = true;
- }
- }
+ return tp;
+}
- if (!int_registers_saved)
+/* Construct the SYMBOL_REF for the tls_get_addr function. */
+
+static GTY(()) rtx ix86_tls_symbol;
+
+static rtx
+ix86_tls_get_addr (void)
+{
+ if (!ix86_tls_symbol)
{
- /* If saving registers via PUSH, do so now. */
- if (!frame.save_regs_using_mov)
- {
- ix86_emit_save_regs ();
- int_registers_saved = true;
- gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
- }
+ const char *sym
+ = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
+ ? "___tls_get_addr" : "__tls_get_addr");
- /* When using red zone we may start register saving before allocating
- the stack frame saving one cycle of the prologue. However, avoid
- doing this if we have to probe the stack; at least on x86_64 the
- stack probe can turn into a call that clobbers a red zone location. */
- else if (ix86_using_red_zone ()
- && (! TARGET_STACK_PROBE
- || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
- {
- ix86_emit_save_regs_using_mov (frame.reg_save_offset);
- int_registers_saved = true;
- }
+ ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
}
- if (stack_realign_fp)
+ if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
{
- int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
- gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
+ rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
+ UNSPEC_PLTOFF);
+ return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
+ gen_rtx_CONST (Pmode, unspec));
+ }
- /* Record last valid frame pointer offset. */
- m->fs.sp_realigned_fp_last = frame.reg_save_offset;
+ return ix86_tls_symbol;
+}
- /* The computation of the size of the re-aligned stack frame means
- that we must allocate the size of the register save area before
- performing the actual alignment. Otherwise we cannot guarantee
- that there's enough storage above the realignment point. */
- allocate = frame.reg_save_offset - m->fs.sp_offset
- + frame.stack_realign_allocate;
- if (allocate)
- pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
- GEN_INT (-allocate), -1, false);
+/* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
- /* Align the stack. */
- insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
- stack_pointer_rtx,
- GEN_INT (-align_bytes)));
- m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
- m->fs.sp_realigned_offset = m->fs.sp_offset
- - frame.stack_realign_allocate;
- /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset.
- Beyond this point, stack access should be done via choose_baseaddr or
- by using sp_valid_at and fp_valid_at to determine the correct base
- register. Henceforth, any CFA offset should be thought of as logical
- and not physical. */
- gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last);
- gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
- m->fs.sp_realigned = true;
+static GTY(()) rtx ix86_tls_module_base_symbol;
- /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
- is needed to describe where a register is saved using a realigned
- stack pointer, so we need to invalidate the stack pointer for that
- target. */
- if (TARGET_SEH)
- m->fs.sp_valid = false;
+rtx
+ix86_tls_module_base (void)
+{
+ if (!ix86_tls_module_base_symbol)
+ {
+ ix86_tls_module_base_symbol
+ = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
- /* If SP offset is non-immediate after allocation of the stack frame,
- then emit SSE saves or stub call prior to allocating the rest of the
- stack frame. This is less efficient for the out-of-line stub because
- we can't combine allocations across the call barrier, but it's better
- than using a scratch register. */
- else if (!x86_64_immediate_operand (GEN_INT (frame.stack_pointer_offset
- - m->fs.sp_realigned_offset),
- Pmode))
- {
- if (!sse_registers_saved)
- {
- ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
- sse_registers_saved = true;
- }
- else if (save_stub_call_needed)
- {
- ix86_emit_outlined_ms2sysv_save (frame);
- save_stub_call_needed = false;
- }
- }
+ SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
+ |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
}
- allocate = frame.stack_pointer_offset - m->fs.sp_offset;
+ return ix86_tls_module_base_symbol;
+}
- if (flag_stack_usage_info)
- {
- /* We start to count from ARG_POINTER. */
- HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
+/* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
+ false if we expect this to be used for a memory address and true if
+ we expect to load the address into a register. */
- /* If it was realigned, take into account the fake frame. */
- if (stack_realign_drap)
- {
- if (ix86_static_chain_on_stack)
- stack_size += UNITS_PER_WORD;
+rtx
+legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
+{
+ rtx dest, base, off;
+ rtx pic = NULL_RTX, tp = NULL_RTX;
+ machine_mode tp_mode = Pmode;
+ int type;
- if (!call_used_regs[REGNO (crtl->drap_reg)])
- stack_size += UNITS_PER_WORD;
+ /* Fall back to global dynamic model if tool chain cannot support local
+ dynamic. */
+ if (TARGET_SUN_TLS && !TARGET_64BIT
+ && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
+ && model == TLS_MODEL_LOCAL_DYNAMIC)
+ model = TLS_MODEL_GLOBAL_DYNAMIC;
- /* This over-estimates by 1 minimal-stack-alignment-unit but
- mitigates that by counting in the new return address slot. */
- current_function_dynamic_stack_size
- += crtl->stack_alignment_needed / BITS_PER_UNIT;
- }
+ switch (model)
+ {
+ case TLS_MODEL_GLOBAL_DYNAMIC:
+ dest = gen_reg_rtx (Pmode);
- current_function_static_stack_size = stack_size;
- }
+ if (!TARGET_64BIT)
+ {
+ if (flag_pic && !TARGET_PECOFF)
+ pic = pic_offset_table_rtx;
+ else
+ {
+ pic = gen_reg_rtx (Pmode);
+ emit_insn (gen_set_got (pic));
+ }
+ }
- /* On SEH target with very large frame size, allocate an area to save
- SSE registers (as the very large allocation won't be described). */
- if (TARGET_SEH
- && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
- && !sse_registers_saved)
- {
- HOST_WIDE_INT sse_size
- = frame.sse_reg_save_offset - frame.reg_save_offset;
+ if (TARGET_GNU2_TLS)
+ {
+ if (TARGET_64BIT)
+ emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
+ else
+ emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
- gcc_assert (int_registers_saved);
+ tp = get_thread_pointer (Pmode, true);
+ dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
- /* No need to do stack checking as the area will be immediately
- written. */
- pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
- GEN_INT (-sse_size), -1,
- m->fs.cfa_reg == stack_pointer_rtx);
- allocate -= sse_size;
- ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
- sse_registers_saved = true;
- }
+ if (GET_MODE (x) != Pmode)
+ x = gen_rtx_ZERO_EXTEND (Pmode, x);
- /* The stack has already been decremented by the instruction calling us
- so probe if the size is non-negative to preserve the protection area. */
- if (allocate >= 0
- && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
- || flag_stack_clash_protection))
- {
- if (flag_stack_clash_protection)
- {
- ix86_adjust_stack_and_probe_stack_clash (allocate,
- int_registers_saved);
- allocate = 0;
+ set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
}
- else if (STACK_CHECK_MOVING_SP)
+ else
{
- if (!(crtl->is_leaf && !cfun->calls_alloca
- && allocate <= get_probe_interval ()))
+ rtx caddr = ix86_tls_get_addr ();
+
+ if (TARGET_64BIT)
{
- ix86_adjust_stack_and_probe (allocate, int_registers_saved);
- allocate = 0;
+ rtx rax = gen_rtx_REG (Pmode, AX_REG);
+ rtx_insn *insns;
+
+ start_sequence ();
+ emit_call_insn
+ (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
+ insns = get_insns ();
+ end_sequence ();
+
+ if (GET_MODE (x) != Pmode)
+ x = gen_rtx_ZERO_EXTEND (Pmode, x);
+
+ RTL_CONST_CALL_P (insns) = 1;
+ emit_libcall_block (insns, dest, rax, x);
}
+ else
+ emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
}
- else
- {
- HOST_WIDE_INT size = allocate;
+ break;
- if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
- size = 0x80000000 - get_stack_check_protect () - 1;
+ case TLS_MODEL_LOCAL_DYNAMIC:
+ base = gen_reg_rtx (Pmode);
- if (TARGET_STACK_PROBE)
- {
- if (crtl->is_leaf && !cfun->calls_alloca)
- {
- if (size > get_probe_interval ())
- ix86_emit_probe_stack_range (0, size, int_registers_saved);
- }
- else
- ix86_emit_probe_stack_range (0,
- size + get_stack_check_protect (),
- int_registers_saved);
- }
+ if (!TARGET_64BIT)
+ {
+ if (flag_pic)
+ pic = pic_offset_table_rtx;
else
{
- if (crtl->is_leaf && !cfun->calls_alloca)
- {
- if (size > get_probe_interval ()
- && size > get_stack_check_protect ())
- ix86_emit_probe_stack_range (get_stack_check_protect (),
- (size
- - get_stack_check_protect ()),
- int_registers_saved);
- }
- else
- ix86_emit_probe_stack_range (get_stack_check_protect (), size,
- int_registers_saved);
+ pic = gen_reg_rtx (Pmode);
+ emit_insn (gen_set_got (pic));
}
}
- }
- if (allocate == 0)
- ;
- else if (!ix86_target_stack_probe ()
- || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
- {
- pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
- GEN_INT (-allocate), -1,
- m->fs.cfa_reg == stack_pointer_rtx);
- }
- else
- {
- rtx eax = gen_rtx_REG (Pmode, AX_REG);
- rtx r10 = NULL;
- rtx (*adjust_stack_insn)(rtx, rtx, rtx);
- const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
- bool eax_live = ix86_eax_live_at_start_p ();
- bool r10_live = false;
+ if (TARGET_GNU2_TLS)
+ {
+ rtx tmp = ix86_tls_module_base ();
- if (TARGET_64BIT)
- r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
+ if (TARGET_64BIT)
+ emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
+ else
+ emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
- if (eax_live)
- {
- insn = emit_insn (gen_push (eax));
- allocate -= UNITS_PER_WORD;
- /* Note that SEH directives need to continue tracking the stack
- pointer even after the frame pointer has been set up. */
- if (sp_is_cfa_reg || TARGET_SEH)
- {
- if (sp_is_cfa_reg)
- m->fs.cfa_offset += UNITS_PER_WORD;
- RTX_FRAME_RELATED_P (insn) = 1;
- add_reg_note (insn, REG_FRAME_RELATED_EXPR,
- gen_rtx_SET (stack_pointer_rtx,
- plus_constant (Pmode, stack_pointer_rtx,
- -UNITS_PER_WORD)));
- }
+ tp = get_thread_pointer (Pmode, true);
+ set_unique_reg_note (get_last_insn (), REG_EQUAL,
+ gen_rtx_MINUS (Pmode, tmp, tp));
}
-
- if (r10_live)
+ else
{
- r10 = gen_rtx_REG (Pmode, R10_REG);
- insn = emit_insn (gen_push (r10));
- allocate -= UNITS_PER_WORD;
- if (sp_is_cfa_reg || TARGET_SEH)
+ rtx caddr = ix86_tls_get_addr ();
+
+ if (TARGET_64BIT)
{
- if (sp_is_cfa_reg)
- m->fs.cfa_offset += UNITS_PER_WORD;
- RTX_FRAME_RELATED_P (insn) = 1;
- add_reg_note (insn, REG_FRAME_RELATED_EXPR,
- gen_rtx_SET (stack_pointer_rtx,
- plus_constant (Pmode, stack_pointer_rtx,
- -UNITS_PER_WORD)));
+ rtx rax = gen_rtx_REG (Pmode, AX_REG);
+ rtx_insn *insns;
+ rtx eqv;
+
+ start_sequence ();
+ emit_call_insn
+ (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
+ insns = get_insns ();
+ end_sequence ();
+
+ /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
+ share the LD_BASE result with other LD model accesses. */
+ eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
+ UNSPEC_TLS_LD_BASE);
+
+ RTL_CONST_CALL_P (insns) = 1;
+ emit_libcall_block (insns, base, rax, eqv);
}
+ else
+ emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
}
- emit_move_insn (eax, GEN_INT (allocate));
- emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
-
- /* Use the fact that AX still contains ALLOCATE. */
- adjust_stack_insn = (Pmode == DImode
- ? gen_pro_epilogue_adjust_stack_di_sub
- : gen_pro_epilogue_adjust_stack_si_sub);
+ off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
+ off = gen_rtx_CONST (Pmode, off);
- insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
- stack_pointer_rtx, eax));
+ dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
- if (sp_is_cfa_reg || TARGET_SEH)
+ if (TARGET_GNU2_TLS)
{
- if (sp_is_cfa_reg)
- m->fs.cfa_offset += allocate;
- RTX_FRAME_RELATED_P (insn) = 1;
- add_reg_note (insn, REG_FRAME_RELATED_EXPR,
- gen_rtx_SET (stack_pointer_rtx,
- plus_constant (Pmode, stack_pointer_rtx,
- -allocate)));
+ dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
+
+ if (GET_MODE (x) != Pmode)
+ x = gen_rtx_ZERO_EXTEND (Pmode, x);
+
+ set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
}
- m->fs.sp_offset += allocate;
+ break;
- /* Use stack_pointer_rtx for relative addressing so that code works for
- realigned stack. But this means that we need a blockage to prevent
- stores based on the frame pointer from being scheduled before. */
- if (r10_live && eax_live)
- {
- t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
- emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
- gen_frame_mem (word_mode, t));
- t = plus_constant (Pmode, t, UNITS_PER_WORD);
- emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
- gen_frame_mem (word_mode, t));
- emit_insn (gen_memory_blockage ());
+ case TLS_MODEL_INITIAL_EXEC:
+ if (TARGET_64BIT)
+ {
+ if (TARGET_SUN_TLS && !TARGET_X32)
+ {
+ /* The Sun linker took the AMD64 TLS spec literally
+ and can only handle %rax as destination of the
+ initial executable code sequence. */
+
+ dest = gen_reg_rtx (DImode);
+ emit_insn (gen_tls_initial_exec_64_sun (dest, x));
+ return dest;
+ }
+
+ /* Generate DImode references to avoid %fs:(%reg32)
+ problems and linker IE->LE relaxation bug. */
+ tp_mode = DImode;
+ pic = NULL;
+ type = UNSPEC_GOTNTPOFF;
}
- else if (eax_live || r10_live)
+ else if (flag_pic)
{
- t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
- emit_move_insn (gen_rtx_REG (word_mode,
- (eax_live ? AX_REG : R10_REG)),
- gen_frame_mem (word_mode, t));
- emit_insn (gen_memory_blockage ());
+ pic = pic_offset_table_rtx;
+ type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
+ }
+ else if (!TARGET_ANY_GNU_TLS)
+ {
+ pic = gen_reg_rtx (Pmode);
+ emit_insn (gen_set_got (pic));
+ type = UNSPEC_GOTTPOFF;
+ }
+ else
+ {
+ pic = NULL;
+ type = UNSPEC_INDNTPOFF;
}
- }
- gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
- /* If we havn't already set up the frame pointer, do so now. */
- if (frame_pointer_needed && !m->fs.fp_valid)
- {
- insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
- GEN_INT (frame.stack_pointer_offset
- - frame.hard_frame_pointer_offset));
- insn = emit_insn (insn);
- RTX_FRAME_RELATED_P (insn) = 1;
- add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
+ off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
+ off = gen_rtx_CONST (tp_mode, off);
+ if (pic)
+ off = gen_rtx_PLUS (tp_mode, pic, off);
+ off = gen_const_mem (tp_mode, off);
+ set_mem_alias_set (off, ix86_GOT_alias_set ());
- if (m->fs.cfa_reg == stack_pointer_rtx)
- m->fs.cfa_reg = hard_frame_pointer_rtx;
- m->fs.fp_offset = frame.hard_frame_pointer_offset;
- m->fs.fp_valid = true;
- }
+ if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
+ {
+ base = get_thread_pointer (tp_mode,
+ for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
+ off = force_reg (tp_mode, off);
+ dest = gen_rtx_PLUS (tp_mode, base, off);
+ if (tp_mode != Pmode)
+ dest = convert_to_mode (Pmode, dest, 1);
+ }
+ else
+ {
+ base = get_thread_pointer (Pmode, true);
+ dest = gen_reg_rtx (Pmode);
+ emit_insn (ix86_gen_sub3 (dest, base, off));
+ }
+ break;
- if (!int_registers_saved)
- ix86_emit_save_regs_using_mov (frame.reg_save_offset);
- if (!sse_registers_saved)
- ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
- else if (save_stub_call_needed)
- ix86_emit_outlined_ms2sysv_save (frame);
+ case TLS_MODEL_LOCAL_EXEC:
+ off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
+ (TARGET_64BIT || TARGET_ANY_GNU_TLS)
+ ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
+ off = gen_rtx_CONST (Pmode, off);
- /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
- in PROLOGUE. */
- if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
- {
- rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
- insn = emit_insn (gen_set_got (pic));
- RTX_FRAME_RELATED_P (insn) = 1;
- add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
- emit_insn (gen_prologue_use (pic));
- /* Deleting already emmitted SET_GOT if exist and allocated to
- REAL_PIC_OFFSET_TABLE_REGNUM. */
- ix86_elim_entry_set_got (pic);
- }
+ if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
+ {
+ base = get_thread_pointer (Pmode,
+ for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
+ return gen_rtx_PLUS (Pmode, base, off);
+ }
+ else
+ {
+ base = get_thread_pointer (Pmode, true);
+ dest = gen_reg_rtx (Pmode);
+ emit_insn (ix86_gen_sub3 (dest, base, off));
+ }
+ break;
- if (crtl->drap_reg && !crtl->stack_realign_needed)
- {
- /* vDRAP is setup but after reload it turns out stack realign
- isn't necessary, here we will emit prologue to setup DRAP
- without stack realign adjustment */
- t = choose_baseaddr (0, NULL);
- emit_insn (gen_rtx_SET (crtl->drap_reg, t));
+ default:
+ gcc_unreachable ();
}
- /* Prevent instructions from being scheduled into register save push
- sequence when access to the redzone area is done through frame pointer.
- The offset between the frame pointer and the stack pointer is calculated
- relative to the value of the stack pointer at the end of the function
- prologue, and moving instructions that access redzone area via frame
- pointer inside push sequence violates this assumption. */
- if (frame_pointer_needed && frame.red_zone_size)
- emit_insn (gen_memory_blockage ());
-
- /* SEH requires that the prologue end within 256 bytes of the start of
- the function. Prevent instruction schedules that would extend that.
- Further, prevent alloca modifications to the stack pointer from being
- combined with prologue modifications. */
- if (TARGET_SEH)
- emit_insn (gen_prologue_use (stack_pointer_rtx));
+ return dest;
}
-/* Emit code to restore REG using a POP insn. */
-
-static void
-ix86_emit_restore_reg_using_pop (rtx reg)
+/* Return true if OP refers to a TLS address. */
+bool
+ix86_tls_address_pattern_p (rtx op)
{
- struct machine_function *m = cfun->machine;
- rtx_insn *insn = emit_insn (gen_pop (reg));
-
- ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
- m->fs.sp_offset -= UNITS_PER_WORD;
-
- if (m->fs.cfa_reg == crtl->drap_reg
- && REGNO (reg) == REGNO (crtl->drap_reg))
+ subrtx_var_iterator::array_type array;
+ FOR_EACH_SUBRTX_VAR (iter, array, op, ALL)
{
- /* Previously we'd represented the CFA as an expression
- like *(%ebp - 8). We've just popped that value from
- the stack, which means we need to reset the CFA to
- the drap register. This will remain until we restore
- the stack pointer. */
- add_reg_note (insn, REG_CFA_DEF_CFA, reg);
- RTX_FRAME_RELATED_P (insn) = 1;
+ rtx op = *iter;
+ if (MEM_P (op))
+ {
+ rtx *x = &XEXP (op, 0);
+ while (GET_CODE (*x) == PLUS)
+ {
+ int i;
+ for (i = 0; i < 2; i++)
+ {
+ rtx u = XEXP (*x, i);
+ if (GET_CODE (u) == ZERO_EXTEND)
+ u = XEXP (u, 0);
+ if (GET_CODE (u) == UNSPEC
+ && XINT (u, 1) == UNSPEC_TP)
+ return true;
+ }
+ x = &XEXP (*x, 0);
+ }
- /* This means that the DRAP register is valid for addressing too. */
- m->fs.drap_valid = true;
- return;
+ iter.skip_subrtxes ();
+ }
}
- if (m->fs.cfa_reg == stack_pointer_rtx)
- {
- rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
- x = gen_rtx_SET (stack_pointer_rtx, x);
- add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
- RTX_FRAME_RELATED_P (insn) = 1;
-
- m->fs.cfa_offset -= UNITS_PER_WORD;
- }
+ return false;
+}
- /* When the frame pointer is the CFA, and we pop it, we are
- swapping back to the stack pointer as the CFA. This happens
- for stack frames that don't allocate other data, so we assume
- the stack pointer is now pointing at the return address, i.e.
- the function entry state, which makes the offset be 1 word. */
- if (reg == hard_frame_pointer_rtx)
+/* Rewrite *LOC so that it refers to a default TLS address space. */
+void
+ix86_rewrite_tls_address_1 (rtx *loc)
+{
+ subrtx_ptr_iterator::array_type array;
+ FOR_EACH_SUBRTX_PTR (iter, array, loc, ALL)
{
- m->fs.fp_valid = false;
- if (m->fs.cfa_reg == hard_frame_pointer_rtx)
+ rtx *loc = *iter;
+ if (MEM_P (*loc))
{
- m->fs.cfa_reg = stack_pointer_rtx;
- m->fs.cfa_offset -= UNITS_PER_WORD;
+ rtx addr = XEXP (*loc, 0);
+ rtx *x = &addr;
+ while (GET_CODE (*x) == PLUS)
+ {
+ int i;
+ for (i = 0; i < 2; i++)
+ {
+ rtx u = XEXP (*x, i);
+ if (GET_CODE (u) == ZERO_EXTEND)
+ u = XEXP (u, 0);
+ if (GET_CODE (u) == UNSPEC
+ && XINT (u, 1) == UNSPEC_TP)
+ {
+ addr_space_t as = DEFAULT_TLS_SEG_REG;
- add_reg_note (insn, REG_CFA_DEF_CFA,
- gen_rtx_PLUS (Pmode, stack_pointer_rtx,
- GEN_INT (m->fs.cfa_offset)));
- RTX_FRAME_RELATED_P (insn) = 1;
+ *x = XEXP (*x, 1 - i);
+
+ *loc = replace_equiv_address_nv (*loc, addr, true);
+ set_mem_addr_space (*loc, as);
+ return;
+ }
+ }
+ x = &XEXP (*x, 0);
+ }
+
+ iter.skip_subrtxes ();
}
}
}
-/* Emit code to restore saved registers using POP insns. */
-
-static void
-ix86_emit_restore_regs_using_pop (void)
+/* Rewrite instruction pattern involvning TLS address
+ so that it refers to a default TLS address space. */
+rtx
+ix86_rewrite_tls_address (rtx pattern)
{
- unsigned int regno;
-
- for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
- if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
- ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
+ pattern = copy_insn (pattern);
+ ix86_rewrite_tls_address_1 (&pattern);
+ return pattern;
}
-/* Emit code and notes for the LEAVE instruction. If insn is non-null,
- omits the emit and only attaches the notes. */
+/* Create or return the unique __imp_DECL dllimport symbol corresponding
+ to symbol DECL if BEIMPORT is true. Otherwise create or return the
+ unique refptr-DECL symbol corresponding to symbol DECL. */
-static void
-ix86_emit_leave (rtx_insn *insn)
+struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
{
- struct machine_function *m = cfun->machine;
- if (!insn)
- insn = emit_insn (ix86_gen_leave ());
+ static inline hashval_t hash (tree_map *m) { return m->hash; }
+ static inline bool
+ equal (tree_map *a, tree_map *b)
+ {
+ return a->base.from == b->base.from;
+ }
- ix86_add_queued_cfa_restore_notes (insn);
+ static int
+ keep_cache_entry (tree_map *&m)
+ {
+ return ggc_marked_p (m->base.from);
+ }
+};
- gcc_assert (m->fs.fp_valid);
- m->fs.sp_valid = true;
- m->fs.sp_realigned = false;
- m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
- m->fs.fp_valid = false;
+static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
- if (m->fs.cfa_reg == hard_frame_pointer_rtx)
- {
- m->fs.cfa_reg = stack_pointer_rtx;
- m->fs.cfa_offset = m->fs.sp_offset;
+static tree
+get_dllimport_decl (tree decl, bool beimport)
+{
+ struct tree_map *h, in;
+ const char *name;
+ const char *prefix;
+ size_t namelen, prefixlen;
+ char *imp_name;
+ tree to;
+ rtx rtl;
- add_reg_note (insn, REG_CFA_DEF_CFA,
- plus_constant (Pmode, stack_pointer_rtx,
- m->fs.sp_offset));
- RTX_FRAME_RELATED_P (insn) = 1;
- }
- ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
- m->fs.fp_offset);
-}
+ if (!dllimport_map)
+ dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
-/* Emit code to restore saved registers using MOV insns.
- First register is restored from CFA - CFA_OFFSET. */
-static void
-ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
- bool maybe_eh_return)
-{
- struct machine_function *m = cfun->machine;
- unsigned int regno;
+ in.hash = htab_hash_pointer (decl);
+ in.base.from = decl;
+ tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
+ h = *loc;
+ if (h)
+ return h->to;
- for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
- if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
- {
- rtx reg = gen_rtx_REG (word_mode, regno);
- rtx mem;
- rtx_insn *insn;
+ *loc = h = ggc_alloc<tree_map> ();
+ h->hash = in.hash;
+ h->base.from = decl;
+ h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
+ VAR_DECL, NULL, ptr_type_node);
+ DECL_ARTIFICIAL (to) = 1;
+ DECL_IGNORED_P (to) = 1;
+ DECL_EXTERNAL (to) = 1;
+ TREE_READONLY (to) = 1;
- mem = choose_baseaddr (cfa_offset, NULL);
- mem = gen_frame_mem (word_mode, mem);
- insn = emit_move_insn (reg, mem);
+ name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
+ name = targetm.strip_name_encoding (name);
+ if (beimport)
+ prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
+ ? "*__imp_" : "*__imp__";
+ else
+ prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
+ namelen = strlen (name);
+ prefixlen = strlen (prefix);
+ imp_name = (char *) alloca (namelen + prefixlen + 1);
+ memcpy (imp_name, prefix, prefixlen);
+ memcpy (imp_name + prefixlen, name, namelen + 1);
- if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
- {
- /* Previously we'd represented the CFA as an expression
- like *(%ebp - 8). We've just popped that value from
- the stack, which means we need to reset the CFA to
- the drap register. This will remain until we restore
- the stack pointer. */
- add_reg_note (insn, REG_CFA_DEF_CFA, reg);
- RTX_FRAME_RELATED_P (insn) = 1;
+ name = ggc_alloc_string (imp_name, namelen + prefixlen);
+ rtl = gen_rtx_SYMBOL_REF (Pmode, name);
+ SET_SYMBOL_REF_DECL (rtl, to);
+ SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
+ if (!beimport)
+ {
+ SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
+#ifdef SUB_TARGET_RECORD_STUB
+ SUB_TARGET_RECORD_STUB (name);
+#endif
+ }
- /* This means that the DRAP register is valid for addressing. */
- m->fs.drap_valid = true;
- }
- else
- ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
+ rtl = gen_const_mem (Pmode, rtl);
+ set_mem_alias_set (rtl, ix86_GOT_alias_set ());
- cfa_offset -= UNITS_PER_WORD;
- }
+ SET_DECL_RTL (to, rtl);
+ SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
+
+ return to;
}
-/* Emit code to restore saved registers using MOV insns.
- First register is restored from CFA - CFA_OFFSET. */
-static void
-ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
- bool maybe_eh_return)
+/* Expand SYMBOL into its corresponding far-address symbol.
+ WANT_REG is true if we require the result be a register. */
+
+static rtx
+legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
{
- unsigned int regno;
+ tree imp_decl;
+ rtx x;
- for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
- if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
- {
- rtx reg = gen_rtx_REG (V4SFmode, regno);
- rtx mem;
- unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
+ gcc_assert (SYMBOL_REF_DECL (symbol));
+ imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
- mem = choose_baseaddr (cfa_offset, &align);
- mem = gen_rtx_MEM (V4SFmode, mem);
+ x = DECL_RTL (imp_decl);
+ if (want_reg)
+ x = force_reg (Pmode, x);
+ return x;
+}
- /* The location aligment depends upon the base register. */
- align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
- gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
- set_mem_align (mem, align);
- emit_insn (gen_rtx_SET (reg, mem));
-
- ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
-
- cfa_offset -= GET_MODE_SIZE (V4SFmode);
- }
-}
+/* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
+ true if we require the result be a register. */
-static void
-ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
- bool use_call, int style)
+static rtx
+legitimize_dllimport_symbol (rtx symbol, bool want_reg)
{
- struct machine_function *m = cfun->machine;
- const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
- + m->call_ms2sysv_extra_regs;
- rtvec v;
- unsigned int elems_needed, align, i, vi = 0;
- rtx_insn *insn;
- rtx sym, tmp;
- rtx rsi = gen_rtx_REG (word_mode, SI_REG);
- rtx r10 = NULL_RTX;
- const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
- HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
- HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
- rtx rsi_frame_load = NULL_RTX;
- HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
- enum xlogue_stub stub;
-
- gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
-
- /* If using a realigned stack, we should never start with padding. */
- gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
+ tree imp_decl;
+ rtx x;
- /* Setup RSI as the stub's base pointer. */
- align = GET_MODE_ALIGNMENT (V4SFmode);
- tmp = choose_baseaddr (rsi_offset, &align, SI_REG);
- gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
+ gcc_assert (SYMBOL_REF_DECL (symbol));
+ imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
- emit_insn (gen_rtx_SET (rsi, tmp));
+ x = DECL_RTL (imp_decl);
+ if (want_reg)
+ x = force_reg (Pmode, x);
+ return x;
+}
- /* Get a symbol for the stub. */
- if (frame_pointer_needed)
- stub = use_call ? XLOGUE_STUB_RESTORE_HFP
- : XLOGUE_STUB_RESTORE_HFP_TAIL;
- else
- stub = use_call ? XLOGUE_STUB_RESTORE
- : XLOGUE_STUB_RESTORE_TAIL;
- sym = xlogue.get_stub_rtx (stub);
+/* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
+ is true if we require the result be a register. */
- elems_needed = ncregs;
- if (use_call)
- elems_needed += 1;
- else
- elems_needed += frame_pointer_needed ? 5 : 3;
- v = rtvec_alloc (elems_needed);
+rtx
+legitimize_pe_coff_symbol (rtx addr, bool inreg)
+{
+ if (!TARGET_PECOFF)
+ return NULL_RTX;
- /* We call the epilogue stub when we need to pop incoming args or we are
- doing a sibling call as the tail. Otherwise, we will emit a jmp to the
- epilogue stub and it is the tail-call. */
- if (use_call)
- RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
- else
+ if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
{
- RTVEC_ELT (v, vi++) = ret_rtx;
- RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
- if (frame_pointer_needed)
+ if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
+ return legitimize_dllimport_symbol (addr, inreg);
+ if (GET_CODE (addr) == CONST
+ && GET_CODE (XEXP (addr, 0)) == PLUS
+ && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
+ && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
{
- rtx rbp = gen_rtx_REG (DImode, BP_REG);
- gcc_assert (m->fs.fp_valid);
- gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
-
- tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
- RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
- RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
- tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
- RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
+ rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
+ return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
}
- else
- {
- /* If no hard frame pointer, we set R10 to the SP restore value. */
- gcc_assert (!m->fs.fp_valid);
- gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
- gcc_assert (m->fs.sp_valid);
+ }
- r10 = gen_rtx_REG (DImode, R10_REG);
- tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
- emit_insn (gen_rtx_SET (r10, tmp));
+ if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
+ return NULL_RTX;
+ if (GET_CODE (addr) == SYMBOL_REF
+ && !is_imported_p (addr)
+ && SYMBOL_REF_EXTERNAL_P (addr)
+ && SYMBOL_REF_DECL (addr))
+ return legitimize_pe_coff_extern_decl (addr, inreg);
- RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
- }
+ if (GET_CODE (addr) == CONST
+ && GET_CODE (XEXP (addr, 0)) == PLUS
+ && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
+ && !is_imported_p (XEXP (XEXP (addr, 0), 0))
+ && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
+ && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
+ {
+ rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
+ return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
}
+ return NULL_RTX;
+}
- /* Generate frame load insns and restore notes. */
- for (i = 0; i < ncregs; ++i)
- {
- const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
- machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
- rtx reg, frame_load;
+/* Try machine-dependent ways of modifying an illegitimate address
+ to be legitimate. If we find one, return the new, valid address.
+ This macro is used in only one place: `memory_address' in explow.c.
- reg = gen_rtx_REG (mode, r.regno);
- frame_load = gen_frame_load (reg, rsi, r.offset);
+ OLDX is the address as it was before break_out_memory_refs was called.
+ In some cases it is useful to look at this to decide what needs to be done.
- /* Save RSI frame load insn & note to add last. */
- if (r.regno == SI_REG)
- {
- gcc_assert (!rsi_frame_load);
- rsi_frame_load = frame_load;
- rsi_restore_offset = r.offset;
- }
- else
- {
- RTVEC_ELT (v, vi++) = frame_load;
- ix86_add_cfa_restore_note (NULL, reg, r.offset);
- }
- }
+ It is always safe for this macro to do nothing. It exists to recognize
+ opportunities to optimize the output.
- /* Add RSI frame load & restore note at the end. */
- gcc_assert (rsi_frame_load);
- gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
- RTVEC_ELT (v, vi++) = rsi_frame_load;
- ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
- rsi_restore_offset);
+ For the 80386, we handle X+REG by loading X into a register R and
+ using R+REG. R will go in a general reg and indexing will be used.
+ However, if REG is a broken-out memory address or multiplication,
+ nothing needs to be done because REG can certainly go in a general reg.
- /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */
- if (!use_call && !frame_pointer_needed)
- {
- gcc_assert (m->fs.sp_valid);
- gcc_assert (!m->fs.sp_realigned);
+ When -fpic is used, special handling is needed for symbolic references.
+ See comments by legitimize_pic_address in i386.c for details. */
- /* At this point, R10 should point to frame.stack_realign_offset. */
- if (m->fs.cfa_reg == stack_pointer_rtx)
- m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
- m->fs.sp_offset = frame.stack_realign_offset;
- }
+static rtx
+ix86_legitimize_address (rtx x, rtx, machine_mode mode)
+{
+ bool changed = false;
+ unsigned log;
- gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
- tmp = gen_rtx_PARALLEL (VOIDmode, v);
- if (use_call)
- insn = emit_insn (tmp);
- else
+ log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
+ if (log)
+ return legitimize_tls_address (x, (enum tls_model) log, false);
+ if (GET_CODE (x) == CONST
+ && GET_CODE (XEXP (x, 0)) == PLUS
+ && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
+ && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
{
- insn = emit_jump_insn (tmp);
- JUMP_LABEL (insn) = ret_rtx;
-
- if (frame_pointer_needed)
- ix86_emit_leave (insn);
- else
- {
- /* Need CFA adjust note. */
- tmp = gen_rtx_SET (stack_pointer_rtx, r10);
- add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
- }
+ rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
+ (enum tls_model) log, false);
+ return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
}
- RTX_FRAME_RELATED_P (insn) = true;
- ix86_add_queued_cfa_restore_notes (insn);
-
- /* If we're not doing a tail-call, we need to adjust the stack. */
- if (use_call && m->fs.sp_valid)
+ if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
{
- HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
- pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
- GEN_INT (dealloc), style,
- m->fs.cfa_reg == stack_pointer_rtx);
+ rtx tmp = legitimize_pe_coff_symbol (x, true);
+ if (tmp)
+ return tmp;
}
-}
-/* Restore function stack, frame, and registers. */
+ if (flag_pic && SYMBOLIC_CONST (x))
+ return legitimize_pic_address (x, 0);
-void
-ix86_expand_epilogue (int style)
-{
- struct machine_function *m = cfun->machine;
- struct machine_frame_state frame_state_save = m->fs;
- bool restore_regs_via_mov;
- bool using_drap;
- bool restore_stub_is_tail = false;
+#if TARGET_MACHO
+ if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
+ return machopic_indirect_data_reference (x, 0);
+#endif
- if (ix86_function_naked (current_function_decl))
+ /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
+ if (GET_CODE (x) == ASHIFT
+ && CONST_INT_P (XEXP (x, 1))
+ && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
{
- /* The program should not reach this point. */
- emit_insn (gen_ud2 ());
- return;
+ changed = true;
+ log = INTVAL (XEXP (x, 1));
+ x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
+ GEN_INT (1 << log));
}
- ix86_finalize_stack_frame_flags ();
- const struct ix86_frame &frame = cfun->machine->frame;
-
- m->fs.sp_realigned = stack_realign_fp;
- m->fs.sp_valid = stack_realign_fp
- || !frame_pointer_needed
- || crtl->sp_is_unchanging;
- gcc_assert (!m->fs.sp_valid
- || m->fs.sp_offset == frame.stack_pointer_offset);
+ if (GET_CODE (x) == PLUS)
+ {
+ /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
- /* The FP must be valid if the frame pointer is present. */
- gcc_assert (frame_pointer_needed == m->fs.fp_valid);
- gcc_assert (!m->fs.fp_valid
- || m->fs.fp_offset == frame.hard_frame_pointer_offset);
+ if (GET_CODE (XEXP (x, 0)) == ASHIFT
+ && CONST_INT_P (XEXP (XEXP (x, 0), 1))
+ && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
+ {
+ changed = true;
+ log = INTVAL (XEXP (XEXP (x, 0), 1));
+ XEXP (x, 0) = gen_rtx_MULT (Pmode,
+ force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
+ GEN_INT (1 << log));
+ }
- /* We must have *some* valid pointer to the stack frame. */
- gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
+ if (GET_CODE (XEXP (x, 1)) == ASHIFT
+ && CONST_INT_P (XEXP (XEXP (x, 1), 1))
+ && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
+ {
+ changed = true;
+ log = INTVAL (XEXP (XEXP (x, 1), 1));
+ XEXP (x, 1) = gen_rtx_MULT (Pmode,
+ force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
+ GEN_INT (1 << log));
+ }
- /* The DRAP is never valid at this point. */
- gcc_assert (!m->fs.drap_valid);
+ /* Put multiply first if it isn't already. */
+ if (GET_CODE (XEXP (x, 1)) == MULT)
+ {
+ std::swap (XEXP (x, 0), XEXP (x, 1));
+ changed = true;
+ }
- /* See the comment about red zone and frame
- pointer usage in ix86_expand_prologue. */
- if (frame_pointer_needed && frame.red_zone_size)
- emit_insn (gen_memory_blockage ());
+ /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
+ into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
+ created by virtual register instantiation, register elimination, and
+ similar optimizations. */
+ if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
+ {
+ changed = true;
+ x = gen_rtx_PLUS (Pmode,
+ gen_rtx_PLUS (Pmode, XEXP (x, 0),
+ XEXP (XEXP (x, 1), 0)),
+ XEXP (XEXP (x, 1), 1));
+ }
- using_drap = crtl->drap_reg && crtl->stack_realign_needed;
- gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
+ /* Canonicalize
+ (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
+ into (plus (plus (mult (reg) (const)) (reg)) (const)). */
+ else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
+ && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
+ && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
+ && CONSTANT_P (XEXP (x, 1)))
+ {
+ rtx constant;
+ rtx other = NULL_RTX;
- /* Determine the CFA offset of the end of the red-zone. */
- m->fs.red_zone_offset = 0;
- if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
- {
- /* The red-zone begins below return address and error code in
- exception handler. */
- m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
+ if (CONST_INT_P (XEXP (x, 1)))
+ {
+ constant = XEXP (x, 1);
+ other = XEXP (XEXP (XEXP (x, 0), 1), 1);
+ }
+ else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
+ {
+ constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
+ other = XEXP (x, 1);
+ }
+ else
+ constant = 0;
- /* When the register save area is in the aligned portion of
- the stack, determine the maximum runtime displacement that
- matches up with the aligned frame. */
- if (stack_realign_drap)
- m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
- + UNITS_PER_WORD);
- }
+ if (constant)
+ {
+ changed = true;
+ x = gen_rtx_PLUS (Pmode,
+ gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
+ XEXP (XEXP (XEXP (x, 0), 1), 0)),
+ plus_constant (Pmode, other,
+ INTVAL (constant)));
+ }
+ }
- HOST_WIDE_INT reg_save_offset = frame.reg_save_offset;
+ if (changed && ix86_legitimate_address_p (mode, x, false))
+ return x;
- /* Special care must be taken for the normal return case of a function
- using eh_return: the eax and edx registers are marked as saved, but
- not restored along this path. Adjust the save location to match. */
- if (crtl->calls_eh_return && style != 2)
- reg_save_offset -= 2 * UNITS_PER_WORD;
+ if (GET_CODE (XEXP (x, 0)) == MULT)
+ {
+ changed = true;
+ XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
+ }
- /* EH_RETURN requires the use of moves to function properly. */
- if (crtl->calls_eh_return)
- restore_regs_via_mov = true;
- /* SEH requires the use of pops to identify the epilogue. */
- else if (TARGET_SEH)
- restore_regs_via_mov = false;
- /* If we're only restoring one register and sp cannot be used then
- using a move instruction to restore the register since it's
- less work than reloading sp and popping the register. */
- else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
- restore_regs_via_mov = true;
- else if (TARGET_EPILOGUE_USING_MOVE
- && cfun->machine->use_fast_prologue_epilogue
- && (frame.nregs > 1
- || m->fs.sp_offset != reg_save_offset))
- restore_regs_via_mov = true;
- else if (frame_pointer_needed
- && !frame.nregs
- && m->fs.sp_offset != reg_save_offset)
- restore_regs_via_mov = true;
- else if (frame_pointer_needed
- && TARGET_USE_LEAVE
- && cfun->machine->use_fast_prologue_epilogue
- && frame.nregs == 1)
- restore_regs_via_mov = true;
- else
- restore_regs_via_mov = false;
+ if (GET_CODE (XEXP (x, 1)) == MULT)
+ {
+ changed = true;
+ XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
+ }
- if (restore_regs_via_mov || frame.nsseregs)
- {
- /* Ensure that the entire register save area is addressable via
- the stack pointer, if we will restore SSE regs via sp. */
- if (TARGET_64BIT
- && m->fs.sp_offset > 0x7fffffff
- && sp_valid_at (frame.stack_realign_offset + 1)
- && (frame.nsseregs + frame.nregs) != 0)
+ if (changed
+ && REG_P (XEXP (x, 1))
+ && REG_P (XEXP (x, 0)))
+ return x;
+
+ if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
{
- pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
- GEN_INT (m->fs.sp_offset
- - frame.sse_reg_save_offset),
- style,
- m->fs.cfa_reg == stack_pointer_rtx);
+ changed = true;
+ x = legitimize_pic_address (x, 0);
}
- }
- /* If there are any SSE registers to restore, then we have to do it
- via moves, since there's obviously no pop for SSE regs. */
- if (frame.nsseregs)
- ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
- style == 2);
+ if (changed && ix86_legitimate_address_p (mode, x, false))
+ return x;
- if (m->call_ms2sysv)
- {
- int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
+ if (REG_P (XEXP (x, 0)))
+ {
+ rtx temp = gen_reg_rtx (Pmode);
+ rtx val = force_operand (XEXP (x, 1), temp);
+ if (val != temp)
+ {
+ val = convert_to_mode (Pmode, val, 1);
+ emit_move_insn (temp, val);
+ }
- /* We cannot use a tail-call for the stub if:
- 1. We have to pop incoming args,
- 2. We have additional int regs to restore, or
- 3. A sibling call will be the tail-call, or
- 4. We are emitting an eh_return_internal epilogue.
+ XEXP (x, 1) = temp;
+ return x;
+ }
- TODO: Item 4 has not yet tested!
+ else if (REG_P (XEXP (x, 1)))
+ {
+ rtx temp = gen_reg_rtx (Pmode);
+ rtx val = force_operand (XEXP (x, 0), temp);
+ if (val != temp)
+ {
+ val = convert_to_mode (Pmode, val, 1);
+ emit_move_insn (temp, val);
+ }
- If any of the above are true, we will call the stub rather than
- jump to it. */
- restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
- ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
+ XEXP (x, 0) = temp;
+ return x;
+ }
}
- /* If using out-of-line stub that is a tail-call, then...*/
- if (m->call_ms2sysv && restore_stub_is_tail)
- {
- /* TODO: parinoid tests. (remove eventually) */
- gcc_assert (m->fs.sp_valid);
- gcc_assert (!m->fs.sp_realigned);
- gcc_assert (!m->fs.fp_valid);
- gcc_assert (!m->fs.realigned);
- gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
- gcc_assert (!crtl->drap_reg);
- gcc_assert (!frame.nregs);
- }
- else if (restore_regs_via_mov)
- {
- rtx t;
+ return x;
+}
+\f
+/* Print an integer constant expression in assembler syntax. Addition
+ and subtraction are the only arithmetic that may appear in these
+ expressions. FILE is the stdio stream to write to, X is the rtx, and
+ CODE is the operand print code from the output string. */
- if (frame.nregs)
- ix86_emit_restore_regs_using_mov (reg_save_offset, style == 2);
+static void
+output_pic_addr_const (FILE *file, rtx x, int code)
+{
+ char buf[256];
- /* eh_return epilogues need %ecx added to the stack pointer. */
- if (style == 2)
- {
- rtx sa = EH_RETURN_STACKADJ_RTX;
- rtx_insn *insn;
+ switch (GET_CODE (x))
+ {
+ case PC:
+ gcc_assert (flag_pic);
+ putc ('.', file);
+ break;
- /* %ecx can't be used for both DRAP register and eh_return. */
- if (crtl->drap_reg)
- gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
+ case SYMBOL_REF:
+ if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
+ output_addr_const (file, x);
+ else
+ {
+ const char *name = XSTR (x, 0);
- /* regparm nested functions don't work with eh_return. */
- gcc_assert (!ix86_static_chain_on_stack);
+ /* Mark the decl as referenced so that cgraph will
+ output the function. */
+ if (SYMBOL_REF_DECL (x))
+ mark_decl_referenced (SYMBOL_REF_DECL (x));
- if (frame_pointer_needed)
- {
- t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
- t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
- emit_insn (gen_rtx_SET (sa, t));
+#if TARGET_MACHO
+ if (MACHOPIC_INDIRECT
+ && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
+ name = machopic_indirection_name (x, /*stub_p=*/true);
+#endif
+ assemble_name (file, name);
+ }
+ if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
+ && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
+ fputs ("@PLT", file);
+ break;
- t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
- insn = emit_move_insn (hard_frame_pointer_rtx, t);
+ case LABEL_REF:
+ x = XEXP (x, 0);
+ /* FALLTHRU */
+ case CODE_LABEL:
+ ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
+ assemble_name (asm_out_file, buf);
+ break;
- /* Note that we use SA as a temporary CFA, as the return
- address is at the proper place relative to it. We
- pretend this happens at the FP restore insn because
- prior to this insn the FP would be stored at the wrong
- offset relative to SA, and after this insn we have no
- other reasonable register to use for the CFA. We don't
- bother resetting the CFA to the SP for the duration of
- the return insn, unless the control flow instrumentation
- is done. In this case the SP is used later and we have
- to reset CFA to SP. */
- add_reg_note (insn, REG_CFA_DEF_CFA,
- plus_constant (Pmode, sa, UNITS_PER_WORD));
- ix86_add_queued_cfa_restore_notes (insn);
- add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
- RTX_FRAME_RELATED_P (insn) = 1;
+ case CONST_INT:
+ fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
+ break;
- m->fs.cfa_reg = sa;
- m->fs.cfa_offset = UNITS_PER_WORD;
- m->fs.fp_valid = false;
+ case CONST:
+ /* This used to output parentheses around the expression,
+ but that does not work on the 386 (either ATT or BSD assembler). */
+ output_pic_addr_const (file, XEXP (x, 0), code);
+ break;
- pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
- const0_rtx, style,
- flag_cf_protection);
- }
- else
- {
- t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
- t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
- insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
- ix86_add_queued_cfa_restore_notes (insn);
+ case CONST_DOUBLE:
+ /* We can't handle floating point constants;
+ TARGET_PRINT_OPERAND must handle them. */
+ output_operand_lossage ("floating constant misused");
+ break;
- gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
- if (m->fs.cfa_offset != UNITS_PER_WORD)
- {
- m->fs.cfa_offset = UNITS_PER_WORD;
- add_reg_note (insn, REG_CFA_DEF_CFA,
- plus_constant (Pmode, stack_pointer_rtx,
- UNITS_PER_WORD));
- RTX_FRAME_RELATED_P (insn) = 1;
- }
- }
- m->fs.sp_offset = UNITS_PER_WORD;
- m->fs.sp_valid = true;
- m->fs.sp_realigned = false;
- }
- }
- else
- {
- /* SEH requires that the function end with (1) a stack adjustment
- if necessary, (2) a sequence of pops, and (3) a return or
- jump instruction. Prevent insns from the function body from
- being scheduled into this sequence. */
- if (TARGET_SEH)
+ case PLUS:
+ /* Some assemblers need integer constants to appear first. */
+ if (CONST_INT_P (XEXP (x, 0)))
{
- /* Prevent a catch region from being adjacent to the standard
- epilogue sequence. Unfortunately neither crtl->uses_eh_lsda
- nor several other flags that would be interesting to test are
- set up yet. */
- if (flag_non_call_exceptions)
- emit_insn (gen_nops (const1_rtx));
- else
- emit_insn (gen_blockage ());
+ output_pic_addr_const (file, XEXP (x, 0), code);
+ putc ('+', file);
+ output_pic_addr_const (file, XEXP (x, 1), code);
}
-
- /* First step is to deallocate the stack frame so that we can
- pop the registers. If the stack pointer was realigned, it needs
- to be restored now. Also do it on SEH target for very large
- frame as the emitted instructions aren't allowed by the ABI
- in epilogues. */
- if (!m->fs.sp_valid || m->fs.sp_realigned
- || (TARGET_SEH
- && (m->fs.sp_offset - reg_save_offset
- >= SEH_MAX_FRAME_SIZE)))
+ else
{
- pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
- GEN_INT (m->fs.fp_offset
- - reg_save_offset),
- style, false);
+ gcc_assert (CONST_INT_P (XEXP (x, 1)));
+ output_pic_addr_const (file, XEXP (x, 1), code);
+ putc ('+', file);
+ output_pic_addr_const (file, XEXP (x, 0), code);
}
- else if (m->fs.sp_offset != reg_save_offset)
+ break;
+
+ case MINUS:
+ if (!TARGET_MACHO)
+ putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
+ output_pic_addr_const (file, XEXP (x, 0), code);
+ putc ('-', file);
+ output_pic_addr_const (file, XEXP (x, 1), code);
+ if (!TARGET_MACHO)
+ putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
+ break;
+
+ case UNSPEC:
+ gcc_assert (XVECLEN (x, 0) == 1);
+ output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
+ switch (XINT (x, 1))
{
- pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
- GEN_INT (m->fs.sp_offset
- - reg_save_offset),
- style,
- m->fs.cfa_reg == stack_pointer_rtx);
+ case UNSPEC_GOT:
+ fputs ("@GOT", file);
+ break;
+ case UNSPEC_GOTOFF:
+ fputs ("@GOTOFF", file);
+ break;
+ case UNSPEC_PLTOFF:
+ fputs ("@PLTOFF", file);
+ break;
+ case UNSPEC_PCREL:
+ fputs (ASSEMBLER_DIALECT == ASM_ATT ?
+ "(%rip)" : "[rip]", file);
+ break;
+ case UNSPEC_GOTPCREL:
+ fputs (ASSEMBLER_DIALECT == ASM_ATT ?
+ "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
+ break;
+ case UNSPEC_GOTTPOFF:
+ /* FIXME: This might be @TPOFF in Sun ld too. */
+ fputs ("@gottpoff", file);
+ break;
+ case UNSPEC_TPOFF:
+ fputs ("@tpoff", file);
+ break;
+ case UNSPEC_NTPOFF:
+ if (TARGET_64BIT)
+ fputs ("@tpoff", file);
+ else
+ fputs ("@ntpoff", file);
+ break;
+ case UNSPEC_DTPOFF:
+ fputs ("@dtpoff", file);
+ break;
+ case UNSPEC_GOTNTPOFF:
+ if (TARGET_64BIT)
+ fputs (ASSEMBLER_DIALECT == ASM_ATT ?
+ "@gottpoff(%rip)": "@gottpoff[rip]", file);
+ else
+ fputs ("@gotntpoff", file);
+ break;
+ case UNSPEC_INDNTPOFF:
+ fputs ("@indntpoff", file);
+ break;
+#if TARGET_MACHO
+ case UNSPEC_MACHOPIC_OFFSET:
+ putc ('-', file);
+ machopic_output_function_base_name (file);
+ break;
+#endif
+ default:
+ output_operand_lossage ("invalid UNSPEC as operand");
+ break;
}
+ break;
- ix86_emit_restore_regs_using_pop ();
+ default:
+ output_operand_lossage ("invalid expression as operand");
}
+}
- /* If we used a stack pointer and haven't already got rid of it,
- then do so now. */
- if (m->fs.fp_valid)
- {
- /* If the stack pointer is valid and pointing at the frame
- pointer store address, then we only need a pop. */
- if (sp_valid_at (frame.hfp_save_offset)
- && m->fs.sp_offset == frame.hfp_save_offset)
- ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
- /* Leave results in shorter dependency chains on CPUs that are
- able to grok it fast. */
- else if (TARGET_USE_LEAVE
- || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
- || !cfun->machine->use_fast_prologue_epilogue)
- ix86_emit_leave (NULL);
- else
- {
- pro_epilogue_adjust_stack (stack_pointer_rtx,
- hard_frame_pointer_rtx,
- const0_rtx, style, !using_drap);
- ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
- }
- }
+/* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
+ We need to emit DTP-relative relocations. */
- if (using_drap)
+static void ATTRIBUTE_UNUSED
+i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
+{
+ fputs (ASM_LONG, file);
+ output_addr_const (file, x);
+ fputs ("@dtpoff", file);
+ switch (size)
{
- int param_ptr_offset = UNITS_PER_WORD;
- rtx_insn *insn;
-
- gcc_assert (stack_realign_drap);
-
- if (ix86_static_chain_on_stack)
- param_ptr_offset += UNITS_PER_WORD;
- if (!call_used_regs[REGNO (crtl->drap_reg)])
- param_ptr_offset += UNITS_PER_WORD;
-
- insn = emit_insn (gen_rtx_SET
- (stack_pointer_rtx,
- gen_rtx_PLUS (Pmode,
- crtl->drap_reg,
- GEN_INT (-param_ptr_offset))));
- m->fs.cfa_reg = stack_pointer_rtx;
- m->fs.cfa_offset = param_ptr_offset;
- m->fs.sp_offset = param_ptr_offset;
- m->fs.realigned = false;
-
- add_reg_note (insn, REG_CFA_DEF_CFA,
- gen_rtx_PLUS (Pmode, stack_pointer_rtx,
- GEN_INT (param_ptr_offset)));
- RTX_FRAME_RELATED_P (insn) = 1;
+ case 4:
+ break;
+ case 8:
+ fputs (", 0", file);
+ break;
+ default:
+ gcc_unreachable ();
+ }
+}
- if (!call_used_regs[REGNO (crtl->drap_reg)])
- ix86_emit_restore_reg_using_pop (crtl->drap_reg);
- }
+/* Return true if X is a representation of the PIC register. This copes
+ with calls from ix86_find_base_term, where the register might have
+ been replaced by a cselib value. */
- /* At this point the stack pointer must be valid, and we must have
- restored all of the registers. We may not have deallocated the
- entire stack frame. We've delayed this until now because it may
- be possible to merge the local stack deallocation with the
- deallocation forced by ix86_static_chain_on_stack. */
- gcc_assert (m->fs.sp_valid);
- gcc_assert (!m->fs.sp_realigned);
- gcc_assert (!m->fs.fp_valid);
- gcc_assert (!m->fs.realigned);
- if (m->fs.sp_offset != UNITS_PER_WORD)
+static bool
+ix86_pic_register_p (rtx x)
+{
+ if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
+ return (pic_offset_table_rtx
+ && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
+ else if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SET_GOT)
+ return true;
+ else if (!REG_P (x))
+ return false;
+ else if (pic_offset_table_rtx)
{
- pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
- GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
- style, true);
+ if (REGNO (x) == REGNO (pic_offset_table_rtx))
+ return true;
+ if (HARD_REGISTER_P (x)
+ && !HARD_REGISTER_P (pic_offset_table_rtx)
+ && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
+ return true;
+ return false;
}
else
- ix86_add_queued_cfa_restore_notes (get_last_insn ());
+ return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
+}
- /* Sibcall epilogues don't want a return instruction. */
- if (style == 0)
+/* Helper function for ix86_delegitimize_address.
+ Attempt to delegitimize TLS local-exec accesses. */
+
+static rtx
+ix86_delegitimize_tls_address (rtx orig_x)
+{
+ rtx x = orig_x, unspec;
+ struct ix86_address addr;
+
+ if (!TARGET_TLS_DIRECT_SEG_REFS)
+ return orig_x;
+ if (MEM_P (x))
+ x = XEXP (x, 0);
+ if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
+ return orig_x;
+ if (ix86_decompose_address (x, &addr) == 0
+ || addr.seg != DEFAULT_TLS_SEG_REG
+ || addr.disp == NULL_RTX
+ || GET_CODE (addr.disp) != CONST)
+ return orig_x;
+ unspec = XEXP (addr.disp, 0);
+ if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
+ unspec = XEXP (unspec, 0);
+ if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
+ return orig_x;
+ x = XVECEXP (unspec, 0, 0);
+ gcc_assert (GET_CODE (x) == SYMBOL_REF);
+ if (unspec != XEXP (addr.disp, 0))
+ x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
+ if (addr.index)
{
- m->fs = frame_state_save;
- return;
+ rtx idx = addr.index;
+ if (addr.scale != 1)
+ idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
+ x = gen_rtx_PLUS (Pmode, idx, x);
}
+ if (addr.base)
+ x = gen_rtx_PLUS (Pmode, addr.base, x);
+ if (MEM_P (orig_x))
+ x = replace_equiv_address_nv (orig_x, x);
+ return x;
+}
- if (cfun->machine->func_type != TYPE_NORMAL)
- emit_jump_insn (gen_interrupt_return ());
- else if (crtl->args.pops_args && crtl->args.size)
- {
- rtx popc = GEN_INT (crtl->args.pops_args);
+/* In the name of slightly smaller debug output, and to cater to
+ general assembler lossage, recognize PIC+GOTOFF and turn it back
+ into a direct symbol reference.
- /* i386 can only pop 64K bytes. If asked to pop more, pop return
- address, do explicit add, and jump indirectly to the caller. */
+ On Darwin, this is necessary to avoid a crash, because Darwin
+ has a different PIC label for each routine but the DWARF debugging
+ information is not associated with any particular routine, so it's
+ necessary to remove references to the PIC label from RTL stored by
+ the DWARF output code.
- if (crtl->args.pops_args >= 65536)
- {
- rtx ecx = gen_rtx_REG (SImode, CX_REG);
- rtx_insn *insn;
+ This helper is used in the normal ix86_delegitimize_address
+ entrypoint (e.g. used in the target delegitimization hook) and
+ in ix86_find_base_term. As compile time memory optimization, we
+ avoid allocating rtxes that will not change anything on the outcome
+ of the callers (find_base_value and find_base_term). */
- /* There is no "pascal" calling convention in any 64bit ABI. */
- gcc_assert (!TARGET_64BIT);
+static inline rtx
+ix86_delegitimize_address_1 (rtx x, bool base_term_p)
+{
+ rtx orig_x = delegitimize_mem_from_attrs (x);
+ /* addend is NULL or some rtx if x is something+GOTOFF where
+ something doesn't include the PIC register. */
+ rtx addend = NULL_RTX;
+ /* reg_addend is NULL or a multiple of some register. */
+ rtx reg_addend = NULL_RTX;
+ /* const_addend is NULL or a const_int. */
+ rtx const_addend = NULL_RTX;
+ /* This is the result, or NULL. */
+ rtx result = NULL_RTX;
- insn = emit_insn (gen_pop (ecx));
- m->fs.cfa_offset -= UNITS_PER_WORD;
- m->fs.sp_offset -= UNITS_PER_WORD;
+ x = orig_x;
- rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
- x = gen_rtx_SET (stack_pointer_rtx, x);
- add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
- add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
- RTX_FRAME_RELATED_P (insn) = 1;
+ if (MEM_P (x))
+ x = XEXP (x, 0);
- pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
- popc, -1, true);
- emit_jump_insn (gen_simple_return_indirect_internal (ecx));
- }
- else
- emit_jump_insn (gen_simple_return_pop_internal (popc));
- }
- else if (!m->call_ms2sysv || !restore_stub_is_tail)
+ if (TARGET_64BIT)
{
- /* In case of return from EH a simple return cannot be used
- as a return address will be compared with a shadow stack
- return address. Use indirect jump instead. */
- if (style == 2 && flag_cf_protection)
+ if (GET_CODE (x) == CONST
+ && GET_CODE (XEXP (x, 0)) == PLUS
+ && GET_MODE (XEXP (x, 0)) == Pmode
+ && CONST_INT_P (XEXP (XEXP (x, 0), 1))
+ && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
+ && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
+ {
+ /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
+ base. A CONST can't be arg_pointer_rtx based. */
+ if (base_term_p && MEM_P (orig_x))
+ return orig_x;
+ rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
+ x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
+ if (MEM_P (orig_x))
+ x = replace_equiv_address_nv (orig_x, x);
+ return x;
+ }
+
+ if (GET_CODE (x) == CONST
+ && GET_CODE (XEXP (x, 0)) == UNSPEC
+ && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
+ || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
+ && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
{
- /* Register used in indirect jump must be in word_mode. But
- Pmode may not be the same as word_mode for x32. */
- rtx ecx = gen_rtx_REG (word_mode, CX_REG);
- rtx_insn *insn;
+ x = XVECEXP (XEXP (x, 0), 0, 0);
+ if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
+ {
+ x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
+ if (x == NULL_RTX)
+ return orig_x;
+ }
+ return x;
+ }
- insn = emit_insn (gen_pop (ecx));
- m->fs.cfa_offset -= UNITS_PER_WORD;
- m->fs.sp_offset -= UNITS_PER_WORD;
+ if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
+ return ix86_delegitimize_tls_address (orig_x);
- rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
- x = gen_rtx_SET (stack_pointer_rtx, x);
- add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
- add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
- RTX_FRAME_RELATED_P (insn) = 1;
+ /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
+ and -mcmodel=medium -fpic. */
+ }
- emit_jump_insn (gen_simple_return_indirect_internal (ecx));
- }
+ if (GET_CODE (x) != PLUS
+ || GET_CODE (XEXP (x, 1)) != CONST)
+ return ix86_delegitimize_tls_address (orig_x);
+
+ if (ix86_pic_register_p (XEXP (x, 0)))
+ /* %ebx + GOT/GOTOFF */
+ ;
+ else if (GET_CODE (XEXP (x, 0)) == PLUS)
+ {
+ /* %ebx + %reg * scale + GOT/GOTOFF */
+ reg_addend = XEXP (x, 0);
+ if (ix86_pic_register_p (XEXP (reg_addend, 0)))
+ reg_addend = XEXP (reg_addend, 1);
+ else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
+ reg_addend = XEXP (reg_addend, 0);
else
- emit_jump_insn (gen_simple_return_internal ());
+ {
+ reg_addend = NULL_RTX;
+ addend = XEXP (x, 0);
+ }
}
+ else
+ addend = XEXP (x, 0);
- /* Restore the state back to the state from the prologue,
- so that it's correct for the next epilogue. */
- m->fs = frame_state_save;
-}
+ x = XEXP (XEXP (x, 1), 0);
+ if (GET_CODE (x) == PLUS
+ && CONST_INT_P (XEXP (x, 1)))
+ {
+ const_addend = XEXP (x, 1);
+ x = XEXP (x, 0);
+ }
-/* Reset from the function's potential modifications. */
-
-static void
-ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED)
-{
- if (pic_offset_table_rtx
- && !ix86_use_pseudo_pic_reg ())
- SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
-
- if (TARGET_MACHO)
- {
- rtx_insn *insn = get_last_insn ();
- rtx_insn *deleted_debug_label = NULL;
-
- /* Mach-O doesn't support labels at the end of objects, so if
- it looks like we might want one, take special action.
- First, collect any sequence of deleted debug labels. */
- while (insn
- && NOTE_P (insn)
- && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
- {
- /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
- notes only, instead set their CODE_LABEL_NUMBER to -1,
- otherwise there would be code generation differences
- in between -g and -g0. */
- if (NOTE_P (insn) && NOTE_KIND (insn)
- == NOTE_INSN_DELETED_DEBUG_LABEL)
- deleted_debug_label = insn;
- insn = PREV_INSN (insn);
- }
-
- /* If we have:
- label:
- barrier
- then this needs to be detected, so skip past the barrier. */
-
- if (insn && BARRIER_P (insn))
- insn = PREV_INSN (insn);
+ if (GET_CODE (x) == UNSPEC
+ && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
+ || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
+ || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
+ && !MEM_P (orig_x) && !addend)))
+ result = XVECEXP (x, 0, 0);
- /* Up to now we've only seen notes or barriers. */
- if (insn)
- {
- if (LABEL_P (insn)
- || (NOTE_P (insn)
- && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
- /* Trailing label. */
- fputs ("\tnop\n", file);
- else if (cfun && ! cfun->is_thunk)
- {
- /* See if we have a completely empty function body, skipping
- the special case of the picbase thunk emitted as asm. */
- while (insn && ! INSN_P (insn))
- insn = PREV_INSN (insn);
- /* If we don't find any insns, we've got an empty function body;
- I.e. completely empty - without a return or branch. This is
- taken as the case where a function body has been removed
- because it contains an inline __builtin_unreachable(). GCC
- declares that reaching __builtin_unreachable() means UB so
- we're not obliged to do anything special; however, we want
- non-zero-sized function bodies. To meet this, and help the
- user out, let's trap the case. */
- if (insn == NULL)
- fputs ("\tud2\n", file);
- }
- }
- else if (deleted_debug_label)
- for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
- if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
- CODE_LABEL_NUMBER (insn) = -1;
- }
-}
+ if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
+ && !MEM_P (orig_x))
+ result = XVECEXP (x, 0, 0);
-/* Return a scratch register to use in the split stack prologue. The
- split stack prologue is used for -fsplit-stack. It is the first
- instructions in the function, even before the regular prologue.
- The scratch register can be any caller-saved register which is not
- used for parameters or for the static chain. */
+ if (! result)
+ return ix86_delegitimize_tls_address (orig_x);
-static unsigned int
-split_stack_prologue_scratch_regno (void)
-{
- if (TARGET_64BIT)
- return R11_REG;
- else
+ /* For (PLUS something CONST_INT) both find_base_{value,term} just
+ recurse on the first operand. */
+ if (const_addend && !base_term_p)
+ result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
+ if (reg_addend)
+ result = gen_rtx_PLUS (Pmode, reg_addend, result);
+ if (addend)
{
- bool is_fastcall, is_thiscall;
- int regparm;
-
- is_fastcall = (lookup_attribute ("fastcall",
- TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
- != NULL);
- is_thiscall = (lookup_attribute ("thiscall",
- TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
- != NULL);
- regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
-
- if (is_fastcall)
- {
- if (DECL_STATIC_CHAIN (cfun->decl))
- {
- sorry ("%<-fsplit-stack%> does not support fastcall with "
- "nested function");
- return INVALID_REGNUM;
- }
- return AX_REG;
- }
- else if (is_thiscall)
- {
- if (!DECL_STATIC_CHAIN (cfun->decl))
- return DX_REG;
- return AX_REG;
- }
- else if (regparm < 3)
+ /* If the rest of original X doesn't involve the PIC register, add
+ addend and subtract pic_offset_table_rtx. This can happen e.g.
+ for code like:
+ leal (%ebx, %ecx, 4), %ecx
+ ...
+ movl foo@GOTOFF(%ecx), %edx
+ in which case we return (%ecx - %ebx) + foo
+ or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
+ and reload has completed. Don't do the latter for debug,
+ as _GLOBAL_OFFSET_TABLE_ can't be expressed in the assembly. */
+ if (pic_offset_table_rtx
+ && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
+ result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
+ pic_offset_table_rtx),
+ result);
+ else if (base_term_p
+ && pic_offset_table_rtx
+ && !TARGET_MACHO
+ && !TARGET_VXWORKS_RTP)
{
- if (!DECL_STATIC_CHAIN (cfun->decl))
- return CX_REG;
- else
- {
- if (regparm >= 2)
- {
- sorry ("%<-fsplit-stack%> does not support 2 register "
- "parameters for a nested function");
- return INVALID_REGNUM;
- }
- return DX_REG;
- }
+ rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
+ tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
+ result = gen_rtx_PLUS (Pmode, tmp, result);
}
else
- {
- /* FIXME: We could make this work by pushing a register
- around the addition and comparison. */
- sorry ("%<-fsplit-stack%> does not support 3 register parameters");
- return INVALID_REGNUM;
- }
+ return orig_x;
}
+ if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
+ {
+ result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
+ if (result == NULL_RTX)
+ return orig_x;
+ }
+ return result;
}
-/* A SYMBOL_REF for the function which allocates new stackspace for
- -fsplit-stack. */
-
-static GTY(()) rtx split_stack_fn;
-
-/* A SYMBOL_REF for the more stack function when using the large
- model. */
+/* The normal instantiation of the above template. */
-static GTY(()) rtx split_stack_fn_large;
+static rtx
+ix86_delegitimize_address (rtx x)
+{
+ return ix86_delegitimize_address_1 (x, false);
+}
-/* Return location of the stack guard value in the TLS block. */
+/* If X is a machine specific address (i.e. a symbol or label being
+ referenced as a displacement from the GOT implemented using an
+ UNSPEC), then return the base term. Otherwise return X. */
rtx
-ix86_split_stack_guard (void)
+ix86_find_base_term (rtx x)
{
- int offset;
- addr_space_t as = DEFAULT_TLS_SEG_REG;
- rtx r;
-
- gcc_assert (flag_split_stack);
+ rtx term;
-#ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
- offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
-#else
- gcc_unreachable ();
-#endif
+ if (TARGET_64BIT)
+ {
+ if (GET_CODE (x) != CONST)
+ return x;
+ term = XEXP (x, 0);
+ if (GET_CODE (term) == PLUS
+ && CONST_INT_P (XEXP (term, 1)))
+ term = XEXP (term, 0);
+ if (GET_CODE (term) != UNSPEC
+ || (XINT (term, 1) != UNSPEC_GOTPCREL
+ && XINT (term, 1) != UNSPEC_PCREL))
+ return x;
- r = GEN_INT (offset);
- r = gen_const_mem (Pmode, r);
- set_mem_addr_space (r, as);
+ return XVECEXP (term, 0, 0);
+ }
- return r;
+ return ix86_delegitimize_address_1 (x, true);
}
-/* Handle -fsplit-stack. These are the first instructions in the
- function, even before the regular prologue. */
+/* Return true if X shouldn't be emitted into the debug info.
+ Disallow UNSPECs other than @gotoff - we can't emit _GLOBAL_OFFSET_TABLE_
+ symbol easily into the .debug_info section, so we need not to
+ delegitimize, but instead assemble as @gotoff.
+ Disallow _GLOBAL_OFFSET_TABLE_ SYMBOL_REF - the assembler magically
+ assembles that as _GLOBAL_OFFSET_TABLE_-. expression. */
-void
-ix86_expand_split_stack_prologue (void)
+static bool
+ix86_const_not_ok_for_debug_p (rtx x)
{
- HOST_WIDE_INT allocate;
- unsigned HOST_WIDE_INT args_size;
- rtx_code_label *label;
- rtx limit, current, allocate_rtx, call_fusage;
- rtx_insn *call_insn;
- rtx scratch_reg = NULL_RTX;
- rtx_code_label *varargs_label = NULL;
- rtx fn;
-
- gcc_assert (flag_split_stack && reload_completed);
+ if (GET_CODE (x) == UNSPEC && XINT (x, 1) != UNSPEC_GOTOFF)
+ return true;
- ix86_finalize_stack_frame_flags ();
- struct ix86_frame &frame = cfun->machine->frame;
- allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
+ if (SYMBOL_REF_P (x) && strcmp (XSTR (x, 0), GOT_SYMBOL_NAME) == 0)
+ return true;
- /* This is the label we will branch to if we have enough stack
- space. We expect the basic block reordering pass to reverse this
- branch if optimizing, so that we branch in the unlikely case. */
- label = gen_label_rtx ();
-
- /* We need to compare the stack pointer minus the frame size with
- the stack boundary in the TCB. The stack boundary always gives
- us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
- can compare directly. Otherwise we need to do an addition. */
-
- limit = ix86_split_stack_guard ();
+ return false;
+}
+\f
+static void
+put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
+ bool fp, FILE *file)
+{
+ const char *suffix;
- if (allocate < SPLIT_STACK_AVAILABLE)
- current = stack_pointer_rtx;
- else
+ if (mode == CCFPmode)
{
- unsigned int scratch_regno;
- rtx offset;
+ code = ix86_fp_compare_code_to_integer (code);
+ mode = CCmode;
+ }
+ if (reverse)
+ code = reverse_condition (code);
- /* We need a scratch register to hold the stack pointer minus
- the required frame size. Since this is the very start of the
- function, the scratch register can be any caller-saved
- register which is not used for parameters. */
- offset = GEN_INT (- allocate);
- scratch_regno = split_stack_prologue_scratch_regno ();
- if (scratch_regno == INVALID_REGNUM)
- return;
- scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
- if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
+ switch (code)
+ {
+ case EQ:
+ gcc_assert (mode != CCGZmode);
+ switch (mode)
{
- /* We don't use ix86_gen_add3 in this case because it will
- want to split to lea, but when not optimizing the insn
- will not be split after this point. */
- emit_insn (gen_rtx_SET (scratch_reg,
- gen_rtx_PLUS (Pmode, stack_pointer_rtx,
- offset)));
+ case E_CCAmode:
+ suffix = "a";
+ break;
+ case E_CCCmode:
+ suffix = "c";
+ break;
+ case E_CCOmode:
+ suffix = "o";
+ break;
+ case E_CCPmode:
+ suffix = "p";
+ break;
+ case E_CCSmode:
+ suffix = "s";
+ break;
+ default:
+ suffix = "e";
+ break;
+ }
+ break;
+ case NE:
+ gcc_assert (mode != CCGZmode);
+ switch (mode)
+ {
+ case E_CCAmode:
+ suffix = "na";
+ break;
+ case E_CCCmode:
+ suffix = "nc";
+ break;
+ case E_CCOmode:
+ suffix = "no";
+ break;
+ case E_CCPmode:
+ suffix = "np";
+ break;
+ case E_CCSmode:
+ suffix = "ns";
+ break;
+ default:
+ suffix = "ne";
+ break;
}
+ break;
+ case GT:
+ gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
+ suffix = "g";
+ break;
+ case GTU:
+ /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
+ Those same assemblers have the same but opposite lossage on cmov. */
+ if (mode == CCmode)
+ suffix = fp ? "nbe" : "a";
else
+ gcc_unreachable ();
+ break;
+ case LT:
+ switch (mode)
{
- emit_move_insn (scratch_reg, offset);
- emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
- stack_pointer_rtx));
+ case E_CCNOmode:
+ case E_CCGOCmode:
+ suffix = "s";
+ break;
+
+ case E_CCmode:
+ case E_CCGCmode:
+ case E_CCGZmode:
+ suffix = "l";
+ break;
+
+ default:
+ gcc_unreachable ();
}
- current = scratch_reg;
+ break;
+ case LTU:
+ if (mode == CCmode || mode == CCGZmode)
+ suffix = "b";
+ else if (mode == CCCmode)
+ suffix = fp ? "b" : "c";
+ else
+ gcc_unreachable ();
+ break;
+ case GE:
+ switch (mode)
+ {
+ case E_CCNOmode:
+ case E_CCGOCmode:
+ suffix = "ns";
+ break;
+
+ case E_CCmode:
+ case E_CCGCmode:
+ case E_CCGZmode:
+ suffix = "ge";
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+ break;
+ case GEU:
+ if (mode == CCmode || mode == CCGZmode)
+ suffix = "nb";
+ else if (mode == CCCmode)
+ suffix = fp ? "nb" : "nc";
+ else
+ gcc_unreachable ();
+ break;
+ case LE:
+ gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
+ suffix = "le";
+ break;
+ case LEU:
+ if (mode == CCmode)
+ suffix = "be";
+ else
+ gcc_unreachable ();
+ break;
+ case UNORDERED:
+ suffix = fp ? "u" : "p";
+ break;
+ case ORDERED:
+ suffix = fp ? "nu" : "np";
+ break;
+ default:
+ gcc_unreachable ();
}
+ fputs (suffix, file);
+}
- ix86_expand_branch (GEU, current, limit, label);
- rtx_insn *jump_insn = get_last_insn ();
- JUMP_LABEL (jump_insn) = label;
+/* Print the name of register X to FILE based on its machine mode and number.
+ If CODE is 'w', pretend the mode is HImode.
+ If CODE is 'b', pretend the mode is QImode.
+ If CODE is 'k', pretend the mode is SImode.
+ If CODE is 'q', pretend the mode is DImode.
+ If CODE is 'x', pretend the mode is V4SFmode.
+ If CODE is 't', pretend the mode is V8SFmode.
+ If CODE is 'g', pretend the mode is V16SFmode.
+ If CODE is 'h', pretend the reg is the 'high' byte register.
+ If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
+ If CODE is 'd', duplicate the operand for AVX instruction.
+ If CODE is 'V', print naked full integer register name without %.
+ */
- /* Mark the jump as very likely to be taken. */
- add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
+void
+print_reg (rtx x, int code, FILE *file)
+{
+ const char *reg;
+ int msize;
+ unsigned int regno;
+ bool duplicated;
- if (split_stack_fn == NULL_RTX)
+ if (ASSEMBLER_DIALECT == ASM_ATT && code != 'V')
+ putc ('%', file);
+
+ if (x == pc_rtx)
{
- split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
- SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
+ gcc_assert (TARGET_64BIT);
+ fputs ("rip", file);
+ return;
}
- fn = split_stack_fn;
- /* Get more stack space. We pass in the desired stack space and the
- size of the arguments to copy to the new stack. In 32-bit mode
- we push the parameters; __morestack will return on a new stack
- anyhow. In 64-bit mode we pass the parameters in r10 and
- r11. */
- allocate_rtx = GEN_INT (allocate);
- args_size = crtl->args.size >= 0 ? (HOST_WIDE_INT) crtl->args.size : 0;
- call_fusage = NULL_RTX;
- rtx pop = NULL_RTX;
- if (TARGET_64BIT)
+ if (code == 'y' && STACK_TOP_P (x))
{
- rtx reg10, reg11;
+ fputs ("st(0)", file);
+ return;
+ }
- reg10 = gen_rtx_REG (Pmode, R10_REG);
- reg11 = gen_rtx_REG (Pmode, R11_REG);
+ if (code == 'w')
+ msize = 2;
+ else if (code == 'b')
+ msize = 1;
+ else if (code == 'k')
+ msize = 4;
+ else if (code == 'q')
+ msize = 8;
+ else if (code == 'h')
+ msize = 0;
+ else if (code == 'x')
+ msize = 16;
+ else if (code == 't')
+ msize = 32;
+ else if (code == 'g')
+ msize = 64;
+ else
+ msize = GET_MODE_SIZE (GET_MODE (x));
- /* If this function uses a static chain, it will be in %r10.
- Preserve it across the call to __morestack. */
- if (DECL_STATIC_CHAIN (cfun->decl))
- {
- rtx rax;
+ regno = REGNO (x);
- rax = gen_rtx_REG (word_mode, AX_REG);
- emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
- use_reg (&call_fusage, rax);
- }
-
- if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
- && !TARGET_PECOFF)
- {
- HOST_WIDE_INT argval;
-
- gcc_assert (Pmode == DImode);
- /* When using the large model we need to load the address
- into a register, and we've run out of registers. So we
- switch to a different calling convention, and we call a
- different function: __morestack_large. We pass the
- argument size in the upper 32 bits of r10 and pass the
- frame size in the lower 32 bits. */
- gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
- gcc_assert ((args_size & 0xffffffff) == args_size);
-
- if (split_stack_fn_large == NULL_RTX)
- {
- split_stack_fn_large
- = gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
- SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
- }
- if (ix86_cmodel == CM_LARGE_PIC)
- {
- rtx_code_label *label;
- rtx x;
-
- label = gen_label_rtx ();
- emit_label (label);
- LABEL_PRESERVE_P (label) = 1;
- emit_insn (gen_set_rip_rex64 (reg10, label));
- emit_insn (gen_set_got_offset_rex64 (reg11, label));
- emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
- x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
- UNSPEC_GOT);
- x = gen_rtx_CONST (Pmode, x);
- emit_move_insn (reg11, x);
- x = gen_rtx_PLUS (Pmode, reg10, reg11);
- x = gen_const_mem (Pmode, x);
- emit_move_insn (reg11, x);
- }
- else
- emit_move_insn (reg11, split_stack_fn_large);
-
- fn = reg11;
-
- argval = ((args_size << 16) << 16) + allocate;
- emit_move_insn (reg10, GEN_INT (argval));
- }
- else
- {
- emit_move_insn (reg10, allocate_rtx);
- emit_move_insn (reg11, GEN_INT (args_size));
- use_reg (&call_fusage, reg11);
- }
-
- use_reg (&call_fusage, reg10);
+ if (regno == ARG_POINTER_REGNUM
+ || regno == FRAME_POINTER_REGNUM
+ || regno == FPSR_REG)
+ {
+ output_operand_lossage
+ ("invalid use of register '%s'", reg_names[regno]);
+ return;
}
- else
+ else if (regno == FLAGS_REG)
{
- rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
- add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
- insn = emit_insn (gen_push (allocate_rtx));
- add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
- pop = GEN_INT (2 * UNITS_PER_WORD);
+ output_operand_lossage ("invalid use of asm flag output");
+ return;
}
- call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
- GEN_INT (UNITS_PER_WORD), constm1_rtx,
- pop, false);
- add_function_usage_to (call_insn, call_fusage);
- if (!TARGET_64BIT)
- add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
- /* Indicate that this function can't jump to non-local gotos. */
- make_reg_eh_region_note_nothrow_nononlocal (call_insn);
-
- /* In order to make call/return prediction work right, we now need
- to execute a return instruction. See
- libgcc/config/i386/morestack.S for the details on how this works.
-
- For flow purposes gcc must not see this as a return
- instruction--we need control flow to continue at the subsequent
- label. Therefore, we use an unspec. */
- gcc_assert (crtl->args.pops_args < 65536);
- rtx_insn *ret_insn
- = emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
- if ((flag_cf_protection & CF_BRANCH))
+ if (code == 'V')
{
- /* Insert ENDBR since __morestack will jump back here via indirect
- call. */
- rtx cet_eb = gen_nop_endbr ();
- emit_insn_after (cet_eb, ret_insn);
+ if (GENERAL_REGNO_P (regno))
+ msize = GET_MODE_SIZE (word_mode);
+ else
+ error ("%<V%> modifier on non-integer register");
}
- /* If we are in 64-bit mode and this function uses a static chain,
- we saved %r10 in %rax before calling _morestack. */
- if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
- emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
- gen_rtx_REG (word_mode, AX_REG));
+ duplicated = code == 'd' && TARGET_AVX;
- /* If this function calls va_start, we need to store a pointer to
- the arguments on the old stack, because they may not have been
- all copied to the new stack. At this point the old stack can be
- found at the frame pointer value used by __morestack, because
- __morestack has set that up before calling back to us. Here we
- store that pointer in a scratch register, and in
- ix86_expand_prologue we store the scratch register in a stack
- slot. */
- if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
+ switch (msize)
{
- unsigned int scratch_regno;
- rtx frame_reg;
- int words;
-
- scratch_regno = split_stack_prologue_scratch_regno ();
- scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
- frame_reg = gen_rtx_REG (Pmode, BP_REG);
-
- /* 64-bit:
- fp -> old fp value
- return address within this function
- return address of caller of this function
- stack arguments
- So we add three words to get to the stack arguments.
-
- 32-bit:
- fp -> old fp value
- return address within this function
- first argument to __morestack
- second argument to __morestack
- return address of caller of this function
- stack arguments
- So we add five words to get to the stack arguments.
- */
- words = TARGET_64BIT ? 3 : 5;
- emit_insn (gen_rtx_SET (scratch_reg,
- gen_rtx_PLUS (Pmode, frame_reg,
- GEN_INT (words * UNITS_PER_WORD))));
-
- varargs_label = gen_label_rtx ();
- emit_jump_insn (gen_jump (varargs_label));
- JUMP_LABEL (get_last_insn ()) = varargs_label;
-
- emit_barrier ();
+ case 16:
+ case 12:
+ case 8:
+ if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
+ warning (0, "unsupported size for integer register");
+ /* FALLTHRU */
+ case 4:
+ if (LEGACY_INT_REGNO_P (regno))
+ putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
+ /* FALLTHRU */
+ case 2:
+ normal:
+ reg = hi_reg_name[regno];
+ break;
+ case 1:
+ if (regno >= ARRAY_SIZE (qi_reg_name))
+ goto normal;
+ if (!ANY_QI_REGNO_P (regno))
+ error ("unsupported size for integer register");
+ reg = qi_reg_name[regno];
+ break;
+ case 0:
+ if (regno >= ARRAY_SIZE (qi_high_reg_name))
+ goto normal;
+ reg = qi_high_reg_name[regno];
+ break;
+ case 32:
+ case 64:
+ if (SSE_REGNO_P (regno))
+ {
+ gcc_assert (!duplicated);
+ putc (msize == 32 ? 'y' : 'z', file);
+ reg = hi_reg_name[regno] + 1;
+ break;
+ }
+ goto normal;
+ default:
+ gcc_unreachable ();
}
- emit_label (label);
- LABEL_NUSES (label) = 1;
+ fputs (reg, file);
- /* If this function calls va_start, we now have to set the scratch
- register for the case where we do not call __morestack. In this
- case we need to set it based on the stack pointer. */
- if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
+ /* Irritatingly, AMD extended registers use
+ different naming convention: "r%d[bwd]" */
+ if (REX_INT_REGNO_P (regno))
{
- emit_insn (gen_rtx_SET (scratch_reg,
- gen_rtx_PLUS (Pmode, stack_pointer_rtx,
- GEN_INT (UNITS_PER_WORD))));
-
- emit_label (varargs_label);
- LABEL_NUSES (varargs_label) = 1;
+ gcc_assert (TARGET_64BIT);
+ switch (msize)
+ {
+ case 0:
+ error ("extended registers have no high halves");
+ break;
+ case 1:
+ putc ('b', file);
+ break;
+ case 2:
+ putc ('w', file);
+ break;
+ case 4:
+ putc ('d', file);
+ break;
+ case 8:
+ /* no suffix */
+ break;
+ default:
+ error ("unsupported operand size for extended register");
+ break;
+ }
+ return;
}
-}
-/* We may have to tell the dataflow pass that the split stack prologue
- is initializing a scratch register. */
-
-static void
-ix86_live_on_entry (bitmap regs)
-{
- if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
+ if (duplicated)
{
- gcc_assert (flag_split_stack);
- bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
+ if (ASSEMBLER_DIALECT == ASM_ATT)
+ fprintf (file, ", %%%s", reg);
+ else
+ fprintf (file, ", %s", reg);
}
}
-\f
-/* Extract the parts of an RTL expression that is a valid memory address
- for an instruction. Return 0 if the structure of the address is
- grossly off. Return -1 if the address contains ASHIFT, so it is not
- strictly valid, but still used for computing length of lea instruction. */
-int
-ix86_decompose_address (rtx addr, struct ix86_address *out)
-{
- rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
- rtx base_reg, index_reg;
- HOST_WIDE_INT scale = 1;
- rtx scale_rtx = NULL_RTX;
- rtx tmp;
- int retval = 1;
- addr_space_t seg = ADDR_SPACE_GENERIC;
+/* Meaning of CODE:
+ L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
+ C -- print opcode suffix for set/cmov insn.
+ c -- like C, but print reversed condition
+ F,f -- likewise, but for floating-point.
+ O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
+ otherwise nothing
+ R -- print embedded rounding and sae.
+ r -- print only sae.
+ z -- print the opcode suffix for the size of the current operand.
+ Z -- likewise, with special suffixes for x87 instructions.
+ * -- print a star (in certain assembler syntax)
+ A -- print an absolute memory reference.
+ E -- print address with DImode register names if TARGET_64BIT.
+ w -- print the operand as if it's a "word" (HImode) even if it isn't.
+ s -- print a shift double count, followed by the assemblers argument
+ delimiter.
+ b -- print the QImode name of the register for the indicated operand.
+ %b0 would print %al if operands[0] is reg 0.
+ w -- likewise, print the HImode name of the register.
+ k -- likewise, print the SImode name of the register.
+ q -- likewise, print the DImode name of the register.
+ x -- likewise, print the V4SFmode name of the register.
+ t -- likewise, print the V8SFmode name of the register.
+ g -- likewise, print the V16SFmode name of the register.
+ h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
+ y -- print "st(0)" instead of "st" as a register.
+ d -- print duplicated register operand for AVX instruction.
+ D -- print condition for SSE cmp instruction.
+ P -- if PIC, print an @PLT suffix.
+ p -- print raw symbol name.
+ X -- don't print any sort of PIC '@' suffix for a symbol.
+ & -- print some in-use local-dynamic symbol name.
+ H -- print a memory address offset by 8; used for sse high-parts
+ Y -- print condition for XOP pcom* instruction.
+ V -- print naked full integer register name without %.
+ + -- print a branch hint as 'cs' or 'ds' prefix
+ ; -- print a semicolon (after prefixes due to bug in older gas).
+ ~ -- print "i" if TARGET_AVX2, "f" otherwise.
+ ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
+ M -- print addr32 prefix for TARGET_X32 with VSIB address.
+ ! -- print NOTRACK prefix for jxx/call/ret instructions if required.
+ */
- /* Allow zero-extended SImode addresses,
- they will be emitted with addr32 prefix. */
- if (TARGET_64BIT && GET_MODE (addr) == DImode)
+void
+ix86_print_operand (FILE *file, rtx x, int code)
+{
+ if (code)
{
- if (GET_CODE (addr) == ZERO_EXTEND
- && GET_MODE (XEXP (addr, 0)) == SImode)
- {
- addr = XEXP (addr, 0);
- if (CONST_INT_P (addr))
- return 0;
- }
- else if (GET_CODE (addr) == AND
- && const_32bit_mask (XEXP (addr, 1), DImode))
+ switch (code)
{
- addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
- if (addr == NULL_RTX)
- return 0;
+ case 'A':
+ switch (ASSEMBLER_DIALECT)
+ {
+ case ASM_ATT:
+ putc ('*', file);
+ break;
- if (CONST_INT_P (addr))
- return 0;
- }
- }
+ case ASM_INTEL:
+ /* Intel syntax. For absolute addresses, registers should not
+ be surrounded by braces. */
+ if (!REG_P (x))
+ {
+ putc ('[', file);
+ ix86_print_operand (file, x, 0);
+ putc (']', file);
+ return;
+ }
+ break;
- /* Allow SImode subregs of DImode addresses,
- they will be emitted with addr32 prefix. */
- if (TARGET_64BIT && GET_MODE (addr) == SImode)
- {
- if (SUBREG_P (addr)
- && GET_MODE (SUBREG_REG (addr)) == DImode)
- {
- addr = SUBREG_REG (addr);
- if (CONST_INT_P (addr))
- return 0;
- }
- }
+ default:
+ gcc_unreachable ();
+ }
- if (REG_P (addr))
- base = addr;
- else if (SUBREG_P (addr))
- {
- if (REG_P (SUBREG_REG (addr)))
- base = addr;
- else
- return 0;
- }
- else if (GET_CODE (addr) == PLUS)
- {
- rtx addends[4], op;
- int n = 0, i;
+ ix86_print_operand (file, x, 0);
+ return;
- op = addr;
- do
- {
- if (n >= 4)
- return 0;
- addends[n++] = XEXP (op, 1);
- op = XEXP (op, 0);
- }
- while (GET_CODE (op) == PLUS);
- if (n >= 4)
- return 0;
- addends[n] = op;
+ case 'E':
+ /* Wrap address in an UNSPEC to declare special handling. */
+ if (TARGET_64BIT)
+ x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
- for (i = n; i >= 0; --i)
- {
- op = addends[i];
- switch (GET_CODE (op))
- {
- case MULT:
- if (index)
- return 0;
- index = XEXP (op, 0);
- scale_rtx = XEXP (op, 1);
- break;
+ output_address (VOIDmode, x);
+ return;
- case ASHIFT:
- if (index)
- return 0;
- index = XEXP (op, 0);
- tmp = XEXP (op, 1);
- if (!CONST_INT_P (tmp))
- return 0;
- scale = INTVAL (tmp);
- if ((unsigned HOST_WIDE_INT) scale > 3)
- return 0;
- scale = 1 << scale;
- break;
+ case 'L':
+ if (ASSEMBLER_DIALECT == ASM_ATT)
+ putc ('l', file);
+ return;
- case ZERO_EXTEND:
- op = XEXP (op, 0);
- if (GET_CODE (op) != UNSPEC)
- return 0;
- /* FALLTHRU */
+ case 'W':
+ if (ASSEMBLER_DIALECT == ASM_ATT)
+ putc ('w', file);
+ return;
- case UNSPEC:
- if (XINT (op, 1) == UNSPEC_TP
- && TARGET_TLS_DIRECT_SEG_REFS
- && seg == ADDR_SPACE_GENERIC)
- seg = DEFAULT_TLS_SEG_REG;
- else
- return 0;
- break;
+ case 'B':
+ if (ASSEMBLER_DIALECT == ASM_ATT)
+ putc ('b', file);
+ return;
- case SUBREG:
- if (!REG_P (SUBREG_REG (op)))
- return 0;
- /* FALLTHRU */
+ case 'Q':
+ if (ASSEMBLER_DIALECT == ASM_ATT)
+ putc ('l', file);
+ return;
- case REG:
- if (!base)
- base = op;
- else if (!index)
- index = op;
- else
- return 0;
+ case 'S':
+ if (ASSEMBLER_DIALECT == ASM_ATT)
+ putc ('s', file);
+ return;
+
+ case 'T':
+ if (ASSEMBLER_DIALECT == ASM_ATT)
+ putc ('t', file);
+ return;
+
+ case 'O':
+#ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
+ if (ASSEMBLER_DIALECT != ASM_ATT)
+ return;
+
+ switch (GET_MODE_SIZE (GET_MODE (x)))
+ {
+ case 2:
+ putc ('w', file);
+ break;
+
+ case 4:
+ putc ('l', file);
break;
- case CONST:
- case CONST_INT:
- case SYMBOL_REF:
- case LABEL_REF:
- if (disp)
- return 0;
- disp = op;
+ case 8:
+ putc ('q', file);
break;
default:
- return 0;
+ output_operand_lossage ("invalid operand size for operand "
+ "code 'O'");
+ return;
}
- }
- }
- else if (GET_CODE (addr) == MULT)
- {
- index = XEXP (addr, 0); /* index*scale */
- scale_rtx = XEXP (addr, 1);
- }
- else if (GET_CODE (addr) == ASHIFT)
- {
- /* We're called for lea too, which implements ashift on occasion. */
- index = XEXP (addr, 0);
- tmp = XEXP (addr, 1);
- if (!CONST_INT_P (tmp))
- return 0;
- scale = INTVAL (tmp);
- if ((unsigned HOST_WIDE_INT) scale > 3)
- return 0;
- scale = 1 << scale;
- retval = -1;
- }
- else
- disp = addr; /* displacement */
-
- if (index)
- {
- if (REG_P (index))
- ;
- else if (SUBREG_P (index)
- && REG_P (SUBREG_REG (index)))
- ;
- else
- return 0;
- }
-
- /* Extract the integral value of scale. */
- if (scale_rtx)
- {
- if (!CONST_INT_P (scale_rtx))
- return 0;
- scale = INTVAL (scale_rtx);
- }
- base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
- index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
+ putc ('.', file);
+#endif
+ return;
- /* Avoid useless 0 displacement. */
- if (disp == const0_rtx && (base || index))
- disp = NULL_RTX;
+ case 'z':
+ if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
+ {
+ /* Opcodes don't get size suffixes if using Intel opcodes. */
+ if (ASSEMBLER_DIALECT == ASM_INTEL)
+ return;
- /* Allow arg pointer and stack pointer as index if there is not scaling. */
- if (base_reg && index_reg && scale == 1
- && (REGNO (index_reg) == ARG_POINTER_REGNUM
- || REGNO (index_reg) == FRAME_POINTER_REGNUM
- || REGNO (index_reg) == SP_REG))
- {
- std::swap (base, index);
- std::swap (base_reg, index_reg);
- }
+ switch (GET_MODE_SIZE (GET_MODE (x)))
+ {
+ case 1:
+ putc ('b', file);
+ return;
- /* Special case: %ebp cannot be encoded as a base without a displacement.
- Similarly %r13. */
- if (!disp && base_reg
- && (REGNO (base_reg) == ARG_POINTER_REGNUM
- || REGNO (base_reg) == FRAME_POINTER_REGNUM
- || REGNO (base_reg) == BP_REG
- || REGNO (base_reg) == R13_REG))
- disp = const0_rtx;
+ case 2:
+ putc ('w', file);
+ return;
- /* Special case: on K6, [%esi] makes the instruction vector decoded.
- Avoid this by transforming to [%esi+0].
- Reload calls address legitimization without cfun defined, so we need
- to test cfun for being non-NULL. */
- if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
- && base_reg && !index_reg && !disp
- && REGNO (base_reg) == SI_REG)
- disp = const0_rtx;
+ case 4:
+ putc ('l', file);
+ return;
- /* Special case: encode reg+reg instead of reg*2. */
- if (!base && index && scale == 2)
- base = index, base_reg = index_reg, scale = 1;
+ case 8:
+ putc ('q', file);
+ return;
- /* Special case: scaling cannot be encoded without base or displacement. */
- if (!base && !disp && index && scale != 1)
- disp = const0_rtx;
+ default:
+ output_operand_lossage ("invalid operand size for operand "
+ "code 'z'");
+ return;
+ }
+ }
- out->base = base;
- out->index = index;
- out->disp = disp;
- out->scale = scale;
- out->seg = seg;
+ if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
+ warning (0, "non-integer operand used with operand code %<z%>");
+ /* FALLTHRU */
- return retval;
-}
-\f
-/* Return cost of the memory address x.
- For i386, it is better to use a complex address than let gcc copy
- the address into a reg and make a new pseudo. But not if the address
- requires to two regs - that would mean more pseudos with longer
- lifetimes. */
-static int
-ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
-{
- struct ix86_address parts;
- int cost = 1;
- int ok = ix86_decompose_address (x, &parts);
+ case 'Z':
+ /* 387 opcodes don't get size suffixes if using Intel opcodes. */
+ if (ASSEMBLER_DIALECT == ASM_INTEL)
+ return;
- gcc_assert (ok);
+ if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
+ {
+ switch (GET_MODE_SIZE (GET_MODE (x)))
+ {
+ case 2:
+#ifdef HAVE_AS_IX86_FILDS
+ putc ('s', file);
+#endif
+ return;
- if (parts.base && SUBREG_P (parts.base))
- parts.base = SUBREG_REG (parts.base);
- if (parts.index && SUBREG_P (parts.index))
- parts.index = SUBREG_REG (parts.index);
+ case 4:
+ putc ('l', file);
+ return;
- /* Attempt to minimize number of registers in the address by increasing
- address cost for each used register. We don't increase address cost
- for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
- is not invariant itself it most likely means that base or index is not
- invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
- which is not profitable for x86. */
- if (parts.base
- && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
- && (current_pass->type == GIMPLE_PASS
- || !pic_offset_table_rtx
- || !REG_P (parts.base)
- || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
- cost++;
+ case 8:
+#ifdef HAVE_AS_IX86_FILDQ
+ putc ('q', file);
+#else
+ fputs ("ll", file);
+#endif
+ return;
- if (parts.index
- && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
- && (current_pass->type == GIMPLE_PASS
- || !pic_offset_table_rtx
- || !REG_P (parts.index)
- || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
- cost++;
+ default:
+ break;
+ }
+ }
+ else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
+ {
+ /* 387 opcodes don't get size suffixes
+ if the operands are registers. */
+ if (STACK_REG_P (x))
+ return;
- /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
- since it's predecode logic can't detect the length of instructions
- and it degenerates to vector decoded. Increase cost of such
- addresses here. The penalty is minimally 2 cycles. It may be worthwhile
- to split such addresses or even refuse such addresses at all.
+ switch (GET_MODE_SIZE (GET_MODE (x)))
+ {
+ case 4:
+ putc ('s', file);
+ return;
- Following addressing modes are affected:
- [base+scale*index]
- [scale*index+disp]
- [base+index]
+ case 8:
+ putc ('l', file);
+ return;
- The first and last case may be avoidable by explicitly coding the zero in
- memory address, but I don't have AMD-K6 machine handy to check this
- theory. */
+ case 12:
+ case 16:
+ putc ('t', file);
+ return;
- if (TARGET_K6
- && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
- || (parts.disp && !parts.base && parts.index && parts.scale != 1)
- || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
- cost += 10;
+ default:
+ break;
+ }
+ }
+ else
+ {
+ output_operand_lossage ("invalid operand type used with "
+ "operand code 'Z'");
+ return;
+ }
- return cost;
-}
-\f
-/* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
- this is used for to form addresses to local data when -fPIC is in
- use. */
+ output_operand_lossage ("invalid operand size for operand code 'Z'");
+ return;
-static bool
-darwin_local_data_pic (rtx disp)
-{
- return (GET_CODE (disp) == UNSPEC
- && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
-}
+ case 'd':
+ case 'b':
+ case 'w':
+ case 'k':
+ case 'q':
+ case 'h':
+ case 't':
+ case 'g':
+ case 'y':
+ case 'x':
+ case 'X':
+ case 'P':
+ case 'p':
+ case 'V':
+ break;
-/* True if operand X should be loaded from GOT. */
+ case 's':
+ if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
+ {
+ ix86_print_operand (file, x, 0);
+ fputs (", ", file);
+ }
+ return;
-bool
-ix86_force_load_from_GOT_p (rtx x)
-{
- return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
- && !TARGET_PECOFF && !TARGET_MACHO
- && !flag_pic
- && ix86_cmodel != CM_LARGE
- && GET_CODE (x) == SYMBOL_REF
- && SYMBOL_REF_FUNCTION_P (x)
- && (!flag_plt
- || (SYMBOL_REF_DECL (x)
- && lookup_attribute ("noplt",
- DECL_ATTRIBUTES (SYMBOL_REF_DECL (x)))))
- && !SYMBOL_REF_LOCAL_P (x));
-}
-
-/* Determine if a given RTX is a valid constant. We already know this
- satisfies CONSTANT_P. */
-
-static bool
-ix86_legitimate_constant_p (machine_mode mode, rtx x)
-{
- switch (GET_CODE (x))
- {
- case CONST:
- x = XEXP (x, 0);
-
- if (GET_CODE (x) == PLUS)
- {
- if (!CONST_INT_P (XEXP (x, 1)))
- return false;
- x = XEXP (x, 0);
- }
+ case 'Y':
+ switch (GET_CODE (x))
+ {
+ case NE:
+ fputs ("neq", file);
+ break;
+ case EQ:
+ fputs ("eq", file);
+ break;
+ case GE:
+ case GEU:
+ fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
+ break;
+ case GT:
+ case GTU:
+ fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
+ break;
+ case LE:
+ case LEU:
+ fputs ("le", file);
+ break;
+ case LT:
+ case LTU:
+ fputs ("lt", file);
+ break;
+ case UNORDERED:
+ fputs ("unord", file);
+ break;
+ case ORDERED:
+ fputs ("ord", file);
+ break;
+ case UNEQ:
+ fputs ("ueq", file);
+ break;
+ case UNGE:
+ fputs ("nlt", file);
+ break;
+ case UNGT:
+ fputs ("nle", file);
+ break;
+ case UNLE:
+ fputs ("ule", file);
+ break;
+ case UNLT:
+ fputs ("ult", file);
+ break;
+ case LTGT:
+ fputs ("une", file);
+ break;
+ default:
+ output_operand_lossage ("operand is not a condition code, "
+ "invalid operand code 'Y'");
+ return;
+ }
+ return;
- if (TARGET_MACHO && darwin_local_data_pic (x))
- return true;
+ case 'D':
+ /* Little bit of braindamage here. The SSE compare instructions
+ does use completely different names for the comparisons that the
+ fp conditional moves. */
+ switch (GET_CODE (x))
+ {
+ case UNEQ:
+ if (TARGET_AVX)
+ {
+ fputs ("eq_us", file);
+ break;
+ }
+ /* FALLTHRU */
+ case EQ:
+ fputs ("eq", file);
+ break;
+ case UNLT:
+ if (TARGET_AVX)
+ {
+ fputs ("nge", file);
+ break;
+ }
+ /* FALLTHRU */
+ case LT:
+ fputs ("lt", file);
+ break;
+ case UNLE:
+ if (TARGET_AVX)
+ {
+ fputs ("ngt", file);
+ break;
+ }
+ /* FALLTHRU */
+ case LE:
+ fputs ("le", file);
+ break;
+ case UNORDERED:
+ fputs ("unord", file);
+ break;
+ case LTGT:
+ if (TARGET_AVX)
+ {
+ fputs ("neq_oq", file);
+ break;
+ }
+ /* FALLTHRU */
+ case NE:
+ fputs ("neq", file);
+ break;
+ case GE:
+ if (TARGET_AVX)
+ {
+ fputs ("ge", file);
+ break;
+ }
+ /* FALLTHRU */
+ case UNGE:
+ fputs ("nlt", file);
+ break;
+ case GT:
+ if (TARGET_AVX)
+ {
+ fputs ("gt", file);
+ break;
+ }
+ /* FALLTHRU */
+ case UNGT:
+ fputs ("nle", file);
+ break;
+ case ORDERED:
+ fputs ("ord", file);
+ break;
+ default:
+ output_operand_lossage ("operand is not a condition code, "
+ "invalid operand code 'D'");
+ return;
+ }
+ return;
- /* Only some unspecs are valid as "constants". */
- if (GET_CODE (x) == UNSPEC)
- switch (XINT (x, 1))
- {
- case UNSPEC_GOT:
- case UNSPEC_GOTOFF:
- case UNSPEC_PLTOFF:
- return TARGET_64BIT;
- case UNSPEC_TPOFF:
- case UNSPEC_NTPOFF:
- x = XVECEXP (x, 0, 0);
- return (GET_CODE (x) == SYMBOL_REF
- && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
- case UNSPEC_DTPOFF:
- x = XVECEXP (x, 0, 0);
- return (GET_CODE (x) == SYMBOL_REF
- && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
- default:
- return false;
- }
+ case 'F':
+ case 'f':
+#ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
+ if (ASSEMBLER_DIALECT == ASM_ATT)
+ putc ('.', file);
+ gcc_fallthrough ();
+#endif
- /* We must have drilled down to a symbol. */
- if (GET_CODE (x) == LABEL_REF)
- return true;
- if (GET_CODE (x) != SYMBOL_REF)
- return false;
- /* FALLTHRU */
+ case 'C':
+ case 'c':
+ if (!COMPARISON_P (x))
+ {
+ output_operand_lossage ("operand is not a condition code, "
+ "invalid operand code '%c'", code);
+ return;
+ }
+ put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
+ code == 'c' || code == 'f',
+ code == 'F' || code == 'f',
+ file);
+ return;
- case SYMBOL_REF:
- /* TLS symbols are never valid. */
- if (SYMBOL_REF_TLS_MODEL (x))
- return false;
+ case 'H':
+ if (!offsettable_memref_p (x))
+ {
+ output_operand_lossage ("operand is not an offsettable memory "
+ "reference, invalid operand code 'H'");
+ return;
+ }
+ /* It doesn't actually matter what mode we use here, as we're
+ only going to use this for printing. */
+ x = adjust_address_nv (x, DImode, 8);
+ /* Output 'qword ptr' for intel assembler dialect. */
+ if (ASSEMBLER_DIALECT == ASM_INTEL)
+ code = 'q';
+ break;
- /* DLLIMPORT symbols are never valid. */
- if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
- && SYMBOL_REF_DLLIMPORT_P (x))
- return false;
+ case 'K':
+ if (!CONST_INT_P (x))
+ {
+ output_operand_lossage ("operand is not an integer, invalid "
+ "operand code 'K'");
+ return;
+ }
-#if TARGET_MACHO
- /* mdynamic-no-pic */
- if (MACHO_DYNAMIC_NO_PIC_P)
- return machopic_symbol_defined_p (x);
+ if (INTVAL (x) & IX86_HLE_ACQUIRE)
+#ifdef HAVE_AS_IX86_HLE
+ fputs ("xacquire ", file);
+#else
+ fputs ("\n" ASM_BYTE "0xf2\n\t", file);
#endif
+ else if (INTVAL (x) & IX86_HLE_RELEASE)
+#ifdef HAVE_AS_IX86_HLE
+ fputs ("xrelease ", file);
+#else
+ fputs ("\n" ASM_BYTE "0xf3\n\t", file);
+#endif
+ /* We do not want to print value of the operand. */
+ return;
- /* External function address should be loaded
- via the GOT slot to avoid PLT. */
- if (ix86_force_load_from_GOT_p (x))
- return false;
-
- break;
+ case 'N':
+ if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
+ fputs ("{z}", file);
+ return;
- CASE_CONST_SCALAR_INT:
- switch (mode)
- {
- case E_TImode:
- if (TARGET_64BIT)
- return true;
- /* FALLTHRU */
- case E_OImode:
- case E_XImode:
- if (!standard_sse_constant_p (x, mode))
- return false;
- default:
- break;
- }
- break;
+ case 'r':
+ if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
+ {
+ output_operand_lossage ("operand is not a specific integer, "
+ "invalid operand code 'r'");
+ return;
+ }
- case CONST_VECTOR:
- if (!standard_sse_constant_p (x, mode))
- return false;
+ if (ASSEMBLER_DIALECT == ASM_INTEL)
+ fputs (", ", file);
- default:
- break;
- }
+ fputs ("{sae}", file);
- /* Otherwise we handle everything else in the move patterns. */
- return true;
-}
+ if (ASSEMBLER_DIALECT == ASM_ATT)
+ fputs (", ", file);
-/* Determine if it's legal to put X into the constant pool. This
- is not possible for the address of thread-local symbols, which
- is checked above. */
+ return;
-static bool
-ix86_cannot_force_const_mem (machine_mode mode, rtx x)
-{
- /* We can put any immediate constant in memory. */
- switch (GET_CODE (x))
- {
- CASE_CONST_ANY:
- return false;
+ case 'R':
+ if (!CONST_INT_P (x))
+ {
+ output_operand_lossage ("operand is not an integer, invalid "
+ "operand code 'R'");
+ return;
+ }
- default:
- break;
- }
+ if (ASSEMBLER_DIALECT == ASM_INTEL)
+ fputs (", ", file);
- return !ix86_legitimate_constant_p (mode, x);
-}
+ switch (INTVAL (x))
+ {
+ case ROUND_NEAREST_INT | ROUND_SAE:
+ fputs ("{rn-sae}", file);
+ break;
+ case ROUND_NEG_INF | ROUND_SAE:
+ fputs ("{rd-sae}", file);
+ break;
+ case ROUND_POS_INF | ROUND_SAE:
+ fputs ("{ru-sae}", file);
+ break;
+ case ROUND_ZERO | ROUND_SAE:
+ fputs ("{rz-sae}", file);
+ break;
+ default:
+ output_operand_lossage ("operand is not a specific integer, "
+ "invalid operand code 'R'");
+ }
-/* Nonzero if the symbol is marked as dllimport, or as stub-variable,
- otherwise zero. */
+ if (ASSEMBLER_DIALECT == ASM_ATT)
+ fputs (", ", file);
-static bool
-is_imported_p (rtx x)
-{
- if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
- || GET_CODE (x) != SYMBOL_REF)
- return false;
+ return;
- return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
-}
+ case '*':
+ if (ASSEMBLER_DIALECT == ASM_ATT)
+ putc ('*', file);
+ return;
+ case '&':
+ {
+ const char *name = get_some_local_dynamic_name ();
+ if (name == NULL)
+ output_operand_lossage ("'%%&' used without any "
+ "local dynamic TLS references");
+ else
+ assemble_name (file, name);
+ return;
+ }
-/* Nonzero if the constant value X is a legitimate general operand
- when generating PIC code. It is given that flag_pic is on and
- that X satisfies CONSTANT_P. */
+ case '+':
+ {
+ rtx x;
-bool
-legitimate_pic_operand_p (rtx x)
-{
- rtx inner;
+ if (!optimize
+ || optimize_function_for_size_p (cfun)
+ || !TARGET_BRANCH_PREDICTION_HINTS)
+ return;
- switch (GET_CODE (x))
- {
- case CONST:
- inner = XEXP (x, 0);
- if (GET_CODE (inner) == PLUS
- && CONST_INT_P (XEXP (inner, 1)))
- inner = XEXP (inner, 0);
+ x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
+ if (x)
+ {
+ int pred_val = profile_probability::from_reg_br_prob_note
+ (XINT (x, 0)).to_reg_br_prob_base ();
- /* Only some unspecs are valid as "constants". */
- if (GET_CODE (inner) == UNSPEC)
- switch (XINT (inner, 1))
- {
- case UNSPEC_GOT:
- case UNSPEC_GOTOFF:
- case UNSPEC_PLTOFF:
- return TARGET_64BIT;
- case UNSPEC_TPOFF:
- x = XVECEXP (inner, 0, 0);
- return (GET_CODE (x) == SYMBOL_REF
- && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
- case UNSPEC_MACHOPIC_OFFSET:
- return legitimate_pic_address_disp_p (x);
- default:
- return false;
+ if (pred_val < REG_BR_PROB_BASE * 45 / 100
+ || pred_val > REG_BR_PROB_BASE * 55 / 100)
+ {
+ bool taken = pred_val > REG_BR_PROB_BASE / 2;
+ bool cputaken
+ = final_forward_branch_p (current_output_insn) == 0;
+
+ /* Emit hints only in the case default branch prediction
+ heuristics would fail. */
+ if (taken != cputaken)
+ {
+ /* We use 3e (DS) prefix for taken branches and
+ 2e (CS) prefix for not taken branches. */
+ if (taken)
+ fputs ("ds ; ", file);
+ else
+ fputs ("cs ; ", file);
+ }
+ }
+ }
+ return;
}
- /* FALLTHRU */
- case SYMBOL_REF:
- case LABEL_REF:
- return legitimate_pic_address_disp_p (x);
+ case ';':
+#ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
+ putc (';', file);
+#endif
+ return;
- default:
- return true;
- }
-}
+ case '~':
+ putc (TARGET_AVX2 ? 'i' : 'f', file);
+ return;
-/* Determine if a given CONST RTX is a valid memory displacement
- in PIC mode. */
+ case 'M':
+ if (TARGET_X32)
+ {
+ /* NB: 32-bit indices in VSIB address are sign-extended
+ to 64 bits. In x32, if 32-bit address 0xf7fa3010 is
+ sign-extended to 0xfffffffff7fa3010 which is invalid
+ address. Add addr32 prefix if there is no base
+ register nor symbol. */
+ bool ok;
+ struct ix86_address parts;
+ ok = ix86_decompose_address (x, &parts);
+ gcc_assert (ok && parts.index == NULL_RTX);
+ if (parts.base == NULL_RTX
+ && (parts.disp == NULL_RTX
+ || !symbolic_operand (parts.disp,
+ GET_MODE (parts.disp))))
+ fputs ("addr32 ", file);
+ }
+ return;
-bool
-legitimate_pic_address_disp_p (rtx disp)
-{
- bool saw_plus;
+ case '^':
+ if (TARGET_64BIT && Pmode != word_mode)
+ fputs ("addr32 ", file);
+ return;
- /* In 64bit mode we can allow direct addresses of symbols and labels
- when they are not dynamic symbols. */
- if (TARGET_64BIT)
+ case '!':
+ if (ix86_notrack_prefixed_insn_p (current_output_insn))
+ fputs ("notrack ", file);
+ return;
+
+ default:
+ output_operand_lossage ("invalid operand code '%c'", code);
+ }
+ }
+
+ if (REG_P (x))
+ print_reg (x, code, file);
+
+ else if (MEM_P (x))
{
- rtx op0 = disp, op1;
+ rtx addr = XEXP (x, 0);
- switch (GET_CODE (disp))
+ /* No `byte ptr' prefix for call instructions ... */
+ if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
{
- case LABEL_REF:
- return true;
+ machine_mode mode = GET_MODE (x);
+ const char *size;
- case CONST:
- if (GET_CODE (XEXP (disp, 0)) != PLUS)
- break;
- op0 = XEXP (XEXP (disp, 0), 0);
- op1 = XEXP (XEXP (disp, 0), 1);
- if (!CONST_INT_P (op1))
- break;
- if (GET_CODE (op0) == UNSPEC
- && (XINT (op0, 1) == UNSPEC_DTPOFF
- || XINT (op0, 1) == UNSPEC_NTPOFF)
- && trunc_int_for_mode (INTVAL (op1), SImode) == INTVAL (op1))
- return true;
- if (INTVAL (op1) >= 16*1024*1024
- || INTVAL (op1) < -16*1024*1024)
- break;
- if (GET_CODE (op0) == LABEL_REF)
- return true;
- if (GET_CODE (op0) == CONST
- && GET_CODE (XEXP (op0, 0)) == UNSPEC
- && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
- return true;
- if (GET_CODE (op0) == UNSPEC
- && XINT (op0, 1) == UNSPEC_PCREL)
- return true;
- if (GET_CODE (op0) != SYMBOL_REF)
- break;
- /* FALLTHRU */
-
- case SYMBOL_REF:
- /* TLS references should always be enclosed in UNSPEC.
- The dllimported symbol needs always to be resolved. */
- if (SYMBOL_REF_TLS_MODEL (op0)
- || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
- return false;
-
- if (TARGET_PECOFF)
- {
- if (is_imported_p (op0))
- return true;
-
- if (SYMBOL_REF_FAR_ADDR_P (op0)
- || !SYMBOL_REF_LOCAL_P (op0))
+ /* Check for explicit size override codes. */
+ if (code == 'b')
+ size = "BYTE";
+ else if (code == 'w')
+ size = "WORD";
+ else if (code == 'k')
+ size = "DWORD";
+ else if (code == 'q')
+ size = "QWORD";
+ else if (code == 'x')
+ size = "XMMWORD";
+ else if (code == 't')
+ size = "YMMWORD";
+ else if (code == 'g')
+ size = "ZMMWORD";
+ else if (mode == BLKmode)
+ /* ... or BLKmode operands, when not overridden. */
+ size = NULL;
+ else
+ switch (GET_MODE_SIZE (mode))
+ {
+ case 1: size = "BYTE"; break;
+ case 2: size = "WORD"; break;
+ case 4: size = "DWORD"; break;
+ case 8: size = "QWORD"; break;
+ case 12: size = "TBYTE"; break;
+ case 16:
+ if (mode == XFmode)
+ size = "TBYTE";
+ else
+ size = "XMMWORD";
break;
-
- /* Function-symbols need to be resolved only for
- large-model.
- For the small-model we don't need to resolve anything
- here. */
- if ((ix86_cmodel != CM_LARGE_PIC
- && SYMBOL_REF_FUNCTION_P (op0))
- || ix86_cmodel == CM_SMALL_PIC)
- return true;
- /* Non-external symbols don't need to be resolved for
- large, and medium-model. */
- if ((ix86_cmodel == CM_LARGE_PIC
- || ix86_cmodel == CM_MEDIUM_PIC)
- && !SYMBOL_REF_EXTERNAL_P (op0))
- return true;
+ case 32: size = "YMMWORD"; break;
+ case 64: size = "ZMMWORD"; break;
+ default:
+ gcc_unreachable ();
+ }
+ if (size)
+ {
+ fputs (size, file);
+ fputs (" PTR ", file);
}
- else if (!SYMBOL_REF_FAR_ADDR_P (op0)
- && (SYMBOL_REF_LOCAL_P (op0)
- || (HAVE_LD_PIE_COPYRELOC
- && flag_pie
- && !SYMBOL_REF_WEAK (op0)
- && !SYMBOL_REF_FUNCTION_P (op0)))
- && ix86_cmodel != CM_LARGE_PIC)
- return true;
- break;
-
- default:
- break;
}
+
+ if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
+ output_operand_lossage ("invalid constraints for operand");
+ else
+ ix86_print_operand_address_as
+ (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
}
- if (GET_CODE (disp) != CONST)
- return false;
- disp = XEXP (disp, 0);
- if (TARGET_64BIT)
+ else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
{
- /* We are unsafe to allow PLUS expressions. This limit allowed distance
- of GOT tables. We should not need these anyway. */
- if (GET_CODE (disp) != UNSPEC
- || (XINT (disp, 1) != UNSPEC_GOTPCREL
- && XINT (disp, 1) != UNSPEC_GOTOFF
- && XINT (disp, 1) != UNSPEC_PCREL
- && XINT (disp, 1) != UNSPEC_PLTOFF))
- return false;
+ long l;
- if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
- && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
- return false;
- return true;
+ REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
+
+ if (ASSEMBLER_DIALECT == ASM_ATT)
+ putc ('$', file);
+ /* Sign extend 32bit SFmode immediate to 8 bytes. */
+ if (code == 'q')
+ fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
+ (unsigned long long) (int) l);
+ else
+ fprintf (file, "0x%08x", (unsigned int) l);
}
- saw_plus = false;
- if (GET_CODE (disp) == PLUS)
+ else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
{
- if (!CONST_INT_P (XEXP (disp, 1)))
- return false;
- disp = XEXP (disp, 0);
- saw_plus = true;
- }
+ long l[2];
- if (TARGET_MACHO && darwin_local_data_pic (disp))
- return true;
+ REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
- if (GET_CODE (disp) != UNSPEC)
- return false;
+ if (ASSEMBLER_DIALECT == ASM_ATT)
+ putc ('$', file);
+ fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
+ }
- switch (XINT (disp, 1))
+ /* These float cases don't actually occur as immediate operands. */
+ else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
{
- case UNSPEC_GOT:
- if (saw_plus)
- return false;
- /* We need to check for both symbols and labels because VxWorks loads
- text labels with @GOT rather than @GOTOFF. See gotoff_operand for
- details. */
- return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
- || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
- case UNSPEC_GOTOFF:
- /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
- While ABI specify also 32bit relocation but we don't produce it in
- small PIC model at all. */
- if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
- || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
- && !TARGET_64BIT)
- return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
- return false;
- case UNSPEC_GOTTPOFF:
- case UNSPEC_GOTNTPOFF:
- case UNSPEC_INDNTPOFF:
- if (saw_plus)
- return false;
- disp = XVECEXP (disp, 0, 0);
- return (GET_CODE (disp) == SYMBOL_REF
- && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
- case UNSPEC_NTPOFF:
- disp = XVECEXP (disp, 0, 0);
- return (GET_CODE (disp) == SYMBOL_REF
- && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
- case UNSPEC_DTPOFF:
- disp = XVECEXP (disp, 0, 0);
- return (GET_CODE (disp) == SYMBOL_REF
- && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
+ char dstr[30];
+
+ real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
+ fputs (dstr, file);
}
- return false;
-}
+ else
+ {
+ /* We have patterns that allow zero sets of memory, for instance.
+ In 64-bit mode, we should probably support all 8-byte vectors,
+ since we can in fact encode that into an immediate. */
+ if (GET_CODE (x) == CONST_VECTOR)
+ {
+ if (x != CONST0_RTX (GET_MODE (x)))
+ output_operand_lossage ("invalid vector immediate");
+ x = const0_rtx;
+ }
-/* Determine if op is suitable RTX for an address register.
- Return naked register if a register or a register subreg is
- found, otherwise return NULL_RTX. */
+ if (code != 'P' && code != 'p')
+ {
+ if (CONST_INT_P (x))
+ {
+ if (ASSEMBLER_DIALECT == ASM_ATT)
+ putc ('$', file);
+ }
+ else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
+ || GET_CODE (x) == LABEL_REF)
+ {
+ if (ASSEMBLER_DIALECT == ASM_ATT)
+ putc ('$', file);
+ else
+ fputs ("OFFSET FLAT:", file);
+ }
+ }
+ if (CONST_INT_P (x))
+ fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
+ else if (flag_pic || MACHOPIC_INDIRECT)
+ output_pic_addr_const (file, x, code);
+ else
+ output_addr_const (file, x);
+ }
+}
-static rtx
-ix86_validate_address_register (rtx op)
+static bool
+ix86_print_operand_punct_valid_p (unsigned char code)
{
- machine_mode mode = GET_MODE (op);
+ return (code == '*' || code == '+' || code == '&' || code == ';'
+ || code == '~' || code == '^' || code == '!');
+}
+\f
+/* Print a memory operand whose address is ADDR. */
- /* Only SImode or DImode registers can form the address. */
- if (mode != SImode && mode != DImode)
- return NULL_RTX;
+static void
+ix86_print_operand_address_as (FILE *file, rtx addr,
+ addr_space_t as, bool no_rip)
+{
+ struct ix86_address parts;
+ rtx base, index, disp;
+ int scale;
+ int ok;
+ bool vsib = false;
+ int code = 0;
- if (REG_P (op))
- return op;
- else if (SUBREG_P (op))
+ if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
{
- rtx reg = SUBREG_REG (op);
-
- if (!REG_P (reg))
- return NULL_RTX;
-
- mode = GET_MODE (reg);
-
- /* Don't allow SUBREGs that span more than a word. It can
- lead to spill failures when the register is one word out
- of a two word structure. */
- if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
- return NULL_RTX;
-
- /* Allow only SUBREGs of non-eliminable hard registers. */
- if (register_no_elim_operand (reg, mode))
- return reg;
+ ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
+ gcc_assert (parts.index == NULL_RTX);
+ parts.index = XVECEXP (addr, 0, 1);
+ parts.scale = INTVAL (XVECEXP (addr, 0, 2));
+ addr = XVECEXP (addr, 0, 0);
+ vsib = true;
}
+ else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
+ {
+ gcc_assert (TARGET_64BIT);
+ ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
+ code = 'q';
+ }
+ else
+ ok = ix86_decompose_address (addr, &parts);
- /* Op is not a register. */
- return NULL_RTX;
-}
-
-/* Recognizes RTL expressions that are valid memory addresses for an
- instruction. The MODE argument is the machine mode for the MEM
- expression that wants to use this address.
-
- It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
- convert common non-canonical forms to canonical form so that they will
- be recognized. */
-
-static bool
-ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
-{
- struct ix86_address parts;
- rtx base, index, disp;
- HOST_WIDE_INT scale;
- addr_space_t seg;
-
- if (ix86_decompose_address (addr, &parts) <= 0)
- /* Decomposition failed. */
- return false;
+ gcc_assert (ok);
base = parts.base;
index = parts.index;
disp = parts.disp;
scale = parts.scale;
- seg = parts.seg;
- /* Validate base register. */
- if (base)
- {
- rtx reg = ix86_validate_address_register (base);
+ if (ADDR_SPACE_GENERIC_P (as))
+ as = parts.seg;
+ else
+ gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
- if (reg == NULL_RTX)
- return false;
+ if (!ADDR_SPACE_GENERIC_P (as))
+ {
+ if (ASSEMBLER_DIALECT == ASM_ATT)
+ putc ('%', file);
- if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
- || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
- /* Base is not valid. */
- return false;
+ switch (as)
+ {
+ case ADDR_SPACE_SEG_FS:
+ fputs ("fs:", file);
+ break;
+ case ADDR_SPACE_SEG_GS:
+ fputs ("gs:", file);
+ break;
+ default:
+ gcc_unreachable ();
+ }
}
- /* Validate index register. */
- if (index)
+ /* Use one byte shorter RIP relative addressing for 64bit mode. */
+ if (TARGET_64BIT && !base && !index && !no_rip)
{
- rtx reg = ix86_validate_address_register (index);
+ rtx symbol = disp;
- if (reg == NULL_RTX)
- return false;
+ if (GET_CODE (disp) == CONST
+ && GET_CODE (XEXP (disp, 0)) == PLUS
+ && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
+ symbol = XEXP (XEXP (disp, 0), 0);
- if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
- || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
- /* Index is not valid. */
- return false;
+ if (GET_CODE (symbol) == LABEL_REF
+ || (GET_CODE (symbol) == SYMBOL_REF
+ && SYMBOL_REF_TLS_MODEL (symbol) == 0))
+ base = pc_rtx;
}
- /* Index and base should have the same mode. */
- if (base && index
- && GET_MODE (base) != GET_MODE (index))
- return false;
-
- /* Address override works only on the (%reg) part of %fs:(%reg). */
- if (seg != ADDR_SPACE_GENERIC
- && ((base && GET_MODE (base) != word_mode)
- || (index && GET_MODE (index) != word_mode)))
- return false;
-
- /* Validate scale factor. */
- if (scale != 1)
+ if (!base && !index)
{
- if (!index)
- /* Scale without index. */
- return false;
-
- if (scale != 2 && scale != 4 && scale != 8)
- /* Scale is not a valid multiplier. */
- return false;
+ /* Displacement only requires special attention. */
+ if (CONST_INT_P (disp))
+ {
+ if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
+ fputs ("ds:", file);
+ fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
+ }
+ /* Load the external function address via the GOT slot to avoid PLT. */
+ else if (GET_CODE (disp) == CONST
+ && GET_CODE (XEXP (disp, 0)) == UNSPEC
+ && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
+ || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
+ && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
+ output_pic_addr_const (file, disp, 0);
+ else if (flag_pic)
+ output_pic_addr_const (file, disp, 0);
+ else
+ output_addr_const (file, disp);
}
-
- /* Validate displacement. */
- if (disp)
+ else
{
- if (GET_CODE (disp) == CONST
- && GET_CODE (XEXP (disp, 0)) == UNSPEC
- && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
- switch (XINT (XEXP (disp, 0), 1))
- {
- /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
- when used. While ABI specify also 32bit relocations, we
- don't produce them at all and use IP relative instead.
- Allow GOT in 32bit mode for both PIC and non-PIC if symbol
- should be loaded via GOT. */
- case UNSPEC_GOT:
- if (!TARGET_64BIT
- && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
- goto is_legitimate_pic;
- /* FALLTHRU */
- case UNSPEC_GOTOFF:
- gcc_assert (flag_pic);
- if (!TARGET_64BIT)
- goto is_legitimate_pic;
-
- /* 64bit address unspec. */
- return false;
-
- case UNSPEC_GOTPCREL:
- if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
- goto is_legitimate_pic;
- /* FALLTHRU */
- case UNSPEC_PCREL:
- gcc_assert (flag_pic);
- goto is_legitimate_pic;
-
- case UNSPEC_GOTTPOFF:
- case UNSPEC_GOTNTPOFF:
- case UNSPEC_INDNTPOFF:
- case UNSPEC_NTPOFF:
- case UNSPEC_DTPOFF:
- break;
+ /* Print SImode register names to force addr32 prefix. */
+ if (SImode_address_operand (addr, VOIDmode))
+ {
+ if (flag_checking)
+ {
+ gcc_assert (TARGET_64BIT);
+ switch (GET_CODE (addr))
+ {
+ case SUBREG:
+ gcc_assert (GET_MODE (addr) == SImode);
+ gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
+ break;
+ case ZERO_EXTEND:
+ case AND:
+ gcc_assert (GET_MODE (addr) == DImode);
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ }
+ gcc_assert (!code);
+ code = 'k';
+ }
+ else if (code == 0
+ && TARGET_X32
+ && disp
+ && CONST_INT_P (disp)
+ && INTVAL (disp) < -16*1024*1024)
+ {
+ /* X32 runs in 64-bit mode, where displacement, DISP, in
+ address DISP(%r64), is encoded as 32-bit immediate sign-
+ extended from 32-bit to 64-bit. For -0x40000300(%r64),
+ address is %r64 + 0xffffffffbffffd00. When %r64 <
+ 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
+ which is invalid for x32. The correct address is %r64
+ - 0x40000300 == 0xf7ffdd64. To properly encode
+ -0x40000300(%r64) for x32, we zero-extend negative
+ displacement by forcing addr32 prefix which truncates
+ 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
+ zero-extend all negative displacements, including -1(%rsp).
+ However, for small negative displacements, sign-extension
+ won't cause overflow. We only zero-extend negative
+ displacements if they < -16*1024*1024, which is also used
+ to check legitimate address displacements for PIC. */
+ code = 'k';
+ }
- default:
- /* Invalid address unspec. */
- return false;
- }
+ /* Since the upper 32 bits of RSP are always zero for x32,
+ we can encode %esp as %rsp to avoid 0x67 prefix if
+ there is no index register. */
+ if (TARGET_X32 && Pmode == SImode
+ && !index && base && REG_P (base) && REGNO (base) == SP_REG)
+ code = 'q';
- else if (SYMBOLIC_CONST (disp)
- && (flag_pic
- || (TARGET_MACHO
-#if TARGET_MACHO
- && MACHOPIC_INDIRECT
- && !machopic_operand_p (disp)
-#endif
- )))
+ if (ASSEMBLER_DIALECT == ASM_ATT)
{
-
- is_legitimate_pic:
- if (TARGET_64BIT && (index || base))
+ if (disp)
{
- /* foo@dtpoff(%rX) is ok. */
- if (GET_CODE (disp) != CONST
- || GET_CODE (XEXP (disp, 0)) != PLUS
- || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
- || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
- || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
- && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
- /* Non-constant pic memory reference. */
- return false;
+ if (flag_pic)
+ output_pic_addr_const (file, disp, 0);
+ else if (GET_CODE (disp) == LABEL_REF)
+ output_asm_label (disp);
+ else
+ output_addr_const (file, disp);
}
- else if ((!TARGET_MACHO || flag_pic)
- && ! legitimate_pic_address_disp_p (disp))
- /* Displacement is an invalid pic construct. */
- return false;
-#if TARGET_MACHO
- else if (MACHO_DYNAMIC_NO_PIC_P
- && !ix86_legitimate_constant_p (Pmode, disp))
- /* displacment must be referenced via non_lazy_pointer */
- return false;
-#endif
- /* This code used to verify that a symbolic pic displacement
- includes the pic_offset_table_rtx register.
+ putc ('(', file);
+ if (base)
+ print_reg (base, code, file);
+ if (index)
+ {
+ putc (',', file);
+ print_reg (index, vsib ? 0 : code, file);
+ if (scale != 1 || vsib)
+ fprintf (file, ",%d", scale);
+ }
+ putc (')', file);
+ }
+ else
+ {
+ rtx offset = NULL_RTX;
- While this is good idea, unfortunately these constructs may
- be created by "adds using lea" optimization for incorrect
- code like:
+ if (disp)
+ {
+ /* Pull out the offset of a symbol; print any symbol itself. */
+ if (GET_CODE (disp) == CONST
+ && GET_CODE (XEXP (disp, 0)) == PLUS
+ && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
+ {
+ offset = XEXP (XEXP (disp, 0), 1);
+ disp = gen_rtx_CONST (VOIDmode,
+ XEXP (XEXP (disp, 0), 0));
+ }
- int a;
- int foo(int i)
- {
- return *(&a+i);
- }
+ if (flag_pic)
+ output_pic_addr_const (file, disp, 0);
+ else if (GET_CODE (disp) == LABEL_REF)
+ output_asm_label (disp);
+ else if (CONST_INT_P (disp))
+ offset = disp;
+ else
+ output_addr_const (file, disp);
+ }
- This code is nonsensical, but results in addressing
- GOT table with pic_offset_table_rtx base. We can't
- just refuse it easily, since it gets matched by
- "addsi3" pattern, that later gets split to lea in the
- case output register differs from input. While this
- can be handled by separate addsi pattern for this case
- that never results in lea, this seems to be easier and
- correct fix for crash to disable this test. */
+ putc ('[', file);
+ if (base)
+ {
+ print_reg (base, code, file);
+ if (offset)
+ {
+ if (INTVAL (offset) >= 0)
+ putc ('+', file);
+ fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
+ }
+ }
+ else if (offset)
+ fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
+ else
+ putc ('0', file);
+
+ if (index)
+ {
+ putc ('+', file);
+ print_reg (index, vsib ? 0 : code, file);
+ if (scale != 1 || vsib)
+ fprintf (file, "*%d", scale);
+ }
+ putc (']', file);
}
- else if (GET_CODE (disp) != LABEL_REF
- && !CONST_INT_P (disp)
- && (GET_CODE (disp) != CONST
- || !ix86_legitimate_constant_p (Pmode, disp))
- && (GET_CODE (disp) != SYMBOL_REF
- || !ix86_legitimate_constant_p (Pmode, disp)))
- /* Displacement is not constant. */
- return false;
- else if (TARGET_64BIT
- && !x86_64_immediate_operand (disp, VOIDmode))
- /* Displacement is out of range. */
- return false;
- /* In x32 mode, constant addresses are sign extended to 64bit, so
- we have to prevent addresses from 0x80000000 to 0xffffffff. */
- else if (TARGET_X32 && !(index || base)
- && CONST_INT_P (disp)
- && val_signbit_known_set_p (SImode, INTVAL (disp)))
- return false;
}
-
- /* Everything looks valid. */
- return true;
}
-/* Determine if a given RTX is a valid constant address. */
-
-bool
-constant_address_p (rtx x)
+static void
+ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
{
- return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
+ ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
}
-\f
-/* Return a unique alias set for the GOT. */
-static alias_set_type
-ix86_GOT_alias_set (void)
+/* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
+
+static bool
+i386_asm_output_addr_const_extra (FILE *file, rtx x)
{
- static alias_set_type set = -1;
- if (set == -1)
- set = new_alias_set ();
- return set;
-}
+ rtx op;
-/* Return a legitimate reference for ORIG (an address) using the
- register REG. If REG is 0, a new pseudo is generated.
+ if (GET_CODE (x) != UNSPEC)
+ return false;
- There are two types of references that must be handled:
+ op = XVECEXP (x, 0, 0);
+ switch (XINT (x, 1))
+ {
+ case UNSPEC_GOTOFF:
+ output_addr_const (file, op);
+ fputs ("@gotoff", file);
+ break;
+ case UNSPEC_GOTTPOFF:
+ output_addr_const (file, op);
+ /* FIXME: This might be @TPOFF in Sun ld. */
+ fputs ("@gottpoff", file);
+ break;
+ case UNSPEC_TPOFF:
+ output_addr_const (file, op);
+ fputs ("@tpoff", file);
+ break;
+ case UNSPEC_NTPOFF:
+ output_addr_const (file, op);
+ if (TARGET_64BIT)
+ fputs ("@tpoff", file);
+ else
+ fputs ("@ntpoff", file);
+ break;
+ case UNSPEC_DTPOFF:
+ output_addr_const (file, op);
+ fputs ("@dtpoff", file);
+ break;
+ case UNSPEC_GOTNTPOFF:
+ output_addr_const (file, op);
+ if (TARGET_64BIT)
+ fputs (ASSEMBLER_DIALECT == ASM_ATT ?
+ "@gottpoff(%rip)" : "@gottpoff[rip]", file);
+ else
+ fputs ("@gotntpoff", file);
+ break;
+ case UNSPEC_INDNTPOFF:
+ output_addr_const (file, op);
+ fputs ("@indntpoff", file);
+ break;
+#if TARGET_MACHO
+ case UNSPEC_MACHOPIC_OFFSET:
+ output_addr_const (file, op);
+ putc ('-', file);
+ machopic_output_function_base_name (file);
+ break;
+#endif
- 1. Global data references must load the address from the GOT, via
- the PIC reg. An insn is emitted to do this load, and the reg is
- returned.
+ default:
+ return false;
+ }
- 2. Static data references, constant pool addresses, and code labels
- compute the address as an offset from the GOT, whose base is in
- the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
- differentiate them from global data objects. The returned
- address is the PIC reg + an unspec constant.
+ return true;
+}
+\f
+\f
+/* Output code to perform a 387 binary operation in INSN, one of PLUS,
+ MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
+ is the expression of the binary operation. The output may either be
+ emitted here, or returned to the caller, like all output_* functions.
- TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
- reg also appears in the address. */
+ There is no guarantee that the operands are the same mode, as they
+ might be within FLOAT or FLOAT_EXTEND expressions. */
-static rtx
-legitimize_pic_address (rtx orig, rtx reg)
+#ifndef SYSV386_COMPAT
+/* Set to 1 for compatibility with brain-damaged assemblers. No-one
+ wants to fix the assemblers because that causes incompatibility
+ with gcc. No-one wants to fix gcc because that causes
+ incompatibility with assemblers... You can use the option of
+ -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
+#define SYSV386_COMPAT 1
+#endif
+
+const char *
+output_387_binary_op (rtx_insn *insn, rtx *operands)
{
- rtx addr = orig;
- rtx new_rtx = orig;
+ static char buf[40];
+ const char *p;
+ bool is_sse
+ = (SSE_REG_P (operands[0])
+ || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]));
-#if TARGET_MACHO
- if (TARGET_MACHO && !TARGET_64BIT)
- {
- if (reg == 0)
- reg = gen_reg_rtx (Pmode);
- /* Use the generic Mach-O PIC machinery. */
- return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
- }
-#endif
+ if (is_sse)
+ p = "%v";
+ else if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
+ || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
+ p = "fi";
+ else
+ p = "f";
- if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
- {
- rtx tmp = legitimize_pe_coff_symbol (addr, true);
- if (tmp)
- return tmp;
- }
+ strcpy (buf, p);
- if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
- new_rtx = addr;
- else if ((!TARGET_64BIT
- || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
- && !TARGET_PECOFF
- && gotoff_operand (addr, Pmode))
+ switch (GET_CODE (operands[3]))
{
- /* This symbol may be referenced via a displacement
- from the PIC base address (@GOTOFF). */
- if (GET_CODE (addr) == CONST)
- addr = XEXP (addr, 0);
+ case PLUS:
+ p = "add"; break;
+ case MINUS:
+ p = "sub"; break;
+ case MULT:
+ p = "mul"; break;
+ case DIV:
+ p = "div"; break;
+ default:
+ gcc_unreachable ();
+ }
- if (GET_CODE (addr) == PLUS)
- {
- new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
- UNSPEC_GOTOFF);
- new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
- }
- else
- new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
+ strcat (buf, p);
- new_rtx = gen_rtx_CONST (Pmode, new_rtx);
+ if (is_sse)
+ {
+ p = (GET_MODE (operands[0]) == SFmode) ? "ss" : "sd";
+ strcat (buf, p);
- if (TARGET_64BIT)
- new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
+ if (TARGET_AVX)
+ p = "\t{%2, %1, %0|%0, %1, %2}";
+ else
+ p = "\t{%2, %0|%0, %2}";
- if (reg != 0)
- {
- gcc_assert (REG_P (reg));
- new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
- new_rtx, reg, 1, OPTAB_DIRECT);
- }
+ strcat (buf, p);
+ return buf;
+ }
+
+ /* Even if we do not want to check the inputs, this documents input
+ constraints. Which helps in understanding the following code. */
+ if (flag_checking)
+ {
+ if (STACK_REG_P (operands[0])
+ && ((REG_P (operands[1])
+ && REGNO (operands[0]) == REGNO (operands[1])
+ && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
+ || (REG_P (operands[2])
+ && REGNO (operands[0]) == REGNO (operands[2])
+ && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
+ && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
+ ; /* ok */
else
- new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
+ gcc_unreachable ();
}
- else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
- /* We can't use @GOTOFF for text labels
- on VxWorks, see gotoff_operand. */
- || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
+
+ switch (GET_CODE (operands[3]))
{
- rtx tmp = legitimize_pe_coff_symbol (addr, true);
- if (tmp)
- return tmp;
+ case MULT:
+ case PLUS:
+ if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
+ std::swap (operands[1], operands[2]);
- /* For x64 PE-COFF there is no GOT table,
- so we use address directly. */
- if (TARGET_64BIT && TARGET_PECOFF)
+ /* know operands[0] == operands[1]. */
+
+ if (MEM_P (operands[2]))
{
- new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
- new_rtx = gen_rtx_CONST (Pmode, new_rtx);
+ p = "%Z2\t%2";
+ break;
}
- else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
+
+ if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
{
- new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
- UNSPEC_GOTPCREL);
- new_rtx = gen_rtx_CONST (Pmode, new_rtx);
- new_rtx = gen_const_mem (Pmode, new_rtx);
- set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
+ if (STACK_TOP_P (operands[0]))
+ /* How is it that we are storing to a dead operand[2]?
+ Well, presumably operands[1] is dead too. We can't
+ store the result to st(0) as st(0) gets popped on this
+ instruction. Instead store to operands[2] (which I
+ think has to be st(1)). st(1) will be popped later.
+ gcc <= 2.8.1 didn't have this check and generated
+ assembly code that the Unixware assembler rejected. */
+ p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
+ else
+ p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
+ break;
}
+
+ if (STACK_TOP_P (operands[0]))
+ p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
else
+ p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
+ break;
+
+ case MINUS:
+ case DIV:
+ if (MEM_P (operands[1]))
{
- /* This symbol must be referenced via a load
- from the Global Offset Table (@GOT). */
- new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
- new_rtx = gen_rtx_CONST (Pmode, new_rtx);
- if (TARGET_64BIT)
- new_rtx = force_reg (Pmode, new_rtx);
- new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
- new_rtx = gen_const_mem (Pmode, new_rtx);
- set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
+ p = "r%Z1\t%1";
+ break;
}
- new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
- }
- else
- {
- if (CONST_INT_P (addr)
- && !x86_64_immediate_operand (addr, VOIDmode))
- new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
- else if (GET_CODE (addr) == CONST)
+ if (MEM_P (operands[2]))
{
- addr = XEXP (addr, 0);
-
- /* We must match stuff we generate before. Assume the only
- unspecs that can get here are ours. Not that we could do
- anything with them anyway.... */
- if (GET_CODE (addr) == UNSPEC
- || (GET_CODE (addr) == PLUS
- && GET_CODE (XEXP (addr, 0)) == UNSPEC))
- return orig;
- gcc_assert (GET_CODE (addr) == PLUS);
+ p = "%Z2\t%2";
+ break;
}
- if (GET_CODE (addr) == PLUS)
+ if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
{
- rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
-
- /* Check first to see if this is a constant
- offset from a @GOTOFF symbol reference. */
- if (!TARGET_PECOFF
- && gotoff_operand (op0, Pmode)
- && CONST_INT_P (op1))
- {
- if (!TARGET_64BIT)
- {
- new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
- UNSPEC_GOTOFF);
- new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
- new_rtx = gen_rtx_CONST (Pmode, new_rtx);
+#if SYSV386_COMPAT
+ /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
+ derived assemblers, confusingly reverse the direction of
+ the operation for fsub{r} and fdiv{r} when the
+ destination register is not st(0). The Intel assembler
+ doesn't have this brain damage. Read !SYSV386_COMPAT to
+ figure out what the hardware really does. */
+ if (STACK_TOP_P (operands[0]))
+ p = "{p\t%0, %2|rp\t%2, %0}";
+ else
+ p = "{rp\t%2, %0|p\t%0, %2}";
+#else
+ if (STACK_TOP_P (operands[0]))
+ /* As above for fmul/fadd, we can't store to st(0). */
+ p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
+ else
+ p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
+#endif
+ break;
+ }
- if (reg != 0)
- {
- gcc_assert (REG_P (reg));
- new_rtx = expand_simple_binop (Pmode, PLUS,
- pic_offset_table_rtx,
- new_rtx, reg, 1,
- OPTAB_DIRECT);
- }
- else
- new_rtx
- = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
- }
- else
- {
- if (INTVAL (op1) < -16*1024*1024
- || INTVAL (op1) >= 16*1024*1024)
- {
- if (!x86_64_immediate_operand (op1, Pmode))
- op1 = force_reg (Pmode, op1);
+ if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
+ {
+#if SYSV386_COMPAT
+ if (STACK_TOP_P (operands[0]))
+ p = "{rp\t%0, %1|p\t%1, %0}";
+ else
+ p = "{p\t%1, %0|rp\t%0, %1}";
+#else
+ if (STACK_TOP_P (operands[0]))
+ p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
+ else
+ p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
+#endif
+ break;
+ }
- new_rtx
- = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
- }
- }
- }
+ if (STACK_TOP_P (operands[0]))
+ {
+ if (STACK_TOP_P (operands[1]))
+ p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
else
- {
- rtx base = legitimize_pic_address (op0, reg);
- machine_mode mode = GET_MODE (base);
- new_rtx
- = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
+ p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
+ break;
+ }
+ else if (STACK_TOP_P (operands[1]))
+ {
+#if SYSV386_COMPAT
+ p = "{\t%1, %0|r\t%0, %1}";
+#else
+ p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
+#endif
+ }
+ else
+ {
+#if SYSV386_COMPAT
+ p = "{r\t%2, %0|\t%0, %2}";
+#else
+ p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
+#endif
+ }
+ break;
- if (CONST_INT_P (new_rtx))
- {
- if (INTVAL (new_rtx) < -16*1024*1024
- || INTVAL (new_rtx) >= 16*1024*1024)
- {
- if (!x86_64_immediate_operand (new_rtx, mode))
- new_rtx = force_reg (mode, new_rtx);
-
- new_rtx
- = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
- }
- else
- new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
- }
- else
- {
- /* For %rip addressing, we have to use
- just disp32, not base nor index. */
- if (TARGET_64BIT
- && (GET_CODE (base) == SYMBOL_REF
- || GET_CODE (base) == LABEL_REF))
- base = force_reg (mode, base);
- if (GET_CODE (new_rtx) == PLUS
- && CONSTANT_P (XEXP (new_rtx, 1)))
- {
- base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
- new_rtx = XEXP (new_rtx, 1);
- }
- new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
- }
- }
- }
- }
- return new_rtx;
-}
-\f
-/* Load the thread pointer. If TO_REG is true, force it into a register. */
-
-static rtx
-get_thread_pointer (machine_mode tp_mode, bool to_reg)
-{
- rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
-
- if (GET_MODE (tp) != tp_mode)
- {
- gcc_assert (GET_MODE (tp) == SImode);
- gcc_assert (tp_mode == DImode);
-
- tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
+ default:
+ gcc_unreachable ();
}
- if (to_reg)
- tp = copy_to_mode_reg (tp_mode, tp);
-
- return tp;
+ strcat (buf, p);
+ return buf;
}
-/* Construct the SYMBOL_REF for the tls_get_addr function. */
-
-static GTY(()) rtx ix86_tls_symbol;
+/* Return needed mode for entity in optimize_mode_switching pass. */
-static rtx
-ix86_tls_get_addr (void)
+static int
+ix86_dirflag_mode_needed (rtx_insn *insn)
{
- if (!ix86_tls_symbol)
+ if (CALL_P (insn))
{
- const char *sym
- = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
- ? "___tls_get_addr" : "__tls_get_addr");
-
- ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
+ if (cfun->machine->func_type == TYPE_NORMAL)
+ return X86_DIRFLAG_ANY;
+ else
+ /* No need to emit CLD in interrupt handler for TARGET_CLD. */
+ return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
}
- if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
+ if (recog_memoized (insn) < 0)
+ return X86_DIRFLAG_ANY;
+
+ if (get_attr_type (insn) == TYPE_STR)
{
- rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
- UNSPEC_PLTOFF);
- return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
- gen_rtx_CONST (Pmode, unspec));
+ /* Emit cld instruction if stringops are used in the function. */
+ if (cfun->machine->func_type == TYPE_NORMAL)
+ return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
+ else
+ return X86_DIRFLAG_RESET;
}
- return ix86_tls_symbol;
+ return X86_DIRFLAG_ANY;
}
-/* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
-
-static GTY(()) rtx ix86_tls_module_base_symbol;
+/* Check if a 256bit or 512 bit AVX register is referenced inside of EXP. */
-rtx
-ix86_tls_module_base (void)
+static bool
+ix86_check_avx_upper_register (const_rtx exp)
{
- if (!ix86_tls_module_base_symbol)
- {
- ix86_tls_module_base_symbol
- = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
-
- SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
- |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
- }
-
- return ix86_tls_module_base_symbol;
+ return SSE_REG_P (exp) && GET_MODE_BITSIZE (GET_MODE (exp)) > 128;
}
-/* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
- false if we expect this to be used for a memory address and true if
- we expect to load the address into a register. */
+/* Return needed mode for entity in optimize_mode_switching pass. */
-static rtx
-legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
+static int
+ix86_avx_u128_mode_needed (rtx_insn *insn)
{
- rtx dest, base, off;
- rtx pic = NULL_RTX, tp = NULL_RTX;
- machine_mode tp_mode = Pmode;
- int type;
-
- /* Fall back to global dynamic model if tool chain cannot support local
- dynamic. */
- if (TARGET_SUN_TLS && !TARGET_64BIT
- && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
- && model == TLS_MODEL_LOCAL_DYNAMIC)
- model = TLS_MODEL_GLOBAL_DYNAMIC;
-
- switch (model)
+ if (CALL_P (insn))
{
- case TLS_MODEL_GLOBAL_DYNAMIC:
- dest = gen_reg_rtx (Pmode);
+ rtx link;
- if (!TARGET_64BIT)
+ /* Needed mode is set to AVX_U128_CLEAN if there are
+ no 256bit or 512bit modes used in function arguments. */
+ for (link = CALL_INSN_FUNCTION_USAGE (insn);
+ link;
+ link = XEXP (link, 1))
{
- if (flag_pic && !TARGET_PECOFF)
- pic = pic_offset_table_rtx;
- else
+ if (GET_CODE (XEXP (link, 0)) == USE)
{
- pic = gen_reg_rtx (Pmode);
- emit_insn (gen_set_got (pic));
+ rtx arg = XEXP (XEXP (link, 0), 0);
+
+ if (ix86_check_avx_upper_register (arg))
+ return AVX_U128_DIRTY;
}
}
- if (TARGET_GNU2_TLS)
- {
- if (TARGET_64BIT)
- emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
- else
- emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
+ return AVX_U128_CLEAN;
+ }
- tp = get_thread_pointer (Pmode, true);
- dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
+ /* Require DIRTY mode if a 256bit or 512bit AVX register is referenced.
+ Hardware changes state only when a 256bit register is written to,
+ but we need to prevent the compiler from moving optimal insertion
+ point above eventual read from 256bit or 512 bit register. */
+ subrtx_iterator::array_type array;
+ FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
+ if (ix86_check_avx_upper_register (*iter))
+ return AVX_U128_DIRTY;
- if (GET_MODE (x) != Pmode)
- x = gen_rtx_ZERO_EXTEND (Pmode, x);
+ return AVX_U128_ANY;
+}
- set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
- }
- else
- {
- rtx caddr = ix86_tls_get_addr ();
+/* Return mode that i387 must be switched into
+ prior to the execution of insn. */
- if (TARGET_64BIT)
- {
- rtx rax = gen_rtx_REG (Pmode, AX_REG);
- rtx_insn *insns;
+static int
+ix86_i387_mode_needed (int entity, rtx_insn *insn)
+{
+ enum attr_i387_cw mode;
- start_sequence ();
- emit_call_insn
- (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
- insns = get_insns ();
- end_sequence ();
+ /* The mode UNINITIALIZED is used to store control word after a
+ function call or ASM pattern. The mode ANY specify that function
+ has no requirements on the control word and make no changes in the
+ bits we are interested in. */
- if (GET_MODE (x) != Pmode)
- x = gen_rtx_ZERO_EXTEND (Pmode, x);
+ if (CALL_P (insn)
+ || (NONJUMP_INSN_P (insn)
+ && (asm_noperands (PATTERN (insn)) >= 0
+ || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
+ return I387_CW_UNINITIALIZED;
- RTL_CONST_CALL_P (insns) = 1;
- emit_libcall_block (insns, dest, rax, x);
- }
- else
- emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
- }
+ if (recog_memoized (insn) < 0)
+ return I387_CW_ANY;
+
+ mode = get_attr_i387_cw (insn);
+
+ switch (entity)
+ {
+ case I387_TRUNC:
+ if (mode == I387_CW_TRUNC)
+ return mode;
break;
- case TLS_MODEL_LOCAL_DYNAMIC:
- base = gen_reg_rtx (Pmode);
+ case I387_FLOOR:
+ if (mode == I387_CW_FLOOR)
+ return mode;
+ break;
- if (!TARGET_64BIT)
- {
- if (flag_pic)
- pic = pic_offset_table_rtx;
- else
- {
- pic = gen_reg_rtx (Pmode);
- emit_insn (gen_set_got (pic));
- }
- }
+ case I387_CEIL:
+ if (mode == I387_CW_CEIL)
+ return mode;
+ break;
- if (TARGET_GNU2_TLS)
- {
- rtx tmp = ix86_tls_module_base ();
+ default:
+ gcc_unreachable ();
+ }
- if (TARGET_64BIT)
- emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
- else
- emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
+ return I387_CW_ANY;
+}
- tp = get_thread_pointer (Pmode, true);
- set_unique_reg_note (get_last_insn (), REG_EQUAL,
- gen_rtx_MINUS (Pmode, tmp, tp));
- }
- else
- {
- rtx caddr = ix86_tls_get_addr ();
-
- if (TARGET_64BIT)
- {
- rtx rax = gen_rtx_REG (Pmode, AX_REG);
- rtx_insn *insns;
- rtx eqv;
-
- start_sequence ();
- emit_call_insn
- (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
- insns = get_insns ();
- end_sequence ();
-
- /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
- share the LD_BASE result with other LD model accesses. */
- eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
- UNSPEC_TLS_LD_BASE);
-
- RTL_CONST_CALL_P (insns) = 1;
- emit_libcall_block (insns, base, rax, eqv);
- }
- else
- emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
- }
-
- off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
- off = gen_rtx_CONST (Pmode, off);
-
- dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
-
- if (TARGET_GNU2_TLS)
- {
- dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
+/* Return mode that entity must be switched into
+ prior to the execution of insn. */
- if (GET_MODE (x) != Pmode)
- x = gen_rtx_ZERO_EXTEND (Pmode, x);
+static int
+ix86_mode_needed (int entity, rtx_insn *insn)
+{
+ switch (entity)
+ {
+ case X86_DIRFLAG:
+ return ix86_dirflag_mode_needed (insn);
+ case AVX_U128:
+ return ix86_avx_u128_mode_needed (insn);
+ case I387_TRUNC:
+ case I387_FLOOR:
+ case I387_CEIL:
+ return ix86_i387_mode_needed (entity, insn);
+ default:
+ gcc_unreachable ();
+ }
+ return 0;
+}
- set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
- }
- break;
+/* Check if a 256bit or 512bit AVX register is referenced in stores. */
+
+static void
+ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
+ {
+ if (ix86_check_avx_upper_register (dest))
+ {
+ bool *used = (bool *) data;
+ *used = true;
+ }
+ }
- case TLS_MODEL_INITIAL_EXEC:
- if (TARGET_64BIT)
- {
- if (TARGET_SUN_TLS && !TARGET_X32)
- {
- /* The Sun linker took the AMD64 TLS spec literally
- and can only handle %rax as destination of the
- initial executable code sequence. */
+/* Calculate mode of upper 128bit AVX registers after the insn. */
- dest = gen_reg_rtx (DImode);
- emit_insn (gen_tls_initial_exec_64_sun (dest, x));
- return dest;
- }
+static int
+ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
+{
+ rtx pat = PATTERN (insn);
- /* Generate DImode references to avoid %fs:(%reg32)
- problems and linker IE->LE relaxation bug. */
- tp_mode = DImode;
- pic = NULL;
- type = UNSPEC_GOTNTPOFF;
- }
- else if (flag_pic)
- {
- pic = pic_offset_table_rtx;
- type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
- }
- else if (!TARGET_ANY_GNU_TLS)
- {
- pic = gen_reg_rtx (Pmode);
- emit_insn (gen_set_got (pic));
- type = UNSPEC_GOTTPOFF;
- }
- else
- {
- pic = NULL;
- type = UNSPEC_INDNTPOFF;
- }
+ if (vzeroupper_pattern (pat, VOIDmode)
+ || vzeroall_pattern (pat, VOIDmode))
+ return AVX_U128_CLEAN;
- off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
- off = gen_rtx_CONST (tp_mode, off);
- if (pic)
- off = gen_rtx_PLUS (tp_mode, pic, off);
- off = gen_const_mem (tp_mode, off);
- set_mem_alias_set (off, ix86_GOT_alias_set ());
+ /* We know that state is clean after CALL insn if there are no
+ 256bit or 512bit registers used in the function return register. */
+ if (CALL_P (insn))
+ {
+ bool avx_upper_reg_found = false;
+ note_stores (pat, ix86_check_avx_upper_stores, &avx_upper_reg_found);
- if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
- {
- base = get_thread_pointer (tp_mode,
- for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
- off = force_reg (tp_mode, off);
- dest = gen_rtx_PLUS (tp_mode, base, off);
- if (tp_mode != Pmode)
- dest = convert_to_mode (Pmode, dest, 1);
- }
- else
- {
- base = get_thread_pointer (Pmode, true);
- dest = gen_reg_rtx (Pmode);
- emit_insn (ix86_gen_sub3 (dest, base, off));
- }
- break;
+ return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
+ }
- case TLS_MODEL_LOCAL_EXEC:
- off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
- (TARGET_64BIT || TARGET_ANY_GNU_TLS)
- ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
- off = gen_rtx_CONST (Pmode, off);
+ /* Otherwise, return current mode. Remember that if insn
+ references AVX 256bit or 512bit registers, the mode was already
+ changed to DIRTY from MODE_NEEDED. */
+ return mode;
+}
- if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
- {
- base = get_thread_pointer (Pmode,
- for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
- return gen_rtx_PLUS (Pmode, base, off);
- }
- else
- {
- base = get_thread_pointer (Pmode, true);
- dest = gen_reg_rtx (Pmode);
- emit_insn (ix86_gen_sub3 (dest, base, off));
- }
- break;
+/* Return the mode that an insn results in. */
+static int
+ix86_mode_after (int entity, int mode, rtx_insn *insn)
+{
+ switch (entity)
+ {
+ case X86_DIRFLAG:
+ return mode;
+ case AVX_U128:
+ return ix86_avx_u128_mode_after (mode, insn);
+ case I387_TRUNC:
+ case I387_FLOOR:
+ case I387_CEIL:
+ return mode;
default:
gcc_unreachable ();
}
+}
- return dest;
+static int
+ix86_dirflag_mode_entry (void)
+{
+ /* For TARGET_CLD or in the interrupt handler we can't assume
+ direction flag state at function entry. */
+ if (TARGET_CLD
+ || cfun->machine->func_type != TYPE_NORMAL)
+ return X86_DIRFLAG_ANY;
+
+ return X86_DIRFLAG_RESET;
}
-/* Return true if OP refers to a TLS address. */
-bool
-ix86_tls_address_pattern_p (rtx op)
+static int
+ix86_avx_u128_mode_entry (void)
{
- subrtx_var_iterator::array_type array;
- FOR_EACH_SUBRTX_VAR (iter, array, op, ALL)
+ tree arg;
+
+ /* Entry mode is set to AVX_U128_DIRTY if there are
+ 256bit or 512bit modes used in function arguments. */
+ for (arg = DECL_ARGUMENTS (current_function_decl); arg;
+ arg = TREE_CHAIN (arg))
{
- rtx op = *iter;
- if (MEM_P (op))
- {
- rtx *x = &XEXP (op, 0);
- while (GET_CODE (*x) == PLUS)
- {
- int i;
- for (i = 0; i < 2; i++)
- {
- rtx u = XEXP (*x, i);
- if (GET_CODE (u) == ZERO_EXTEND)
- u = XEXP (u, 0);
- if (GET_CODE (u) == UNSPEC
- && XINT (u, 1) == UNSPEC_TP)
- return true;
- }
- x = &XEXP (*x, 0);
- }
+ rtx incoming = DECL_INCOMING_RTL (arg);
- iter.skip_subrtxes ();
- }
+ if (incoming && ix86_check_avx_upper_register (incoming))
+ return AVX_U128_DIRTY;
}
- return false;
+ return AVX_U128_CLEAN;
}
-/* Rewrite *LOC so that it refers to a default TLS address space. */
-void
-ix86_rewrite_tls_address_1 (rtx *loc)
+/* Return a mode that ENTITY is assumed to be
+ switched to at function entry. */
+
+static int
+ix86_mode_entry (int entity)
{
- subrtx_ptr_iterator::array_type array;
- FOR_EACH_SUBRTX_PTR (iter, array, loc, ALL)
+ switch (entity)
{
- rtx *loc = *iter;
- if (MEM_P (*loc))
- {
- rtx addr = XEXP (*loc, 0);
- rtx *x = &addr;
- while (GET_CODE (*x) == PLUS)
- {
- int i;
- for (i = 0; i < 2; i++)
- {
- rtx u = XEXP (*x, i);
- if (GET_CODE (u) == ZERO_EXTEND)
- u = XEXP (u, 0);
- if (GET_CODE (u) == UNSPEC
- && XINT (u, 1) == UNSPEC_TP)
- {
- addr_space_t as = DEFAULT_TLS_SEG_REG;
+ case X86_DIRFLAG:
+ return ix86_dirflag_mode_entry ();
+ case AVX_U128:
+ return ix86_avx_u128_mode_entry ();
+ case I387_TRUNC:
+ case I387_FLOOR:
+ case I387_CEIL:
+ return I387_CW_ANY;
+ default:
+ gcc_unreachable ();
+ }
+}
- *x = XEXP (*x, 1 - i);
+static int
+ix86_avx_u128_mode_exit (void)
+{
+ rtx reg = crtl->return_rtx;
- *loc = replace_equiv_address_nv (*loc, addr, true);
- set_mem_addr_space (*loc, as);
- return;
- }
- }
- x = &XEXP (*x, 0);
- }
+ /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
+ or 512 bit modes used in the function return register. */
+ if (reg && ix86_check_avx_upper_register (reg))
+ return AVX_U128_DIRTY;
- iter.skip_subrtxes ();
- }
+ /* Exit mode is set to AVX_U128_DIRTY if there are 256bit or 512bit
+ modes used in function arguments, otherwise return AVX_U128_CLEAN.
+ */
+ return ix86_avx_u128_mode_entry ();
+}
+
+/* Return a mode that ENTITY is assumed to be
+ switched to at function exit. */
+
+static int
+ix86_mode_exit (int entity)
+{
+ switch (entity)
+ {
+ case X86_DIRFLAG:
+ return X86_DIRFLAG_ANY;
+ case AVX_U128:
+ return ix86_avx_u128_mode_exit ();
+ case I387_TRUNC:
+ case I387_FLOOR:
+ case I387_CEIL:
+ return I387_CW_ANY;
+ default:
+ gcc_unreachable ();
}
}
-/* Rewrite instruction pattern involvning TLS address
- so that it refers to a default TLS address space. */
-rtx
-ix86_rewrite_tls_address (rtx pattern)
+static int
+ix86_mode_priority (int, int n)
{
- pattern = copy_insn (pattern);
- ix86_rewrite_tls_address_1 (&pattern);
- return pattern;
+ return n;
}
-/* Create or return the unique __imp_DECL dllimport symbol corresponding
- to symbol DECL if BEIMPORT is true. Otherwise create or return the
- unique refptr-DECL symbol corresponding to symbol DECL. */
+/* Output code to initialize control word copies used by trunc?f?i and
+ rounding patterns. CURRENT_MODE is set to current control word,
+ while NEW_MODE is set to new control word. */
-struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
+static void
+emit_i387_cw_initialization (int mode)
{
- static inline hashval_t hash (tree_map *m) { return m->hash; }
- static inline bool
- equal (tree_map *a, tree_map *b)
- {
- return a->base.from == b->base.from;
- }
+ rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
+ rtx new_mode;
- static int
- keep_cache_entry (tree_map *&m)
- {
- return ggc_marked_p (m->base.from);
- }
-};
+ enum ix86_stack_slot slot;
-static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
+ rtx reg = gen_reg_rtx (HImode);
-static tree
-get_dllimport_decl (tree decl, bool beimport)
-{
- struct tree_map *h, in;
- const char *name;
- const char *prefix;
- size_t namelen, prefixlen;
- char *imp_name;
- tree to;
- rtx rtl;
+ emit_insn (gen_x86_fnstcw_1 (stored_mode));
+ emit_move_insn (reg, copy_rtx (stored_mode));
- if (!dllimport_map)
- dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
+ switch (mode)
+ {
+ case I387_CW_TRUNC:
+ /* round toward zero (truncate) */
+ emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
+ slot = SLOT_CW_TRUNC;
+ break;
- in.hash = htab_hash_pointer (decl);
- in.base.from = decl;
- tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
- h = *loc;
- if (h)
- return h->to;
+ case I387_CW_FLOOR:
+ /* round down toward -oo */
+ emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
+ emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
+ slot = SLOT_CW_FLOOR;
+ break;
- *loc = h = ggc_alloc<tree_map> ();
- h->hash = in.hash;
- h->base.from = decl;
- h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
- VAR_DECL, NULL, ptr_type_node);
- DECL_ARTIFICIAL (to) = 1;
- DECL_IGNORED_P (to) = 1;
- DECL_EXTERNAL (to) = 1;
- TREE_READONLY (to) = 1;
+ case I387_CW_CEIL:
+ /* round up toward +oo */
+ emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
+ emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
+ slot = SLOT_CW_CEIL;
+ break;
- name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
- name = targetm.strip_name_encoding (name);
- if (beimport)
- prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
- ? "*__imp_" : "*__imp__";
- else
- prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
- namelen = strlen (name);
- prefixlen = strlen (prefix);
- imp_name = (char *) alloca (namelen + prefixlen + 1);
- memcpy (imp_name, prefix, prefixlen);
- memcpy (imp_name + prefixlen, name, namelen + 1);
+ default:
+ gcc_unreachable ();
+ }
- name = ggc_alloc_string (imp_name, namelen + prefixlen);
- rtl = gen_rtx_SYMBOL_REF (Pmode, name);
- SET_SYMBOL_REF_DECL (rtl, to);
- SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
- if (!beimport)
- {
- SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
-#ifdef SUB_TARGET_RECORD_STUB
- SUB_TARGET_RECORD_STUB (name);
-#endif
- }
+ gcc_assert (slot < MAX_386_STACK_LOCALS);
- rtl = gen_const_mem (Pmode, rtl);
- set_mem_alias_set (rtl, ix86_GOT_alias_set ());
+ new_mode = assign_386_stack_local (HImode, slot);
+ emit_move_insn (new_mode, reg);
+}
- SET_DECL_RTL (to, rtl);
- SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
+/* Generate one or more insns to set ENTITY to MODE. */
- return to;
+static void
+ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
+ HARD_REG_SET regs_live ATTRIBUTE_UNUSED)
+{
+ switch (entity)
+ {
+ case X86_DIRFLAG:
+ if (mode == X86_DIRFLAG_RESET)
+ emit_insn (gen_cld ());
+ break;
+ case AVX_U128:
+ if (mode == AVX_U128_CLEAN)
+ emit_insn (gen_avx_vzeroupper ());
+ break;
+ case I387_TRUNC:
+ case I387_FLOOR:
+ case I387_CEIL:
+ if (mode != I387_CW_ANY
+ && mode != I387_CW_UNINITIALIZED)
+ emit_i387_cw_initialization (mode);
+ break;
+ default:
+ gcc_unreachable ();
+ }
}
-/* Expand SYMBOL into its corresponding far-address symbol.
- WANT_REG is true if we require the result be a register. */
+/* Output code for INSN to convert a float to a signed int. OPERANDS
+ are the insn operands. The output may be [HSD]Imode and the input
+ operand may be [SDX]Fmode. */
-static rtx
-legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
+const char *
+output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
{
- tree imp_decl;
- rtx x;
+ bool stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
+ bool dimode_p = GET_MODE (operands[0]) == DImode;
+ int round_mode = get_attr_i387_cw (insn);
- gcc_assert (SYMBOL_REF_DECL (symbol));
- imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
+ static char buf[40];
+ const char *p;
- x = DECL_RTL (imp_decl);
- if (want_reg)
- x = force_reg (Pmode, x);
- return x;
-}
+ /* Jump through a hoop or two for DImode, since the hardware has no
+ non-popping instruction. We used to do this a different way, but
+ that was somewhat fragile and broke with post-reload splitters. */
+ if ((dimode_p || fisttp) && !stack_top_dies)
+ output_asm_insn ("fld\t%y1", operands);
-/* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
- true if we require the result be a register. */
+ gcc_assert (STACK_TOP_P (operands[1]));
+ gcc_assert (MEM_P (operands[0]));
+ gcc_assert (GET_MODE (operands[1]) != TFmode);
-static rtx
-legitimize_dllimport_symbol (rtx symbol, bool want_reg)
-{
- tree imp_decl;
- rtx x;
+ if (fisttp)
+ return "fisttp%Z0\t%0";
- gcc_assert (SYMBOL_REF_DECL (symbol));
- imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
+ strcpy (buf, "fist");
- x = DECL_RTL (imp_decl);
- if (want_reg)
- x = force_reg (Pmode, x);
- return x;
-}
+ if (round_mode != I387_CW_ANY)
+ output_asm_insn ("fldcw\t%3", operands);
-/* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
- is true if we require the result be a register. */
+ p = "p%Z0\t%0";
+ strcat (buf, p + !(stack_top_dies || dimode_p));
-static rtx
-legitimize_pe_coff_symbol (rtx addr, bool inreg)
-{
- if (!TARGET_PECOFF)
- return NULL_RTX;
+ output_asm_insn (buf, operands);
- if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
- {
- if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
- return legitimize_dllimport_symbol (addr, inreg);
- if (GET_CODE (addr) == CONST
- && GET_CODE (XEXP (addr, 0)) == PLUS
- && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
- && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
- {
- rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
- return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
- }
- }
+ if (round_mode != I387_CW_ANY)
+ output_asm_insn ("fldcw\t%2", operands);
- if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
- return NULL_RTX;
- if (GET_CODE (addr) == SYMBOL_REF
- && !is_imported_p (addr)
- && SYMBOL_REF_EXTERNAL_P (addr)
- && SYMBOL_REF_DECL (addr))
- return legitimize_pe_coff_extern_decl (addr, inreg);
+ return "";
+}
- if (GET_CODE (addr) == CONST
- && GET_CODE (XEXP (addr, 0)) == PLUS
- && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
- && !is_imported_p (XEXP (XEXP (addr, 0), 0))
- && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
- && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
+/* Output code for x87 ffreep insn. The OPNO argument, which may only
+ have the values zero or one, indicates the ffreep insn's operand
+ from the OPERANDS array. */
+
+static const char *
+output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
+{
+ if (TARGET_USE_FFREEP)
+#ifdef HAVE_AS_IX86_FFREEP
+ return opno ? "ffreep\t%y1" : "ffreep\t%y0";
+#else
{
- rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
- return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
+ static char retval[32];
+ int regno = REGNO (operands[opno]);
+
+ gcc_assert (STACK_REGNO_P (regno));
+
+ regno -= FIRST_STACK_REG;
+
+ snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
+ return retval;
}
- return NULL_RTX;
+#endif
+
+ return opno ? "fstp\t%y1" : "fstp\t%y0";
}
-/* Try machine-dependent ways of modifying an illegitimate address
- to be legitimate. If we find one, return the new, valid address.
- This macro is used in only one place: `memory_address' in explow.c.
- OLDX is the address as it was before break_out_memory_refs was called.
- In some cases it is useful to look at this to decide what needs to be done.
+/* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
+ should be used. UNORDERED_P is true when fucom should be used. */
- It is always safe for this macro to do nothing. It exists to recognize
- opportunities to optimize the output.
+const char *
+output_fp_compare (rtx_insn *insn, rtx *operands,
+ bool eflags_p, bool unordered_p)
+{
+ rtx *xops = eflags_p ? &operands[0] : &operands[1];
+ bool stack_top_dies;
- For the 80386, we handle X+REG by loading X into a register R and
- using R+REG. R will go in a general reg and indexing will be used.
- However, if REG is a broken-out memory address or multiplication,
- nothing needs to be done because REG can certainly go in a general reg.
+ static char buf[40];
+ const char *p;
- When -fpic is used, special handling is needed for symbolic references.
- See comments by legitimize_pic_address in i386.c for details. */
+ gcc_assert (STACK_TOP_P (xops[0]));
-static rtx
-ix86_legitimize_address (rtx x, rtx, machine_mode mode)
-{
- bool changed = false;
- unsigned log;
+ stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
- log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
- if (log)
- return legitimize_tls_address (x, (enum tls_model) log, false);
- if (GET_CODE (x) == CONST
- && GET_CODE (XEXP (x, 0)) == PLUS
- && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
- && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
+ if (eflags_p)
{
- rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
- (enum tls_model) log, false);
- return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
- }
+ p = unordered_p ? "fucomi" : "fcomi";
+ strcpy (buf, p);
- if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
- {
- rtx tmp = legitimize_pe_coff_symbol (x, true);
- if (tmp)
- return tmp;
- }
+ p = "p\t{%y1, %0|%0, %y1}";
+ strcat (buf, p + !stack_top_dies);
- if (flag_pic && SYMBOLIC_CONST (x))
- return legitimize_pic_address (x, 0);
+ return buf;
+ }
-#if TARGET_MACHO
- if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
- return machopic_indirect_data_reference (x, 0);
-#endif
+ if (STACK_REG_P (xops[1])
+ && stack_top_dies
+ && find_regno_note (insn, REG_DEAD, FIRST_STACK_REG + 1))
+ {
+ gcc_assert (REGNO (xops[1]) == FIRST_STACK_REG + 1);
- /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
- if (GET_CODE (x) == ASHIFT
- && CONST_INT_P (XEXP (x, 1))
- && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
+ /* If both the top of the 387 stack die, and the other operand
+ is also a stack register that dies, then this must be a
+ `fcompp' float compare. */
+ p = unordered_p ? "fucompp" : "fcompp";
+ strcpy (buf, p);
+ }
+ else if (const0_operand (xops[1], VOIDmode))
{
- changed = true;
- log = INTVAL (XEXP (x, 1));
- x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
- GEN_INT (1 << log));
+ gcc_assert (!unordered_p);
+ strcpy (buf, "ftst");
}
-
- if (GET_CODE (x) == PLUS)
+ else
{
- /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
-
- if (GET_CODE (XEXP (x, 0)) == ASHIFT
- && CONST_INT_P (XEXP (XEXP (x, 0), 1))
- && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
- {
- changed = true;
- log = INTVAL (XEXP (XEXP (x, 0), 1));
- XEXP (x, 0) = gen_rtx_MULT (Pmode,
- force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
- GEN_INT (1 << log));
- }
-
- if (GET_CODE (XEXP (x, 1)) == ASHIFT
- && CONST_INT_P (XEXP (XEXP (x, 1), 1))
- && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
+ if (GET_MODE_CLASS (GET_MODE (xops[1])) == MODE_INT)
{
- changed = true;
- log = INTVAL (XEXP (XEXP (x, 1), 1));
- XEXP (x, 1) = gen_rtx_MULT (Pmode,
- force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
- GEN_INT (1 << log));
+ gcc_assert (!unordered_p);
+ p = "ficom";
}
+ else
+ p = unordered_p ? "fucom" : "fcom";
- /* Put multiply first if it isn't already. */
- if (GET_CODE (XEXP (x, 1)) == MULT)
- {
- std::swap (XEXP (x, 0), XEXP (x, 1));
- changed = true;
- }
+ strcpy (buf, p);
- /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
- into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
- created by virtual register instantiation, register elimination, and
- similar optimizations. */
- if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
- {
- changed = true;
- x = gen_rtx_PLUS (Pmode,
- gen_rtx_PLUS (Pmode, XEXP (x, 0),
- XEXP (XEXP (x, 1), 0)),
- XEXP (XEXP (x, 1), 1));
- }
+ p = "p%Z2\t%y2";
+ strcat (buf, p + !stack_top_dies);
+ }
- /* Canonicalize
- (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
- into (plus (plus (mult (reg) (const)) (reg)) (const)). */
- else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
- && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
- && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
- && CONSTANT_P (XEXP (x, 1)))
- {
- rtx constant;
- rtx other = NULL_RTX;
+ output_asm_insn (buf, operands);
+ return "fnstsw\t%0";
+}
- if (CONST_INT_P (XEXP (x, 1)))
- {
- constant = XEXP (x, 1);
- other = XEXP (XEXP (XEXP (x, 0), 1), 1);
- }
- else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
- {
- constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
- other = XEXP (x, 1);
- }
- else
- constant = 0;
+void
+ix86_output_addr_vec_elt (FILE *file, int value)
+{
+ const char *directive = ASM_LONG;
- if (constant)
- {
- changed = true;
- x = gen_rtx_PLUS (Pmode,
- gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
- XEXP (XEXP (XEXP (x, 0), 1), 0)),
- plus_constant (Pmode, other,
- INTVAL (constant)));
- }
- }
+#ifdef ASM_QUAD
+ if (TARGET_LP64)
+ directive = ASM_QUAD;
+#else
+ gcc_assert (!TARGET_64BIT);
+#endif
- if (changed && ix86_legitimate_address_p (mode, x, false))
- return x;
+ fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
+}
- if (GET_CODE (XEXP (x, 0)) == MULT)
- {
- changed = true;
- XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
- }
+void
+ix86_output_addr_diff_elt (FILE *file, int value, int rel)
+{
+ const char *directive = ASM_LONG;
- if (GET_CODE (XEXP (x, 1)) == MULT)
- {
- changed = true;
- XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
- }
+#ifdef ASM_QUAD
+ if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
+ directive = ASM_QUAD;
+#else
+ gcc_assert (!TARGET_64BIT);
+#endif
+ /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
+ if (TARGET_64BIT || TARGET_VXWORKS_RTP)
+ fprintf (file, "%s%s%d-%s%d\n",
+ directive, LPREFIX, value, LPREFIX, rel);
+#if TARGET_MACHO
+ else if (TARGET_MACHO)
+ {
+ fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
+ machopic_output_function_base_name (file);
+ putc ('\n', file);
+ }
+#endif
+ else if (HAVE_AS_GOTOFF_IN_DATA)
+ fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
+ else
+ asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
+ GOT_SYMBOL_NAME, LPREFIX, value);
+}
+\f
+#define LEA_MAX_STALL (3)
+#define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
- if (changed
- && REG_P (XEXP (x, 1))
- && REG_P (XEXP (x, 0)))
- return x;
+/* Increase given DISTANCE in half-cycles according to
+ dependencies between PREV and NEXT instructions.
+ Add 1 half-cycle if there is no dependency and
+ go to next cycle if there is some dependecy. */
- if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
- {
- changed = true;
- x = legitimize_pic_address (x, 0);
- }
+static unsigned int
+increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
+{
+ df_ref def, use;
- if (changed && ix86_legitimate_address_p (mode, x, false))
- return x;
+ if (!prev || !next)
+ return distance + (distance & 1) + 2;
- if (REG_P (XEXP (x, 0)))
- {
- rtx temp = gen_reg_rtx (Pmode);
- rtx val = force_operand (XEXP (x, 1), temp);
- if (val != temp)
- {
- val = convert_to_mode (Pmode, val, 1);
- emit_move_insn (temp, val);
- }
+ if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
+ return distance + 1;
- XEXP (x, 1) = temp;
- return x;
- }
+ FOR_EACH_INSN_USE (use, next)
+ FOR_EACH_INSN_DEF (def, prev)
+ if (!DF_REF_IS_ARTIFICIAL (def)
+ && DF_REF_REGNO (use) == DF_REF_REGNO (def))
+ return distance + (distance & 1) + 2;
- else if (REG_P (XEXP (x, 1)))
+ return distance + 1;
+}
+
+/* Function checks if instruction INSN defines register number
+ REGNO1 or REGNO2. */
+
+bool
+insn_defines_reg (unsigned int regno1, unsigned int regno2,
+ rtx_insn *insn)
+{
+ df_ref def;
+
+ FOR_EACH_INSN_DEF (def, insn)
+ if (DF_REF_REG_DEF_P (def)
+ && !DF_REF_IS_ARTIFICIAL (def)
+ && (regno1 == DF_REF_REGNO (def)
+ || regno2 == DF_REF_REGNO (def)))
+ return true;
+
+ return false;
+}
+
+/* Function checks if instruction INSN uses register number
+ REGNO as a part of address expression. */
+
+static bool
+insn_uses_reg_mem (unsigned int regno, rtx insn)
+{
+ df_ref use;
+
+ FOR_EACH_INSN_USE (use, insn)
+ if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
+ return true;
+
+ return false;
+}
+
+/* Search backward for non-agu definition of register number REGNO1
+ or register number REGNO2 in basic block starting from instruction
+ START up to head of basic block or instruction INSN.
+
+ Function puts true value into *FOUND var if definition was found
+ and false otherwise.
+
+ Distance in half-cycles between START and found instruction or head
+ of BB is added to DISTANCE and returned. */
+
+static int
+distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
+ rtx_insn *insn, int distance,
+ rtx_insn *start, bool *found)
+{
+ basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
+ rtx_insn *prev = start;
+ rtx_insn *next = NULL;
+
+ *found = false;
+
+ while (prev
+ && prev != insn
+ && distance < LEA_SEARCH_THRESHOLD)
+ {
+ if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
{
- rtx temp = gen_reg_rtx (Pmode);
- rtx val = force_operand (XEXP (x, 0), temp);
- if (val != temp)
+ distance = increase_distance (prev, next, distance);
+ if (insn_defines_reg (regno1, regno2, prev))
{
- val = convert_to_mode (Pmode, val, 1);
- emit_move_insn (temp, val);
+ if (recog_memoized (prev) < 0
+ || get_attr_type (prev) != TYPE_LEA)
+ {
+ *found = true;
+ return distance;
+ }
}
- XEXP (x, 0) = temp;
- return x;
+ next = prev;
}
+ if (prev == BB_HEAD (bb))
+ break;
+
+ prev = PREV_INSN (prev);
}
- return x;
+ return distance;
}
-\f
-/* Print an integer constant expression in assembler syntax. Addition
- and subtraction are the only arithmetic that may appear in these
- expressions. FILE is the stdio stream to write to, X is the rtx, and
- CODE is the operand print code from the output string. */
-static void
-output_pic_addr_const (FILE *file, rtx x, int code)
+/* Search backward for non-agu definition of register number REGNO1
+ or register number REGNO2 in INSN's basic block until
+ 1. Pass LEA_SEARCH_THRESHOLD instructions, or
+ 2. Reach neighbor BBs boundary, or
+ 3. Reach agu definition.
+ Returns the distance between the non-agu definition point and INSN.
+ If no definition point, returns -1. */
+
+static int
+distance_non_agu_define (unsigned int regno1, unsigned int regno2,
+ rtx_insn *insn)
{
- char buf[256];
+ basic_block bb = BLOCK_FOR_INSN (insn);
+ int distance = 0;
+ bool found = false;
- switch (GET_CODE (x))
+ if (insn != BB_HEAD (bb))
+ distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
+ distance, PREV_INSN (insn),
+ &found);
+
+ if (!found && distance < LEA_SEARCH_THRESHOLD)
{
- case PC:
- gcc_assert (flag_pic);
- putc ('.', file);
- break;
+ edge e;
+ edge_iterator ei;
+ bool simple_loop = false;
- case SYMBOL_REF:
- if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
- output_addr_const (file, x);
+ FOR_EACH_EDGE (e, ei, bb->preds)
+ if (e->src == bb)
+ {
+ simple_loop = true;
+ break;
+ }
+
+ if (simple_loop)
+ distance = distance_non_agu_define_in_bb (regno1, regno2,
+ insn, distance,
+ BB_END (bb), &found);
else
{
- const char *name = XSTR (x, 0);
+ int shortest_dist = -1;
+ bool found_in_bb = false;
- /* Mark the decl as referenced so that cgraph will
- output the function. */
- if (SYMBOL_REF_DECL (x))
- mark_decl_referenced (SYMBOL_REF_DECL (x));
+ FOR_EACH_EDGE (e, ei, bb->preds)
+ {
+ int bb_dist
+ = distance_non_agu_define_in_bb (regno1, regno2,
+ insn, distance,
+ BB_END (e->src),
+ &found_in_bb);
+ if (found_in_bb)
+ {
+ if (shortest_dist < 0)
+ shortest_dist = bb_dist;
+ else if (bb_dist > 0)
+ shortest_dist = MIN (bb_dist, shortest_dist);
-#if TARGET_MACHO
- if (MACHOPIC_INDIRECT
- && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
- name = machopic_indirection_name (x, /*stub_p=*/true);
-#endif
- assemble_name (file, name);
+ found = true;
+ }
+ }
+
+ distance = shortest_dist;
}
- if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
- && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
- fputs ("@PLT", file);
- break;
+ }
- case LABEL_REF:
- x = XEXP (x, 0);
- /* FALLTHRU */
- case CODE_LABEL:
- ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
- assemble_name (asm_out_file, buf);
- break;
+ /* get_attr_type may modify recog data. We want to make sure
+ that recog data is valid for instruction INSN, on which
+ distance_non_agu_define is called. INSN is unchanged here. */
+ extract_insn_cached (insn);
- case CONST_INT:
- fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
- break;
+ if (!found)
+ return -1;
- case CONST:
- /* This used to output parentheses around the expression,
- but that does not work on the 386 (either ATT or BSD assembler). */
- output_pic_addr_const (file, XEXP (x, 0), code);
- break;
+ return distance >> 1;
+}
- case CONST_DOUBLE:
- /* We can't handle floating point constants;
- TARGET_PRINT_OPERAND must handle them. */
- output_operand_lossage ("floating constant misused");
- break;
+/* Return the distance in half-cycles between INSN and the next
+ insn that uses register number REGNO in memory address added
+ to DISTANCE. Return -1 if REGNO0 is set.
- case PLUS:
- /* Some assemblers need integer constants to appear first. */
- if (CONST_INT_P (XEXP (x, 0)))
- {
- output_pic_addr_const (file, XEXP (x, 0), code);
- putc ('+', file);
- output_pic_addr_const (file, XEXP (x, 1), code);
- }
- else
- {
- gcc_assert (CONST_INT_P (XEXP (x, 1)));
- output_pic_addr_const (file, XEXP (x, 1), code);
- putc ('+', file);
- output_pic_addr_const (file, XEXP (x, 0), code);
- }
- break;
+ Put true value into *FOUND if register usage was found and
+ false otherwise.
+ Put true value into *REDEFINED if register redefinition was
+ found and false otherwise. */
- case MINUS:
- if (!TARGET_MACHO)
- putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
- output_pic_addr_const (file, XEXP (x, 0), code);
- putc ('-', file);
- output_pic_addr_const (file, XEXP (x, 1), code);
- if (!TARGET_MACHO)
- putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
- break;
+static int
+distance_agu_use_in_bb (unsigned int regno,
+ rtx_insn *insn, int distance, rtx_insn *start,
+ bool *found, bool *redefined)
+{
+ basic_block bb = NULL;
+ rtx_insn *next = start;
+ rtx_insn *prev = NULL;
- case UNSPEC:
- gcc_assert (XVECLEN (x, 0) == 1);
- output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
- switch (XINT (x, 1))
- {
- case UNSPEC_GOT:
- fputs ("@GOT", file);
- break;
- case UNSPEC_GOTOFF:
- fputs ("@GOTOFF", file);
- break;
- case UNSPEC_PLTOFF:
- fputs ("@PLTOFF", file);
- break;
- case UNSPEC_PCREL:
- fputs (ASSEMBLER_DIALECT == ASM_ATT ?
- "(%rip)" : "[rip]", file);
- break;
- case UNSPEC_GOTPCREL:
- fputs (ASSEMBLER_DIALECT == ASM_ATT ?
- "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
- break;
- case UNSPEC_GOTTPOFF:
- /* FIXME: This might be @TPOFF in Sun ld too. */
- fputs ("@gottpoff", file);
- break;
- case UNSPEC_TPOFF:
- fputs ("@tpoff", file);
- break;
- case UNSPEC_NTPOFF:
- if (TARGET_64BIT)
- fputs ("@tpoff", file);
- else
- fputs ("@ntpoff", file);
- break;
- case UNSPEC_DTPOFF:
- fputs ("@dtpoff", file);
- break;
- case UNSPEC_GOTNTPOFF:
- if (TARGET_64BIT)
- fputs (ASSEMBLER_DIALECT == ASM_ATT ?
- "@gottpoff(%rip)": "@gottpoff[rip]", file);
- else
- fputs ("@gotntpoff", file);
- break;
- case UNSPEC_INDNTPOFF:
- fputs ("@indntpoff", file);
- break;
-#if TARGET_MACHO
- case UNSPEC_MACHOPIC_OFFSET:
- putc ('-', file);
- machopic_output_function_base_name (file);
- break;
-#endif
- default:
- output_operand_lossage ("invalid UNSPEC as operand");
- break;
- }
- break;
+ *found = false;
+ *redefined = false;
- default:
- output_operand_lossage ("invalid expression as operand");
+ if (start != NULL_RTX)
+ {
+ bb = BLOCK_FOR_INSN (start);
+ if (start != BB_HEAD (bb))
+ /* If insn and start belong to the same bb, set prev to insn,
+ so the call to increase_distance will increase the distance
+ between insns by 1. */
+ prev = insn;
}
-}
-/* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
- We need to emit DTP-relative relocations. */
-
-static void ATTRIBUTE_UNUSED
-i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
-{
- fputs (ASM_LONG, file);
- output_addr_const (file, x);
- fputs ("@dtpoff", file);
- switch (size)
+ while (next
+ && next != insn
+ && distance < LEA_SEARCH_THRESHOLD)
{
- case 4:
- break;
- case 8:
- fputs (", 0", file);
- break;
- default:
- gcc_unreachable ();
- }
-}
+ if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
+ {
+ distance = increase_distance(prev, next, distance);
+ if (insn_uses_reg_mem (regno, next))
+ {
+ /* Return DISTANCE if OP0 is used in memory
+ address in NEXT. */
+ *found = true;
+ return distance;
+ }
-/* Return true if X is a representation of the PIC register. This copes
- with calls from ix86_find_base_term, where the register might have
- been replaced by a cselib value. */
+ if (insn_defines_reg (regno, INVALID_REGNUM, next))
+ {
+ /* Return -1 if OP0 is set in NEXT. */
+ *redefined = true;
+ return -1;
+ }
-static bool
-ix86_pic_register_p (rtx x)
-{
- if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
- return (pic_offset_table_rtx
- && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
- else if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SET_GOT)
- return true;
- else if (!REG_P (x))
- return false;
- else if (pic_offset_table_rtx)
- {
- if (REGNO (x) == REGNO (pic_offset_table_rtx))
- return true;
- if (HARD_REGISTER_P (x)
- && !HARD_REGISTER_P (pic_offset_table_rtx)
- && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
- return true;
- return false;
+ prev = next;
+ }
+
+ if (next == BB_END (bb))
+ break;
+
+ next = NEXT_INSN (next);
}
- else
- return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
+
+ return distance;
}
-/* Helper function for ix86_delegitimize_address.
- Attempt to delegitimize TLS local-exec accesses. */
+/* Return the distance between INSN and the next insn that uses
+ register number REGNO0 in memory address. Return -1 if no such
+ a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
-static rtx
-ix86_delegitimize_tls_address (rtx orig_x)
+static int
+distance_agu_use (unsigned int regno0, rtx_insn *insn)
{
- rtx x = orig_x, unspec;
- struct ix86_address addr;
+ basic_block bb = BLOCK_FOR_INSN (insn);
+ int distance = 0;
+ bool found = false;
+ bool redefined = false;
- if (!TARGET_TLS_DIRECT_SEG_REFS)
- return orig_x;
- if (MEM_P (x))
- x = XEXP (x, 0);
- if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
- return orig_x;
- if (ix86_decompose_address (x, &addr) == 0
- || addr.seg != DEFAULT_TLS_SEG_REG
- || addr.disp == NULL_RTX
- || GET_CODE (addr.disp) != CONST)
- return orig_x;
- unspec = XEXP (addr.disp, 0);
- if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
- unspec = XEXP (unspec, 0);
- if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
- return orig_x;
- x = XVECEXP (unspec, 0, 0);
- gcc_assert (GET_CODE (x) == SYMBOL_REF);
- if (unspec != XEXP (addr.disp, 0))
- x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
- if (addr.index)
- {
- rtx idx = addr.index;
- if (addr.scale != 1)
- idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
- x = gen_rtx_PLUS (Pmode, idx, x);
- }
- if (addr.base)
- x = gen_rtx_PLUS (Pmode, addr.base, x);
- if (MEM_P (orig_x))
- x = replace_equiv_address_nv (orig_x, x);
- return x;
-}
+ if (insn != BB_END (bb))
+ distance = distance_agu_use_in_bb (regno0, insn, distance,
+ NEXT_INSN (insn),
+ &found, &redefined);
-/* In the name of slightly smaller debug output, and to cater to
- general assembler lossage, recognize PIC+GOTOFF and turn it back
- into a direct symbol reference.
+ if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
+ {
+ edge e;
+ edge_iterator ei;
+ bool simple_loop = false;
- On Darwin, this is necessary to avoid a crash, because Darwin
- has a different PIC label for each routine but the DWARF debugging
- information is not associated with any particular routine, so it's
- necessary to remove references to the PIC label from RTL stored by
- the DWARF output code.
-
- This helper is used in the normal ix86_delegitimize_address
- entrypoint (e.g. used in the target delegitimization hook) and
- in ix86_find_base_term. As compile time memory optimization, we
- avoid allocating rtxes that will not change anything on the outcome
- of the callers (find_base_value and find_base_term). */
+ FOR_EACH_EDGE (e, ei, bb->succs)
+ if (e->dest == bb)
+ {
+ simple_loop = true;
+ break;
+ }
-static inline rtx
-ix86_delegitimize_address_1 (rtx x, bool base_term_p)
-{
- rtx orig_x = delegitimize_mem_from_attrs (x);
- /* addend is NULL or some rtx if x is something+GOTOFF where
- something doesn't include the PIC register. */
- rtx addend = NULL_RTX;
- /* reg_addend is NULL or a multiple of some register. */
- rtx reg_addend = NULL_RTX;
- /* const_addend is NULL or a const_int. */
- rtx const_addend = NULL_RTX;
- /* This is the result, or NULL. */
- rtx result = NULL_RTX;
+ if (simple_loop)
+ distance = distance_agu_use_in_bb (regno0, insn,
+ distance, BB_HEAD (bb),
+ &found, &redefined);
+ else
+ {
+ int shortest_dist = -1;
+ bool found_in_bb = false;
+ bool redefined_in_bb = false;
- x = orig_x;
+ FOR_EACH_EDGE (e, ei, bb->succs)
+ {
+ int bb_dist
+ = distance_agu_use_in_bb (regno0, insn,
+ distance, BB_HEAD (e->dest),
+ &found_in_bb, &redefined_in_bb);
+ if (found_in_bb)
+ {
+ if (shortest_dist < 0)
+ shortest_dist = bb_dist;
+ else if (bb_dist > 0)
+ shortest_dist = MIN (bb_dist, shortest_dist);
- if (MEM_P (x))
- x = XEXP (x, 0);
+ found = true;
+ }
+ }
- if (TARGET_64BIT)
- {
- if (GET_CODE (x) == CONST
- && GET_CODE (XEXP (x, 0)) == PLUS
- && GET_MODE (XEXP (x, 0)) == Pmode
- && CONST_INT_P (XEXP (XEXP (x, 0), 1))
- && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
- && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
- {
- /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
- base. A CONST can't be arg_pointer_rtx based. */
- if (base_term_p && MEM_P (orig_x))
- return orig_x;
- rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
- x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
- if (MEM_P (orig_x))
- x = replace_equiv_address_nv (orig_x, x);
- return x;
+ distance = shortest_dist;
}
+ }
- if (GET_CODE (x) == CONST
- && GET_CODE (XEXP (x, 0)) == UNSPEC
- && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
- || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
- && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
- {
- x = XVECEXP (XEXP (x, 0), 0, 0);
- if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
- {
- x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
- if (x == NULL_RTX)
- return orig_x;
- }
- return x;
- }
+ if (!found || redefined)
+ return -1;
- if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
- return ix86_delegitimize_tls_address (orig_x);
+ return distance >> 1;
+}
- /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
- and -mcmodel=medium -fpic. */
- }
+/* Define this macro to tune LEA priority vs ADD, it take effect when
+ there is a dilemma of choicing LEA or ADD
+ Negative value: ADD is more preferred than LEA
+ Zero: Netrual
+ Positive value: LEA is more preferred than ADD*/
+#define IX86_LEA_PRIORITY 0
- if (GET_CODE (x) != PLUS
- || GET_CODE (XEXP (x, 1)) != CONST)
- return ix86_delegitimize_tls_address (orig_x);
+/* Return true if usage of lea INSN has performance advantage
+ over a sequence of instructions. Instructions sequence has
+ SPLIT_COST cycles higher latency than lea latency. */
- if (ix86_pic_register_p (XEXP (x, 0)))
- /* %ebx + GOT/GOTOFF */
- ;
- else if (GET_CODE (XEXP (x, 0)) == PLUS)
- {
- /* %ebx + %reg * scale + GOT/GOTOFF */
- reg_addend = XEXP (x, 0);
- if (ix86_pic_register_p (XEXP (reg_addend, 0)))
- reg_addend = XEXP (reg_addend, 1);
- else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
- reg_addend = XEXP (reg_addend, 0);
- else
- {
- reg_addend = NULL_RTX;
- addend = XEXP (x, 0);
- }
- }
- else
- addend = XEXP (x, 0);
+static bool
+ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
+ unsigned int regno2, int split_cost, bool has_scale)
+{
+ int dist_define, dist_use;
- x = XEXP (XEXP (x, 1), 0);
- if (GET_CODE (x) == PLUS
- && CONST_INT_P (XEXP (x, 1)))
+ /* For Silvermont if using a 2-source or 3-source LEA for
+ non-destructive destination purposes, or due to wanting
+ ability to use SCALE, the use of LEA is justified. */
+ if (TARGET_SILVERMONT || TARGET_GOLDMONT || TARGET_GOLDMONT_PLUS
+ || TARGET_TREMONT || TARGET_INTEL)
{
- const_addend = XEXP (x, 1);
- x = XEXP (x, 0);
+ if (has_scale)
+ return true;
+ if (split_cost < 1)
+ return false;
+ if (regno0 == regno1 || regno0 == regno2)
+ return false;
+ return true;
}
- if (GET_CODE (x) == UNSPEC
- && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
- || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
- || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
- && !MEM_P (orig_x) && !addend)))
- result = XVECEXP (x, 0, 0);
-
- if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
- && !MEM_P (orig_x))
- result = XVECEXP (x, 0, 0);
-
- if (! result)
- return ix86_delegitimize_tls_address (orig_x);
+ dist_define = distance_non_agu_define (regno1, regno2, insn);
+ dist_use = distance_agu_use (regno0, insn);
- /* For (PLUS something CONST_INT) both find_base_{value,term} just
- recurse on the first operand. */
- if (const_addend && !base_term_p)
- result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
- if (reg_addend)
- result = gen_rtx_PLUS (Pmode, reg_addend, result);
- if (addend)
+ if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
{
- /* If the rest of original X doesn't involve the PIC register, add
- addend and subtract pic_offset_table_rtx. This can happen e.g.
- for code like:
- leal (%ebx, %ecx, 4), %ecx
- ...
- movl foo@GOTOFF(%ecx), %edx
- in which case we return (%ecx - %ebx) + foo
- or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
- and reload has completed. Don't do the latter for debug,
- as _GLOBAL_OFFSET_TABLE_ can't be expressed in the assembly. */
- if (pic_offset_table_rtx
- && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
- result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
- pic_offset_table_rtx),
- result);
- else if (base_term_p
- && pic_offset_table_rtx
- && !TARGET_MACHO
- && !TARGET_VXWORKS_RTP)
- {
- rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
- tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
- result = gen_rtx_PLUS (Pmode, tmp, result);
- }
+ /* If there is no non AGU operand definition, no AGU
+ operand usage and split cost is 0 then both lea
+ and non lea variants have same priority. Currently
+ we prefer lea for 64 bit code and non lea on 32 bit
+ code. */
+ if (dist_use < 0 && split_cost == 0)
+ return TARGET_64BIT || IX86_LEA_PRIORITY;
else
- return orig_x;
- }
- if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
- {
- result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
- if (result == NULL_RTX)
- return orig_x;
+ return true;
}
- return result;
-}
-/* The normal instantiation of the above template. */
+ /* With longer definitions distance lea is more preferable.
+ Here we change it to take into account splitting cost and
+ lea priority. */
+ dist_define += split_cost + IX86_LEA_PRIORITY;
-static rtx
-ix86_delegitimize_address (rtx x)
-{
- return ix86_delegitimize_address_1 (x, false);
+ /* If there is no use in memory addess then we just check
+ that split cost exceeds AGU stall. */
+ if (dist_use < 0)
+ return dist_define > LEA_MAX_STALL;
+
+ /* If this insn has both backward non-agu dependence and forward
+ agu dependence, the one with short distance takes effect. */
+ return dist_define >= dist_use;
}
-/* If X is a machine specific address (i.e. a symbol or label being
- referenced as a displacement from the GOT implemented using an
- UNSPEC), then return the base term. Otherwise return X. */
+/* Return true if it is legal to clobber flags by INSN and
+ false otherwise. */
-rtx
-ix86_find_base_term (rtx x)
+static bool
+ix86_ok_to_clobber_flags (rtx_insn *insn)
{
- rtx term;
+ basic_block bb = BLOCK_FOR_INSN (insn);
+ df_ref use;
+ bitmap live;
- if (TARGET_64BIT)
+ while (insn)
{
- if (GET_CODE (x) != CONST)
- return x;
- term = XEXP (x, 0);
- if (GET_CODE (term) == PLUS
- && CONST_INT_P (XEXP (term, 1)))
- term = XEXP (term, 0);
- if (GET_CODE (term) != UNSPEC
- || (XINT (term, 1) != UNSPEC_GOTPCREL
- && XINT (term, 1) != UNSPEC_PCREL))
- return x;
+ if (NONDEBUG_INSN_P (insn))
+ {
+ FOR_EACH_INSN_USE (use, insn)
+ if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
+ return false;
- return XVECEXP (term, 0, 0);
+ if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
+ return true;
+ }
+
+ if (insn == BB_END (bb))
+ break;
+
+ insn = NEXT_INSN (insn);
}
- return ix86_delegitimize_address_1 (x, true);
+ live = df_get_live_out(bb);
+ return !REGNO_REG_SET_P (live, FLAGS_REG);
}
-/* Return true if X shouldn't be emitted into the debug info.
- Disallow UNSPECs other than @gotoff - we can't emit _GLOBAL_OFFSET_TABLE_
- symbol easily into the .debug_info section, so we need not to
- delegitimize, but instead assemble as @gotoff.
- Disallow _GLOBAL_OFFSET_TABLE_ SYMBOL_REF - the assembler magically
- assembles that as _GLOBAL_OFFSET_TABLE_-. expression. */
+/* Return true if we need to split op0 = op1 + op2 into a sequence of
+ move and add to avoid AGU stalls. */
-static bool
-ix86_const_not_ok_for_debug_p (rtx x)
+bool
+ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
{
- if (GET_CODE (x) == UNSPEC && XINT (x, 1) != UNSPEC_GOTOFF)
- return true;
+ unsigned int regno0, regno1, regno2;
- if (SYMBOL_REF_P (x) && strcmp (XSTR (x, 0), GOT_SYMBOL_NAME) == 0)
- return true;
+ /* Check if we need to optimize. */
+ if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
+ return false;
- return false;
+ /* Check it is correct to split here. */
+ if (!ix86_ok_to_clobber_flags(insn))
+ return false;
+
+ regno0 = true_regnum (operands[0]);
+ regno1 = true_regnum (operands[1]);
+ regno2 = true_regnum (operands[2]);
+
+ /* We need to split only adds with non destructive
+ destination operand. */
+ if (regno0 == regno1 || regno0 == regno2)
+ return false;
+ else
+ return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
}
-\f
-static void
-put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
- bool fp, FILE *file)
-{
- const char *suffix;
- if (mode == CCFPmode)
- {
- code = ix86_fp_compare_code_to_integer (code);
- mode = CCmode;
- }
- if (reverse)
- code = reverse_condition (code);
+/* Return true if we should emit lea instruction instead of mov
+ instruction. */
- switch (code)
- {
- case EQ:
- gcc_assert (mode != CCGZmode);
- switch (mode)
- {
- case E_CCAmode:
- suffix = "a";
- break;
- case E_CCCmode:
- suffix = "c";
- break;
- case E_CCOmode:
- suffix = "o";
- break;
- case E_CCPmode:
- suffix = "p";
- break;
- case E_CCSmode:
- suffix = "s";
- break;
- default:
- suffix = "e";
- break;
- }
- break;
- case NE:
- gcc_assert (mode != CCGZmode);
- switch (mode)
- {
- case E_CCAmode:
- suffix = "na";
- break;
- case E_CCCmode:
- suffix = "nc";
- break;
- case E_CCOmode:
- suffix = "no";
- break;
- case E_CCPmode:
- suffix = "np";
- break;
- case E_CCSmode:
- suffix = "ns";
- break;
- default:
- suffix = "ne";
- break;
- }
- break;
- case GT:
- gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
- suffix = "g";
- break;
- case GTU:
- /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
- Those same assemblers have the same but opposite lossage on cmov. */
- if (mode == CCmode)
- suffix = fp ? "nbe" : "a";
- else
- gcc_unreachable ();
- break;
- case LT:
- switch (mode)
- {
- case E_CCNOmode:
- case E_CCGOCmode:
- suffix = "s";
- break;
+bool
+ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
+{
+ unsigned int regno0, regno1;
- case E_CCmode:
- case E_CCGCmode:
- case E_CCGZmode:
- suffix = "l";
- break;
+ /* Check if we need to optimize. */
+ if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
+ return false;
- default:
- gcc_unreachable ();
- }
- break;
- case LTU:
- if (mode == CCmode || mode == CCGZmode)
- suffix = "b";
- else if (mode == CCCmode)
- suffix = fp ? "b" : "c";
- else
- gcc_unreachable ();
- break;
- case GE:
- switch (mode)
- {
- case E_CCNOmode:
- case E_CCGOCmode:
- suffix = "ns";
- break;
+ /* Use lea for reg to reg moves only. */
+ if (!REG_P (operands[0]) || !REG_P (operands[1]))
+ return false;
- case E_CCmode:
- case E_CCGCmode:
- case E_CCGZmode:
- suffix = "ge";
- break;
+ regno0 = true_regnum (operands[0]);
+ regno1 = true_regnum (operands[1]);
- default:
- gcc_unreachable ();
- }
- break;
- case GEU:
- if (mode == CCmode || mode == CCGZmode)
- suffix = "nb";
- else if (mode == CCCmode)
- suffix = fp ? "nb" : "nc";
- else
- gcc_unreachable ();
- break;
- case LE:
- gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
- suffix = "le";
- break;
- case LEU:
- if (mode == CCmode)
- suffix = "be";
- else
- gcc_unreachable ();
- break;
- case UNORDERED:
- suffix = fp ? "u" : "p";
- break;
- case ORDERED:
- suffix = fp ? "nu" : "np";
- break;
- default:
- gcc_unreachable ();
- }
- fputs (suffix, file);
+ return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
}
-/* Print the name of register X to FILE based on its machine mode and number.
- If CODE is 'w', pretend the mode is HImode.
- If CODE is 'b', pretend the mode is QImode.
- If CODE is 'k', pretend the mode is SImode.
- If CODE is 'q', pretend the mode is DImode.
- If CODE is 'x', pretend the mode is V4SFmode.
- If CODE is 't', pretend the mode is V8SFmode.
- If CODE is 'g', pretend the mode is V16SFmode.
- If CODE is 'h', pretend the reg is the 'high' byte register.
- If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
- If CODE is 'd', duplicate the operand for AVX instruction.
- If CODE is 'V', print naked full integer register name without %.
- */
+/* Return true if we need to split lea into a sequence of
+ instructions to avoid AGU stalls. */
-void
-print_reg (rtx x, int code, FILE *file)
+bool
+ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
{
- const char *reg;
- int msize;
- unsigned int regno;
- bool duplicated;
-
- if (ASSEMBLER_DIALECT == ASM_ATT && code != 'V')
- putc ('%', file);
+ unsigned int regno0, regno1, regno2;
+ int split_cost;
+ struct ix86_address parts;
+ int ok;
- if (x == pc_rtx)
- {
- gcc_assert (TARGET_64BIT);
- fputs ("rip", file);
- return;
- }
+ /* Check we need to optimize. */
+ if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
+ return false;
- if (code == 'y' && STACK_TOP_P (x))
- {
- fputs ("st(0)", file);
- return;
- }
+ /* The "at least two components" test below might not catch simple
+ move or zero extension insns if parts.base is non-NULL and parts.disp
+ is const0_rtx as the only components in the address, e.g. if the
+ register is %rbp or %r13. As this test is much cheaper and moves or
+ zero extensions are the common case, do this check first. */
+ if (REG_P (operands[1])
+ || (SImode_address_operand (operands[1], VOIDmode)
+ && REG_P (XEXP (operands[1], 0))))
+ return false;
- if (code == 'w')
- msize = 2;
- else if (code == 'b')
- msize = 1;
- else if (code == 'k')
- msize = 4;
- else if (code == 'q')
- msize = 8;
- else if (code == 'h')
- msize = 0;
- else if (code == 'x')
- msize = 16;
- else if (code == 't')
- msize = 32;
- else if (code == 'g')
- msize = 64;
- else
- msize = GET_MODE_SIZE (GET_MODE (x));
+ /* Check if it is OK to split here. */
+ if (!ix86_ok_to_clobber_flags (insn))
+ return false;
- regno = REGNO (x);
+ ok = ix86_decompose_address (operands[1], &parts);
+ gcc_assert (ok);
- if (regno == ARG_POINTER_REGNUM
- || regno == FRAME_POINTER_REGNUM
- || regno == FPSR_REG)
- {
- output_operand_lossage
- ("invalid use of register '%s'", reg_names[regno]);
- return;
- }
- else if (regno == FLAGS_REG)
- {
- output_operand_lossage ("invalid use of asm flag output");
- return;
- }
+ /* There should be at least two components in the address. */
+ if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
+ + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
+ return false;
- if (code == 'V')
+ /* We should not split into add if non legitimate pic
+ operand is used as displacement. */
+ if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
+ return false;
+
+ regno0 = true_regnum (operands[0]) ;
+ regno1 = INVALID_REGNUM;
+ regno2 = INVALID_REGNUM;
+
+ if (parts.base)
+ regno1 = true_regnum (parts.base);
+ if (parts.index)
+ regno2 = true_regnum (parts.index);
+
+ split_cost = 0;
+
+ /* Compute how many cycles we will add to execution time
+ if split lea into a sequence of instructions. */
+ if (parts.base || parts.index)
{
- if (GENERAL_REGNO_P (regno))
- msize = GET_MODE_SIZE (word_mode);
- else
- error ("%<V%> modifier on non-integer register");
+ /* Have to use mov instruction if non desctructive
+ destination form is used. */
+ if (regno1 != regno0 && regno2 != regno0)
+ split_cost += 1;
+
+ /* Have to add index to base if both exist. */
+ if (parts.base && parts.index)
+ split_cost += 1;
+
+ /* Have to use shift and adds if scale is 2 or greater. */
+ if (parts.scale > 1)
+ {
+ if (regno0 != regno1)
+ split_cost += 1;
+ else if (regno2 == regno0)
+ split_cost += 4;
+ else
+ split_cost += parts.scale;
+ }
+
+ /* Have to use add instruction with immediate if
+ disp is non zero. */
+ if (parts.disp && parts.disp != const0_rtx)
+ split_cost += 1;
+
+ /* Subtract the price of lea. */
+ split_cost -= 1;
}
- duplicated = code == 'd' && TARGET_AVX;
+ return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
+ parts.scale > 1);
+}
- switch (msize)
+/* Return true if it is ok to optimize an ADD operation to LEA
+ operation to avoid flag register consumation. For most processors,
+ ADD is faster than LEA. For the processors like BONNELL, if the
+ destination register of LEA holds an actual address which will be
+ used soon, LEA is better and otherwise ADD is better. */
+
+bool
+ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
+{
+ unsigned int regno0 = true_regnum (operands[0]);
+ unsigned int regno1 = true_regnum (operands[1]);
+ unsigned int regno2 = true_regnum (operands[2]);
+
+ /* If a = b + c, (a!=b && a!=c), must use lea form. */
+ if (regno0 != regno1 && regno0 != regno2)
+ return true;
+
+ if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
+ return false;
+
+ return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
+}
+
+/* Return true if destination reg of SET_BODY is shift count of
+ USE_BODY. */
+
+static bool
+ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
+{
+ rtx set_dest;
+ rtx shift_rtx;
+ int i;
+
+ /* Retrieve destination of SET_BODY. */
+ switch (GET_CODE (set_body))
{
- case 16:
- case 12:
- case 8:
- if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
- warning (0, "unsupported size for integer register");
- /* FALLTHRU */
- case 4:
- if (LEGACY_INT_REGNO_P (regno))
- putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
- /* FALLTHRU */
- case 2:
- normal:
- reg = hi_reg_name[regno];
- break;
- case 1:
- if (regno >= ARRAY_SIZE (qi_reg_name))
- goto normal;
- if (!ANY_QI_REGNO_P (regno))
- error ("unsupported size for integer register");
- reg = qi_reg_name[regno];
- break;
- case 0:
- if (regno >= ARRAY_SIZE (qi_high_reg_name))
- goto normal;
- reg = qi_high_reg_name[regno];
+ case SET:
+ set_dest = SET_DEST (set_body);
+ if (!set_dest || !REG_P (set_dest))
+ return false;
break;
- case 32:
- case 64:
- if (SSE_REGNO_P (regno))
- {
- gcc_assert (!duplicated);
- putc (msize == 32 ? 'y' : 'z', file);
- reg = hi_reg_name[regno] + 1;
- break;
- }
- goto normal;
+ case PARALLEL:
+ for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
+ if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
+ use_body))
+ return true;
+ /* FALLTHROUGH */
default:
- gcc_unreachable ();
+ return false;
}
- fputs (reg, file);
+ /* Retrieve shift count of USE_BODY. */
+ switch (GET_CODE (use_body))
+ {
+ case SET:
+ shift_rtx = XEXP (use_body, 1);
+ break;
+ case PARALLEL:
+ for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
+ if (ix86_dep_by_shift_count_body (set_body,
+ XVECEXP (use_body, 0, i)))
+ return true;
+ /* FALLTHROUGH */
+ default:
+ return false;
+ }
- /* Irritatingly, AMD extended registers use
- different naming convention: "r%d[bwd]" */
- if (REX_INT_REGNO_P (regno))
+ if (shift_rtx
+ && (GET_CODE (shift_rtx) == ASHIFT
+ || GET_CODE (shift_rtx) == LSHIFTRT
+ || GET_CODE (shift_rtx) == ASHIFTRT
+ || GET_CODE (shift_rtx) == ROTATE
+ || GET_CODE (shift_rtx) == ROTATERT))
{
- gcc_assert (TARGET_64BIT);
- switch (msize)
+ rtx shift_count = XEXP (shift_rtx, 1);
+
+ /* Return true if shift count is dest of SET_BODY. */
+ if (REG_P (shift_count))
{
- case 0:
- error ("extended registers have no high halves");
- break;
- case 1:
- putc ('b', file);
- break;
- case 2:
- putc ('w', file);
- break;
- case 4:
- putc ('d', file);
- break;
- case 8:
- /* no suffix */
- break;
- default:
- error ("unsupported operand size for extended register");
- break;
+ /* Add check since it can be invoked before register
+ allocation in pre-reload schedule. */
+ if (reload_completed
+ && true_regnum (set_dest) == true_regnum (shift_count))
+ return true;
+ else if (REGNO(set_dest) == REGNO(shift_count))
+ return true;
}
- return;
}
- if (duplicated)
- {
- if (ASSEMBLER_DIALECT == ASM_ATT)
- fprintf (file, ", %%%s", reg);
- else
- fprintf (file, ", %s", reg);
- }
+ return false;
}
-/* Meaning of CODE:
- L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
- C -- print opcode suffix for set/cmov insn.
- c -- like C, but print reversed condition
- F,f -- likewise, but for floating-point.
- O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
- otherwise nothing
- R -- print embedded rounding and sae.
- r -- print only sae.
- z -- print the opcode suffix for the size of the current operand.
- Z -- likewise, with special suffixes for x87 instructions.
- * -- print a star (in certain assembler syntax)
- A -- print an absolute memory reference.
- E -- print address with DImode register names if TARGET_64BIT.
- w -- print the operand as if it's a "word" (HImode) even if it isn't.
- s -- print a shift double count, followed by the assemblers argument
- delimiter.
- b -- print the QImode name of the register for the indicated operand.
- %b0 would print %al if operands[0] is reg 0.
- w -- likewise, print the HImode name of the register.
- k -- likewise, print the SImode name of the register.
- q -- likewise, print the DImode name of the register.
- x -- likewise, print the V4SFmode name of the register.
- t -- likewise, print the V8SFmode name of the register.
- g -- likewise, print the V16SFmode name of the register.
- h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
- y -- print "st(0)" instead of "st" as a register.
- d -- print duplicated register operand for AVX instruction.
- D -- print condition for SSE cmp instruction.
- P -- if PIC, print an @PLT suffix.
- p -- print raw symbol name.
- X -- don't print any sort of PIC '@' suffix for a symbol.
- & -- print some in-use local-dynamic symbol name.
- H -- print a memory address offset by 8; used for sse high-parts
- Y -- print condition for XOP pcom* instruction.
- V -- print naked full integer register name without %.
- + -- print a branch hint as 'cs' or 'ds' prefix
- ; -- print a semicolon (after prefixes due to bug in older gas).
- ~ -- print "i" if TARGET_AVX2, "f" otherwise.
- ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
- M -- print addr32 prefix for TARGET_X32 with VSIB address.
- ! -- print NOTRACK prefix for jxx/call/ret instructions if required.
- */
+/* Return true if destination reg of SET_INSN is shift count of
+ USE_INSN. */
-void
-ix86_print_operand (FILE *file, rtx x, int code)
+bool
+ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
{
- if (code)
- {
- switch (code)
- {
- case 'A':
- switch (ASSEMBLER_DIALECT)
- {
- case ASM_ATT:
- putc ('*', file);
- break;
-
- case ASM_INTEL:
- /* Intel syntax. For absolute addresses, registers should not
- be surrounded by braces. */
- if (!REG_P (x))
- {
- putc ('[', file);
- ix86_print_operand (file, x, 0);
- putc (']', file);
- return;
- }
- break;
+ return ix86_dep_by_shift_count_body (PATTERN (set_insn),
+ PATTERN (use_insn));
+}
- default:
- gcc_unreachable ();
- }
+/* Return TRUE or FALSE depending on whether the unary operator meets the
+ appropriate constraints. */
- ix86_print_operand (file, x, 0);
- return;
+bool
+ix86_unary_operator_ok (enum rtx_code,
+ machine_mode,
+ rtx operands[2])
+{
+ /* If one of operands is memory, source and destination must match. */
+ if ((MEM_P (operands[0])
+ || MEM_P (operands[1]))
+ && ! rtx_equal_p (operands[0], operands[1]))
+ return false;
+ return true;
+}
- case 'E':
- /* Wrap address in an UNSPEC to declare special handling. */
- if (TARGET_64BIT)
- x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
+/* Return TRUE if the operands to a vec_interleave_{high,low}v2df
+ are ok, keeping in mind the possible movddup alternative. */
- output_address (VOIDmode, x);
- return;
+bool
+ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
+{
+ if (MEM_P (operands[0]))
+ return rtx_equal_p (operands[0], operands[1 + high]);
+ if (MEM_P (operands[1]) && MEM_P (operands[2]))
+ return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
+ return true;
+}
- case 'L':
- if (ASSEMBLER_DIALECT == ASM_ATT)
- putc ('l', file);
- return;
+/* A subroutine of ix86_build_signbit_mask. If VECT is true,
+ then replicate the value for all elements of the vector
+ register. */
- case 'W':
- if (ASSEMBLER_DIALECT == ASM_ATT)
- putc ('w', file);
- return;
+rtx
+ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
+{
+ int i, n_elt;
+ rtvec v;
+ machine_mode scalar_mode;
- case 'B':
- if (ASSEMBLER_DIALECT == ASM_ATT)
- putc ('b', file);
- return;
+ switch (mode)
+ {
+ case E_V64QImode:
+ case E_V32QImode:
+ case E_V16QImode:
+ case E_V32HImode:
+ case E_V16HImode:
+ case E_V8HImode:
+ case E_V16SImode:
+ case E_V8SImode:
+ case E_V4SImode:
+ case E_V8DImode:
+ case E_V4DImode:
+ case E_V2DImode:
+ gcc_assert (vect);
+ /* FALLTHRU */
+ case E_V16SFmode:
+ case E_V8SFmode:
+ case E_V4SFmode:
+ case E_V8DFmode:
+ case E_V4DFmode:
+ case E_V2DFmode:
+ n_elt = GET_MODE_NUNITS (mode);
+ v = rtvec_alloc (n_elt);
+ scalar_mode = GET_MODE_INNER (mode);
- case 'Q':
- if (ASSEMBLER_DIALECT == ASM_ATT)
- putc ('l', file);
- return;
+ RTVEC_ELT (v, 0) = value;
- case 'S':
- if (ASSEMBLER_DIALECT == ASM_ATT)
- putc ('s', file);
- return;
+ for (i = 1; i < n_elt; ++i)
+ RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
- case 'T':
- if (ASSEMBLER_DIALECT == ASM_ATT)
- putc ('t', file);
- return;
+ return gen_rtx_CONST_VECTOR (mode, v);
- case 'O':
-#ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
- if (ASSEMBLER_DIALECT != ASM_ATT)
- return;
+ default:
+ gcc_unreachable ();
+ }
+}
- switch (GET_MODE_SIZE (GET_MODE (x)))
- {
- case 2:
- putc ('w', file);
- break;
-
- case 4:
- putc ('l', file);
- break;
+/* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
+ and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
+ for an SSE register. If VECT is true, then replicate the mask for
+ all elements of the vector register. If INVERT is true, then create
+ a mask excluding the sign bit. */
- case 8:
- putc ('q', file);
- break;
+rtx
+ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
+{
+ machine_mode vec_mode, imode;
+ wide_int w;
+ rtx mask, v;
- default:
- output_operand_lossage ("invalid operand size for operand "
- "code 'O'");
- return;
- }
+ switch (mode)
+ {
+ case E_V16SImode:
+ case E_V16SFmode:
+ case E_V8SImode:
+ case E_V4SImode:
+ case E_V8SFmode:
+ case E_V4SFmode:
+ vec_mode = mode;
+ imode = SImode;
+ break;
- putc ('.', file);
-#endif
- return;
+ case E_V8DImode:
+ case E_V4DImode:
+ case E_V2DImode:
+ case E_V8DFmode:
+ case E_V4DFmode:
+ case E_V2DFmode:
+ vec_mode = mode;
+ imode = DImode;
+ break;
- case 'z':
- if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
- {
- /* Opcodes don't get size suffixes if using Intel opcodes. */
- if (ASSEMBLER_DIALECT == ASM_INTEL)
- return;
+ case E_TImode:
+ case E_TFmode:
+ vec_mode = VOIDmode;
+ imode = TImode;
+ break;
- switch (GET_MODE_SIZE (GET_MODE (x)))
- {
- case 1:
- putc ('b', file);
- return;
+ default:
+ gcc_unreachable ();
+ }
- case 2:
- putc ('w', file);
- return;
+ machine_mode inner_mode = GET_MODE_INNER (mode);
+ w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
+ GET_MODE_BITSIZE (inner_mode));
+ if (invert)
+ w = wi::bit_not (w);
- case 4:
- putc ('l', file);
- return;
+ /* Force this value into the low part of a fp vector constant. */
+ mask = immed_wide_int_const (w, imode);
+ mask = gen_lowpart (inner_mode, mask);
- case 8:
- putc ('q', file);
- return;
+ if (vec_mode == VOIDmode)
+ return force_reg (inner_mode, mask);
- default:
- output_operand_lossage ("invalid operand size for operand "
- "code 'z'");
- return;
- }
- }
+ v = ix86_build_const_vector (vec_mode, vect, mask);
+ return force_reg (vec_mode, v);
+}
- if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
- warning (0, "non-integer operand used with operand code %<z%>");
- /* FALLTHRU */
+/* Return TRUE or FALSE depending on whether the first SET in INSN
+ has source and destination with matching CC modes, and that the
+ CC mode is at least as constrained as REQ_MODE. */
- case 'Z':
- /* 387 opcodes don't get size suffixes if using Intel opcodes. */
- if (ASSEMBLER_DIALECT == ASM_INTEL)
- return;
+bool
+ix86_match_ccmode (rtx insn, machine_mode req_mode)
+{
+ rtx set;
+ machine_mode set_mode;
- if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
- {
- switch (GET_MODE_SIZE (GET_MODE (x)))
- {
- case 2:
-#ifdef HAVE_AS_IX86_FILDS
- putc ('s', file);
-#endif
- return;
+ set = PATTERN (insn);
+ if (GET_CODE (set) == PARALLEL)
+ set = XVECEXP (set, 0, 0);
+ gcc_assert (GET_CODE (set) == SET);
+ gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
- case 4:
- putc ('l', file);
- return;
+ set_mode = GET_MODE (SET_DEST (set));
+ switch (set_mode)
+ {
+ case E_CCNOmode:
+ if (req_mode != CCNOmode
+ && (req_mode != CCmode
+ || XEXP (SET_SRC (set), 1) != const0_rtx))
+ return false;
+ break;
+ case E_CCmode:
+ if (req_mode == CCGCmode)
+ return false;
+ /* FALLTHRU */
+ case E_CCGCmode:
+ if (req_mode == CCGOCmode || req_mode == CCNOmode)
+ return false;
+ /* FALLTHRU */
+ case E_CCGOCmode:
+ if (req_mode == CCZmode)
+ return false;
+ /* FALLTHRU */
+ case E_CCZmode:
+ break;
- case 8:
-#ifdef HAVE_AS_IX86_FILDQ
- putc ('q', file);
-#else
- fputs ("ll", file);
-#endif
- return;
+ case E_CCGZmode:
- default:
- break;
- }
- }
- else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
- {
- /* 387 opcodes don't get size suffixes
- if the operands are registers. */
- if (STACK_REG_P (x))
- return;
+ case E_CCAmode:
+ case E_CCCmode:
+ case E_CCOmode:
+ case E_CCPmode:
+ case E_CCSmode:
+ if (set_mode != req_mode)
+ return false;
+ break;
- switch (GET_MODE_SIZE (GET_MODE (x)))
- {
- case 4:
- putc ('s', file);
- return;
+ default:
+ gcc_unreachable ();
+ }
- case 8:
- putc ('l', file);
- return;
+ return GET_MODE (SET_SRC (set)) == set_mode;
+}
- case 12:
- case 16:
- putc ('t', file);
- return;
+machine_mode
+ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
+{
+ machine_mode mode = GET_MODE (op0);
- default:
- break;
- }
- }
- else
- {
- output_operand_lossage ("invalid operand type used with "
- "operand code 'Z'");
- return;
- }
+ if (SCALAR_FLOAT_MODE_P (mode))
+ {
+ gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
+ return CCFPmode;
+ }
- output_operand_lossage ("invalid operand size for operand code 'Z'");
- return;
+ switch (code)
+ {
+ /* Only zero flag is needed. */
+ case EQ: /* ZF=0 */
+ case NE: /* ZF!=0 */
+ return CCZmode;
+ /* Codes needing carry flag. */
+ case GEU: /* CF=0 */
+ case LTU: /* CF=1 */
+ /* Detect overflow checks. They need just the carry flag. */
+ if (GET_CODE (op0) == PLUS
+ && (rtx_equal_p (op1, XEXP (op0, 0))
+ || rtx_equal_p (op1, XEXP (op0, 1))))
+ return CCCmode;
+ else
+ return CCmode;
+ case GTU: /* CF=0 & ZF=0 */
+ case LEU: /* CF=1 | ZF=1 */
+ return CCmode;
+ /* Codes possibly doable only with sign flag when
+ comparing against zero. */
+ case GE: /* SF=OF or SF=0 */
+ case LT: /* SF<>OF or SF=1 */
+ if (op1 == const0_rtx)
+ return CCGOCmode;
+ else
+ /* For other cases Carry flag is not required. */
+ return CCGCmode;
+ /* Codes doable only with sign flag when comparing
+ against zero, but we miss jump instruction for it
+ so we need to use relational tests against overflow
+ that thus needs to be zero. */
+ case GT: /* ZF=0 & SF=OF */
+ case LE: /* ZF=1 | SF<>OF */
+ if (op1 == const0_rtx)
+ return CCNOmode;
+ else
+ return CCGCmode;
+ /* strcmp pattern do (use flags) and combine may ask us for proper
+ mode. */
+ case USE:
+ return CCmode;
+ default:
+ gcc_unreachable ();
+ }
+}
- case 'd':
- case 'b':
- case 'w':
- case 'k':
- case 'q':
- case 'h':
- case 't':
- case 'g':
- case 'y':
- case 'x':
- case 'X':
- case 'P':
- case 'p':
- case 'V':
- break;
+/* Return the fixed registers used for condition codes. */
- case 's':
- if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
- {
- ix86_print_operand (file, x, 0);
- fputs (", ", file);
- }
- return;
+static bool
+ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
+{
+ *p1 = FLAGS_REG;
+ *p2 = INVALID_REGNUM;
+ return true;
+}
- case 'Y':
- switch (GET_CODE (x))
- {
- case NE:
- fputs ("neq", file);
- break;
- case EQ:
- fputs ("eq", file);
- break;
- case GE:
- case GEU:
- fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
- break;
- case GT:
- case GTU:
- fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
- break;
- case LE:
- case LEU:
- fputs ("le", file);
- break;
- case LT:
- case LTU:
- fputs ("lt", file);
- break;
- case UNORDERED:
- fputs ("unord", file);
- break;
- case ORDERED:
- fputs ("ord", file);
- break;
- case UNEQ:
- fputs ("ueq", file);
- break;
- case UNGE:
- fputs ("nlt", file);
- break;
- case UNGT:
- fputs ("nle", file);
- break;
- case UNLE:
- fputs ("ule", file);
- break;
- case UNLT:
- fputs ("ult", file);
- break;
- case LTGT:
- fputs ("une", file);
- break;
- default:
- output_operand_lossage ("operand is not a condition code, "
- "invalid operand code 'Y'");
- return;
- }
- return;
+/* If two condition code modes are compatible, return a condition code
+ mode which is compatible with both. Otherwise, return
+ VOIDmode. */
- case 'D':
- /* Little bit of braindamage here. The SSE compare instructions
- does use completely different names for the comparisons that the
- fp conditional moves. */
- switch (GET_CODE (x))
- {
- case UNEQ:
- if (TARGET_AVX)
- {
- fputs ("eq_us", file);
- break;
- }
- /* FALLTHRU */
- case EQ:
- fputs ("eq", file);
- break;
- case UNLT:
- if (TARGET_AVX)
- {
- fputs ("nge", file);
- break;
- }
- /* FALLTHRU */
- case LT:
- fputs ("lt", file);
- break;
- case UNLE:
- if (TARGET_AVX)
- {
- fputs ("ngt", file);
- break;
- }
- /* FALLTHRU */
- case LE:
- fputs ("le", file);
- break;
- case UNORDERED:
- fputs ("unord", file);
- break;
- case LTGT:
- if (TARGET_AVX)
- {
- fputs ("neq_oq", file);
- break;
- }
- /* FALLTHRU */
- case NE:
- fputs ("neq", file);
- break;
- case GE:
- if (TARGET_AVX)
- {
- fputs ("ge", file);
- break;
- }
- /* FALLTHRU */
- case UNGE:
- fputs ("nlt", file);
- break;
- case GT:
- if (TARGET_AVX)
- {
- fputs ("gt", file);
- break;
- }
- /* FALLTHRU */
- case UNGT:
- fputs ("nle", file);
- break;
- case ORDERED:
- fputs ("ord", file);
- break;
- default:
- output_operand_lossage ("operand is not a condition code, "
- "invalid operand code 'D'");
- return;
- }
- return;
+static machine_mode
+ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
+{
+ if (m1 == m2)
+ return m1;
- case 'F':
- case 'f':
-#ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
- if (ASSEMBLER_DIALECT == ASM_ATT)
- putc ('.', file);
- gcc_fallthrough ();
-#endif
+ if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
+ return VOIDmode;
- case 'C':
- case 'c':
- if (!COMPARISON_P (x))
- {
- output_operand_lossage ("operand is not a condition code, "
- "invalid operand code '%c'", code);
- return;
- }
- put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
- code == 'c' || code == 'f',
- code == 'F' || code == 'f',
- file);
- return;
+ if ((m1 == CCGCmode && m2 == CCGOCmode)
+ || (m1 == CCGOCmode && m2 == CCGCmode))
+ return CCGCmode;
- case 'H':
- if (!offsettable_memref_p (x))
- {
- output_operand_lossage ("operand is not an offsettable memory "
- "reference, invalid operand code 'H'");
- return;
- }
- /* It doesn't actually matter what mode we use here, as we're
- only going to use this for printing. */
- x = adjust_address_nv (x, DImode, 8);
- /* Output 'qword ptr' for intel assembler dialect. */
- if (ASSEMBLER_DIALECT == ASM_INTEL)
- code = 'q';
- break;
+ if ((m1 == CCNOmode && m2 == CCGOCmode)
+ || (m1 == CCGOCmode && m2 == CCNOmode))
+ return CCNOmode;
- case 'K':
- if (!CONST_INT_P (x))
- {
- output_operand_lossage ("operand is not an integer, invalid "
- "operand code 'K'");
- return;
- }
+ if (m1 == CCZmode
+ && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
+ return m2;
+ else if (m2 == CCZmode
+ && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
+ return m1;
- if (INTVAL (x) & IX86_HLE_ACQUIRE)
-#ifdef HAVE_AS_IX86_HLE
- fputs ("xacquire ", file);
-#else
- fputs ("\n" ASM_BYTE "0xf2\n\t", file);
-#endif
- else if (INTVAL (x) & IX86_HLE_RELEASE)
-#ifdef HAVE_AS_IX86_HLE
- fputs ("xrelease ", file);
-#else
- fputs ("\n" ASM_BYTE "0xf3\n\t", file);
-#endif
- /* We do not want to print value of the operand. */
- return;
+ switch (m1)
+ {
+ default:
+ gcc_unreachable ();
- case 'N':
- if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
- fputs ("{z}", file);
- return;
+ case E_CCmode:
+ case E_CCGCmode:
+ case E_CCGOCmode:
+ case E_CCNOmode:
+ case E_CCAmode:
+ case E_CCCmode:
+ case E_CCOmode:
+ case E_CCPmode:
+ case E_CCSmode:
+ case E_CCZmode:
+ switch (m2)
+ {
+ default:
+ return VOIDmode;
- case 'r':
- if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
- {
- output_operand_lossage ("operand is not a specific integer, "
- "invalid operand code 'r'");
- return;
- }
+ case E_CCmode:
+ case E_CCGCmode:
+ case E_CCGOCmode:
+ case E_CCNOmode:
+ case E_CCAmode:
+ case E_CCCmode:
+ case E_CCOmode:
+ case E_CCPmode:
+ case E_CCSmode:
+ case E_CCZmode:
+ return CCmode;
+ }
- if (ASSEMBLER_DIALECT == ASM_INTEL)
- fputs (", ", file);
+ case E_CCFPmode:
+ /* These are only compatible with themselves, which we already
+ checked above. */
+ return VOIDmode;
+ }
+}
- fputs ("{sae}", file);
+/* Return strategy to use for floating-point. We assume that fcomi is always
+ preferrable where available, since that is also true when looking at size
+ (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
- if (ASSEMBLER_DIALECT == ASM_ATT)
- fputs (", ", file);
+enum ix86_fpcmp_strategy
+ix86_fp_comparison_strategy (enum rtx_code)
+{
+ /* Do fcomi/sahf based test when profitable. */
- return;
+ if (TARGET_CMOVE)
+ return IX86_FPCMP_COMI;
- case 'R':
- if (!CONST_INT_P (x))
- {
- output_operand_lossage ("operand is not an integer, invalid "
- "operand code 'R'");
- return;
- }
+ if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
+ return IX86_FPCMP_SAHF;
- if (ASSEMBLER_DIALECT == ASM_INTEL)
- fputs (", ", file);
+ return IX86_FPCMP_ARITH;
+}
- switch (INTVAL (x))
- {
- case ROUND_NEAREST_INT | ROUND_SAE:
- fputs ("{rn-sae}", file);
- break;
- case ROUND_NEG_INF | ROUND_SAE:
- fputs ("{rd-sae}", file);
- break;
- case ROUND_POS_INF | ROUND_SAE:
- fputs ("{ru-sae}", file);
- break;
- case ROUND_ZERO | ROUND_SAE:
- fputs ("{rz-sae}", file);
- break;
- default:
- output_operand_lossage ("operand is not a specific integer, "
- "invalid operand code 'R'");
- }
-
- if (ASSEMBLER_DIALECT == ASM_ATT)
- fputs (", ", file);
-
- return;
-
- case '*':
- if (ASSEMBLER_DIALECT == ASM_ATT)
- putc ('*', file);
- return;
-
- case '&':
- {
- const char *name = get_some_local_dynamic_name ();
- if (name == NULL)
- output_operand_lossage ("'%%&' used without any "
- "local dynamic TLS references");
- else
- assemble_name (file, name);
- return;
- }
+/* Convert comparison codes we use to represent FP comparison to integer
+ code that will result in proper branch. Return UNKNOWN if no such code
+ is available. */
- case '+':
- {
- rtx x;
+enum rtx_code
+ix86_fp_compare_code_to_integer (enum rtx_code code)
+{
+ switch (code)
+ {
+ case GT:
+ return GTU;
+ case GE:
+ return GEU;
+ case ORDERED:
+ case UNORDERED:
+ return code;
+ case UNEQ:
+ return EQ;
+ case UNLT:
+ return LTU;
+ case UNLE:
+ return LEU;
+ case LTGT:
+ return NE;
+ default:
+ return UNKNOWN;
+ }
+}
- if (!optimize
- || optimize_function_for_size_p (cfun)
- || !TARGET_BRANCH_PREDICTION_HINTS)
- return;
+/* Zero extend possibly SImode EXP to Pmode register. */
+rtx
+ix86_zero_extend_to_Pmode (rtx exp)
+{
+ return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
+}
- x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
- if (x)
- {
- int pred_val = profile_probability::from_reg_br_prob_note
- (XINT (x, 0)).to_reg_br_prob_base ();
+/* Return true if the function being called was marked with attribute
+ "noplt" or using -fno-plt and we are compiling for non-PIC. We need
+ to handle the non-PIC case in the backend because there is no easy
+ interface for the front-end to force non-PLT calls to use the GOT.
+ This is currently used only with 64-bit or 32-bit GOT32X ELF targets
+ to call the function marked "noplt" indirectly. */
- if (pred_val < REG_BR_PROB_BASE * 45 / 100
- || pred_val > REG_BR_PROB_BASE * 55 / 100)
- {
- bool taken = pred_val > REG_BR_PROB_BASE / 2;
- bool cputaken
- = final_forward_branch_p (current_output_insn) == 0;
+static bool
+ix86_nopic_noplt_attribute_p (rtx call_op)
+{
+ if (flag_pic || ix86_cmodel == CM_LARGE
+ || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
+ || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
+ || SYMBOL_REF_LOCAL_P (call_op))
+ return false;
- /* Emit hints only in the case default branch prediction
- heuristics would fail. */
- if (taken != cputaken)
- {
- /* We use 3e (DS) prefix for taken branches and
- 2e (CS) prefix for not taken branches. */
- if (taken)
- fputs ("ds ; ", file);
- else
- fputs ("cs ; ", file);
- }
- }
- }
- return;
- }
+ tree symbol_decl = SYMBOL_REF_DECL (call_op);
- case ';':
-#ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
- putc (';', file);
-#endif
- return;
+ if (!flag_plt
+ || (symbol_decl != NULL_TREE
+ && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
+ return true;
- case '~':
- putc (TARGET_AVX2 ? 'i' : 'f', file);
- return;
+ return false;
+}
- case 'M':
- if (TARGET_X32)
- {
- /* NB: 32-bit indices in VSIB address are sign-extended
- to 64 bits. In x32, if 32-bit address 0xf7fa3010 is
- sign-extended to 0xfffffffff7fa3010 which is invalid
- address. Add addr32 prefix if there is no base
- register nor symbol. */
- bool ok;
- struct ix86_address parts;
- ok = ix86_decompose_address (x, &parts);
- gcc_assert (ok && parts.index == NULL_RTX);
- if (parts.base == NULL_RTX
- && (parts.disp == NULL_RTX
- || !symbolic_operand (parts.disp,
- GET_MODE (parts.disp))))
- fputs ("addr32 ", file);
- }
- return;
+/* Output indirect branch via a call and return thunk. CALL_OP is a
+ register which contains the branch target. XASM is the assembly
+ template for CALL_OP. Branch is a tail call if SIBCALL_P is true.
+ A normal call is converted to:
- case '^':
- if (TARGET_64BIT && Pmode != word_mode)
- fputs ("addr32 ", file);
- return;
+ call __x86_indirect_thunk_reg
- case '!':
- if (ix86_notrack_prefixed_insn_p (current_output_insn))
- fputs ("notrack ", file);
- return;
+ and a tail call is converted to:
- default:
- output_operand_lossage ("invalid operand code '%c'", code);
- }
- }
+ jmp __x86_indirect_thunk_reg
+ */
- if (REG_P (x))
- print_reg (x, code, file);
+static void
+ix86_output_indirect_branch_via_reg (rtx call_op, bool sibcall_p)
+{
+ char thunk_name_buf[32];
+ char *thunk_name;
+ enum indirect_thunk_prefix need_prefix
+ = indirect_thunk_need_prefix (current_output_insn);
+ int regno = REGNO (call_op);
- else if (MEM_P (x))
+ if (cfun->machine->indirect_branch_type
+ != indirect_branch_thunk_inline)
{
- rtx addr = XEXP (x, 0);
-
- /* No `byte ptr' prefix for call instructions ... */
- if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
+ if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
{
- machine_mode mode = GET_MODE (x);
- const char *size;
-
- /* Check for explicit size override codes. */
- if (code == 'b')
- size = "BYTE";
- else if (code == 'w')
- size = "WORD";
- else if (code == 'k')
- size = "DWORD";
- else if (code == 'q')
- size = "QWORD";
- else if (code == 'x')
- size = "XMMWORD";
- else if (code == 't')
- size = "YMMWORD";
- else if (code == 'g')
- size = "ZMMWORD";
- else if (mode == BLKmode)
- /* ... or BLKmode operands, when not overridden. */
- size = NULL;
- else
- switch (GET_MODE_SIZE (mode))
- {
- case 1: size = "BYTE"; break;
- case 2: size = "WORD"; break;
- case 4: size = "DWORD"; break;
- case 8: size = "QWORD"; break;
- case 12: size = "TBYTE"; break;
- case 16:
- if (mode == XFmode)
- size = "TBYTE";
- else
- size = "XMMWORD";
- break;
- case 32: size = "YMMWORD"; break;
- case 64: size = "ZMMWORD"; break;
- default:
- gcc_unreachable ();
- }
- if (size)
- {
- fputs (size, file);
- fputs (" PTR ", file);
- }
+ int i = regno;
+ if (i >= FIRST_REX_INT_REG)
+ i -= (FIRST_REX_INT_REG - LAST_INT_REG - 1);
+ indirect_thunks_used |= 1 << i;
}
+ indirect_thunk_name (thunk_name_buf, regno, need_prefix, false);
+ thunk_name = thunk_name_buf;
+ }
+ else
+ thunk_name = NULL;
- if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
- output_operand_lossage ("invalid constraints for operand");
+ if (sibcall_p)
+ {
+ if (thunk_name != NULL)
+ fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
else
- ix86_print_operand_address_as
- (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
+ output_indirect_thunk (regno);
}
-
- else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
+ else
{
- long l;
+ if (thunk_name != NULL)
+ {
+ fprintf (asm_out_file, "\tcall\t%s\n", thunk_name);
+ return;
+ }
- REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
+ char indirectlabel1[32];
+ char indirectlabel2[32];
- if (ASSEMBLER_DIALECT == ASM_ATT)
- putc ('$', file);
- /* Sign extend 32bit SFmode immediate to 8 bytes. */
- if (code == 'q')
- fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
- (unsigned long long) (int) l);
+ ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
+ INDIRECT_LABEL,
+ indirectlabelno++);
+ ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
+ INDIRECT_LABEL,
+ indirectlabelno++);
+
+ /* Jump. */
+ fputs ("\tjmp\t", asm_out_file);
+ assemble_name_raw (asm_out_file, indirectlabel2);
+ fputc ('\n', asm_out_file);
+
+ ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
+
+ if (thunk_name != NULL)
+ fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
else
- fprintf (file, "0x%08x", (unsigned int) l);
+ output_indirect_thunk (regno);
+
+ ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
+
+ /* Call. */
+ fputs ("\tcall\t", asm_out_file);
+ assemble_name_raw (asm_out_file, indirectlabel1);
+ fputc ('\n', asm_out_file);
}
+}
- else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
- {
- long l[2];
+/* Output indirect branch via a call and return thunk. CALL_OP is
+ the branch target. XASM is the assembly template for CALL_OP.
+ Branch is a tail call if SIBCALL_P is true. A normal call is
+ converted to:
- REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
+ jmp L2
+ L1:
+ push CALL_OP
+ jmp __x86_indirect_thunk
+ L2:
+ call L1
- if (ASSEMBLER_DIALECT == ASM_ATT)
- putc ('$', file);
- fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
- }
+ and a tail call is converted to:
- /* These float cases don't actually occur as immediate operands. */
- else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
- {
- char dstr[30];
+ push CALL_OP
+ jmp __x86_indirect_thunk
+ */
- real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
- fputs (dstr, file);
+static void
+ix86_output_indirect_branch_via_push (rtx call_op, const char *xasm,
+ bool sibcall_p)
+{
+ char thunk_name_buf[32];
+ char *thunk_name;
+ char push_buf[64];
+ enum indirect_thunk_prefix need_prefix
+ = indirect_thunk_need_prefix (current_output_insn);
+ int regno = -1;
+
+ if (cfun->machine->indirect_branch_type
+ != indirect_branch_thunk_inline)
+ {
+ if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
+ indirect_thunk_needed = true;
+ indirect_thunk_name (thunk_name_buf, regno, need_prefix, false);
+ thunk_name = thunk_name_buf;
}
+ else
+ thunk_name = NULL;
+
+ snprintf (push_buf, sizeof (push_buf), "push{%c}\t%s",
+ TARGET_64BIT ? 'q' : 'l', xasm);
+ if (sibcall_p)
+ {
+ output_asm_insn (push_buf, &call_op);
+ if (thunk_name != NULL)
+ fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
+ else
+ output_indirect_thunk (regno);
+ }
else
{
- /* We have patterns that allow zero sets of memory, for instance.
- In 64-bit mode, we should probably support all 8-byte vectors,
- since we can in fact encode that into an immediate. */
- if (GET_CODE (x) == CONST_VECTOR)
- {
- if (x != CONST0_RTX (GET_MODE (x)))
- output_operand_lossage ("invalid vector immediate");
- x = const0_rtx;
- }
+ char indirectlabel1[32];
+ char indirectlabel2[32];
- if (code != 'P' && code != 'p')
+ ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
+ INDIRECT_LABEL,
+ indirectlabelno++);
+ ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
+ INDIRECT_LABEL,
+ indirectlabelno++);
+
+ /* Jump. */
+ fputs ("\tjmp\t", asm_out_file);
+ assemble_name_raw (asm_out_file, indirectlabel2);
+ fputc ('\n', asm_out_file);
+
+ ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
+
+ /* An external function may be called via GOT, instead of PLT. */
+ if (MEM_P (call_op))
{
- if (CONST_INT_P (x))
- {
- if (ASSEMBLER_DIALECT == ASM_ATT)
- putc ('$', file);
- }
- else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
- || GET_CODE (x) == LABEL_REF)
+ struct ix86_address parts;
+ rtx addr = XEXP (call_op, 0);
+ if (ix86_decompose_address (addr, &parts)
+ && parts.base == stack_pointer_rtx)
{
- if (ASSEMBLER_DIALECT == ASM_ATT)
- putc ('$', file);
+ /* Since call will adjust stack by -UNITS_PER_WORD,
+ we must convert "disp(stack, index, scale)" to
+ "disp+UNITS_PER_WORD(stack, index, scale)". */
+ if (parts.index)
+ {
+ addr = gen_rtx_MULT (Pmode, parts.index,
+ GEN_INT (parts.scale));
+ addr = gen_rtx_PLUS (Pmode, stack_pointer_rtx,
+ addr);
+ }
else
- fputs ("OFFSET FLAT:", file);
+ addr = stack_pointer_rtx;
+
+ rtx disp;
+ if (parts.disp != NULL_RTX)
+ disp = plus_constant (Pmode, parts.disp,
+ UNITS_PER_WORD);
+ else
+ disp = GEN_INT (UNITS_PER_WORD);
+
+ addr = gen_rtx_PLUS (Pmode, addr, disp);
+ call_op = gen_rtx_MEM (GET_MODE (call_op), addr);
}
}
- if (CONST_INT_P (x))
- fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
- else if (flag_pic || MACHOPIC_INDIRECT)
- output_pic_addr_const (file, x, code);
+
+ output_asm_insn (push_buf, &call_op);
+
+ if (thunk_name != NULL)
+ fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
else
- output_addr_const (file, x);
+ output_indirect_thunk (regno);
+
+ ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
+
+ /* Call. */
+ fputs ("\tcall\t", asm_out_file);
+ assemble_name_raw (asm_out_file, indirectlabel1);
+ fputc ('\n', asm_out_file);
}
}
-static bool
-ix86_print_operand_punct_valid_p (unsigned char code)
-{
- return (code == '*' || code == '+' || code == '&' || code == ';'
- || code == '~' || code == '^' || code == '!');
-}
-\f
-/* Print a memory operand whose address is ADDR. */
+/* Output indirect branch via a call and return thunk. CALL_OP is
+ the branch target. XASM is the assembly template for CALL_OP.
+ Branch is a tail call if SIBCALL_P is true. */
static void
-ix86_print_operand_address_as (FILE *file, rtx addr,
- addr_space_t as, bool no_rip)
+ix86_output_indirect_branch (rtx call_op, const char *xasm,
+ bool sibcall_p)
{
- struct ix86_address parts;
- rtx base, index, disp;
- int scale;
- int ok;
- bool vsib = false;
- int code = 0;
-
- if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
- {
- ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
- gcc_assert (parts.index == NULL_RTX);
- parts.index = XVECEXP (addr, 0, 1);
- parts.scale = INTVAL (XVECEXP (addr, 0, 2));
- addr = XVECEXP (addr, 0, 0);
- vsib = true;
- }
- else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
- {
- gcc_assert (TARGET_64BIT);
- ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
- code = 'q';
- }
+ if (REG_P (call_op))
+ ix86_output_indirect_branch_via_reg (call_op, sibcall_p);
else
- ok = ix86_decompose_address (addr, &parts);
+ ix86_output_indirect_branch_via_push (call_op, xasm, sibcall_p);
+}
- gcc_assert (ok);
+/* Output indirect jump. CALL_OP is the jump target. */
- base = parts.base;
- index = parts.index;
- disp = parts.disp;
- scale = parts.scale;
+const char *
+ix86_output_indirect_jmp (rtx call_op)
+{
+ if (cfun->machine->indirect_branch_type != indirect_branch_keep)
+ {
+ /* We can't have red-zone since "call" in the indirect thunk
+ pushes the return address onto stack, destroying red-zone. */
+ if (ix86_red_zone_size != 0)
+ gcc_unreachable ();
- if (ADDR_SPACE_GENERIC_P (as))
- as = parts.seg;
+ ix86_output_indirect_branch (call_op, "%0", true);
+ return "";
+ }
else
- gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
+ return "%!jmp\t%A0";
+}
- if (!ADDR_SPACE_GENERIC_P (as))
- {
- if (ASSEMBLER_DIALECT == ASM_ATT)
- putc ('%', file);
+/* Output return instrumentation for current function if needed. */
- switch (as)
+static void
+output_return_instrumentation (void)
+{
+ if (ix86_instrument_return != instrument_return_none
+ && flag_fentry
+ && !DECL_NO_INSTRUMENT_FUNCTION_ENTRY_EXIT (cfun->decl))
+ {
+ if (ix86_flag_record_return)
+ fprintf (asm_out_file, "1:\n");
+ switch (ix86_instrument_return)
{
- case ADDR_SPACE_SEG_FS:
- fputs ("fs:", file);
+ case instrument_return_call:
+ fprintf (asm_out_file, "\tcall\t__return__\n");
break;
- case ADDR_SPACE_SEG_GS:
- fputs ("gs:", file);
+ case instrument_return_nop5:
+ /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
+ fprintf (asm_out_file, ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
+ break;
+ case instrument_return_none:
break;
- default:
- gcc_unreachable ();
+ }
+
+ if (ix86_flag_record_return)
+ {
+ fprintf (asm_out_file, "\t.section __return_loc, \"a\",@progbits\n");
+ fprintf (asm_out_file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
+ fprintf (asm_out_file, "\t.previous\n");
}
}
+}
- /* Use one byte shorter RIP relative addressing for 64bit mode. */
- if (TARGET_64BIT && !base && !index && !no_rip)
+/* Output function return. CALL_OP is the jump target. Add a REP
+ prefix to RET if LONG_P is true and function return is kept. */
+
+const char *
+ix86_output_function_return (bool long_p)
+{
+ output_return_instrumentation ();
+
+ if (cfun->machine->function_return_type != indirect_branch_keep)
{
- rtx symbol = disp;
+ char thunk_name[32];
+ enum indirect_thunk_prefix need_prefix
+ = indirect_thunk_need_prefix (current_output_insn);
- if (GET_CODE (disp) == CONST
- && GET_CODE (XEXP (disp, 0)) == PLUS
- && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
- symbol = XEXP (XEXP (disp, 0), 0);
+ if (cfun->machine->function_return_type
+ != indirect_branch_thunk_inline)
+ {
+ bool need_thunk = (cfun->machine->function_return_type
+ == indirect_branch_thunk);
+ indirect_thunk_name (thunk_name, INVALID_REGNUM, need_prefix,
+ true);
+ indirect_return_needed |= need_thunk;
+ fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
+ }
+ else
+ output_indirect_thunk (INVALID_REGNUM);
- if (GET_CODE (symbol) == LABEL_REF
- || (GET_CODE (symbol) == SYMBOL_REF
- && SYMBOL_REF_TLS_MODEL (symbol) == 0))
- base = pc_rtx;
+ return "";
}
- if (!base && !index)
+ if (!long_p)
+ return "%!ret";
+
+ return "rep%; ret";
+}
+
+/* Output indirect function return. RET_OP is the function return
+ target. */
+
+const char *
+ix86_output_indirect_function_return (rtx ret_op)
+{
+ if (cfun->machine->function_return_type != indirect_branch_keep)
{
- /* Displacement only requires special attention. */
- if (CONST_INT_P (disp))
+ char thunk_name[32];
+ enum indirect_thunk_prefix need_prefix
+ = indirect_thunk_need_prefix (current_output_insn);
+ unsigned int regno = REGNO (ret_op);
+ gcc_assert (regno == CX_REG);
+
+ if (cfun->machine->function_return_type
+ != indirect_branch_thunk_inline)
{
- if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
- fputs ("ds:", file);
- fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
+ bool need_thunk = (cfun->machine->function_return_type
+ == indirect_branch_thunk);
+ indirect_thunk_name (thunk_name, regno, need_prefix, true);
+
+ if (need_thunk)
+ {
+ indirect_return_via_cx = true;
+ indirect_thunks_used |= 1 << CX_REG;
+ }
+ fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
}
- /* Load the external function address via the GOT slot to avoid PLT. */
- else if (GET_CODE (disp) == CONST
- && GET_CODE (XEXP (disp, 0)) == UNSPEC
- && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
- || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
- && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
- output_pic_addr_const (file, disp, 0);
- else if (flag_pic)
- output_pic_addr_const (file, disp, 0);
else
- output_addr_const (file, disp);
+ output_indirect_thunk (regno);
+
+ return "";
}
else
+ return "%!jmp\t%A0";
+}
+
+/* Output the assembly for a call instruction. */
+
+const char *
+ix86_output_call_insn (rtx_insn *insn, rtx call_op)
+{
+ bool direct_p = constant_call_address_operand (call_op, VOIDmode);
+ bool output_indirect_p
+ = (!TARGET_SEH
+ && cfun->machine->indirect_branch_type != indirect_branch_keep);
+ bool seh_nop_p = false;
+ const char *xasm;
+
+ if (SIBLING_CALL_P (insn))
{
- /* Print SImode register names to force addr32 prefix. */
- if (SImode_address_operand (addr, VOIDmode))
+ output_return_instrumentation ();
+ if (direct_p)
{
- if (flag_checking)
+ if (ix86_nopic_noplt_attribute_p (call_op))
{
- gcc_assert (TARGET_64BIT);
- switch (GET_CODE (addr))
+ direct_p = false;
+ if (TARGET_64BIT)
{
- case SUBREG:
- gcc_assert (GET_MODE (addr) == SImode);
- gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
- break;
- case ZERO_EXTEND:
- case AND:
- gcc_assert (GET_MODE (addr) == DImode);
- break;
- default:
- gcc_unreachable ();
+ if (output_indirect_p)
+ xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
+ else
+ xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
+ }
+ else
+ {
+ if (output_indirect_p)
+ xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
+ else
+ xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
}
}
- gcc_assert (!code);
- code = 'k';
+ else
+ xasm = "%!jmp\t%P0";
}
- else if (code == 0
- && TARGET_X32
- && disp
- && CONST_INT_P (disp)
- && INTVAL (disp) < -16*1024*1024)
+ /* SEH epilogue detection requires the indirect branch case
+ to include REX.W. */
+ else if (TARGET_SEH)
+ xasm = "%!rex.W jmp\t%A0";
+ else
{
- /* X32 runs in 64-bit mode, where displacement, DISP, in
- address DISP(%r64), is encoded as 32-bit immediate sign-
- extended from 32-bit to 64-bit. For -0x40000300(%r64),
- address is %r64 + 0xffffffffbffffd00. When %r64 <
- 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
- which is invalid for x32. The correct address is %r64
- - 0x40000300 == 0xf7ffdd64. To properly encode
- -0x40000300(%r64) for x32, we zero-extend negative
- displacement by forcing addr32 prefix which truncates
- 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
- zero-extend all negative displacements, including -1(%rsp).
- However, for small negative displacements, sign-extension
- won't cause overflow. We only zero-extend negative
- displacements if they < -16*1024*1024, which is also used
- to check legitimate address displacements for PIC. */
- code = 'k';
+ if (output_indirect_p)
+ xasm = "%0";
+ else
+ xasm = "%!jmp\t%A0";
}
- /* Since the upper 32 bits of RSP are always zero for x32,
- we can encode %esp as %rsp to avoid 0x67 prefix if
- there is no index register. */
- if (TARGET_X32 && Pmode == SImode
- && !index && base && REG_P (base) && REGNO (base) == SP_REG)
- code = 'q';
+ if (output_indirect_p && !direct_p)
+ ix86_output_indirect_branch (call_op, xasm, true);
+ else
+ output_asm_insn (xasm, &call_op);
+ return "";
+ }
- if (ASSEMBLER_DIALECT == ASM_ATT)
+ /* SEH unwinding can require an extra nop to be emitted in several
+ circumstances. Determine if we have one of those. */
+ if (TARGET_SEH)
+ {
+ rtx_insn *i;
+
+ for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
{
- if (disp)
+ /* Prevent a catch region from being adjacent to a jump that would
+ be interpreted as an epilogue sequence by the unwinder. */
+ if (JUMP_P(i) && CROSSING_JUMP_P (i))
{
- if (flag_pic)
- output_pic_addr_const (file, disp, 0);
- else if (GET_CODE (disp) == LABEL_REF)
- output_asm_label (disp);
- else
- output_addr_const (file, disp);
+ seh_nop_p = true;
+ break;
}
+
+ /* If we get to another real insn, we don't need the nop. */
+ if (INSN_P (i))
+ break;
- putc ('(', file);
- if (base)
- print_reg (base, code, file);
- if (index)
+ /* If we get to the epilogue note, prevent a catch region from
+ being adjacent to the standard epilogue sequence. If non-
+ call-exceptions, we'll have done this during epilogue emission. */
+ if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
+ && !flag_non_call_exceptions
+ && !can_throw_internal (insn))
{
- putc (',', file);
- print_reg (index, vsib ? 0 : code, file);
- if (scale != 1 || vsib)
- fprintf (file, ",%d", scale);
+ seh_nop_p = true;
+ break;
}
- putc (')', file);
}
- else
- {
- rtx offset = NULL_RTX;
- if (disp)
- {
- /* Pull out the offset of a symbol; print any symbol itself. */
- if (GET_CODE (disp) == CONST
- && GET_CODE (XEXP (disp, 0)) == PLUS
- && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
- {
- offset = XEXP (XEXP (disp, 0), 1);
- disp = gen_rtx_CONST (VOIDmode,
- XEXP (XEXP (disp, 0), 0));
- }
+ /* If we didn't find a real insn following the call, prevent the
+ unwinder from looking into the next function. */
+ if (i == NULL)
+ seh_nop_p = true;
+ }
- if (flag_pic)
- output_pic_addr_const (file, disp, 0);
- else if (GET_CODE (disp) == LABEL_REF)
- output_asm_label (disp);
- else if (CONST_INT_P (disp))
- offset = disp;
- else
- output_addr_const (file, disp);
- }
-
- putc ('[', file);
- if (base)
+ if (direct_p)
+ {
+ if (ix86_nopic_noplt_attribute_p (call_op))
+ {
+ direct_p = false;
+ if (TARGET_64BIT)
{
- print_reg (base, code, file);
- if (offset)
- {
- if (INTVAL (offset) >= 0)
- putc ('+', file);
- fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
- }
+ if (output_indirect_p)
+ xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
+ else
+ xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
}
- else if (offset)
- fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
else
- putc ('0', file);
-
- if (index)
{
- putc ('+', file);
- print_reg (index, vsib ? 0 : code, file);
- if (scale != 1 || vsib)
- fprintf (file, "*%d", scale);
+ if (output_indirect_p)
+ xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
+ else
+ xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
}
- putc (']', file);
}
+ else
+ xasm = "%!call\t%P0";
+ }
+ else
+ {
+ if (output_indirect_p)
+ xasm = "%0";
+ else
+ xasm = "%!call\t%A0";
}
-}
-static void
-ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
-{
- ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
+ if (output_indirect_p && !direct_p)
+ ix86_output_indirect_branch (call_op, xasm, false);
+ else
+ output_asm_insn (xasm, &call_op);
+
+ if (seh_nop_p)
+ return "nop";
+
+ return "";
}
+\f
+/* Return a MEM corresponding to a stack slot with mode MODE.
+ Allocate a new slot if necessary.
-/* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
+ The RTL for a function can have several slots available: N is
+ which slot to use. */
-static bool
-i386_asm_output_addr_const_extra (FILE *file, rtx x)
+rtx
+assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
{
- rtx op;
+ struct stack_local_entry *s;
- if (GET_CODE (x) != UNSPEC)
- return false;
+ gcc_assert (n < MAX_386_STACK_LOCALS);
- op = XVECEXP (x, 0, 0);
- switch (XINT (x, 1))
- {
- case UNSPEC_GOTOFF:
- output_addr_const (file, op);
- fputs ("@gotoff", file);
- break;
- case UNSPEC_GOTTPOFF:
- output_addr_const (file, op);
- /* FIXME: This might be @TPOFF in Sun ld. */
- fputs ("@gottpoff", file);
- break;
- case UNSPEC_TPOFF:
- output_addr_const (file, op);
- fputs ("@tpoff", file);
- break;
- case UNSPEC_NTPOFF:
- output_addr_const (file, op);
- if (TARGET_64BIT)
- fputs ("@tpoff", file);
- else
- fputs ("@ntpoff", file);
- break;
- case UNSPEC_DTPOFF:
- output_addr_const (file, op);
- fputs ("@dtpoff", file);
- break;
- case UNSPEC_GOTNTPOFF:
- output_addr_const (file, op);
- if (TARGET_64BIT)
- fputs (ASSEMBLER_DIALECT == ASM_ATT ?
- "@gottpoff(%rip)" : "@gottpoff[rip]", file);
- else
- fputs ("@gotntpoff", file);
- break;
- case UNSPEC_INDNTPOFF:
- output_addr_const (file, op);
- fputs ("@indntpoff", file);
- break;
-#if TARGET_MACHO
- case UNSPEC_MACHOPIC_OFFSET:
- output_addr_const (file, op);
- putc ('-', file);
- machopic_output_function_base_name (file);
- break;
-#endif
+ for (s = ix86_stack_locals; s; s = s->next)
+ if (s->mode == mode && s->n == n)
+ return validize_mem (copy_rtx (s->rtl));
- default:
- return false;
- }
+ s = ggc_alloc<stack_local_entry> ();
+ s->n = n;
+ s->mode = mode;
+ s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
- return true;
+ s->next = ix86_stack_locals;
+ ix86_stack_locals = s;
+ return validize_mem (copy_rtx (s->rtl));
}
-\f
-/* Split one or more double-mode RTL references into pairs of half-mode
- references. The RTL can be REG, offsettable MEM, integer constant, or
- CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
- split and "num" is its length. lo_half and hi_half are output arrays
- that parallel "operands". */
-void
-split_double_mode (machine_mode mode, rtx operands[],
- int num, rtx lo_half[], rtx hi_half[])
+static void
+ix86_instantiate_decls (void)
{
- machine_mode half_mode;
- unsigned int byte;
+ struct stack_local_entry *s;
- switch (mode)
- {
- case E_TImode:
- half_mode = DImode;
- break;
- case E_DImode:
- half_mode = SImode;
- break;
- default:
- gcc_unreachable ();
- }
+ for (s = ix86_stack_locals; s; s = s->next)
+ if (s->rtl != NULL_RTX)
+ instantiate_decl_rtl (s->rtl);
+}
+\f
+/* Check whether x86 address PARTS is a pc-relative address. */
- byte = GET_MODE_SIZE (half_mode);
+bool
+ix86_rip_relative_addr_p (struct ix86_address *parts)
+{
+ rtx base, index, disp;
- while (num--)
- {
- rtx op = operands[num];
+ base = parts->base;
+ index = parts->index;
+ disp = parts->disp;
- /* simplify_subreg refuse to split volatile memory addresses,
- but we still have to handle it. */
- if (MEM_P (op))
- {
- lo_half[num] = adjust_address (op, half_mode, 0);
- hi_half[num] = adjust_address (op, half_mode, byte);
- }
- else
+ if (disp && !base && !index)
+ {
+ if (TARGET_64BIT)
{
- lo_half[num] = simplify_gen_subreg (half_mode, op,
- GET_MODE (op) == VOIDmode
- ? mode : GET_MODE (op), 0);
- hi_half[num] = simplify_gen_subreg (half_mode, op,
- GET_MODE (op) == VOIDmode
- ? mode : GET_MODE (op), byte);
+ rtx symbol = disp;
+
+ if (GET_CODE (disp) == CONST)
+ symbol = XEXP (disp, 0);
+ if (GET_CODE (symbol) == PLUS
+ && CONST_INT_P (XEXP (symbol, 1)))
+ symbol = XEXP (symbol, 0);
+
+ if (GET_CODE (symbol) == LABEL_REF
+ || (GET_CODE (symbol) == SYMBOL_REF
+ && SYMBOL_REF_TLS_MODEL (symbol) == 0)
+ || (GET_CODE (symbol) == UNSPEC
+ && (XINT (symbol, 1) == UNSPEC_GOTPCREL
+ || XINT (symbol, 1) == UNSPEC_PCREL
+ || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
+ return true;
}
}
+ return false;
}
-\f
-/* Output code to perform a 387 binary operation in INSN, one of PLUS,
- MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
- is the expression of the binary operation. The output may either be
- emitted here, or returned to the caller, like all output_* functions.
-
- There is no guarantee that the operands are the same mode, as they
- might be within FLOAT or FLOAT_EXTEND expressions. */
-#ifndef SYSV386_COMPAT
-/* Set to 1 for compatibility with brain-damaged assemblers. No-one
- wants to fix the assemblers because that causes incompatibility
- with gcc. No-one wants to fix gcc because that causes
- incompatibility with assemblers... You can use the option of
- -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
-#define SYSV386_COMPAT 1
-#endif
+/* Calculate the length of the memory address in the instruction encoding.
+ Includes addr32 prefix, does not include the one-byte modrm, opcode,
+ or other prefixes. We never generate addr32 prefix for LEA insn. */
-const char *
-output_387_binary_op (rtx_insn *insn, rtx *operands)
+int
+memory_address_length (rtx addr, bool lea)
{
- static char buf[40];
- const char *p;
- bool is_sse
- = (SSE_REG_P (operands[0])
- || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]));
-
- if (is_sse)
- p = "%v";
- else if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
- || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
- p = "fi";
- else
- p = "f";
+ struct ix86_address parts;
+ rtx base, index, disp;
+ int len;
+ int ok;
- strcpy (buf, p);
+ if (GET_CODE (addr) == PRE_DEC
+ || GET_CODE (addr) == POST_INC
+ || GET_CODE (addr) == PRE_MODIFY
+ || GET_CODE (addr) == POST_MODIFY)
+ return 0;
- switch (GET_CODE (operands[3]))
- {
- case PLUS:
- p = "add"; break;
- case MINUS:
- p = "sub"; break;
- case MULT:
- p = "mul"; break;
- case DIV:
- p = "div"; break;
- default:
- gcc_unreachable ();
- }
+ ok = ix86_decompose_address (addr, &parts);
+ gcc_assert (ok);
- strcat (buf, p);
+ len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
- if (is_sse)
- {
- p = (GET_MODE (operands[0]) == SFmode) ? "ss" : "sd";
- strcat (buf, p);
+ /* If this is not LEA instruction, add the length of addr32 prefix. */
+ if (TARGET_64BIT && !lea
+ && (SImode_address_operand (addr, VOIDmode)
+ || (parts.base && GET_MODE (parts.base) == SImode)
+ || (parts.index && GET_MODE (parts.index) == SImode)))
+ len++;
- if (TARGET_AVX)
- p = "\t{%2, %1, %0|%0, %1, %2}";
- else
- p = "\t{%2, %0|%0, %2}";
+ base = parts.base;
+ index = parts.index;
+ disp = parts.disp;
- strcat (buf, p);
- return buf;
- }
+ if (base && SUBREG_P (base))
+ base = SUBREG_REG (base);
+ if (index && SUBREG_P (index))
+ index = SUBREG_REG (index);
- /* Even if we do not want to check the inputs, this documents input
- constraints. Which helps in understanding the following code. */
- if (flag_checking)
+ gcc_assert (base == NULL_RTX || REG_P (base));
+ gcc_assert (index == NULL_RTX || REG_P (index));
+
+ /* Rule of thumb:
+ - esp as the base always wants an index,
+ - ebp as the base always wants a displacement,
+ - r12 as the base always wants an index,
+ - r13 as the base always wants a displacement. */
+
+ /* Register Indirect. */
+ if (base && !index && !disp)
{
- if (STACK_REG_P (operands[0])
- && ((REG_P (operands[1])
- && REGNO (operands[0]) == REGNO (operands[1])
- && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
- || (REG_P (operands[2])
- && REGNO (operands[0]) == REGNO (operands[2])
- && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
- && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
- ; /* ok */
- else
- gcc_unreachable ();
+ /* esp (for its index) and ebp (for its displacement) need
+ the two-byte modrm form. Similarly for r12 and r13 in 64-bit
+ code. */
+ if (base == arg_pointer_rtx
+ || base == frame_pointer_rtx
+ || REGNO (base) == SP_REG
+ || REGNO (base) == BP_REG
+ || REGNO (base) == R12_REG
+ || REGNO (base) == R13_REG)
+ len++;
}
- switch (GET_CODE (operands[3]))
+ /* Direct Addressing. In 64-bit mode mod 00 r/m 5
+ is not disp32, but disp32(%rip), so for disp32
+ SIB byte is needed, unless print_operand_address
+ optimizes it into disp32(%rip) or (%rip) is implied
+ by UNSPEC. */
+ else if (disp && !base && !index)
{
- case MULT:
- case PLUS:
- if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
- std::swap (operands[1], operands[2]);
-
- /* know operands[0] == operands[1]. */
-
- if (MEM_P (operands[2]))
- {
- p = "%Z2\t%2";
- break;
- }
-
- if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
+ len += 4;
+ if (!ix86_rip_relative_addr_p (&parts))
+ len++;
+ }
+ else
+ {
+ /* Find the length of the displacement constant. */
+ if (disp)
{
- if (STACK_TOP_P (operands[0]))
- /* How is it that we are storing to a dead operand[2]?
- Well, presumably operands[1] is dead too. We can't
- store the result to st(0) as st(0) gets popped on this
- instruction. Instead store to operands[2] (which I
- think has to be st(1)). st(1) will be popped later.
- gcc <= 2.8.1 didn't have this check and generated
- assembly code that the Unixware assembler rejected. */
- p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
+ if (base && satisfies_constraint_K (disp))
+ len += 1;
else
- p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
- break;
+ len += 4;
}
+ /* ebp always wants a displacement. Similarly r13. */
+ else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
+ len++;
- if (STACK_TOP_P (operands[0]))
- p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
- else
- p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
- break;
+ /* An index requires the two-byte modrm form.... */
+ if (index
+ /* ...like esp (or r12), which always wants an index. */
+ || base == arg_pointer_rtx
+ || base == frame_pointer_rtx
+ || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
+ len++;
+ }
- case MINUS:
- case DIV:
- if (MEM_P (operands[1]))
- {
- p = "r%Z1\t%1";
- break;
- }
+ return len;
+}
- if (MEM_P (operands[2]))
- {
- p = "%Z2\t%2";
- break;
- }
+/* Compute default value for "length_immediate" attribute. When SHORTFORM
+ is set, expect that insn have 8bit immediate alternative. */
+int
+ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
+{
+ int len = 0;
+ int i;
+ extract_insn_cached (insn);
+ for (i = recog_data.n_operands - 1; i >= 0; --i)
+ if (CONSTANT_P (recog_data.operand[i]))
+ {
+ enum attr_mode mode = get_attr_mode (insn);
- if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
- {
-#if SYSV386_COMPAT
- /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
- derived assemblers, confusingly reverse the direction of
- the operation for fsub{r} and fdiv{r} when the
- destination register is not st(0). The Intel assembler
- doesn't have this brain damage. Read !SYSV386_COMPAT to
- figure out what the hardware really does. */
- if (STACK_TOP_P (operands[0]))
- p = "{p\t%0, %2|rp\t%2, %0}";
- else
- p = "{rp\t%2, %0|p\t%0, %2}";
-#else
- if (STACK_TOP_P (operands[0]))
- /* As above for fmul/fadd, we can't store to st(0). */
- p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
- else
- p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
-#endif
- break;
+ gcc_assert (!len);
+ if (shortform && CONST_INT_P (recog_data.operand[i]))
+ {
+ HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
+ switch (mode)
+ {
+ case MODE_QI:
+ len = 1;
+ continue;
+ case MODE_HI:
+ ival = trunc_int_for_mode (ival, HImode);
+ break;
+ case MODE_SI:
+ ival = trunc_int_for_mode (ival, SImode);
+ break;
+ default:
+ break;
+ }
+ if (IN_RANGE (ival, -128, 127))
+ {
+ len = 1;
+ continue;
+ }
+ }
+ switch (mode)
+ {
+ case MODE_QI:
+ len = 1;
+ break;
+ case MODE_HI:
+ len = 2;
+ break;
+ case MODE_SI:
+ len = 4;
+ break;
+ /* Immediates for DImode instructions are encoded
+ as 32bit sign extended values. */
+ case MODE_DI:
+ len = 4;
+ break;
+ default:
+ fatal_insn ("unknown insn mode", insn);
}
+ }
+ return len;
+}
- if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
- {
-#if SYSV386_COMPAT
- if (STACK_TOP_P (operands[0]))
- p = "{rp\t%0, %1|p\t%1, %0}";
- else
- p = "{p\t%1, %0|rp\t%0, %1}";
-#else
- if (STACK_TOP_P (operands[0]))
- p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
- else
- p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
-#endif
- break;
- }
+/* Compute default value for "length_address" attribute. */
+int
+ix86_attr_length_address_default (rtx_insn *insn)
+{
+ int i;
- if (STACK_TOP_P (operands[0]))
+ if (get_attr_type (insn) == TYPE_LEA)
+ {
+ rtx set = PATTERN (insn), addr;
+
+ if (GET_CODE (set) == PARALLEL)
+ set = XVECEXP (set, 0, 0);
+
+ gcc_assert (GET_CODE (set) == SET);
+
+ addr = SET_SRC (set);
+
+ return memory_address_length (addr, true);
+ }
+
+ extract_insn_cached (insn);
+ for (i = recog_data.n_operands - 1; i >= 0; --i)
+ {
+ rtx op = recog_data.operand[i];
+ if (MEM_P (op))
{
- if (STACK_TOP_P (operands[1]))
- p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
- else
- p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
- break;
- }
- else if (STACK_TOP_P (operands[1]))
- {
-#if SYSV386_COMPAT
- p = "{\t%1, %0|r\t%0, %1}";
-#else
- p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
-#endif
- }
- else
- {
-#if SYSV386_COMPAT
- p = "{r\t%2, %0|\t%0, %2}";
-#else
- p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
-#endif
- }
- break;
+ constrain_operands_cached (insn, reload_completed);
+ if (which_alternative != -1)
+ {
+ const char *constraints = recog_data.constraints[i];
+ int alt = which_alternative;
- default:
- gcc_unreachable ();
+ while (*constraints == '=' || *constraints == '+')
+ constraints++;
+ while (alt-- > 0)
+ while (*constraints++ != ',')
+ ;
+ /* Skip ignored operands. */
+ if (*constraints == 'X')
+ continue;
+ }
+
+ int len = memory_address_length (XEXP (op, 0), false);
+
+ /* Account for segment prefix for non-default addr spaces. */
+ if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
+ len++;
+
+ return len;
+ }
}
+ return 0;
+}
- strcat (buf, p);
- return buf;
+/* Compute default value for "length_vex" attribute. It includes
+ 2 or 3 byte VEX prefix and 1 opcode byte. */
+
+int
+ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
+ bool has_vex_w)
+{
+ int i;
+
+ /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
+ byte VEX prefix. */
+ if (!has_0f_opcode || has_vex_w)
+ return 3 + 1;
+
+ /* We can always use 2 byte VEX prefix in 32bit. */
+ if (!TARGET_64BIT)
+ return 2 + 1;
+
+ extract_insn_cached (insn);
+
+ for (i = recog_data.n_operands - 1; i >= 0; --i)
+ if (REG_P (recog_data.operand[i]))
+ {
+ /* REX.W bit uses 3 byte VEX prefix. */
+ if (GET_MODE (recog_data.operand[i]) == DImode
+ && GENERAL_REG_P (recog_data.operand[i]))
+ return 3 + 1;
+ }
+ else
+ {
+ /* REX.X or REX.B bits use 3 byte VEX prefix. */
+ if (MEM_P (recog_data.operand[i])
+ && x86_extended_reg_mentioned_p (recog_data.operand[i]))
+ return 3 + 1;
+ }
+
+ return 2 + 1;
}
+\f
-/* Return needed mode for entity in optimize_mode_switching pass. */
+static bool
+ix86_class_likely_spilled_p (reg_class_t);
-static int
-ix86_dirflag_mode_needed (rtx_insn *insn)
+/* Returns true if lhs of insn is HW function argument register and set up
+ is_spilled to true if it is likely spilled HW register. */
+static bool
+insn_is_function_arg (rtx insn, bool* is_spilled)
{
+ rtx dst;
+
+ if (!NONDEBUG_INSN_P (insn))
+ return false;
+ /* Call instructions are not movable, ignore it. */
if (CALL_P (insn))
+ return false;
+ insn = PATTERN (insn);
+ if (GET_CODE (insn) == PARALLEL)
+ insn = XVECEXP (insn, 0, 0);
+ if (GET_CODE (insn) != SET)
+ return false;
+ dst = SET_DEST (insn);
+ if (REG_P (dst) && HARD_REGISTER_P (dst)
+ && ix86_function_arg_regno_p (REGNO (dst)))
{
- if (cfun->machine->func_type == TYPE_NORMAL)
- return X86_DIRFLAG_ANY;
- else
- /* No need to emit CLD in interrupt handler for TARGET_CLD. */
- return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
+ /* Is it likely spilled HW register? */
+ if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
+ && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
+ *is_spilled = true;
+ return true;
}
+ return false;
+}
- if (recog_memoized (insn) < 0)
- return X86_DIRFLAG_ANY;
+/* Add output dependencies for chain of function adjacent arguments if only
+ there is a move to likely spilled HW register. Return first argument
+ if at least one dependence was added or NULL otherwise. */
+static rtx_insn *
+add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
+{
+ rtx_insn *insn;
+ rtx_insn *last = call;
+ rtx_insn *first_arg = NULL;
+ bool is_spilled = false;
- if (get_attr_type (insn) == TYPE_STR)
+ head = PREV_INSN (head);
+
+ /* Find nearest to call argument passing instruction. */
+ while (true)
{
- /* Emit cld instruction if stringops are used in the function. */
- if (cfun->machine->func_type == TYPE_NORMAL)
- return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
- else
- return X86_DIRFLAG_RESET;
+ last = PREV_INSN (last);
+ if (last == head)
+ return NULL;
+ if (!NONDEBUG_INSN_P (last))
+ continue;
+ if (insn_is_function_arg (last, &is_spilled))
+ break;
+ return NULL;
}
- return X86_DIRFLAG_ANY;
+ first_arg = last;
+ while (true)
+ {
+ insn = PREV_INSN (last);
+ if (!INSN_P (insn))
+ break;
+ if (insn == head)
+ break;
+ if (!NONDEBUG_INSN_P (insn))
+ {
+ last = insn;
+ continue;
+ }
+ if (insn_is_function_arg (insn, &is_spilled))
+ {
+ /* Add output depdendence between two function arguments if chain
+ of output arguments contains likely spilled HW registers. */
+ if (is_spilled)
+ add_dependence (first_arg, insn, REG_DEP_OUTPUT);
+ first_arg = last = insn;
+ }
+ else
+ break;
+ }
+ if (!is_spilled)
+ return NULL;
+ return first_arg;
}
-/* Check if a 256bit or 512 bit AVX register is referenced inside of EXP. */
-
-static bool
-ix86_check_avx_upper_register (const_rtx exp)
+/* Add output or anti dependency from insn to first_arg to restrict its code
+ motion. */
+static void
+avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
{
- return SSE_REG_P (exp) && GET_MODE_BITSIZE (GET_MODE (exp)) > 128;
-}
+ rtx set;
+ rtx tmp;
-/* Return needed mode for entity in optimize_mode_switching pass. */
+ set = single_set (insn);
+ if (!set)
+ return;
+ tmp = SET_DEST (set);
+ if (REG_P (tmp))
+ {
+ /* Add output dependency to the first function argument. */
+ add_dependence (first_arg, insn, REG_DEP_OUTPUT);
+ return;
+ }
+ /* Add anti dependency. */
+ add_dependence (first_arg, insn, REG_DEP_ANTI);
+}
-static int
-ix86_avx_u128_mode_needed (rtx_insn *insn)
+/* Avoid cross block motion of function argument through adding dependency
+ from the first non-jump instruction in bb. */
+static void
+add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
{
- if (CALL_P (insn))
- {
- rtx link;
+ rtx_insn *insn = BB_END (bb);
- /* Needed mode is set to AVX_U128_CLEAN if there are
- no 256bit or 512bit modes used in function arguments. */
- for (link = CALL_INSN_FUNCTION_USAGE (insn);
- link;
- link = XEXP (link, 1))
+ while (insn)
+ {
+ if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
{
- if (GET_CODE (XEXP (link, 0)) == USE)
+ rtx set = single_set (insn);
+ if (set)
{
- rtx arg = XEXP (XEXP (link, 0), 0);
-
- if (ix86_check_avx_upper_register (arg))
- return AVX_U128_DIRTY;
+ avoid_func_arg_motion (arg, insn);
+ return;
}
}
-
- return AVX_U128_CLEAN;
+ if (insn == BB_HEAD (bb))
+ return;
+ insn = PREV_INSN (insn);
}
-
- /* Require DIRTY mode if a 256bit or 512bit AVX register is referenced.
- Hardware changes state only when a 256bit register is written to,
- but we need to prevent the compiler from moving optimal insertion
- point above eventual read from 256bit or 512 bit register. */
- subrtx_iterator::array_type array;
- FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
- if (ix86_check_avx_upper_register (*iter))
- return AVX_U128_DIRTY;
-
- return AVX_U128_ANY;
}
-/* Return mode that i387 must be switched into
- prior to the execution of insn. */
-
-static int
-ix86_i387_mode_needed (int entity, rtx_insn *insn)
+/* Hook for pre-reload schedule - avoid motion of function arguments
+ passed in likely spilled HW registers. */
+static void
+ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
{
- enum attr_i387_cw mode;
-
- /* The mode UNINITIALIZED is used to store control word after a
- function call or ASM pattern. The mode ANY specify that function
- has no requirements on the control word and make no changes in the
- bits we are interested in. */
-
- if (CALL_P (insn)
- || (NONJUMP_INSN_P (insn)
- && (asm_noperands (PATTERN (insn)) >= 0
- || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
- return I387_CW_UNINITIALIZED;
-
- if (recog_memoized (insn) < 0)
- return I387_CW_ANY;
+ rtx_insn *insn;
+ rtx_insn *first_arg = NULL;
+ if (reload_completed)
+ return;
+ while (head != tail && DEBUG_INSN_P (head))
+ head = NEXT_INSN (head);
+ for (insn = tail; insn != head; insn = PREV_INSN (insn))
+ if (INSN_P (insn) && CALL_P (insn))
+ {
+ first_arg = add_parameter_dependencies (insn, head);
+ if (first_arg)
+ {
+ /* Add dependee for first argument to predecessors if only
+ region contains more than one block. */
+ basic_block bb = BLOCK_FOR_INSN (insn);
+ int rgn = CONTAINING_RGN (bb->index);
+ int nr_blks = RGN_NR_BLOCKS (rgn);
+ /* Skip trivial regions and region head blocks that can have
+ predecessors outside of region. */
+ if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
+ {
+ edge e;
+ edge_iterator ei;
- mode = get_attr_i387_cw (insn);
+ /* Regions are SCCs with the exception of selective
+ scheduling with pipelining of outer blocks enabled.
+ So also check that immediate predecessors of a non-head
+ block are in the same region. */
+ FOR_EACH_EDGE (e, ei, bb->preds)
+ {
+ /* Avoid creating of loop-carried dependencies through
+ using topological ordering in the region. */
+ if (rgn == CONTAINING_RGN (e->src->index)
+ && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
+ add_dependee_for_func_arg (first_arg, e->src);
+ }
+ }
+ insn = first_arg;
+ if (insn == head)
+ break;
+ }
+ }
+ else if (first_arg)
+ avoid_func_arg_motion (first_arg, insn);
+}
- switch (entity)
- {
- case I387_TRUNC:
- if (mode == I387_CW_TRUNC)
- return mode;
- break;
+/* Hook for pre-reload schedule - set priority of moves from likely spilled
+ HW registers to maximum, to schedule them at soon as possible. These are
+ moves from function argument registers at the top of the function entry
+ and moves from function return value registers after call. */
+static int
+ix86_adjust_priority (rtx_insn *insn, int priority)
+{
+ rtx set;
- case I387_FLOOR:
- if (mode == I387_CW_FLOOR)
- return mode;
- break;
+ if (reload_completed)
+ return priority;
- case I387_CEIL:
- if (mode == I387_CW_CEIL)
- return mode;
- break;
+ if (!NONDEBUG_INSN_P (insn))
+ return priority;
- default:
- gcc_unreachable ();
+ set = single_set (insn);
+ if (set)
+ {
+ rtx tmp = SET_SRC (set);
+ if (REG_P (tmp)
+ && HARD_REGISTER_P (tmp)
+ && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
+ && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
+ return current_sched_info->sched_max_insns_priority;
}
- return I387_CW_ANY;
+ return priority;
}
-/* Return mode that entity must be switched into
- prior to the execution of insn. */
-
-static int
-ix86_mode_needed (int entity, rtx_insn *insn)
+/* Prepare for scheduling pass. */
+static void
+ix86_sched_init_global (FILE *, int, int)
{
- switch (entity)
+ /* Install scheduling hooks for current CPU. Some of these hooks are used
+ in time-critical parts of the scheduler, so we only set them up when
+ they are actually used. */
+ switch (ix86_tune)
{
- case X86_DIRFLAG:
- return ix86_dirflag_mode_needed (insn);
- case AVX_U128:
- return ix86_avx_u128_mode_needed (insn);
- case I387_TRUNC:
- case I387_FLOOR:
- case I387_CEIL:
- return ix86_i387_mode_needed (entity, insn);
+ case PROCESSOR_CORE2:
+ case PROCESSOR_NEHALEM:
+ case PROCESSOR_SANDYBRIDGE:
+ case PROCESSOR_HASWELL:
+ case PROCESSOR_GENERIC:
+ /* Do not perform multipass scheduling for pre-reload schedule
+ to save compile time. */
+ if (reload_completed)
+ {
+ ix86_core2i7_init_hooks ();
+ break;
+ }
+ /* Fall through. */
default:
- gcc_unreachable ();
+ targetm.sched.dfa_post_advance_cycle = NULL;
+ targetm.sched.first_cycle_multipass_init = NULL;
+ targetm.sched.first_cycle_multipass_begin = NULL;
+ targetm.sched.first_cycle_multipass_issue = NULL;
+ targetm.sched.first_cycle_multipass_backtrack = NULL;
+ targetm.sched.first_cycle_multipass_end = NULL;
+ targetm.sched.first_cycle_multipass_fini = NULL;
+ break;
}
- return 0;
}
-/* Check if a 256bit or 512bit AVX register is referenced in stores. */
-
-static void
-ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
- {
- if (ix86_check_avx_upper_register (dest))
- {
- bool *used = (bool *) data;
- *used = true;
- }
- }
-
-/* Calculate mode of upper 128bit AVX registers after the insn. */
+\f
+/* Implement TARGET_STATIC_RTX_ALIGNMENT. */
-static int
-ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
+static HOST_WIDE_INT
+ix86_static_rtx_alignment (machine_mode mode)
{
- rtx pat = PATTERN (insn);
+ if (mode == DFmode)
+ return 64;
+ if (ALIGN_MODE_128 (mode))
+ return MAX (128, GET_MODE_ALIGNMENT (mode));
+ return GET_MODE_ALIGNMENT (mode);
+}
- if (vzeroupper_pattern (pat, VOIDmode)
- || vzeroall_pattern (pat, VOIDmode))
- return AVX_U128_CLEAN;
+/* Implement TARGET_CONSTANT_ALIGNMENT. */
- /* We know that state is clean after CALL insn if there are no
- 256bit or 512bit registers used in the function return register. */
- if (CALL_P (insn))
+static HOST_WIDE_INT
+ix86_constant_alignment (const_tree exp, HOST_WIDE_INT align)
+{
+ if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
+ || TREE_CODE (exp) == INTEGER_CST)
{
- bool avx_upper_reg_found = false;
- note_stores (pat, ix86_check_avx_upper_stores, &avx_upper_reg_found);
-
- return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
+ machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
+ HOST_WIDE_INT mode_align = ix86_static_rtx_alignment (mode);
+ return MAX (mode_align, align);
}
+ else if (!optimize_size && TREE_CODE (exp) == STRING_CST
+ && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
+ return BITS_PER_WORD;
- /* Otherwise, return current mode. Remember that if insn
- references AVX 256bit or 512bit registers, the mode was already
- changed to DIRTY from MODE_NEEDED. */
- return mode;
+ return align;
}
-/* Return the mode that an insn results in. */
+/* Implement TARGET_EMPTY_RECORD_P. */
-static int
-ix86_mode_after (int entity, int mode, rtx_insn *insn)
+static bool
+ix86_is_empty_record (const_tree type)
{
- switch (entity)
- {
- case X86_DIRFLAG:
- return mode;
- case AVX_U128:
- return ix86_avx_u128_mode_after (mode, insn);
- case I387_TRUNC:
- case I387_FLOOR:
- case I387_CEIL:
- return mode;
- default:
- gcc_unreachable ();
- }
+ if (!TARGET_64BIT)
+ return false;
+ return default_is_empty_record (type);
}
-static int
-ix86_dirflag_mode_entry (void)
-{
- /* For TARGET_CLD or in the interrupt handler we can't assume
- direction flag state at function entry. */
- if (TARGET_CLD
- || cfun->machine->func_type != TYPE_NORMAL)
- return X86_DIRFLAG_ANY;
-
- return X86_DIRFLAG_RESET;
-}
+/* Implement TARGET_WARN_PARAMETER_PASSING_ABI. */
-static int
-ix86_avx_u128_mode_entry (void)
+static void
+ix86_warn_parameter_passing_abi (cumulative_args_t cum_v, tree type)
{
- tree arg;
-
- /* Entry mode is set to AVX_U128_DIRTY if there are
- 256bit or 512bit modes used in function arguments. */
- for (arg = DECL_ARGUMENTS (current_function_decl); arg;
- arg = TREE_CHAIN (arg))
- {
- rtx incoming = DECL_INCOMING_RTL (arg);
+ CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
- if (incoming && ix86_check_avx_upper_register (incoming))
- return AVX_U128_DIRTY;
- }
+ if (!cum->warn_empty)
+ return;
- return AVX_U128_CLEAN;
-}
+ if (!TYPE_EMPTY_P (type))
+ return;
-/* Return a mode that ENTITY is assumed to be
- switched to at function entry. */
+ /* Don't warn if the function isn't visible outside of the TU. */
+ if (cum->decl && !TREE_PUBLIC (cum->decl))
+ return;
-static int
-ix86_mode_entry (int entity)
-{
- switch (entity)
- {
- case X86_DIRFLAG:
- return ix86_dirflag_mode_entry ();
- case AVX_U128:
- return ix86_avx_u128_mode_entry ();
- case I387_TRUNC:
- case I387_FLOOR:
- case I387_CEIL:
- return I387_CW_ANY;
- default:
- gcc_unreachable ();
- }
-}
+ const_tree ctx = get_ultimate_context (cum->decl);
+ if (ctx != NULL_TREE
+ && !TRANSLATION_UNIT_WARN_EMPTY_P (ctx))
+ return;
-static int
-ix86_avx_u128_mode_exit (void)
-{
- rtx reg = crtl->return_rtx;
+ /* If the actual size of the type is zero, then there is no change
+ in how objects of this size are passed. */
+ if (int_size_in_bytes (type) == 0)
+ return;
- /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
- or 512 bit modes used in the function return register. */
- if (reg && ix86_check_avx_upper_register (reg))
- return AVX_U128_DIRTY;
+ warning (OPT_Wabi, "empty class %qT parameter passing ABI "
+ "changes in %<-fabi-version=12%> (GCC 8)", type);
- /* Exit mode is set to AVX_U128_DIRTY if there are 256bit or 512bit
- modes used in function arguments, otherwise return AVX_U128_CLEAN.
- */
- return ix86_avx_u128_mode_entry ();
+ /* Only warn once. */
+ cum->warn_empty = false;
}
-/* Return a mode that ENTITY is assumed to be
- switched to at function exit. */
+/* This hook returns name of multilib ABI. */
-static int
-ix86_mode_exit (int entity)
+static const char *
+ix86_get_multilib_abi_name (void)
{
- switch (entity)
- {
- case X86_DIRFLAG:
- return X86_DIRFLAG_ANY;
- case AVX_U128:
- return ix86_avx_u128_mode_exit ();
- case I387_TRUNC:
- case I387_FLOOR:
- case I387_CEIL:
- return I387_CW_ANY;
- default:
- gcc_unreachable ();
- }
+ if (!(TARGET_64BIT_P (ix86_isa_flags)))
+ return "i386";
+ else if (TARGET_X32_P (ix86_isa_flags))
+ return "x32";
+ else
+ return "x86_64";
}
+/* Compute the alignment for a variable for Intel MCU psABI. TYPE is
+ the data type, and ALIGN is the alignment that the object would
+ ordinarily have. */
+
static int
-ix86_mode_priority (int, int n)
+iamcu_alignment (tree type, int align)
{
- return n;
+ machine_mode mode;
+
+ if (align < 32 || TYPE_USER_ALIGN (type))
+ return align;
+
+ /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
+ bytes. */
+ mode = TYPE_MODE (strip_array_types (type));
+ switch (GET_MODE_CLASS (mode))
+ {
+ case MODE_INT:
+ case MODE_COMPLEX_INT:
+ case MODE_COMPLEX_FLOAT:
+ case MODE_FLOAT:
+ case MODE_DECIMAL_FLOAT:
+ return 32;
+ default:
+ return align;
+ }
}
-/* Output code to initialize control word copies used by trunc?f?i and
- rounding patterns. CURRENT_MODE is set to current control word,
- while NEW_MODE is set to new control word. */
+/* Compute the alignment for a static variable.
+ TYPE is the data type, and ALIGN is the alignment that
+ the object would ordinarily have. The value of this function is used
+ instead of that alignment to align the object. */
-static void
-emit_i387_cw_initialization (int mode)
+int
+ix86_data_alignment (tree type, unsigned int align, bool opt)
{
- rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
- rtx new_mode;
+ /* GCC 4.8 and earlier used to incorrectly assume this alignment even
+ for symbols from other compilation units or symbols that don't need
+ to bind locally. In order to preserve some ABI compatibility with
+ those compilers, ensure we don't decrease alignment from what we
+ used to assume. */
- enum ix86_stack_slot slot;
+ unsigned int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
- rtx reg = gen_reg_rtx (HImode);
+ /* A data structure, equal or greater than the size of a cache line
+ (64 bytes in the Pentium 4 and other recent Intel processors, including
+ processors based on Intel Core microarchitecture) should be aligned
+ so that its base address is a multiple of a cache line size. */
- emit_insn (gen_x86_fnstcw_1 (stored_mode));
- emit_move_insn (reg, copy_rtx (stored_mode));
+ unsigned int max_align
+ = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
- switch (mode)
- {
- case I387_CW_TRUNC:
- /* round toward zero (truncate) */
- emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
- slot = SLOT_CW_TRUNC;
- break;
+ if (max_align < BITS_PER_WORD)
+ max_align = BITS_PER_WORD;
- case I387_CW_FLOOR:
- /* round down toward -oo */
- emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
- emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
- slot = SLOT_CW_FLOOR;
- break;
+ switch (ix86_align_data_type)
+ {
+ case ix86_align_data_type_abi: opt = false; break;
+ case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
+ case ix86_align_data_type_cacheline: break;
+ }
- case I387_CW_CEIL:
- /* round up toward +oo */
- emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
- emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
- slot = SLOT_CW_CEIL;
- break;
+ if (TARGET_IAMCU)
+ align = iamcu_alignment (type, align);
- default:
- gcc_unreachable ();
+ if (opt
+ && AGGREGATE_TYPE_P (type)
+ && TYPE_SIZE (type)
+ && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
+ {
+ if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align_compat)
+ && align < max_align_compat)
+ align = max_align_compat;
+ if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align)
+ && align < max_align)
+ align = max_align;
}
- gcc_assert (slot < MAX_386_STACK_LOCALS);
+ /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
+ to 16byte boundary. */
+ if (TARGET_64BIT)
+ {
+ if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
+ && TYPE_SIZE (type)
+ && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
+ && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
+ && align < 128)
+ return 128;
+ }
- new_mode = assign_386_stack_local (HImode, slot);
- emit_move_insn (new_mode, reg);
-}
+ if (!opt)
+ return align;
-/* Generate one or more insns to set ENTITY to MODE. */
+ if (TREE_CODE (type) == ARRAY_TYPE)
+ {
+ if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
+ return 64;
+ if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
+ return 128;
+ }
+ else if (TREE_CODE (type) == COMPLEX_TYPE)
+ {
-static void
-ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
- HARD_REG_SET regs_live ATTRIBUTE_UNUSED)
-{
- switch (entity)
+ if (TYPE_MODE (type) == DCmode && align < 64)
+ return 64;
+ if ((TYPE_MODE (type) == XCmode
+ || TYPE_MODE (type) == TCmode) && align < 128)
+ return 128;
+ }
+ else if ((TREE_CODE (type) == RECORD_TYPE
+ || TREE_CODE (type) == UNION_TYPE
+ || TREE_CODE (type) == QUAL_UNION_TYPE)
+ && TYPE_FIELDS (type))
{
- case X86_DIRFLAG:
- if (mode == X86_DIRFLAG_RESET)
- emit_insn (gen_cld ());
- break;
- case AVX_U128:
- if (mode == AVX_U128_CLEAN)
- emit_insn (gen_avx_vzeroupper ());
- break;
- case I387_TRUNC:
- case I387_FLOOR:
- case I387_CEIL:
- if (mode != I387_CW_ANY
- && mode != I387_CW_UNINITIALIZED)
- emit_i387_cw_initialization (mode);
- break;
- default:
- gcc_unreachable ();
+ if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
+ return 64;
+ if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
+ return 128;
+ }
+ else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
+ || TREE_CODE (type) == INTEGER_TYPE)
+ {
+ if (TYPE_MODE (type) == DFmode && align < 64)
+ return 64;
+ if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
+ return 128;
}
+
+ return align;
}
-/* Output code for INSN to convert a float to a signed int. OPERANDS
- are the insn operands. The output may be [HSD]Imode and the input
- operand may be [SDX]Fmode. */
+/* Compute the alignment for a local variable or a stack slot. EXP is
+ the data type or decl itself, MODE is the widest mode available and
+ ALIGN is the alignment that the object would ordinarily have. The
+ value of this macro is used instead of that alignment to align the
+ object. */
-const char *
-output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
+unsigned int
+ix86_local_alignment (tree exp, machine_mode mode,
+ unsigned int align)
{
- bool stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
- bool dimode_p = GET_MODE (operands[0]) == DImode;
- int round_mode = get_attr_i387_cw (insn);
-
- static char buf[40];
- const char *p;
+ tree type, decl;
- /* Jump through a hoop or two for DImode, since the hardware has no
- non-popping instruction. We used to do this a different way, but
- that was somewhat fragile and broke with post-reload splitters. */
- if ((dimode_p || fisttp) && !stack_top_dies)
- output_asm_insn ("fld\t%y1", operands);
-
- gcc_assert (STACK_TOP_P (operands[1]));
- gcc_assert (MEM_P (operands[0]));
- gcc_assert (GET_MODE (operands[1]) != TFmode);
-
- if (fisttp)
- return "fisttp%Z0\t%0";
-
- strcpy (buf, "fist");
-
- if (round_mode != I387_CW_ANY)
- output_asm_insn ("fldcw\t%3", operands);
-
- p = "p%Z0\t%0";
- strcat (buf, p + !(stack_top_dies || dimode_p));
-
- output_asm_insn (buf, operands);
-
- if (round_mode != I387_CW_ANY)
- output_asm_insn ("fldcw\t%2", operands);
-
- return "";
-}
-
-/* Output code for x87 ffreep insn. The OPNO argument, which may only
- have the values zero or one, indicates the ffreep insn's operand
- from the OPERANDS array. */
-
-static const char *
-output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
-{
- if (TARGET_USE_FFREEP)
-#ifdef HAVE_AS_IX86_FFREEP
- return opno ? "ffreep\t%y1" : "ffreep\t%y0";
-#else
+ if (exp && DECL_P (exp))
{
- static char retval[32];
- int regno = REGNO (operands[opno]);
-
- gcc_assert (STACK_REGNO_P (regno));
-
- regno -= FIRST_STACK_REG;
-
- snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
- return retval;
+ type = TREE_TYPE (exp);
+ decl = exp;
+ }
+ else
+ {
+ type = exp;
+ decl = NULL;
}
-#endif
- return opno ? "fstp\t%y1" : "fstp\t%y0";
-}
+ /* Don't do dynamic stack realignment for long long objects with
+ -mpreferred-stack-boundary=2. */
+ if (!TARGET_64BIT
+ && align == 64
+ && ix86_preferred_stack_boundary < 64
+ && (mode == DImode || (type && TYPE_MODE (type) == DImode))
+ && (!type || !TYPE_USER_ALIGN (type))
+ && (!decl || !DECL_USER_ALIGN (decl)))
+ align = 32;
+ /* If TYPE is NULL, we are allocating a stack slot for caller-save
+ register in MODE. We will return the largest alignment of XF
+ and DF. */
+ if (!type)
+ {
+ if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
+ align = GET_MODE_ALIGNMENT (DFmode);
+ return align;
+ }
-/* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
- should be used. UNORDERED_P is true when fucom should be used. */
+ /* Don't increase alignment for Intel MCU psABI. */
+ if (TARGET_IAMCU)
+ return align;
-const char *
-output_fp_compare (rtx_insn *insn, rtx *operands,
- bool eflags_p, bool unordered_p)
-{
- rtx *xops = eflags_p ? &operands[0] : &operands[1];
- bool stack_top_dies;
+ /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
+ to 16byte boundary. Exact wording is:
- static char buf[40];
- const char *p;
+ An array uses the same alignment as its elements, except that a local or
+ global array variable of length at least 16 bytes or
+ a C99 variable-length array variable always has alignment of at least 16 bytes.
- gcc_assert (STACK_TOP_P (xops[0]));
+ This was added to allow use of aligned SSE instructions at arrays. This
+ rule is meant for static storage (where compiler cannot do the analysis
+ by itself). We follow it for automatic variables only when convenient.
+ We fully control everything in the function compiled and functions from
+ other unit cannot rely on the alignment.
- stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
+ Exclude va_list type. It is the common case of local array where
+ we cannot benefit from the alignment.
- if (eflags_p)
+ TODO: Probably one should optimize for size only when var is not escaping. */
+ if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
+ && TARGET_SSE)
{
- p = unordered_p ? "fucomi" : "fcomi";
- strcpy (buf, p);
-
- p = "p\t{%y1, %0|%0, %y1}";
- strcat (buf, p + !stack_top_dies);
-
- return buf;
+ if (AGGREGATE_TYPE_P (type)
+ && (va_list_type_node == NULL_TREE
+ || (TYPE_MAIN_VARIANT (type)
+ != TYPE_MAIN_VARIANT (va_list_type_node)))
+ && TYPE_SIZE (type)
+ && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
+ && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
+ && align < 128)
+ return 128;
}
-
- if (STACK_REG_P (xops[1])
- && stack_top_dies
- && find_regno_note (insn, REG_DEAD, FIRST_STACK_REG + 1))
+ if (TREE_CODE (type) == ARRAY_TYPE)
{
- gcc_assert (REGNO (xops[1]) == FIRST_STACK_REG + 1);
-
- /* If both the top of the 387 stack die, and the other operand
- is also a stack register that dies, then this must be a
- `fcompp' float compare. */
- p = unordered_p ? "fucompp" : "fcompp";
- strcpy (buf, p);
+ if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
+ return 64;
+ if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
+ return 128;
}
- else if (const0_operand (xops[1], VOIDmode))
+ else if (TREE_CODE (type) == COMPLEX_TYPE)
{
- gcc_assert (!unordered_p);
- strcpy (buf, "ftst");
+ if (TYPE_MODE (type) == DCmode && align < 64)
+ return 64;
+ if ((TYPE_MODE (type) == XCmode
+ || TYPE_MODE (type) == TCmode) && align < 128)
+ return 128;
}
- else
+ else if ((TREE_CODE (type) == RECORD_TYPE
+ || TREE_CODE (type) == UNION_TYPE
+ || TREE_CODE (type) == QUAL_UNION_TYPE)
+ && TYPE_FIELDS (type))
{
- if (GET_MODE_CLASS (GET_MODE (xops[1])) == MODE_INT)
- {
- gcc_assert (!unordered_p);
- p = "ficom";
- }
- else
- p = unordered_p ? "fucom" : "fcom";
-
- strcpy (buf, p);
-
- p = "p%Z2\t%y2";
- strcat (buf, p + !stack_top_dies);
+ if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
+ return 64;
+ if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
+ return 128;
}
+ else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
+ || TREE_CODE (type) == INTEGER_TYPE)
+ {
- output_asm_insn (buf, operands);
- return "fnstsw\t%0";
+ if (TYPE_MODE (type) == DFmode && align < 64)
+ return 64;
+ if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
+ return 128;
+ }
+ return align;
}
-void
-ix86_output_addr_vec_elt (FILE *file, int value)
-{
- const char *directive = ASM_LONG;
-
-#ifdef ASM_QUAD
- if (TARGET_LP64)
- directive = ASM_QUAD;
-#else
- gcc_assert (!TARGET_64BIT);
-#endif
-
- fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
-}
+/* Compute the minimum required alignment for dynamic stack realignment
+ purposes for a local variable, parameter or a stack slot. EXP is
+ the data type or decl itself, MODE is its mode and ALIGN is the
+ alignment that the object would ordinarily have. */
-void
-ix86_output_addr_diff_elt (FILE *file, int value, int rel)
+unsigned int
+ix86_minimum_alignment (tree exp, machine_mode mode,
+ unsigned int align)
{
- const char *directive = ASM_LONG;
+ tree type, decl;
-#ifdef ASM_QUAD
- if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
- directive = ASM_QUAD;
-#else
- gcc_assert (!TARGET_64BIT);
-#endif
- /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
- if (TARGET_64BIT || TARGET_VXWORKS_RTP)
- fprintf (file, "%s%s%d-%s%d\n",
- directive, LPREFIX, value, LPREFIX, rel);
-#if TARGET_MACHO
- else if (TARGET_MACHO)
+ if (exp && DECL_P (exp))
{
- fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
- machopic_output_function_base_name (file);
- putc ('\n', file);
+ type = TREE_TYPE (exp);
+ decl = exp;
}
-#endif
- else if (HAVE_AS_GOTOFF_IN_DATA)
- fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
else
- asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
- GOT_SYMBOL_NAME, LPREFIX, value);
-}
-\f
-/* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
- for the target. */
-
-void
-ix86_expand_clear (rtx dest)
-{
- rtx tmp;
-
- /* We play register width games, which are only valid after reload. */
- gcc_assert (reload_completed);
+ {
+ type = exp;
+ decl = NULL;
+ }
- /* Avoid HImode and its attendant prefix byte. */
- if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
- dest = gen_rtx_REG (SImode, REGNO (dest));
- tmp = gen_rtx_SET (dest, const0_rtx);
+ if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
+ return align;
- if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
+ /* Don't do dynamic stack realignment for long long objects with
+ -mpreferred-stack-boundary=2. */
+ if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
+ && (!type || !TYPE_USER_ALIGN (type))
+ && (!decl || !DECL_USER_ALIGN (decl)))
{
- rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
- tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
+ gcc_checking_assert (!TARGET_STV);
+ return 32;
}
- emit_insn (tmp);
+ return align;
}
+\f
+/* Find a location for the static chain incoming to a nested function.
+ This is a register, unless all free registers are used by arguments. */
-void
-ix86_expand_move (machine_mode mode, rtx operands[])
-{
- rtx op0, op1;
- rtx tmp, addend = NULL_RTX;
- enum tls_model model;
-
- op0 = operands[0];
- op1 = operands[1];
+static rtx
+ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
+{
+ unsigned regno;
- switch (GET_CODE (op1))
+ if (TARGET_64BIT)
{
- case CONST:
- tmp = XEXP (op1, 0);
-
- if (GET_CODE (tmp) != PLUS
- || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
- break;
-
- op1 = XEXP (tmp, 0);
- addend = XEXP (tmp, 1);
- /* FALLTHRU */
+ /* We always use R10 in 64-bit mode. */
+ regno = R10_REG;
+ }
+ else
+ {
+ const_tree fntype, fndecl;
+ unsigned int ccvt;
- case SYMBOL_REF:
- model = SYMBOL_REF_TLS_MODEL (op1);
+ /* By default in 32-bit mode we use ECX to pass the static chain. */
+ regno = CX_REG;
- if (model)
- op1 = legitimize_tls_address (op1, model, true);
- else if (ix86_force_load_from_GOT_p (op1))
+ if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
{
- /* Load the external function address via GOT slot to avoid PLT. */
- op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
- (TARGET_64BIT
- ? UNSPEC_GOTPCREL
- : UNSPEC_GOT));
- op1 = gen_rtx_CONST (Pmode, op1);
- op1 = gen_const_mem (Pmode, op1);
- set_mem_alias_set (op1, ix86_GOT_alias_set ());
+ fntype = TREE_TYPE (fndecl_or_type);
+ fndecl = fndecl_or_type;
}
else
{
- tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
- if (tmp)
- {
- op1 = tmp;
- if (!addend)
- break;
- }
- else
- {
- op1 = operands[1];
- break;
- }
+ fntype = fndecl_or_type;
+ fndecl = NULL;
}
- if (addend)
+ ccvt = ix86_get_callcvt (fntype);
+ if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
{
- op1 = force_operand (op1, NULL_RTX);
- op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
- op0, 1, OPTAB_DIRECT);
+ /* Fastcall functions use ecx/edx for arguments, which leaves
+ us with EAX for the static chain.
+ Thiscall functions use ecx for arguments, which also
+ leaves us with EAX for the static chain. */
+ regno = AX_REG;
}
- else
- op1 = force_operand (op1, op0);
-
- if (op1 == op0)
- return;
-
- op1 = convert_to_mode (mode, op1, 1);
-
- default:
- break;
- }
-
- if ((flag_pic || MACHOPIC_INDIRECT)
- && symbolic_operand (op1, mode))
- {
- if (TARGET_MACHO && !TARGET_64BIT)
+ else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
{
-#if TARGET_MACHO
- /* dynamic-no-pic */
- if (MACHOPIC_INDIRECT)
- {
- rtx temp = (op0 && REG_P (op0) && mode == Pmode)
- ? op0 : gen_reg_rtx (Pmode);
- op1 = machopic_indirect_data_reference (op1, temp);
- if (MACHOPIC_PURE)
- op1 = machopic_legitimize_pic_address (op1, mode,
- temp == op1 ? 0 : temp);
- }
- if (op0 != op1 && GET_CODE (op0) != MEM)
- {
- rtx insn = gen_rtx_SET (op0, op1);
- emit_insn (insn);
- return;
- }
- if (GET_CODE (op0) == MEM)
- op1 = force_reg (Pmode, op1);
- else
- {
- rtx temp = op0;
- if (GET_CODE (temp) != REG)
- temp = gen_reg_rtx (Pmode);
- temp = legitimize_pic_address (op1, temp);
- if (temp == op0)
- return;
- op1 = temp;
- }
- /* dynamic-no-pic */
-#endif
+ /* Thiscall functions use ecx for arguments, which leaves
+ us with EAX and EDX for the static chain.
+ We are using for abi-compatibility EAX. */
+ regno = AX_REG;
}
- else
+ else if (ix86_function_regparm (fntype, fndecl) == 3)
{
- if (MEM_P (op0))
- op1 = force_reg (mode, op1);
- else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
- {
- rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
- op1 = legitimize_pic_address (op1, reg);
- if (op0 == op1)
- return;
- op1 = convert_to_mode (mode, op1, 1);
- }
- }
- }
- else
- {
- if (MEM_P (op0)
- && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
- || !push_operand (op0, mode))
- && MEM_P (op1))
- op1 = force_reg (mode, op1);
-
- if (push_operand (op0, mode)
- && ! general_no_elim_operand (op1, mode))
- op1 = copy_to_mode_reg (mode, op1);
-
- /* Force large constants in 64bit compilation into register
- to get them CSEed. */
- if (can_create_pseudo_p ()
- && (mode == DImode) && TARGET_64BIT
- && immediate_operand (op1, mode)
- && !x86_64_zext_immediate_operand (op1, VOIDmode)
- && !register_operand (op0, mode)
- && optimize)
- op1 = copy_to_mode_reg (mode, op1);
-
- if (can_create_pseudo_p ()
- && CONST_DOUBLE_P (op1))
- {
- /* If we are loading a floating point constant to a register,
- force the value to memory now, since we'll get better code
- out the back end. */
-
- op1 = validize_mem (force_const_mem (mode, op1));
- if (!register_operand (op0, mode))
+ /* For regparm 3, we have no free call-clobbered registers in
+ which to store the static chain. In order to implement this,
+ we have the trampoline push the static chain to the stack.
+ However, we can't push a value below the return address when
+ we call the nested function directly, so we have to use an
+ alternate entry point. For this we use ESI, and have the
+ alternate entry point push ESI, so that things appear the
+ same once we're executing the nested function. */
+ if (incoming_p)
{
- rtx temp = gen_reg_rtx (mode);
- emit_insn (gen_rtx_SET (temp, op1));
- emit_move_insn (op0, temp);
- return;
+ if (fndecl == current_function_decl
+ && !ix86_static_chain_on_stack)
+ {
+ gcc_assert (!reload_completed);
+ ix86_static_chain_on_stack = true;
+ }
+ return gen_frame_mem (SImode,
+ plus_constant (Pmode,
+ arg_pointer_rtx, -8));
}
+ regno = SI_REG;
}
}
- emit_insn (gen_rtx_SET (op0, op1));
-}
-
-void
-ix86_expand_vector_move (machine_mode mode, rtx operands[])
-{
- rtx op0 = operands[0], op1 = operands[1];
- /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
- psABI since the biggest alignment is 4 byte for IA MCU psABI. */
- unsigned int align = (TARGET_IAMCU
- ? GET_MODE_BITSIZE (mode)
- : GET_MODE_ALIGNMENT (mode));
-
- if (push_operand (op0, VOIDmode))
- op0 = emit_move_resolve_push (mode, op0);
-
- /* Force constants other than zero into memory. We do not know how
- the instructions used to build constants modify the upper 64 bits
- of the register, once we have that information we may be able
- to handle some of them more efficiently. */
- if (can_create_pseudo_p ()
- && (CONSTANT_P (op1)
- || (SUBREG_P (op1)
- && CONSTANT_P (SUBREG_REG (op1))))
- && ((register_operand (op0, mode)
- && !standard_sse_constant_p (op1, mode))
- /* ix86_expand_vector_move_misalign() does not like constants. */
- || (SSE_REG_MODE_P (mode)
- && MEM_P (op0)
- && MEM_ALIGN (op0) < align)))
- {
- if (SUBREG_P (op1))
- {
- machine_mode imode = GET_MODE (SUBREG_REG (op1));
- rtx r = force_const_mem (imode, SUBREG_REG (op1));
- if (r)
- r = validize_mem (r);
- else
- r = force_reg (imode, SUBREG_REG (op1));
- op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
- }
- else
- op1 = validize_mem (force_const_mem (mode, op1));
- }
-
- /* We need to check memory alignment for SSE mode since attribute
- can make operands unaligned. */
- if (can_create_pseudo_p ()
- && SSE_REG_MODE_P (mode)
- && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
- || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
- {
- rtx tmp[2];
-
- /* ix86_expand_vector_move_misalign() does not like both
- arguments in memory. */
- if (!register_operand (op0, mode)
- && !register_operand (op1, mode))
- op1 = force_reg (mode, op1);
-
- tmp[0] = op0; tmp[1] = op1;
- ix86_expand_vector_move_misalign (mode, tmp);
- return;
- }
-
- /* Make operand1 a register if it isn't already. */
- if (can_create_pseudo_p ()
- && !register_operand (op0, mode)
- && !register_operand (op1, mode))
- {
- emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
- return;
- }
-
- emit_insn (gen_rtx_SET (op0, op1));
+ return gen_rtx_REG (Pmode, regno);
}
-/* Split 32-byte AVX unaligned load and store if needed. */
+/* Emit RTL insns to initialize the variable parts of a trampoline.
+ FNDECL is the decl of the target address; M_TRAMP is a MEM for
+ the trampoline, and CHAIN_VALUE is an RTX for the static chain
+ to be passed to the target function. */
static void
-ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
+ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
{
- rtx m;
- rtx (*extract) (rtx, rtx, rtx);
- machine_mode mode;
+ rtx mem, fnaddr;
+ int opcode;
+ int offset = 0;
+ bool need_endbr = (flag_cf_protection & CF_BRANCH);
- if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
- || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
- {
- emit_insn (gen_rtx_SET (op0, op1));
- return;
- }
+ fnaddr = XEXP (DECL_RTL (fndecl), 0);
- rtx orig_op0 = NULL_RTX;
- mode = GET_MODE (op0);
- switch (GET_MODE_CLASS (mode))
+ if (TARGET_64BIT)
{
- case MODE_VECTOR_INT:
- case MODE_INT:
- if (mode != V32QImode)
+ int size;
+
+ if (need_endbr)
{
- if (!MEM_P (op0))
- {
- orig_op0 = op0;
- op0 = gen_reg_rtx (V32QImode);
- }
- else
- op0 = gen_lowpart (V32QImode, op0);
- op1 = gen_lowpart (V32QImode, op1);
- mode = V32QImode;
+ /* Insert ENDBR64. */
+ mem = adjust_address (m_tramp, SImode, offset);
+ emit_move_insn (mem, gen_int_mode (0xfa1e0ff3, SImode));
+ offset += 4;
}
- break;
- case MODE_VECTOR_FLOAT:
- break;
- default:
- gcc_unreachable ();
- }
-
- switch (mode)
- {
- default:
- gcc_unreachable ();
- case E_V32QImode:
- extract = gen_avx_vextractf128v32qi;
- mode = V16QImode;
- break;
- case E_V8SFmode:
- extract = gen_avx_vextractf128v8sf;
- mode = V4SFmode;
- break;
- case E_V4DFmode:
- extract = gen_avx_vextractf128v4df;
- mode = V2DFmode;
- break;
- }
-
- if (MEM_P (op1))
- {
- rtx r = gen_reg_rtx (mode);
- m = adjust_address (op1, mode, 0);
- emit_move_insn (r, m);
- m = adjust_address (op1, mode, 16);
- r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
- emit_move_insn (op0, r);
- }
- else if (MEM_P (op0))
- {
- m = adjust_address (op0, mode, 0);
- emit_insn (extract (m, op1, const0_rtx));
- m = adjust_address (op0, mode, 16);
- emit_insn (extract (m, copy_rtx (op1), const1_rtx));
- }
- else
- gcc_unreachable ();
-
- if (orig_op0)
- emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
-}
-
-/* Implement the movmisalign patterns for SSE. Non-SSE modes go
- straight to ix86_expand_vector_move. */
-/* Code generation for scalar reg-reg moves of single and double precision data:
- if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
- movaps reg, reg
- else
- movss reg, reg
- if (x86_sse_partial_reg_dependency == true)
- movapd reg, reg
- else
- movsd reg, reg
-
- Code generation for scalar loads of double precision data:
- if (x86_sse_split_regs == true)
- movlpd mem, reg (gas syntax)
- else
- movsd mem, reg
-
- Code generation for unaligned packed loads of single precision data
- (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
- if (x86_sse_unaligned_move_optimal)
- movups mem, reg
-
- if (x86_sse_partial_reg_dependency == true)
- {
- xorps reg, reg
- movlps mem, reg
- movhps mem+8, reg
- }
- else
- {
- movlps mem, reg
- movhps mem+8, reg
- }
-
- Code generation for unaligned packed loads of double precision data
- (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
- if (x86_sse_unaligned_move_optimal)
- movupd mem, reg
-
- if (x86_sse_split_regs == true)
- {
- movlpd mem, reg
- movhpd mem+8, reg
- }
- else
- {
- movsd mem, reg
- movhpd mem+8, reg
- }
- */
-
-void
-ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
-{
- rtx op0, op1, m;
-
- op0 = operands[0];
- op1 = operands[1];
-
- /* Use unaligned load/store for AVX512 or when optimizing for size. */
- if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
- {
- emit_insn (gen_rtx_SET (op0, op1));
- return;
- }
-
- if (TARGET_AVX)
- {
- if (GET_MODE_SIZE (mode) == 32)
- ix86_avx256_split_vector_move_misalign (op0, op1);
- else
- /* Always use 128-bit mov<mode>_internal pattern for AVX. */
- emit_insn (gen_rtx_SET (op0, op1));
- return;
- }
-
- if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
- || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
- {
- emit_insn (gen_rtx_SET (op0, op1));
- return;
- }
-
- /* ??? If we have typed data, then it would appear that using
- movdqu is the only way to get unaligned data loaded with
- integer type. */
- if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
- {
- emit_insn (gen_rtx_SET (op0, op1));
- return;
- }
- if (MEM_P (op1))
- {
- if (TARGET_SSE2 && mode == V2DFmode)
- {
- rtx zero;
+ /* Load the function address to r11. Try to load address using
+ the shorter movl instead of movabs. We may want to support
+ movq for kernel mode, but kernel does not use trampolines at
+ the moment. FNADDR is a 32bit address and may not be in
+ DImode when ptr_mode == SImode. Always use movl in this
+ case. */
+ if (ptr_mode == SImode
+ || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
+ {
+ fnaddr = copy_addr_to_reg (fnaddr);
- /* When SSE registers are split into halves, we can avoid
- writing to the top half twice. */
- if (TARGET_SSE_SPLIT_REGS)
- {
- emit_clobber (op0);
- zero = op0;
- }
- else
- {
- /* ??? Not sure about the best option for the Intel chips.
- The following would seem to satisfy; the register is
- entirely cleared, breaking the dependency chain. We
- then store to the upper half, with a dependency depth
- of one. A rumor has it that Intel recommends two movsd
- followed by an unpacklpd, but this is unconfirmed. And
- given that the dependency depth of the unpacklpd would
- still be one, I'm not sure why this would be better. */
- zero = CONST0_RTX (V2DFmode);
- }
+ mem = adjust_address (m_tramp, HImode, offset);
+ emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
- m = adjust_address (op1, DFmode, 0);
- emit_insn (gen_sse2_loadlpd (op0, zero, m));
- m = adjust_address (op1, DFmode, 8);
- emit_insn (gen_sse2_loadhpd (op0, op0, m));
+ mem = adjust_address (m_tramp, SImode, offset + 2);
+ emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
+ offset += 6;
}
else
- {
- rtx t;
-
- if (mode != V4SFmode)
- t = gen_reg_rtx (V4SFmode);
- else
- t = op0;
-
- if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
- emit_move_insn (t, CONST0_RTX (V4SFmode));
- else
- emit_clobber (t);
+ {
+ mem = adjust_address (m_tramp, HImode, offset);
+ emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
- m = adjust_address (op1, V2SFmode, 0);
- emit_insn (gen_sse_loadlps (t, t, m));
- m = adjust_address (op1, V2SFmode, 8);
- emit_insn (gen_sse_loadhps (t, t, m));
- if (mode != V4SFmode)
- emit_move_insn (op0, gen_lowpart (mode, t));
+ mem = adjust_address (m_tramp, DImode, offset + 2);
+ emit_move_insn (mem, fnaddr);
+ offset += 10;
}
- }
- else if (MEM_P (op0))
- {
- if (TARGET_SSE2 && mode == V2DFmode)
+
+ /* Load static chain using movabs to r10. Use the shorter movl
+ instead of movabs when ptr_mode == SImode. */
+ if (ptr_mode == SImode)
{
- m = adjust_address (op0, DFmode, 0);
- emit_insn (gen_sse2_storelpd (m, op1));
- m = adjust_address (op0, DFmode, 8);
- emit_insn (gen_sse2_storehpd (m, op1));
+ opcode = 0xba41;
+ size = 6;
}
else
{
- if (mode != V4SFmode)
- op1 = gen_lowpart (V4SFmode, op1);
-
- m = adjust_address (op0, V2SFmode, 0);
- emit_insn (gen_sse_storelps (m, op1));
- m = adjust_address (op0, V2SFmode, 8);
- emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
+ opcode = 0xba49;
+ size = 10;
}
- }
- else
- gcc_unreachable ();
-}
-/* Helper function of ix86_fixup_binary_operands to canonicalize
- operand order. Returns true if the operands should be swapped. */
-
-static bool
-ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
- rtx operands[])
-{
- rtx dst = operands[0];
- rtx src1 = operands[1];
- rtx src2 = operands[2];
+ mem = adjust_address (m_tramp, HImode, offset);
+ emit_move_insn (mem, gen_int_mode (opcode, HImode));
- /* If the operation is not commutative, we can't do anything. */
- if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
- && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
- return false;
+ mem = adjust_address (m_tramp, ptr_mode, offset + 2);
+ emit_move_insn (mem, chain_value);
+ offset += size;
- /* Highest priority is that src1 should match dst. */
- if (rtx_equal_p (dst, src1))
- return false;
- if (rtx_equal_p (dst, src2))
- return true;
-
- /* Next highest priority is that immediate constants come second. */
- if (immediate_operand (src2, mode))
- return false;
- if (immediate_operand (src1, mode))
- return true;
-
- /* Lowest priority is that memory references should come second. */
- if (MEM_P (src2))
- return false;
- if (MEM_P (src1))
- return true;
-
- return false;
-}
-
-
-/* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
- destination to use for the operation. If different from the true
- destination in operands[0], a copy operation will be required. */
-
-rtx
-ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
- rtx operands[])
-{
- rtx dst = operands[0];
- rtx src1 = operands[1];
- rtx src2 = operands[2];
-
- /* Canonicalize operand order. */
- if (ix86_swap_binary_operands_p (code, mode, operands))
- {
- /* It is invalid to swap operands of different modes. */
- gcc_assert (GET_MODE (src1) == GET_MODE (src2));
-
- std::swap (src1, src2);
+ /* Jump to r11; the last (unused) byte is a nop, only there to
+ pad the write out to a single 32-bit store. */
+ mem = adjust_address (m_tramp, SImode, offset);
+ emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
+ offset += 4;
}
-
- /* Both source operands cannot be in memory. */
- if (MEM_P (src1) && MEM_P (src2))
+ else
{
- /* Optimization: Only read from memory once. */
- if (rtx_equal_p (src1, src2))
+ rtx disp, chain;
+
+ /* Depending on the static chain location, either load a register
+ with a constant, or push the constant to the stack. All of the
+ instructions are the same size. */
+ chain = ix86_static_chain (fndecl, true);
+ if (REG_P (chain))
{
- src2 = force_reg (mode, src2);
- src1 = src2;
+ switch (REGNO (chain))
+ {
+ case AX_REG:
+ opcode = 0xb8; break;
+ case CX_REG:
+ opcode = 0xb9; break;
+ default:
+ gcc_unreachable ();
+ }
}
- else if (rtx_equal_p (dst, src1))
- src2 = force_reg (mode, src2);
else
- src1 = force_reg (mode, src1);
- }
-
- /* If the destination is memory, and we do not have matching source
- operands, do things in registers. */
- if (MEM_P (dst) && !rtx_equal_p (dst, src1))
- dst = gen_reg_rtx (mode);
-
- /* Source 1 cannot be a constant. */
- if (CONSTANT_P (src1))
- src1 = force_reg (mode, src1);
-
- /* Source 1 cannot be a non-matching memory. */
- if (MEM_P (src1) && !rtx_equal_p (dst, src1))
- src1 = force_reg (mode, src1);
-
- /* Improve address combine. */
- if (code == PLUS
- && GET_MODE_CLASS (mode) == MODE_INT
- && MEM_P (src2))
- src2 = force_reg (mode, src2);
-
- operands[1] = src1;
- operands[2] = src2;
- return dst;
-}
-
-/* Similarly, but assume that the destination has already been
- set up properly. */
-
-void
-ix86_fixup_binary_operands_no_copy (enum rtx_code code,
- machine_mode mode, rtx operands[])
-{
- rtx dst = ix86_fixup_binary_operands (code, mode, operands);
- gcc_assert (dst == operands[0]);
-}
+ opcode = 0x68;
-/* Attempt to expand a binary operator. Make the expansion closer to the
- actual machine, then just general_operand, which will allow 3 separate
- memory references (one output, two input) in a single insn. */
+ if (need_endbr)
+ {
+ /* Insert ENDBR32. */
+ mem = adjust_address (m_tramp, SImode, offset);
+ emit_move_insn (mem, gen_int_mode (0xfb1e0ff3, SImode));
+ offset += 4;
+ }
-void
-ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
- rtx operands[])
-{
- rtx src1, src2, dst, op, clob;
+ mem = adjust_address (m_tramp, QImode, offset);
+ emit_move_insn (mem, gen_int_mode (opcode, QImode));
- dst = ix86_fixup_binary_operands (code, mode, operands);
- src1 = operands[1];
- src2 = operands[2];
+ mem = adjust_address (m_tramp, SImode, offset + 1);
+ emit_move_insn (mem, chain_value);
+ offset += 5;
- /* Emit the instruction. */
+ mem = adjust_address (m_tramp, QImode, offset);
+ emit_move_insn (mem, gen_int_mode (0xe9, QImode));
- op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
+ mem = adjust_address (m_tramp, SImode, offset + 1);
- if (reload_completed
- && code == PLUS
- && !rtx_equal_p (dst, src1))
- {
- /* This is going to be an LEA; avoid splitting it later. */
- emit_insn (op);
- }
- else
- {
- clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
- emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
+ /* Compute offset from the end of the jmp to the target function.
+ In the case in which the trampoline stores the static chain on
+ the stack, we need to skip the first insn which pushes the
+ (call-saved) register static chain; this push is 1 byte. */
+ offset += 5;
+ disp = expand_binop (SImode, sub_optab, fnaddr,
+ plus_constant (Pmode, XEXP (m_tramp, 0),
+ offset - (MEM_P (chain) ? 1 : 0)),
+ NULL_RTX, 1, OPTAB_DIRECT);
+ emit_move_insn (mem, disp);
}
- /* Fix up the destination if needed. */
- if (dst != operands[0])
- emit_move_insn (operands[0], dst);
-}
-
-/* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
- the given OPERANDS. */
+ gcc_assert (offset <= TRAMPOLINE_SIZE);
-void
-ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
- rtx operands[])
-{
- rtx op1 = NULL_RTX, op2 = NULL_RTX;
- if (SUBREG_P (operands[1]))
- {
- op1 = operands[1];
- op2 = operands[2];
- }
- else if (SUBREG_P (operands[2]))
- {
- op1 = operands[2];
- op2 = operands[1];
- }
- /* Optimize (__m128i) d | (__m128i) e and similar code
- when d and e are float vectors into float vector logical
- insn. In C/C++ without using intrinsics there is no other way
- to express vector logical operation on float vectors than
- to cast them temporarily to integer vectors. */
- if (op1
- && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
- && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
- && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
- && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
- && SUBREG_BYTE (op1) == 0
- && (GET_CODE (op2) == CONST_VECTOR
- || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
- && SUBREG_BYTE (op2) == 0))
- && can_create_pseudo_p ())
- {
- rtx dst;
- switch (GET_MODE (SUBREG_REG (op1)))
- {
- case E_V4SFmode:
- case E_V8SFmode:
- case E_V16SFmode:
- case E_V2DFmode:
- case E_V4DFmode:
- case E_V8DFmode:
- dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
- if (GET_CODE (op2) == CONST_VECTOR)
- {
- op2 = gen_lowpart (GET_MODE (dst), op2);
- op2 = force_reg (GET_MODE (dst), op2);
- }
- else
- {
- op1 = operands[1];
- op2 = SUBREG_REG (operands[2]);
- if (!vector_operand (op2, GET_MODE (dst)))
- op2 = force_reg (GET_MODE (dst), op2);
- }
- op1 = SUBREG_REG (op1);
- if (!vector_operand (op1, GET_MODE (dst)))
- op1 = force_reg (GET_MODE (dst), op1);
- emit_insn (gen_rtx_SET (dst,
- gen_rtx_fmt_ee (code, GET_MODE (dst),
- op1, op2)));
- emit_move_insn (operands[0], gen_lowpart (mode, dst));
- return;
- default:
- break;
- }
- }
- if (!vector_operand (operands[1], mode))
- operands[1] = force_reg (mode, operands[1]);
- if (!vector_operand (operands[2], mode))
- operands[2] = force_reg (mode, operands[2]);
- ix86_fixup_binary_operands_no_copy (code, mode, operands);
- emit_insn (gen_rtx_SET (operands[0],
- gen_rtx_fmt_ee (code, mode, operands[1],
- operands[2])));
+#ifdef HAVE_ENABLE_EXECUTE_STACK
+#ifdef CHECK_EXECUTE_STACK_ENABLED
+ if (CHECK_EXECUTE_STACK_ENABLED)
+#endif
+ emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
+ LCT_NORMAL, VOIDmode, XEXP (m_tramp, 0), Pmode);
+#endif
}
-/* Return TRUE or FALSE depending on whether the binary operator meets the
- appropriate constraints. */
-
-bool
-ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
- rtx operands[3])
+static bool
+ix86_allocate_stack_slots_for_args (void)
{
- rtx dst = operands[0];
- rtx src1 = operands[1];
- rtx src2 = operands[2];
-
- /* Both source operands cannot be in memory. */
- if (MEM_P (src1) && MEM_P (src2))
- return false;
-
- /* Canonicalize operand order for commutative operators. */
- if (ix86_swap_binary_operands_p (code, mode, operands))
- std::swap (src1, src2);
-
- /* If the destination is memory, we must have a matching source operand. */
- if (MEM_P (dst) && !rtx_equal_p (dst, src1))
- return false;
-
- /* Source 1 cannot be a constant. */
- if (CONSTANT_P (src1))
- return false;
-
- /* Source 1 cannot be a non-matching memory. */
- if (MEM_P (src1) && !rtx_equal_p (dst, src1))
- /* Support "andhi/andsi/anddi" as a zero-extending move. */
- return (code == AND
- && (mode == HImode
- || mode == SImode
- || (TARGET_64BIT && mode == DImode))
- && satisfies_constraint_L (src2));
-
- return true;
+ /* Naked functions should not allocate stack slots for arguments. */
+ return !ix86_function_naked (current_function_decl);
}
-/* Attempt to expand a unary operator. Make the expansion closer to the
- actual machine, then just general_operand, which will allow 2 separate
- memory references (one output, one input) in a single insn. */
-
-void
-ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
- rtx operands[])
+static bool
+ix86_warn_func_return (tree decl)
{
- bool matching_memory = false;
- rtx src, dst, op, clob;
-
- dst = operands[0];
- src = operands[1];
-
- /* If the destination is memory, and we do not have matching source
- operands, do things in registers. */
- if (MEM_P (dst))
- {
- if (rtx_equal_p (dst, src))
- matching_memory = true;
- else
- dst = gen_reg_rtx (mode);
- }
-
- /* When source operand is memory, destination must match. */
- if (MEM_P (src) && !matching_memory)
- src = force_reg (mode, src);
-
- /* Emit the instruction. */
-
- op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
-
- if (code == NOT)
- emit_insn (op);
- else
- {
- clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
- emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
- }
-
- /* Fix up the destination if needed. */
- if (dst != operands[0])
- emit_move_insn (operands[0], dst);
+ /* Naked functions are implemented entirely in assembly, including the
+ return sequence, so suppress warnings about this. */
+ return !ix86_function_naked (decl);
}
-
-/* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
- divisor are within the range [0-255]. */
-
-void
-ix86_split_idivmod (machine_mode mode, rtx operands[],
- bool signed_p)
+\f
+/* Return the shift count of a vector by scalar shift builtin second argument
+ ARG1. */
+static tree
+ix86_vector_shift_count (tree arg1)
{
- rtx_code_label *end_label, *qimode_label;
- rtx div, mod;
- rtx_insn *insn;
- rtx scratch, tmp0, tmp1, tmp2;
- rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
- rtx (*gen_zero_extend) (rtx, rtx);
- rtx (*gen_test_ccno_1) (rtx, rtx);
-
- switch (mode)
- {
- case E_SImode:
- if (GET_MODE (operands[0]) == SImode)
- {
- if (GET_MODE (operands[1]) == SImode)
- gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
- else
- gen_divmod4_1
- = signed_p ? gen_divmodsi4_zext_2 : gen_udivmodsi4_zext_2;
- gen_zero_extend = gen_zero_extendqisi2;
- }
- else
- {
- gen_divmod4_1
- = signed_p ? gen_divmodsi4_zext_1 : gen_udivmodsi4_zext_1;
- gen_zero_extend = gen_zero_extendqidi2;
- }
- gen_test_ccno_1 = gen_testsi_ccno_1;
- break;
- case E_DImode:
- gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
- gen_test_ccno_1 = gen_testdi_ccno_1;
- gen_zero_extend = gen_zero_extendqidi2;
- break;
- default:
- gcc_unreachable ();
- }
-
- end_label = gen_label_rtx ();
- qimode_label = gen_label_rtx ();
-
- scratch = gen_reg_rtx (mode);
-
- /* Use 8bit unsigned divimod if dividend and divisor are within
- the range [0-255]. */
- emit_move_insn (scratch, operands[2]);
- scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
- scratch, 1, OPTAB_DIRECT);
- emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
- tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
- tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
- tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
- gen_rtx_LABEL_REF (VOIDmode, qimode_label),
- pc_rtx);
- insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
- predict_jump (REG_BR_PROB_BASE * 50 / 100);
- JUMP_LABEL (insn) = qimode_label;
-
- /* Generate original signed/unsigned divimod. */
- div = gen_divmod4_1 (operands[0], operands[1],
- operands[2], operands[3]);
- emit_insn (div);
-
- /* Branch to the end. */
- emit_jump_insn (gen_jump (end_label));
- emit_barrier ();
-
- /* Generate 8bit unsigned divide. */
- emit_label (qimode_label);
- /* Don't use operands[0] for result of 8bit divide since not all
- registers support QImode ZERO_EXTRACT. */
- tmp0 = lowpart_subreg (HImode, scratch, mode);
- tmp1 = lowpart_subreg (HImode, operands[2], mode);
- tmp2 = lowpart_subreg (QImode, operands[3], mode);
- emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
-
- if (signed_p)
- {
- div = gen_rtx_DIV (mode, operands[2], operands[3]);
- mod = gen_rtx_MOD (mode, operands[2], operands[3]);
- }
- else
- {
- div = gen_rtx_UDIV (mode, operands[2], operands[3]);
- mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
- }
- if (mode == SImode)
- {
- if (GET_MODE (operands[0]) != SImode)
- div = gen_rtx_ZERO_EXTEND (DImode, div);
- if (GET_MODE (operands[1]) != SImode)
- mod = gen_rtx_ZERO_EXTEND (DImode, mod);
- }
-
- /* Extract remainder from AH. */
- tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
- tmp0, GEN_INT (8), GEN_INT (8));
- if (REG_P (operands[1]))
- insn = emit_move_insn (operands[1], tmp1);
- else
+ if (tree_fits_uhwi_p (arg1))
+ return arg1;
+ else if (TREE_CODE (arg1) == VECTOR_CST && CHAR_BIT == 8)
{
- /* Need a new scratch register since the old one has result
- of 8bit divide. */
- scratch = gen_reg_rtx (GET_MODE (operands[1]));
- emit_move_insn (scratch, tmp1);
- insn = emit_move_insn (operands[1], scratch);
+ /* The count argument is weird, passed in as various 128-bit
+ (or 64-bit) vectors, the low 64 bits from it are the count. */
+ unsigned char buf[16];
+ int len = native_encode_expr (arg1, buf, 16);
+ if (len == 0)
+ return NULL_TREE;
+ tree t = native_interpret_expr (uint64_type_node, buf, len);
+ if (t && tree_fits_uhwi_p (t))
+ return t;
}
- set_unique_reg_note (insn, REG_EQUAL, mod);
-
- /* Zero extend quotient from AL. */
- tmp1 = gen_lowpart (QImode, tmp0);
- insn = emit_insn (gen_zero_extend (operands[0], tmp1));
- set_unique_reg_note (insn, REG_EQUAL, div);
-
- emit_label (end_label);
+ return NULL_TREE;
}
-#define LEA_MAX_STALL (3)
-#define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
-
-/* Increase given DISTANCE in half-cycles according to
- dependencies between PREV and NEXT instructions.
- Add 1 half-cycle if there is no dependency and
- go to next cycle if there is some dependecy. */
-
-static unsigned int
-increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
+static tree
+ix86_fold_builtin (tree fndecl, int n_args,
+ tree *args, bool ignore ATTRIBUTE_UNUSED)
{
- df_ref def, use;
-
- if (!prev || !next)
- return distance + (distance & 1) + 2;
+ if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
+ {
+ enum ix86_builtins fn_code = (enum ix86_builtins)
+ DECL_FUNCTION_CODE (fndecl);
+ enum rtx_code rcode;
+ bool is_vshift;
+ unsigned HOST_WIDE_INT mask;
- if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
- return distance + 1;
+ switch (fn_code)
+ {
+ case IX86_BUILTIN_CPU_IS:
+ case IX86_BUILTIN_CPU_SUPPORTS:
+ gcc_assert (n_args == 1);
+ return fold_builtin_cpu (fndecl, args);
- FOR_EACH_INSN_USE (use, next)
- FOR_EACH_INSN_DEF (def, prev)
- if (!DF_REF_IS_ARTIFICIAL (def)
- && DF_REF_REGNO (use) == DF_REF_REGNO (def))
- return distance + (distance & 1) + 2;
+ case IX86_BUILTIN_NANQ:
+ case IX86_BUILTIN_NANSQ:
+ {
+ tree type = TREE_TYPE (TREE_TYPE (fndecl));
+ const char *str = c_getstr (*args);
+ int quiet = fn_code == IX86_BUILTIN_NANQ;
+ REAL_VALUE_TYPE real;
- return distance + 1;
-}
+ if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
+ return build_real (type, real);
+ return NULL_TREE;
+ }
-/* Function checks if instruction INSN defines register number
- REGNO1 or REGNO2. */
+ case IX86_BUILTIN_INFQ:
+ case IX86_BUILTIN_HUGE_VALQ:
+ {
+ tree type = TREE_TYPE (TREE_TYPE (fndecl));
+ REAL_VALUE_TYPE inf;
+ real_inf (&inf);
+ return build_real (type, inf);
+ }
-static bool
-insn_defines_reg (unsigned int regno1, unsigned int regno2,
- rtx_insn *insn)
-{
- df_ref def;
-
- FOR_EACH_INSN_DEF (def, insn)
- if (DF_REF_REG_DEF_P (def)
- && !DF_REF_IS_ARTIFICIAL (def)
- && (regno1 == DF_REF_REGNO (def)
- || regno2 == DF_REF_REGNO (def)))
- return true;
-
- return false;
-}
-
-/* Function checks if instruction INSN uses register number
- REGNO as a part of address expression. */
-
-static bool
-insn_uses_reg_mem (unsigned int regno, rtx insn)
-{
- df_ref use;
-
- FOR_EACH_INSN_USE (use, insn)
- if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
- return true;
-
- return false;
-}
+ case IX86_BUILTIN_TZCNT16:
+ case IX86_BUILTIN_CTZS:
+ case IX86_BUILTIN_TZCNT32:
+ case IX86_BUILTIN_TZCNT64:
+ gcc_assert (n_args == 1);
+ if (TREE_CODE (args[0]) == INTEGER_CST)
+ {
+ tree type = TREE_TYPE (TREE_TYPE (fndecl));
+ tree arg = args[0];
+ if (fn_code == IX86_BUILTIN_TZCNT16
+ || fn_code == IX86_BUILTIN_CTZS)
+ arg = fold_convert (short_unsigned_type_node, arg);
+ if (integer_zerop (arg))
+ return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
+ else
+ return fold_const_call (CFN_CTZ, type, arg);
+ }
+ break;
-/* Search backward for non-agu definition of register number REGNO1
- or register number REGNO2 in basic block starting from instruction
- START up to head of basic block or instruction INSN.
+ case IX86_BUILTIN_LZCNT16:
+ case IX86_BUILTIN_CLZS:
+ case IX86_BUILTIN_LZCNT32:
+ case IX86_BUILTIN_LZCNT64:
+ gcc_assert (n_args == 1);
+ if (TREE_CODE (args[0]) == INTEGER_CST)
+ {
+ tree type = TREE_TYPE (TREE_TYPE (fndecl));
+ tree arg = args[0];
+ if (fn_code == IX86_BUILTIN_LZCNT16
+ || fn_code == IX86_BUILTIN_CLZS)
+ arg = fold_convert (short_unsigned_type_node, arg);
+ if (integer_zerop (arg))
+ return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
+ else
+ return fold_const_call (CFN_CLZ, type, arg);
+ }
+ break;
- Function puts true value into *FOUND var if definition was found
- and false otherwise.
+ case IX86_BUILTIN_BEXTR32:
+ case IX86_BUILTIN_BEXTR64:
+ case IX86_BUILTIN_BEXTRI32:
+ case IX86_BUILTIN_BEXTRI64:
+ gcc_assert (n_args == 2);
+ if (tree_fits_uhwi_p (args[1]))
+ {
+ unsigned HOST_WIDE_INT res = 0;
+ unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
+ unsigned int start = tree_to_uhwi (args[1]);
+ unsigned int len = (start & 0xff00) >> 8;
+ start &= 0xff;
+ if (start >= prec || len == 0)
+ res = 0;
+ else if (!tree_fits_uhwi_p (args[0]))
+ break;
+ else
+ res = tree_to_uhwi (args[0]) >> start;
+ if (len > prec)
+ len = prec;
+ if (len < HOST_BITS_PER_WIDE_INT)
+ res &= (HOST_WIDE_INT_1U << len) - 1;
+ return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
+ }
+ break;
- Distance in half-cycles between START and found instruction or head
- of BB is added to DISTANCE and returned. */
+ case IX86_BUILTIN_BZHI32:
+ case IX86_BUILTIN_BZHI64:
+ gcc_assert (n_args == 2);
+ if (tree_fits_uhwi_p (args[1]))
+ {
+ unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
+ if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
+ return args[0];
+ if (idx == 0)
+ return build_int_cst (TREE_TYPE (TREE_TYPE (fndecl)), 0);
+ if (!tree_fits_uhwi_p (args[0]))
+ break;
+ unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
+ res &= ~(HOST_WIDE_INT_M1U << idx);
+ return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
+ }
+ break;
-static int
-distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
- rtx_insn *insn, int distance,
- rtx_insn *start, bool *found)
-{
- basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
- rtx_insn *prev = start;
- rtx_insn *next = NULL;
+ case IX86_BUILTIN_PDEP32:
+ case IX86_BUILTIN_PDEP64:
+ gcc_assert (n_args == 2);
+ if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
+ {
+ unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
+ unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
+ unsigned HOST_WIDE_INT res = 0;
+ unsigned HOST_WIDE_INT m, k = 1;
+ for (m = 1; m; m <<= 1)
+ if ((mask & m) != 0)
+ {
+ if ((src & k) != 0)
+ res |= m;
+ k <<= 1;
+ }
+ return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
+ }
+ break;
- *found = false;
+ case IX86_BUILTIN_PEXT32:
+ case IX86_BUILTIN_PEXT64:
+ gcc_assert (n_args == 2);
+ if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
+ {
+ unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
+ unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
+ unsigned HOST_WIDE_INT res = 0;
+ unsigned HOST_WIDE_INT m, k = 1;
+ for (m = 1; m; m <<= 1)
+ if ((mask & m) != 0)
+ {
+ if ((src & m) != 0)
+ res |= k;
+ k <<= 1;
+ }
+ return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
+ }
+ break;
- while (prev
- && prev != insn
- && distance < LEA_SEARCH_THRESHOLD)
- {
- if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
- {
- distance = increase_distance (prev, next, distance);
- if (insn_defines_reg (regno1, regno2, prev))
+ case IX86_BUILTIN_MOVMSKPS:
+ case IX86_BUILTIN_PMOVMSKB:
+ case IX86_BUILTIN_MOVMSKPD:
+ case IX86_BUILTIN_PMOVMSKB128:
+ case IX86_BUILTIN_MOVMSKPD256:
+ case IX86_BUILTIN_MOVMSKPS256:
+ case IX86_BUILTIN_PMOVMSKB256:
+ gcc_assert (n_args == 1);
+ if (TREE_CODE (args[0]) == VECTOR_CST)
{
- if (recog_memoized (prev) < 0
- || get_attr_type (prev) != TYPE_LEA)
+ HOST_WIDE_INT res = 0;
+ for (unsigned i = 0; i < VECTOR_CST_NELTS (args[0]); ++i)
{
- *found = true;
- return distance;
+ tree e = VECTOR_CST_ELT (args[0], i);
+ if (TREE_CODE (e) == INTEGER_CST && !TREE_OVERFLOW (e))
+ {
+ if (wi::neg_p (wi::to_wide (e)))
+ res |= HOST_WIDE_INT_1 << i;
+ }
+ else if (TREE_CODE (e) == REAL_CST && !TREE_OVERFLOW (e))
+ {
+ if (TREE_REAL_CST (e).sign)
+ res |= HOST_WIDE_INT_1 << i;
+ }
+ else
+ return NULL_TREE;
}
+ return build_int_cst (TREE_TYPE (TREE_TYPE (fndecl)), res);
}
+ break;
- next = prev;
- }
- if (prev == BB_HEAD (bb))
- break;
-
- prev = PREV_INSN (prev);
- }
-
- return distance;
-}
-
-/* Search backward for non-agu definition of register number REGNO1
- or register number REGNO2 in INSN's basic block until
- 1. Pass LEA_SEARCH_THRESHOLD instructions, or
- 2. Reach neighbor BBs boundary, or
- 3. Reach agu definition.
- Returns the distance between the non-agu definition point and INSN.
- If no definition point, returns -1. */
-
-static int
-distance_non_agu_define (unsigned int regno1, unsigned int regno2,
- rtx_insn *insn)
-{
- basic_block bb = BLOCK_FOR_INSN (insn);
- int distance = 0;
- bool found = false;
-
- if (insn != BB_HEAD (bb))
- distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
- distance, PREV_INSN (insn),
- &found);
-
- if (!found && distance < LEA_SEARCH_THRESHOLD)
- {
- edge e;
- edge_iterator ei;
- bool simple_loop = false;
-
- FOR_EACH_EDGE (e, ei, bb->preds)
- if (e->src == bb)
- {
- simple_loop = true;
- break;
- }
-
- if (simple_loop)
- distance = distance_non_agu_define_in_bb (regno1, regno2,
- insn, distance,
- BB_END (bb), &found);
- else
- {
- int shortest_dist = -1;
- bool found_in_bb = false;
-
- FOR_EACH_EDGE (e, ei, bb->preds)
- {
- int bb_dist
- = distance_non_agu_define_in_bb (regno1, regno2,
- insn, distance,
- BB_END (e->src),
- &found_in_bb);
- if (found_in_bb)
- {
- if (shortest_dist < 0)
- shortest_dist = bb_dist;
- else if (bb_dist > 0)
- shortest_dist = MIN (bb_dist, shortest_dist);
-
- found = true;
- }
- }
-
- distance = shortest_dist;
- }
- }
-
- /* get_attr_type may modify recog data. We want to make sure
- that recog data is valid for instruction INSN, on which
- distance_non_agu_define is called. INSN is unchanged here. */
- extract_insn_cached (insn);
-
- if (!found)
- return -1;
-
- return distance >> 1;
-}
-
-/* Return the distance in half-cycles between INSN and the next
- insn that uses register number REGNO in memory address added
- to DISTANCE. Return -1 if REGNO0 is set.
-
- Put true value into *FOUND if register usage was found and
- false otherwise.
- Put true value into *REDEFINED if register redefinition was
- found and false otherwise. */
-
-static int
-distance_agu_use_in_bb (unsigned int regno,
- rtx_insn *insn, int distance, rtx_insn *start,
- bool *found, bool *redefined)
-{
- basic_block bb = NULL;
- rtx_insn *next = start;
- rtx_insn *prev = NULL;
-
- *found = false;
- *redefined = false;
-
- if (start != NULL_RTX)
- {
- bb = BLOCK_FOR_INSN (start);
- if (start != BB_HEAD (bb))
- /* If insn and start belong to the same bb, set prev to insn,
- so the call to increase_distance will increase the distance
- between insns by 1. */
- prev = insn;
- }
+ case IX86_BUILTIN_PSLLD:
+ case IX86_BUILTIN_PSLLD128:
+ case IX86_BUILTIN_PSLLD128_MASK:
+ case IX86_BUILTIN_PSLLD256:
+ case IX86_BUILTIN_PSLLD256_MASK:
+ case IX86_BUILTIN_PSLLD512:
+ case IX86_BUILTIN_PSLLDI:
+ case IX86_BUILTIN_PSLLDI128:
+ case IX86_BUILTIN_PSLLDI128_MASK:
+ case IX86_BUILTIN_PSLLDI256:
+ case IX86_BUILTIN_PSLLDI256_MASK:
+ case IX86_BUILTIN_PSLLDI512:
+ case IX86_BUILTIN_PSLLQ:
+ case IX86_BUILTIN_PSLLQ128:
+ case IX86_BUILTIN_PSLLQ128_MASK:
+ case IX86_BUILTIN_PSLLQ256:
+ case IX86_BUILTIN_PSLLQ256_MASK:
+ case IX86_BUILTIN_PSLLQ512:
+ case IX86_BUILTIN_PSLLQI:
+ case IX86_BUILTIN_PSLLQI128:
+ case IX86_BUILTIN_PSLLQI128_MASK:
+ case IX86_BUILTIN_PSLLQI256:
+ case IX86_BUILTIN_PSLLQI256_MASK:
+ case IX86_BUILTIN_PSLLQI512:
+ case IX86_BUILTIN_PSLLW:
+ case IX86_BUILTIN_PSLLW128:
+ case IX86_BUILTIN_PSLLW128_MASK:
+ case IX86_BUILTIN_PSLLW256:
+ case IX86_BUILTIN_PSLLW256_MASK:
+ case IX86_BUILTIN_PSLLW512_MASK:
+ case IX86_BUILTIN_PSLLWI:
+ case IX86_BUILTIN_PSLLWI128:
+ case IX86_BUILTIN_PSLLWI128_MASK:
+ case IX86_BUILTIN_PSLLWI256:
+ case IX86_BUILTIN_PSLLWI256_MASK:
+ case IX86_BUILTIN_PSLLWI512_MASK:
+ rcode = ASHIFT;
+ is_vshift = false;
+ goto do_shift;
+ case IX86_BUILTIN_PSRAD:
+ case IX86_BUILTIN_PSRAD128:
+ case IX86_BUILTIN_PSRAD128_MASK:
+ case IX86_BUILTIN_PSRAD256:
+ case IX86_BUILTIN_PSRAD256_MASK:
+ case IX86_BUILTIN_PSRAD512:
+ case IX86_BUILTIN_PSRADI:
+ case IX86_BUILTIN_PSRADI128:
+ case IX86_BUILTIN_PSRADI128_MASK:
+ case IX86_BUILTIN_PSRADI256:
+ case IX86_BUILTIN_PSRADI256_MASK:
+ case IX86_BUILTIN_PSRADI512:
+ case IX86_BUILTIN_PSRAQ128_MASK:
+ case IX86_BUILTIN_PSRAQ256_MASK:
+ case IX86_BUILTIN_PSRAQ512:
+ case IX86_BUILTIN_PSRAQI128_MASK:
+ case IX86_BUILTIN_PSRAQI256_MASK:
+ case IX86_BUILTIN_PSRAQI512:
+ case IX86_BUILTIN_PSRAW:
+ case IX86_BUILTIN_PSRAW128:
+ case IX86_BUILTIN_PSRAW128_MASK:
+ case IX86_BUILTIN_PSRAW256:
+ case IX86_BUILTIN_PSRAW256_MASK:
+ case IX86_BUILTIN_PSRAW512:
+ case IX86_BUILTIN_PSRAWI:
+ case IX86_BUILTIN_PSRAWI128:
+ case IX86_BUILTIN_PSRAWI128_MASK:
+ case IX86_BUILTIN_PSRAWI256:
+ case IX86_BUILTIN_PSRAWI256_MASK:
+ case IX86_BUILTIN_PSRAWI512:
+ rcode = ASHIFTRT;
+ is_vshift = false;
+ goto do_shift;
+ case IX86_BUILTIN_PSRLD:
+ case IX86_BUILTIN_PSRLD128:
+ case IX86_BUILTIN_PSRLD128_MASK:
+ case IX86_BUILTIN_PSRLD256:
+ case IX86_BUILTIN_PSRLD256_MASK:
+ case IX86_BUILTIN_PSRLD512:
+ case IX86_BUILTIN_PSRLDI:
+ case IX86_BUILTIN_PSRLDI128:
+ case IX86_BUILTIN_PSRLDI128_MASK:
+ case IX86_BUILTIN_PSRLDI256:
+ case IX86_BUILTIN_PSRLDI256_MASK:
+ case IX86_BUILTIN_PSRLDI512:
+ case IX86_BUILTIN_PSRLQ:
+ case IX86_BUILTIN_PSRLQ128:
+ case IX86_BUILTIN_PSRLQ128_MASK:
+ case IX86_BUILTIN_PSRLQ256:
+ case IX86_BUILTIN_PSRLQ256_MASK:
+ case IX86_BUILTIN_PSRLQ512:
+ case IX86_BUILTIN_PSRLQI:
+ case IX86_BUILTIN_PSRLQI128:
+ case IX86_BUILTIN_PSRLQI128_MASK:
+ case IX86_BUILTIN_PSRLQI256:
+ case IX86_BUILTIN_PSRLQI256_MASK:
+ case IX86_BUILTIN_PSRLQI512:
+ case IX86_BUILTIN_PSRLW:
+ case IX86_BUILTIN_PSRLW128:
+ case IX86_BUILTIN_PSRLW128_MASK:
+ case IX86_BUILTIN_PSRLW256:
+ case IX86_BUILTIN_PSRLW256_MASK:
+ case IX86_BUILTIN_PSRLW512:
+ case IX86_BUILTIN_PSRLWI:
+ case IX86_BUILTIN_PSRLWI128:
+ case IX86_BUILTIN_PSRLWI128_MASK:
+ case IX86_BUILTIN_PSRLWI256:
+ case IX86_BUILTIN_PSRLWI256_MASK:
+ case IX86_BUILTIN_PSRLWI512:
+ rcode = LSHIFTRT;
+ is_vshift = false;
+ goto do_shift;
+ case IX86_BUILTIN_PSLLVV16HI:
+ case IX86_BUILTIN_PSLLVV16SI:
+ case IX86_BUILTIN_PSLLVV2DI:
+ case IX86_BUILTIN_PSLLVV2DI_MASK:
+ case IX86_BUILTIN_PSLLVV32HI:
+ case IX86_BUILTIN_PSLLVV4DI:
+ case IX86_BUILTIN_PSLLVV4DI_MASK:
+ case IX86_BUILTIN_PSLLVV4SI:
+ case IX86_BUILTIN_PSLLVV4SI_MASK:
+ case IX86_BUILTIN_PSLLVV8DI:
+ case IX86_BUILTIN_PSLLVV8HI:
+ case IX86_BUILTIN_PSLLVV8SI:
+ case IX86_BUILTIN_PSLLVV8SI_MASK:
+ rcode = ASHIFT;
+ is_vshift = true;
+ goto do_shift;
+ case IX86_BUILTIN_PSRAVQ128:
+ case IX86_BUILTIN_PSRAVQ256:
+ case IX86_BUILTIN_PSRAVV16HI:
+ case IX86_BUILTIN_PSRAVV16SI:
+ case IX86_BUILTIN_PSRAVV32HI:
+ case IX86_BUILTIN_PSRAVV4SI:
+ case IX86_BUILTIN_PSRAVV4SI_MASK:
+ case IX86_BUILTIN_PSRAVV8DI:
+ case IX86_BUILTIN_PSRAVV8HI:
+ case IX86_BUILTIN_PSRAVV8SI:
+ case IX86_BUILTIN_PSRAVV8SI_MASK:
+ rcode = ASHIFTRT;
+ is_vshift = true;
+ goto do_shift;
+ case IX86_BUILTIN_PSRLVV16HI:
+ case IX86_BUILTIN_PSRLVV16SI:
+ case IX86_BUILTIN_PSRLVV2DI:
+ case IX86_BUILTIN_PSRLVV2DI_MASK:
+ case IX86_BUILTIN_PSRLVV32HI:
+ case IX86_BUILTIN_PSRLVV4DI:
+ case IX86_BUILTIN_PSRLVV4DI_MASK:
+ case IX86_BUILTIN_PSRLVV4SI:
+ case IX86_BUILTIN_PSRLVV4SI_MASK:
+ case IX86_BUILTIN_PSRLVV8DI:
+ case IX86_BUILTIN_PSRLVV8HI:
+ case IX86_BUILTIN_PSRLVV8SI:
+ case IX86_BUILTIN_PSRLVV8SI_MASK:
+ rcode = LSHIFTRT;
+ is_vshift = true;
+ goto do_shift;
- while (next
- && next != insn
- && distance < LEA_SEARCH_THRESHOLD)
- {
- if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
- {
- distance = increase_distance(prev, next, distance);
- if (insn_uses_reg_mem (regno, next))
+ do_shift:
+ gcc_assert (n_args >= 2);
+ if (TREE_CODE (args[0]) != VECTOR_CST)
+ break;
+ mask = HOST_WIDE_INT_M1U;
+ if (n_args > 2)
{
- /* Return DISTANCE if OP0 is used in memory
- address in NEXT. */
- *found = true;
- return distance;
+ /* This is masked shift. */
+ if (!tree_fits_uhwi_p (args[n_args - 1])
+ || TREE_SIDE_EFFECTS (args[n_args - 2]))
+ break;
+ mask = tree_to_uhwi (args[n_args - 1]);
+ unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0]));
+ mask |= HOST_WIDE_INT_M1U << elems;
+ if (mask != HOST_WIDE_INT_M1U
+ && TREE_CODE (args[n_args - 2]) != VECTOR_CST)
+ break;
+ if (mask == (HOST_WIDE_INT_M1U << elems))
+ return args[n_args - 2];
}
-
- if (insn_defines_reg (regno, INVALID_REGNUM, next))
+ if (is_vshift && TREE_CODE (args[1]) != VECTOR_CST)
+ break;
+ if (tree tem = (is_vshift ? integer_one_node
+ : ix86_vector_shift_count (args[1])))
{
- /* Return -1 if OP0 is set in NEXT. */
- *redefined = true;
- return -1;
- }
-
- prev = next;
- }
-
- if (next == BB_END (bb))
- break;
-
- next = NEXT_INSN (next);
- }
-
- return distance;
-}
-
-/* Return the distance between INSN and the next insn that uses
- register number REGNO0 in memory address. Return -1 if no such
- a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
-
-static int
-distance_agu_use (unsigned int regno0, rtx_insn *insn)
-{
- basic_block bb = BLOCK_FOR_INSN (insn);
- int distance = 0;
- bool found = false;
- bool redefined = false;
-
- if (insn != BB_END (bb))
- distance = distance_agu_use_in_bb (regno0, insn, distance,
- NEXT_INSN (insn),
- &found, &redefined);
-
- if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
- {
- edge e;
- edge_iterator ei;
- bool simple_loop = false;
-
- FOR_EACH_EDGE (e, ei, bb->succs)
- if (e->dest == bb)
- {
- simple_loop = true;
- break;
- }
-
- if (simple_loop)
- distance = distance_agu_use_in_bb (regno0, insn,
- distance, BB_HEAD (bb),
- &found, &redefined);
- else
- {
- int shortest_dist = -1;
- bool found_in_bb = false;
- bool redefined_in_bb = false;
-
- FOR_EACH_EDGE (e, ei, bb->succs)
- {
- int bb_dist
- = distance_agu_use_in_bb (regno0, insn,
- distance, BB_HEAD (e->dest),
- &found_in_bb, &redefined_in_bb);
- if (found_in_bb)
+ unsigned HOST_WIDE_INT count = tree_to_uhwi (tem);
+ unsigned HOST_WIDE_INT prec
+ = TYPE_PRECISION (TREE_TYPE (TREE_TYPE (args[0])));
+ if (count == 0 && mask == HOST_WIDE_INT_M1U)
+ return args[0];
+ if (count >= prec)
{
- if (shortest_dist < 0)
- shortest_dist = bb_dist;
- else if (bb_dist > 0)
- shortest_dist = MIN (bb_dist, shortest_dist);
-
- found = true;
+ if (rcode == ASHIFTRT)
+ count = prec - 1;
+ else if (mask == HOST_WIDE_INT_M1U)
+ return build_zero_cst (TREE_TYPE (args[0]));
+ }
+ tree countt = NULL_TREE;
+ if (!is_vshift)
+ {
+ if (count >= prec)
+ countt = integer_zero_node;
+ else
+ countt = build_int_cst (integer_type_node, count);
+ }
+ tree_vector_builder builder;
+ builder.new_unary_operation (TREE_TYPE (args[0]), args[0],
+ false);
+ unsigned int cnt = builder.encoded_nelts ();
+ for (unsigned int i = 0; i < cnt; ++i)
+ {
+ tree elt = VECTOR_CST_ELT (args[0], i);
+ if (TREE_CODE (elt) != INTEGER_CST || TREE_OVERFLOW (elt))
+ return NULL_TREE;
+ tree type = TREE_TYPE (elt);
+ if (rcode == LSHIFTRT)
+ elt = fold_convert (unsigned_type_for (type), elt);
+ if (is_vshift)
+ {
+ countt = VECTOR_CST_ELT (args[1], i);
+ if (TREE_CODE (countt) != INTEGER_CST
+ || TREE_OVERFLOW (countt))
+ return NULL_TREE;
+ if (wi::neg_p (wi::to_wide (countt))
+ || wi::to_widest (countt) >= prec)
+ {
+ if (rcode == ASHIFTRT)
+ countt = build_int_cst (TREE_TYPE (countt),
+ prec - 1);
+ else
+ {
+ elt = build_zero_cst (TREE_TYPE (elt));
+ countt = build_zero_cst (TREE_TYPE (countt));
+ }
+ }
+ }
+ else if (count >= prec)
+ elt = build_zero_cst (TREE_TYPE (elt));
+ elt = const_binop (rcode == ASHIFT
+ ? LSHIFT_EXPR : RSHIFT_EXPR,
+ TREE_TYPE (elt), elt, countt);
+ if (!elt || TREE_CODE (elt) != INTEGER_CST)
+ return NULL_TREE;
+ if (rcode == LSHIFTRT)
+ elt = fold_convert (type, elt);
+ if ((mask & (HOST_WIDE_INT_1U << i)) == 0)
+ {
+ elt = VECTOR_CST_ELT (args[n_args - 2], i);
+ if (TREE_CODE (elt) != INTEGER_CST
+ || TREE_OVERFLOW (elt))
+ return NULL_TREE;
+ }
+ builder.quick_push (elt);
}
+ return builder.build ();
}
+ break;
- distance = shortest_dist;
- }
- }
-
- if (!found || redefined)
- return -1;
-
- return distance >> 1;
-}
-
-/* Define this macro to tune LEA priority vs ADD, it take effect when
- there is a dilemma of choicing LEA or ADD
- Negative value: ADD is more preferred than LEA
- Zero: Netrual
- Positive value: LEA is more preferred than ADD*/
-#define IX86_LEA_PRIORITY 0
-
-/* Return true if usage of lea INSN has performance advantage
- over a sequence of instructions. Instructions sequence has
- SPLIT_COST cycles higher latency than lea latency. */
-
-static bool
-ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
- unsigned int regno2, int split_cost, bool has_scale)
-{
- int dist_define, dist_use;
-
- /* For Silvermont if using a 2-source or 3-source LEA for
- non-destructive destination purposes, or due to wanting
- ability to use SCALE, the use of LEA is justified. */
- if (TARGET_SILVERMONT || TARGET_GOLDMONT || TARGET_GOLDMONT_PLUS
- || TARGET_TREMONT || TARGET_INTEL)
- {
- if (has_scale)
- return true;
- if (split_cost < 1)
- return false;
- if (regno0 == regno1 || regno0 == regno2)
- return false;
- return true;
- }
-
- dist_define = distance_non_agu_define (regno1, regno2, insn);
- dist_use = distance_agu_use (regno0, insn);
-
- if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
- {
- /* If there is no non AGU operand definition, no AGU
- operand usage and split cost is 0 then both lea
- and non lea variants have same priority. Currently
- we prefer lea for 64 bit code and non lea on 32 bit
- code. */
- if (dist_use < 0 && split_cost == 0)
- return TARGET_64BIT || IX86_LEA_PRIORITY;
- else
- return true;
- }
-
- /* With longer definitions distance lea is more preferable.
- Here we change it to take into account splitting cost and
- lea priority. */
- dist_define += split_cost + IX86_LEA_PRIORITY;
-
- /* If there is no use in memory addess then we just check
- that split cost exceeds AGU stall. */
- if (dist_use < 0)
- return dist_define > LEA_MAX_STALL;
-
- /* If this insn has both backward non-agu dependence and forward
- agu dependence, the one with short distance takes effect. */
- return dist_define >= dist_use;
-}
-
-/* Return true if it is legal to clobber flags by INSN and
- false otherwise. */
-
-static bool
-ix86_ok_to_clobber_flags (rtx_insn *insn)
-{
- basic_block bb = BLOCK_FOR_INSN (insn);
- df_ref use;
- bitmap live;
-
- while (insn)
- {
- if (NONDEBUG_INSN_P (insn))
- {
- FOR_EACH_INSN_USE (use, insn)
- if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
- return false;
-
- if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
- return true;
- }
-
- if (insn == BB_END (bb))
- break;
-
- insn = NEXT_INSN (insn);
- }
-
- live = df_get_live_out(bb);
- return !REGNO_REG_SET_P (live, FLAGS_REG);
-}
-
-/* Return true if we need to split op0 = op1 + op2 into a sequence of
- move and add to avoid AGU stalls. */
-
-bool
-ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
-{
- unsigned int regno0, regno1, regno2;
-
- /* Check if we need to optimize. */
- if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
- return false;
-
- /* Check it is correct to split here. */
- if (!ix86_ok_to_clobber_flags(insn))
- return false;
-
- regno0 = true_regnum (operands[0]);
- regno1 = true_regnum (operands[1]);
- regno2 = true_regnum (operands[2]);
-
- /* We need to split only adds with non destructive
- destination operand. */
- if (regno0 == regno1 || regno0 == regno2)
- return false;
- else
- return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
-}
-
-/* Return true if we should emit lea instruction instead of mov
- instruction. */
-
-bool
-ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
-{
- unsigned int regno0, regno1;
-
- /* Check if we need to optimize. */
- if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
- return false;
-
- /* Use lea for reg to reg moves only. */
- if (!REG_P (operands[0]) || !REG_P (operands[1]))
- return false;
-
- regno0 = true_regnum (operands[0]);
- regno1 = true_regnum (operands[1]);
-
- return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
-}
-
-/* Return true if we need to split lea into a sequence of
- instructions to avoid AGU stalls. */
-
-bool
-ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
-{
- unsigned int regno0, regno1, regno2;
- int split_cost;
- struct ix86_address parts;
- int ok;
-
- /* Check we need to optimize. */
- if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
- return false;
-
- /* The "at least two components" test below might not catch simple
- move or zero extension insns if parts.base is non-NULL and parts.disp
- is const0_rtx as the only components in the address, e.g. if the
- register is %rbp or %r13. As this test is much cheaper and moves or
- zero extensions are the common case, do this check first. */
- if (REG_P (operands[1])
- || (SImode_address_operand (operands[1], VOIDmode)
- && REG_P (XEXP (operands[1], 0))))
- return false;
-
- /* Check if it is OK to split here. */
- if (!ix86_ok_to_clobber_flags (insn))
- return false;
-
- ok = ix86_decompose_address (operands[1], &parts);
- gcc_assert (ok);
-
- /* There should be at least two components in the address. */
- if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
- + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
- return false;
-
- /* We should not split into add if non legitimate pic
- operand is used as displacement. */
- if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
- return false;
-
- regno0 = true_regnum (operands[0]) ;
- regno1 = INVALID_REGNUM;
- regno2 = INVALID_REGNUM;
-
- if (parts.base)
- regno1 = true_regnum (parts.base);
- if (parts.index)
- regno2 = true_regnum (parts.index);
-
- split_cost = 0;
-
- /* Compute how many cycles we will add to execution time
- if split lea into a sequence of instructions. */
- if (parts.base || parts.index)
- {
- /* Have to use mov instruction if non desctructive
- destination form is used. */
- if (regno1 != regno0 && regno2 != regno0)
- split_cost += 1;
-
- /* Have to add index to base if both exist. */
- if (parts.base && parts.index)
- split_cost += 1;
-
- /* Have to use shift and adds if scale is 2 or greater. */
- if (parts.scale > 1)
- {
- if (regno0 != regno1)
- split_cost += 1;
- else if (regno2 == regno0)
- split_cost += 4;
- else
- split_cost += parts.scale;
- }
-
- /* Have to use add instruction with immediate if
- disp is non zero. */
- if (parts.disp && parts.disp != const0_rtx)
- split_cost += 1;
-
- /* Subtract the price of lea. */
- split_cost -= 1;
- }
-
- return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
- parts.scale > 1);
-}
-
-/* Emit x86 binary operand CODE in mode MODE, where the first operand
- matches destination. RTX includes clobber of FLAGS_REG. */
-
-static void
-ix86_emit_binop (enum rtx_code code, machine_mode mode,
- rtx dst, rtx src)
-{
- rtx op, clob;
-
- op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
- clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
-
- emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
-}
-
-/* Return true if regno1 def is nearest to the insn. */
-
-static bool
-find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
-{
- rtx_insn *prev = insn;
- rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
-
- if (insn == start)
- return false;
- while (prev && prev != start)
- {
- if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
- {
- prev = PREV_INSN (prev);
- continue;
- }
- if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
- return true;
- else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
- return false;
- prev = PREV_INSN (prev);
- }
-
- /* None of the regs is defined in the bb. */
- return false;
-}
-
-/* Split lea instructions into a sequence of instructions
- which are executed on ALU to avoid AGU stalls.
- It is assumed that it is allowed to clobber flags register
- at lea position. */
-
-void
-ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
-{
- unsigned int regno0, regno1, regno2;
- struct ix86_address parts;
- rtx target, tmp;
- int ok, adds;
-
- ok = ix86_decompose_address (operands[1], &parts);
- gcc_assert (ok);
-
- target = gen_lowpart (mode, operands[0]);
-
- regno0 = true_regnum (target);
- regno1 = INVALID_REGNUM;
- regno2 = INVALID_REGNUM;
-
- if (parts.base)
- {
- parts.base = gen_lowpart (mode, parts.base);
- regno1 = true_regnum (parts.base);
- }
-
- if (parts.index)
- {
- parts.index = gen_lowpart (mode, parts.index);
- regno2 = true_regnum (parts.index);
- }
-
- if (parts.disp)
- parts.disp = gen_lowpart (mode, parts.disp);
-
- if (parts.scale > 1)
- {
- /* Case r1 = r1 + ... */
- if (regno1 == regno0)
- {
- /* If we have a case r1 = r1 + C * r2 then we
- should use multiplication which is very
- expensive. Assume cost model is wrong if we
- have such case here. */
- gcc_assert (regno2 != regno0);
-
- for (adds = parts.scale; adds > 0; adds--)
- ix86_emit_binop (PLUS, mode, target, parts.index);
- }
- else
- {
- /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
- if (regno0 != regno2)
- emit_insn (gen_rtx_SET (target, parts.index));
-
- /* Use shift for scaling. */
- ix86_emit_binop (ASHIFT, mode, target,
- GEN_INT (exact_log2 (parts.scale)));
-
- if (parts.base)
- ix86_emit_binop (PLUS, mode, target, parts.base);
-
- if (parts.disp && parts.disp != const0_rtx)
- ix86_emit_binop (PLUS, mode, target, parts.disp);
- }
- }
- else if (!parts.base && !parts.index)
- {
- gcc_assert(parts.disp);
- emit_insn (gen_rtx_SET (target, parts.disp));
- }
- else
- {
- if (!parts.base)
- {
- if (regno0 != regno2)
- emit_insn (gen_rtx_SET (target, parts.index));
- }
- else if (!parts.index)
- {
- if (regno0 != regno1)
- emit_insn (gen_rtx_SET (target, parts.base));
- }
- else
- {
- if (regno0 == regno1)
- tmp = parts.index;
- else if (regno0 == regno2)
- tmp = parts.base;
- else
- {
- rtx tmp1;
-
- /* Find better operand for SET instruction, depending
- on which definition is farther from the insn. */
- if (find_nearest_reg_def (insn, regno1, regno2))
- tmp = parts.index, tmp1 = parts.base;
- else
- tmp = parts.base, tmp1 = parts.index;
-
- emit_insn (gen_rtx_SET (target, tmp));
-
- if (parts.disp && parts.disp != const0_rtx)
- ix86_emit_binop (PLUS, mode, target, parts.disp);
-
- ix86_emit_binop (PLUS, mode, target, tmp1);
- return;
- }
-
- ix86_emit_binop (PLUS, mode, target, tmp);
- }
-
- if (parts.disp && parts.disp != const0_rtx)
- ix86_emit_binop (PLUS, mode, target, parts.disp);
- }
-}
-
-/* Return true if it is ok to optimize an ADD operation to LEA
- operation to avoid flag register consumation. For most processors,
- ADD is faster than LEA. For the processors like BONNELL, if the
- destination register of LEA holds an actual address which will be
- used soon, LEA is better and otherwise ADD is better. */
-
-bool
-ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
-{
- unsigned int regno0 = true_regnum (operands[0]);
- unsigned int regno1 = true_regnum (operands[1]);
- unsigned int regno2 = true_regnum (operands[2]);
-
- /* If a = b + c, (a!=b && a!=c), must use lea form. */
- if (regno0 != regno1 && regno0 != regno2)
- return true;
-
- if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
- return false;
-
- return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
-}
-
-/* Return true if destination reg of SET_BODY is shift count of
- USE_BODY. */
-
-static bool
-ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
-{
- rtx set_dest;
- rtx shift_rtx;
- int i;
-
- /* Retrieve destination of SET_BODY. */
- switch (GET_CODE (set_body))
- {
- case SET:
- set_dest = SET_DEST (set_body);
- if (!set_dest || !REG_P (set_dest))
- return false;
- break;
- case PARALLEL:
- for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
- if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
- use_body))
- return true;
- /* FALLTHROUGH */
- default:
- return false;
- }
-
- /* Retrieve shift count of USE_BODY. */
- switch (GET_CODE (use_body))
- {
- case SET:
- shift_rtx = XEXP (use_body, 1);
- break;
- case PARALLEL:
- for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
- if (ix86_dep_by_shift_count_body (set_body,
- XVECEXP (use_body, 0, i)))
- return true;
- /* FALLTHROUGH */
- default:
- return false;
- }
-
- if (shift_rtx
- && (GET_CODE (shift_rtx) == ASHIFT
- || GET_CODE (shift_rtx) == LSHIFTRT
- || GET_CODE (shift_rtx) == ASHIFTRT
- || GET_CODE (shift_rtx) == ROTATE
- || GET_CODE (shift_rtx) == ROTATERT))
- {
- rtx shift_count = XEXP (shift_rtx, 1);
-
- /* Return true if shift count is dest of SET_BODY. */
- if (REG_P (shift_count))
- {
- /* Add check since it can be invoked before register
- allocation in pre-reload schedule. */
- if (reload_completed
- && true_regnum (set_dest) == true_regnum (shift_count))
- return true;
- else if (REGNO(set_dest) == REGNO(shift_count))
- return true;
- }
- }
-
- return false;
-}
-
-/* Return true if destination reg of SET_INSN is shift count of
- USE_INSN. */
-
-bool
-ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
-{
- return ix86_dep_by_shift_count_body (PATTERN (set_insn),
- PATTERN (use_insn));
-}
-
-/* Return TRUE or FALSE depending on whether the unary operator meets the
- appropriate constraints. */
-
-bool
-ix86_unary_operator_ok (enum rtx_code,
- machine_mode,
- rtx operands[2])
-{
- /* If one of operands is memory, source and destination must match. */
- if ((MEM_P (operands[0])
- || MEM_P (operands[1]))
- && ! rtx_equal_p (operands[0], operands[1]))
- return false;
- return true;
-}
-
-/* Return TRUE if the operands to a vec_interleave_{high,low}v2df
- are ok, keeping in mind the possible movddup alternative. */
-
-bool
-ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
-{
- if (MEM_P (operands[0]))
- return rtx_equal_p (operands[0], operands[1 + high]);
- if (MEM_P (operands[1]) && MEM_P (operands[2]))
- return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
- return true;
-}
-
-/* Post-reload splitter for converting an SF or DFmode value in an
- SSE register into an unsigned SImode. */
-
-void
-ix86_split_convert_uns_si_sse (rtx operands[])
-{
- machine_mode vecmode;
- rtx value, large, zero_or_two31, input, two31, x;
-
- large = operands[1];
- zero_or_two31 = operands[2];
- input = operands[3];
- two31 = operands[4];
- vecmode = GET_MODE (large);
- value = gen_rtx_REG (vecmode, REGNO (operands[0]));
-
- /* Load up the value into the low element. We must ensure that the other
- elements are valid floats -- zero is the easiest such value. */
- if (MEM_P (input))
- {
- if (vecmode == V4SFmode)
- emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
- else
- emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
- }
- else
- {
- input = gen_rtx_REG (vecmode, REGNO (input));
- emit_move_insn (value, CONST0_RTX (vecmode));
- if (vecmode == V4SFmode)
- emit_insn (gen_sse_movss (value, value, input));
- else
- emit_insn (gen_sse2_movsd (value, value, input));
- }
-
- emit_move_insn (large, two31);
- emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
-
- x = gen_rtx_fmt_ee (LE, vecmode, large, value);
- emit_insn (gen_rtx_SET (large, x));
-
- x = gen_rtx_AND (vecmode, zero_or_two31, large);
- emit_insn (gen_rtx_SET (zero_or_two31, x));
-
- x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
- emit_insn (gen_rtx_SET (value, x));
-
- large = gen_rtx_REG (V4SImode, REGNO (large));
- emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
-
- x = gen_rtx_REG (V4SImode, REGNO (value));
- if (vecmode == V4SFmode)
- emit_insn (gen_fix_truncv4sfv4si2 (x, value));
- else
- emit_insn (gen_sse2_cvttpd2dq (x, value));
- value = x;
-
- emit_insn (gen_xorv4si3 (value, value, large));
-}
-
-/* Convert an unsigned DImode value into a DFmode, using only SSE.
- Expects the 64-bit DImode to be supplied in a pair of integral
- registers. Requires SSE2; will use SSE3 if available. For x86_32,
- -mfpmath=sse, !optimize_size only. */
-
-void
-ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
-{
- REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
- rtx int_xmm, fp_xmm;
- rtx biases, exponents;
- rtx x;
-
- int_xmm = gen_reg_rtx (V4SImode);
- if (TARGET_INTER_UNIT_MOVES_TO_VEC)
- emit_insn (gen_movdi_to_sse (int_xmm, input));
- else if (TARGET_SSE_SPLIT_REGS)
- {
- emit_clobber (int_xmm);
- emit_move_insn (gen_lowpart (DImode, int_xmm), input);
- }
- else
- {
- x = gen_reg_rtx (V2DImode);
- ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
- emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
- }
-
- x = gen_rtx_CONST_VECTOR (V4SImode,
- gen_rtvec (4, GEN_INT (0x43300000UL),
- GEN_INT (0x45300000UL),
- const0_rtx, const0_rtx));
- exponents = validize_mem (force_const_mem (V4SImode, x));
-
- /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
- emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
-
- /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
- yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
- Similarly (0x45300000UL ## fp_value_hi_xmm) yields
- (0x1.0p84 + double(fp_value_hi_xmm)).
- Note these exponents differ by 32. */
-
- fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
-
- /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
- in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
- real_ldexp (&bias_lo_rvt, &dconst1, 52);
- real_ldexp (&bias_hi_rvt, &dconst1, 84);
- biases = const_double_from_real_value (bias_lo_rvt, DFmode);
- x = const_double_from_real_value (bias_hi_rvt, DFmode);
- biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
- biases = validize_mem (force_const_mem (V2DFmode, biases));
- emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
-
- /* Add the upper and lower DFmode values together. */
- if (TARGET_SSE3)
- emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
- else
- {
- x = copy_to_mode_reg (V2DFmode, fp_xmm);
- emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
- emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
- }
-
- ix86_expand_vector_extract (false, target, fp_xmm, 0);
-}
-
-/* Not used, but eases macroization of patterns. */
-void
-ix86_expand_convert_uns_sixf_sse (rtx, rtx)
-{
- gcc_unreachable ();
-}
-
-/* Convert an unsigned SImode value into a DFmode. Only currently used
- for SSE, but applicable anywhere. */
-
-void
-ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
-{
- REAL_VALUE_TYPE TWO31r;
- rtx x, fp;
-
- x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
- NULL, 1, OPTAB_DIRECT);
-
- fp = gen_reg_rtx (DFmode);
- emit_insn (gen_floatsidf2 (fp, x));
-
- real_ldexp (&TWO31r, &dconst1, 31);
- x = const_double_from_real_value (TWO31r, DFmode);
-
- x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
- if (x != target)
- emit_move_insn (target, x);
-}
-
-/* Convert a signed DImode value into a DFmode. Only used for SSE in
- 32-bit mode; otherwise we have a direct convert instruction. */
-
-void
-ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
-{
- REAL_VALUE_TYPE TWO32r;
- rtx fp_lo, fp_hi, x;
-
- fp_lo = gen_reg_rtx (DFmode);
- fp_hi = gen_reg_rtx (DFmode);
-
- emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
-
- real_ldexp (&TWO32r, &dconst1, 32);
- x = const_double_from_real_value (TWO32r, DFmode);
- fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
-
- ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
-
- x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
- 0, OPTAB_DIRECT);
- if (x != target)
- emit_move_insn (target, x);
-}
-
-/* Convert an unsigned SImode value into a SFmode, using only SSE.
- For x86_32, -mfpmath=sse, !optimize_size only. */
-void
-ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
-{
- REAL_VALUE_TYPE ONE16r;
- rtx fp_hi, fp_lo, int_hi, int_lo, x;
-
- real_ldexp (&ONE16r, &dconst1, 16);
- x = const_double_from_real_value (ONE16r, SFmode);
- int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
- NULL, 0, OPTAB_DIRECT);
- int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
- NULL, 0, OPTAB_DIRECT);
- fp_hi = gen_reg_rtx (SFmode);
- fp_lo = gen_reg_rtx (SFmode);
- emit_insn (gen_floatsisf2 (fp_hi, int_hi));
- emit_insn (gen_floatsisf2 (fp_lo, int_lo));
- fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
- 0, OPTAB_DIRECT);
- fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
- 0, OPTAB_DIRECT);
- if (!rtx_equal_p (target, fp_hi))
- emit_move_insn (target, fp_hi);
-}
-
-/* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
- a vector of unsigned ints VAL to vector of floats TARGET. */
-
-void
-ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
-{
- rtx tmp[8];
- REAL_VALUE_TYPE TWO16r;
- machine_mode intmode = GET_MODE (val);
- machine_mode fltmode = GET_MODE (target);
- rtx (*cvt) (rtx, rtx);
-
- if (intmode == V4SImode)
- cvt = gen_floatv4siv4sf2;
- else
- cvt = gen_floatv8siv8sf2;
- tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
- tmp[0] = force_reg (intmode, tmp[0]);
- tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
- OPTAB_DIRECT);
- tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
- NULL_RTX, 1, OPTAB_DIRECT);
- tmp[3] = gen_reg_rtx (fltmode);
- emit_insn (cvt (tmp[3], tmp[1]));
- tmp[4] = gen_reg_rtx (fltmode);
- emit_insn (cvt (tmp[4], tmp[2]));
- real_ldexp (&TWO16r, &dconst1, 16);
- tmp[5] = const_double_from_real_value (TWO16r, SFmode);
- tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
- tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
- OPTAB_DIRECT);
- tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
- OPTAB_DIRECT);
- if (tmp[7] != target)
- emit_move_insn (target, tmp[7]);
-}
-
-/* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
- pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
- This is done by doing just signed conversion if < 0x1p31, and otherwise by
- subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
-
-rtx
-ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
-{
- REAL_VALUE_TYPE TWO31r;
- rtx two31r, tmp[4];
- machine_mode mode = GET_MODE (val);
- machine_mode scalarmode = GET_MODE_INNER (mode);
- machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
- rtx (*cmp) (rtx, rtx, rtx, rtx);
- int i;
-
- for (i = 0; i < 3; i++)
- tmp[i] = gen_reg_rtx (mode);
- real_ldexp (&TWO31r, &dconst1, 31);
- two31r = const_double_from_real_value (TWO31r, scalarmode);
- two31r = ix86_build_const_vector (mode, 1, two31r);
- two31r = force_reg (mode, two31r);
- switch (mode)
- {
- case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
- case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
- case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
- case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
- default: gcc_unreachable ();
- }
- tmp[3] = gen_rtx_LE (mode, two31r, val);
- emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
- tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
- 0, OPTAB_DIRECT);
- if (intmode == V4SImode || TARGET_AVX2)
- *xorp = expand_simple_binop (intmode, ASHIFT,
- gen_lowpart (intmode, tmp[0]),
- GEN_INT (31), NULL_RTX, 0,
- OPTAB_DIRECT);
- else
- {
- rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
- two31 = ix86_build_const_vector (intmode, 1, two31);
- *xorp = expand_simple_binop (intmode, AND,
- gen_lowpart (intmode, tmp[0]),
- two31, NULL_RTX, 0,
- OPTAB_DIRECT);
- }
- return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
- 0, OPTAB_DIRECT);
-}
-
-/* A subroutine of ix86_build_signbit_mask. If VECT is true,
- then replicate the value for all elements of the vector
- register. */
-
-rtx
-ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
-{
- int i, n_elt;
- rtvec v;
- machine_mode scalar_mode;
-
- switch (mode)
- {
- case E_V64QImode:
- case E_V32QImode:
- case E_V16QImode:
- case E_V32HImode:
- case E_V16HImode:
- case E_V8HImode:
- case E_V16SImode:
- case E_V8SImode:
- case E_V4SImode:
- case E_V8DImode:
- case E_V4DImode:
- case E_V2DImode:
- gcc_assert (vect);
- /* FALLTHRU */
- case E_V16SFmode:
- case E_V8SFmode:
- case E_V4SFmode:
- case E_V8DFmode:
- case E_V4DFmode:
- case E_V2DFmode:
- n_elt = GET_MODE_NUNITS (mode);
- v = rtvec_alloc (n_elt);
- scalar_mode = GET_MODE_INNER (mode);
-
- RTVEC_ELT (v, 0) = value;
-
- for (i = 1; i < n_elt; ++i)
- RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
-
- return gen_rtx_CONST_VECTOR (mode, v);
-
- default:
- gcc_unreachable ();
- }
-}
-
-/* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
- and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
- for an SSE register. If VECT is true, then replicate the mask for
- all elements of the vector register. If INVERT is true, then create
- a mask excluding the sign bit. */
-
-rtx
-ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
-{
- machine_mode vec_mode, imode;
- wide_int w;
- rtx mask, v;
-
- switch (mode)
- {
- case E_V16SImode:
- case E_V16SFmode:
- case E_V8SImode:
- case E_V4SImode:
- case E_V8SFmode:
- case E_V4SFmode:
- vec_mode = mode;
- imode = SImode;
- break;
-
- case E_V8DImode:
- case E_V4DImode:
- case E_V2DImode:
- case E_V8DFmode:
- case E_V4DFmode:
- case E_V2DFmode:
- vec_mode = mode;
- imode = DImode;
- break;
-
- case E_TImode:
- case E_TFmode:
- vec_mode = VOIDmode;
- imode = TImode;
- break;
-
- default:
- gcc_unreachable ();
- }
-
- machine_mode inner_mode = GET_MODE_INNER (mode);
- w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
- GET_MODE_BITSIZE (inner_mode));
- if (invert)
- w = wi::bit_not (w);
-
- /* Force this value into the low part of a fp vector constant. */
- mask = immed_wide_int_const (w, imode);
- mask = gen_lowpart (inner_mode, mask);
-
- if (vec_mode == VOIDmode)
- return force_reg (inner_mode, mask);
-
- v = ix86_build_const_vector (vec_mode, vect, mask);
- return force_reg (vec_mode, v);
-}
-
-/* Generate code for floating point ABS or NEG. */
-
-void
-ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
- rtx operands[])
-{
- rtx mask, set, dst, src;
- bool use_sse = false;
- bool vector_mode = VECTOR_MODE_P (mode);
- machine_mode vmode = mode;
-
- if (vector_mode)
- use_sse = true;
- else if (mode == TFmode)
- use_sse = true;
- else if (TARGET_SSE_MATH)
- {
- use_sse = SSE_FLOAT_MODE_P (mode);
- if (mode == SFmode)
- vmode = V4SFmode;
- else if (mode == DFmode)
- vmode = V2DFmode;
- }
-
- /* NEG and ABS performed with SSE use bitwise mask operations.
- Create the appropriate mask now. */
- if (use_sse)
- mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
- else
- mask = NULL_RTX;
-
- dst = operands[0];
- src = operands[1];
-
- set = gen_rtx_fmt_e (code, mode, src);
- set = gen_rtx_SET (dst, set);
-
- if (mask)
- {
- rtx use, clob;
- rtvec par;
-
- use = gen_rtx_USE (VOIDmode, mask);
- if (vector_mode)
- par = gen_rtvec (2, set, use);
- else
- {
- clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
- par = gen_rtvec (3, set, use, clob);
- }
- emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
- }
- else
- emit_insn (set);
-}
-
-/* Expand a copysign operation. Special case operand 0 being a constant. */
-
-void
-ix86_expand_copysign (rtx operands[])
-{
- machine_mode mode, vmode;
- rtx dest, op0, op1, mask, nmask;
-
- dest = operands[0];
- op0 = operands[1];
- op1 = operands[2];
-
- mode = GET_MODE (dest);
-
- if (mode == SFmode)
- vmode = V4SFmode;
- else if (mode == DFmode)
- vmode = V2DFmode;
- else
- vmode = mode;
-
- if (CONST_DOUBLE_P (op0))
- {
- rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
-
- if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
- op0 = simplify_unary_operation (ABS, mode, op0, mode);
-
- if (mode == SFmode || mode == DFmode)
- {
- if (op0 == CONST0_RTX (mode))
- op0 = CONST0_RTX (vmode);
- else
- {
- rtx v = ix86_build_const_vector (vmode, false, op0);
-
- op0 = force_reg (vmode, v);
- }
- }
- else if (op0 != CONST0_RTX (mode))
- op0 = force_reg (mode, op0);
-
- mask = ix86_build_signbit_mask (vmode, 0, 0);
-
- if (mode == SFmode)
- copysign_insn = gen_copysignsf3_const;
- else if (mode == DFmode)
- copysign_insn = gen_copysigndf3_const;
- else
- copysign_insn = gen_copysigntf3_const;
-
- emit_insn (copysign_insn (dest, op0, op1, mask));
- }
- else
- {
- rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
-
- nmask = ix86_build_signbit_mask (vmode, 0, 1);
- mask = ix86_build_signbit_mask (vmode, 0, 0);
-
- if (mode == SFmode)
- copysign_insn = gen_copysignsf3_var;
- else if (mode == DFmode)
- copysign_insn = gen_copysigndf3_var;
- else
- copysign_insn = gen_copysigntf3_var;
-
- emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
- }
-}
-
-/* Deconstruct a copysign operation into bit masks. Operand 0 is known to
- be a constant, and so has already been expanded into a vector constant. */
-
-void
-ix86_split_copysign_const (rtx operands[])
-{
- machine_mode mode, vmode;
- rtx dest, op0, mask, x;
-
- dest = operands[0];
- op0 = operands[1];
- mask = operands[3];
-
- mode = GET_MODE (dest);
- vmode = GET_MODE (mask);
-
- dest = lowpart_subreg (vmode, dest, mode);
- x = gen_rtx_AND (vmode, dest, mask);
- emit_insn (gen_rtx_SET (dest, x));
-
- if (op0 != CONST0_RTX (vmode))
- {
- x = gen_rtx_IOR (vmode, dest, op0);
- emit_insn (gen_rtx_SET (dest, x));
- }
-}
-
-/* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
- so we have to do two masks. */
-
-void
-ix86_split_copysign_var (rtx operands[])
-{
- machine_mode mode, vmode;
- rtx dest, scratch, op0, op1, mask, nmask, x;
-
- dest = operands[0];
- scratch = operands[1];
- op0 = operands[2];
- op1 = operands[3];
- nmask = operands[4];
- mask = operands[5];
-
- mode = GET_MODE (dest);
- vmode = GET_MODE (mask);
-
- if (rtx_equal_p (op0, op1))
- {
- /* Shouldn't happen often (it's useless, obviously), but when it does
- we'd generate incorrect code if we continue below. */
- emit_move_insn (dest, op0);
- return;
- }
-
- if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
- {
- gcc_assert (REGNO (op1) == REGNO (scratch));
-
- x = gen_rtx_AND (vmode, scratch, mask);
- emit_insn (gen_rtx_SET (scratch, x));
-
- dest = mask;
- op0 = lowpart_subreg (vmode, op0, mode);
- x = gen_rtx_NOT (vmode, dest);
- x = gen_rtx_AND (vmode, x, op0);
- emit_insn (gen_rtx_SET (dest, x));
- }
- else
- {
- if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
- {
- x = gen_rtx_AND (vmode, scratch, mask);
- }
- else /* alternative 2,4 */
- {
- gcc_assert (REGNO (mask) == REGNO (scratch));
- op1 = lowpart_subreg (vmode, op1, mode);
- x = gen_rtx_AND (vmode, scratch, op1);
- }
- emit_insn (gen_rtx_SET (scratch, x));
-
- if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
- {
- dest = lowpart_subreg (vmode, op0, mode);
- x = gen_rtx_AND (vmode, dest, nmask);
- }
- else /* alternative 3,4 */
- {
- gcc_assert (REGNO (nmask) == REGNO (dest));
- dest = nmask;
- op0 = lowpart_subreg (vmode, op0, mode);
- x = gen_rtx_AND (vmode, dest, op0);
- }
- emit_insn (gen_rtx_SET (dest, x));
- }
-
- x = gen_rtx_IOR (vmode, dest, scratch);
- emit_insn (gen_rtx_SET (dest, x));
-}
-
-/* Expand an xorsign operation. */
-
-void
-ix86_expand_xorsign (rtx operands[])
-{
- rtx (*xorsign_insn)(rtx, rtx, rtx, rtx);
- machine_mode mode, vmode;
- rtx dest, op0, op1, mask;
-
- dest = operands[0];
- op0 = operands[1];
- op1 = operands[2];
-
- mode = GET_MODE (dest);
-
- if (mode == SFmode)
- {
- xorsign_insn = gen_xorsignsf3_1;
- vmode = V4SFmode;
- }
- else if (mode == DFmode)
- {
- xorsign_insn = gen_xorsigndf3_1;
- vmode = V2DFmode;
- }
- else
- gcc_unreachable ();
-
- mask = ix86_build_signbit_mask (vmode, 0, 0);
-
- emit_insn (xorsign_insn (dest, op0, op1, mask));
-}
-
-/* Deconstruct an xorsign operation into bit masks. */
-
-void
-ix86_split_xorsign (rtx operands[])
-{
- machine_mode mode, vmode;
- rtx dest, op0, mask, x;
-
- dest = operands[0];
- op0 = operands[1];
- mask = operands[3];
-
- mode = GET_MODE (dest);
- vmode = GET_MODE (mask);
-
- dest = lowpart_subreg (vmode, dest, mode);
- x = gen_rtx_AND (vmode, dest, mask);
- emit_insn (gen_rtx_SET (dest, x));
-
- op0 = lowpart_subreg (vmode, op0, mode);
- x = gen_rtx_XOR (vmode, dest, op0);
- emit_insn (gen_rtx_SET (dest, x));
-}
-
-/* Return TRUE or FALSE depending on whether the first SET in INSN
- has source and destination with matching CC modes, and that the
- CC mode is at least as constrained as REQ_MODE. */
-
-bool
-ix86_match_ccmode (rtx insn, machine_mode req_mode)
-{
- rtx set;
- machine_mode set_mode;
-
- set = PATTERN (insn);
- if (GET_CODE (set) == PARALLEL)
- set = XVECEXP (set, 0, 0);
- gcc_assert (GET_CODE (set) == SET);
- gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
-
- set_mode = GET_MODE (SET_DEST (set));
- switch (set_mode)
- {
- case E_CCNOmode:
- if (req_mode != CCNOmode
- && (req_mode != CCmode
- || XEXP (SET_SRC (set), 1) != const0_rtx))
- return false;
- break;
- case E_CCmode:
- if (req_mode == CCGCmode)
- return false;
- /* FALLTHRU */
- case E_CCGCmode:
- if (req_mode == CCGOCmode || req_mode == CCNOmode)
- return false;
- /* FALLTHRU */
- case E_CCGOCmode:
- if (req_mode == CCZmode)
- return false;
- /* FALLTHRU */
- case E_CCZmode:
- break;
-
- case E_CCGZmode:
-
- case E_CCAmode:
- case E_CCCmode:
- case E_CCOmode:
- case E_CCPmode:
- case E_CCSmode:
- if (set_mode != req_mode)
- return false;
- break;
-
- default:
- gcc_unreachable ();
- }
-
- return GET_MODE (SET_SRC (set)) == set_mode;
-}
-
-/* Generate insn patterns to do an integer compare of OPERANDS. */
-
-static rtx
-ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
-{
- machine_mode cmpmode;
- rtx tmp, flags;
-
- cmpmode = SELECT_CC_MODE (code, op0, op1);
- flags = gen_rtx_REG (cmpmode, FLAGS_REG);
-
- /* This is very simple, but making the interface the same as in the
- FP case makes the rest of the code easier. */
- tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
- emit_insn (gen_rtx_SET (flags, tmp));
-
- /* Return the test that should be put into the flags user, i.e.
- the bcc, scc, or cmov instruction. */
- return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
-}
-
-/* Figure out whether to use unordered fp comparisons. */
-
-static bool
-ix86_unordered_fp_compare (enum rtx_code code)
-{
- if (!TARGET_IEEE_FP)
- return false;
-
- switch (code)
- {
- case GT:
- case GE:
- case LT:
- case LE:
- return false;
-
- case EQ:
- case NE:
-
- case LTGT:
- case UNORDERED:
- case ORDERED:
- case UNLT:
- case UNLE:
- case UNGT:
- case UNGE:
- case UNEQ:
- return true;
-
- default:
- gcc_unreachable ();
- }
-}
-
-machine_mode
-ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
-{
- machine_mode mode = GET_MODE (op0);
-
- if (SCALAR_FLOAT_MODE_P (mode))
- {
- gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
- return CCFPmode;
- }
-
- switch (code)
- {
- /* Only zero flag is needed. */
- case EQ: /* ZF=0 */
- case NE: /* ZF!=0 */
- return CCZmode;
- /* Codes needing carry flag. */
- case GEU: /* CF=0 */
- case LTU: /* CF=1 */
- /* Detect overflow checks. They need just the carry flag. */
- if (GET_CODE (op0) == PLUS
- && (rtx_equal_p (op1, XEXP (op0, 0))
- || rtx_equal_p (op1, XEXP (op0, 1))))
- return CCCmode;
- else
- return CCmode;
- case GTU: /* CF=0 & ZF=0 */
- case LEU: /* CF=1 | ZF=1 */
- return CCmode;
- /* Codes possibly doable only with sign flag when
- comparing against zero. */
- case GE: /* SF=OF or SF=0 */
- case LT: /* SF<>OF or SF=1 */
- if (op1 == const0_rtx)
- return CCGOCmode;
- else
- /* For other cases Carry flag is not required. */
- return CCGCmode;
- /* Codes doable only with sign flag when comparing
- against zero, but we miss jump instruction for it
- so we need to use relational tests against overflow
- that thus needs to be zero. */
- case GT: /* ZF=0 & SF=OF */
- case LE: /* ZF=1 | SF<>OF */
- if (op1 == const0_rtx)
- return CCNOmode;
- else
- return CCGCmode;
- /* strcmp pattern do (use flags) and combine may ask us for proper
- mode. */
- case USE:
- return CCmode;
- default:
- gcc_unreachable ();
- }
-}
-
-/* Return the fixed registers used for condition codes. */
-
-static bool
-ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
-{
- *p1 = FLAGS_REG;
- *p2 = INVALID_REGNUM;
- return true;
-}
-
-/* If two condition code modes are compatible, return a condition code
- mode which is compatible with both. Otherwise, return
- VOIDmode. */
-
-static machine_mode
-ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
-{
- if (m1 == m2)
- return m1;
-
- if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
- return VOIDmode;
-
- if ((m1 == CCGCmode && m2 == CCGOCmode)
- || (m1 == CCGOCmode && m2 == CCGCmode))
- return CCGCmode;
-
- if ((m1 == CCNOmode && m2 == CCGOCmode)
- || (m1 == CCGOCmode && m2 == CCNOmode))
- return CCNOmode;
-
- if (m1 == CCZmode
- && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
- return m2;
- else if (m2 == CCZmode
- && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
- return m1;
-
- switch (m1)
- {
- default:
- gcc_unreachable ();
-
- case E_CCmode:
- case E_CCGCmode:
- case E_CCGOCmode:
- case E_CCNOmode:
- case E_CCAmode:
- case E_CCCmode:
- case E_CCOmode:
- case E_CCPmode:
- case E_CCSmode:
- case E_CCZmode:
- switch (m2)
- {
- default:
- return VOIDmode;
-
- case E_CCmode:
- case E_CCGCmode:
- case E_CCGOCmode:
- case E_CCNOmode:
- case E_CCAmode:
- case E_CCCmode:
- case E_CCOmode:
- case E_CCPmode:
- case E_CCSmode:
- case E_CCZmode:
- return CCmode;
- }
-
- case E_CCFPmode:
- /* These are only compatible with themselves, which we already
- checked above. */
- return VOIDmode;
- }
-}
-
-
-/* Return a comparison we can do and that it is equivalent to
- swap_condition (code) apart possibly from orderedness.
- But, never change orderedness if TARGET_IEEE_FP, returning
- UNKNOWN in that case if necessary. */
-
-static enum rtx_code
-ix86_fp_swap_condition (enum rtx_code code)
-{
- switch (code)
- {
- case GT: /* GTU - CF=0 & ZF=0 */
- return TARGET_IEEE_FP ? UNKNOWN : UNLT;
- case GE: /* GEU - CF=0 */
- return TARGET_IEEE_FP ? UNKNOWN : UNLE;
- case UNLT: /* LTU - CF=1 */
- return TARGET_IEEE_FP ? UNKNOWN : GT;
- case UNLE: /* LEU - CF=1 | ZF=1 */
- return TARGET_IEEE_FP ? UNKNOWN : GE;
- default:
- return swap_condition (code);
- }
-}
-
-/* Return cost of comparison CODE using the best strategy for performance.
- All following functions do use number of instructions as a cost metrics.
- In future this should be tweaked to compute bytes for optimize_size and
- take into account performance of various instructions on various CPUs. */
-
-static int
-ix86_fp_comparison_cost (enum rtx_code code)
-{
- int arith_cost;
-
- /* The cost of code using bit-twiddling on %ah. */
- switch (code)
- {
- case UNLE:
- case UNLT:
- case LTGT:
- case GT:
- case GE:
- case UNORDERED:
- case ORDERED:
- case UNEQ:
- arith_cost = 4;
- break;
- case LT:
- case NE:
- case EQ:
- case UNGE:
- arith_cost = TARGET_IEEE_FP ? 5 : 4;
- break;
- case LE:
- case UNGT:
- arith_cost = TARGET_IEEE_FP ? 6 : 4;
- break;
- default:
- gcc_unreachable ();
- }
-
- switch (ix86_fp_comparison_strategy (code))
- {
- case IX86_FPCMP_COMI:
- return arith_cost > 4 ? 3 : 2;
- case IX86_FPCMP_SAHF:
- return arith_cost > 4 ? 4 : 3;
- default:
- return arith_cost;
- }
-}
-
-/* Return strategy to use for floating-point. We assume that fcomi is always
- preferrable where available, since that is also true when looking at size
- (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
-
-enum ix86_fpcmp_strategy
-ix86_fp_comparison_strategy (enum rtx_code)
-{
- /* Do fcomi/sahf based test when profitable. */
-
- if (TARGET_CMOVE)
- return IX86_FPCMP_COMI;
-
- if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
- return IX86_FPCMP_SAHF;
-
- return IX86_FPCMP_ARITH;
-}
-
-/* Swap, force into registers, or otherwise massage the two operands
- to a fp comparison. The operands are updated in place; the new
- comparison code is returned. */
-
-static enum rtx_code
-ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
-{
- bool unordered_compare = ix86_unordered_fp_compare (code);
- rtx op0 = *pop0, op1 = *pop1;
- machine_mode op_mode = GET_MODE (op0);
- bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
-
- /* All of the unordered compare instructions only work on registers.
- The same is true of the fcomi compare instructions. The XFmode
- compare instructions require registers except when comparing
- against zero or when converting operand 1 from fixed point to
- floating point. */
-
- if (!is_sse
- && (unordered_compare
- || (op_mode == XFmode
- && ! (standard_80387_constant_p (op0) == 1
- || standard_80387_constant_p (op1) == 1)
- && GET_CODE (op1) != FLOAT)
- || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
- {
- op0 = force_reg (op_mode, op0);
- op1 = force_reg (op_mode, op1);
- }
- else
- {
- /* %%% We only allow op1 in memory; op0 must be st(0). So swap
- things around if they appear profitable, otherwise force op0
- into a register. */
-
- if (standard_80387_constant_p (op0) == 0
- || (MEM_P (op0)
- && ! (standard_80387_constant_p (op1) == 0
- || MEM_P (op1))))
- {
- enum rtx_code new_code = ix86_fp_swap_condition (code);
- if (new_code != UNKNOWN)
- {
- std::swap (op0, op1);
- code = new_code;
- }
- }
-
- if (!REG_P (op0))
- op0 = force_reg (op_mode, op0);
-
- if (CONSTANT_P (op1))
- {
- int tmp = standard_80387_constant_p (op1);
- if (tmp == 0)
- op1 = validize_mem (force_const_mem (op_mode, op1));
- else if (tmp == 1)
- {
- if (TARGET_CMOVE)
- op1 = force_reg (op_mode, op1);
- }
- else
- op1 = force_reg (op_mode, op1);
- }
- }
-
- /* Try to rearrange the comparison to make it cheaper. */
- if (ix86_fp_comparison_cost (code)
- > ix86_fp_comparison_cost (swap_condition (code))
- && (REG_P (op1) || can_create_pseudo_p ()))
- {
- std::swap (op0, op1);
- code = swap_condition (code);
- if (!REG_P (op0))
- op0 = force_reg (op_mode, op0);
- }
-
- *pop0 = op0;
- *pop1 = op1;
- return code;
-}
-
-/* Convert comparison codes we use to represent FP comparison to integer
- code that will result in proper branch. Return UNKNOWN if no such code
- is available. */
-
-enum rtx_code
-ix86_fp_compare_code_to_integer (enum rtx_code code)
-{
- switch (code)
- {
- case GT:
- return GTU;
- case GE:
- return GEU;
- case ORDERED:
- case UNORDERED:
- return code;
- case UNEQ:
- return EQ;
- case UNLT:
- return LTU;
- case UNLE:
- return LEU;
- case LTGT:
- return NE;
- default:
- return UNKNOWN;
- }
-}
-
-/* Generate insn patterns to do a floating point compare of OPERANDS. */
-
-static rtx
-ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
-{
- bool unordered_compare = ix86_unordered_fp_compare (code);
- machine_mode cmp_mode;
- rtx tmp, scratch;
-
- code = ix86_prepare_fp_compare_args (code, &op0, &op1);
-
- tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
- if (unordered_compare)
- tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
-
- /* Do fcomi/sahf based test when profitable. */
- switch (ix86_fp_comparison_strategy (code))
- {
- case IX86_FPCMP_COMI:
- cmp_mode = CCFPmode;
- emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
- break;
-
- case IX86_FPCMP_SAHF:
- cmp_mode = CCFPmode;
- tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
- scratch = gen_reg_rtx (HImode);
- emit_insn (gen_rtx_SET (scratch, tmp));
- emit_insn (gen_x86_sahf_1 (scratch));
- break;
-
- case IX86_FPCMP_ARITH:
- cmp_mode = CCNOmode;
- tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
- scratch = gen_reg_rtx (HImode);
- emit_insn (gen_rtx_SET (scratch, tmp));
-
- /* In the unordered case, we have to check C2 for NaN's, which
- doesn't happen to work out to anything nice combination-wise.
- So do some bit twiddling on the value we've got in AH to come
- up with an appropriate set of condition codes. */
-
- switch (code)
- {
- case GT:
- case UNGT:
- if (code == GT || !TARGET_IEEE_FP)
- {
- emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
- code = EQ;
- }
- else
- {
- emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
- emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
- emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
- cmp_mode = CCmode;
- code = GEU;
- }
- break;
- case LT:
- case UNLT:
- if (code == LT && TARGET_IEEE_FP)
- {
- emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
- emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
- cmp_mode = CCmode;
- code = EQ;
- }
- else
- {
- emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
- code = NE;
- }
- break;
- case GE:
- case UNGE:
- if (code == GE || !TARGET_IEEE_FP)
- {
- emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
- code = EQ;
- }
- else
- {
- emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
- emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
- code = NE;
- }
- break;
- case LE:
- case UNLE:
- if (code == LE && TARGET_IEEE_FP)
- {
- emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
- emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
- emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
- cmp_mode = CCmode;
- code = LTU;
- }
- else
- {
- emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
- code = NE;
- }
- break;
- case EQ:
- case UNEQ:
- if (code == EQ && TARGET_IEEE_FP)
- {
- emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
- emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
- cmp_mode = CCmode;
- code = EQ;
- }
- else
- {
- emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
- code = NE;
- }
- break;
- case NE:
- case LTGT:
- if (code == NE && TARGET_IEEE_FP)
- {
- emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
- emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
- GEN_INT (0x40)));
- code = NE;
- }
- else
- {
- emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
- code = EQ;
- }
- break;
-
- case UNORDERED:
- emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
- code = NE;
- break;
- case ORDERED:
- emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
- code = EQ;
- break;
-
- default:
- gcc_unreachable ();
- }
- break;
-
- default:
- gcc_unreachable();
- }
-
- /* Return the test that should be put into the flags user, i.e.
- the bcc, scc, or cmov instruction. */
- return gen_rtx_fmt_ee (code, VOIDmode,
- gen_rtx_REG (cmp_mode, FLAGS_REG),
- const0_rtx);
-}
-
-static rtx
-ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
-{
- rtx ret;
-
- if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
- ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
-
- else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
- {
- gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
- ret = ix86_expand_fp_compare (code, op0, op1);
- }
- else
- ret = ix86_expand_int_compare (code, op0, op1);
-
- return ret;
-}
-
-void
-ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
-{
- machine_mode mode = GET_MODE (op0);
- rtx tmp;
-
- /* Handle special case - vector comparsion with boolean result, transform
- it using ptest instruction. */
- if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
- {
- rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
- machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
-
- gcc_assert (code == EQ || code == NE);
- /* Generate XOR since we can't check that one operand is zero vector. */
- tmp = gen_reg_rtx (mode);
- emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
- tmp = gen_lowpart (p_mode, tmp);
- emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
- gen_rtx_UNSPEC (CCmode,
- gen_rtvec (2, tmp, tmp),
- UNSPEC_PTEST)));
- tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
- tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
- gen_rtx_LABEL_REF (VOIDmode, label),
- pc_rtx);
- emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
- return;
- }
-
- switch (mode)
- {
- case E_SFmode:
- case E_DFmode:
- case E_XFmode:
- case E_QImode:
- case E_HImode:
- case E_SImode:
- simple:
- tmp = ix86_expand_compare (code, op0, op1);
- tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
- gen_rtx_LABEL_REF (VOIDmode, label),
- pc_rtx);
- emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
- return;
-
- case E_DImode:
- if (TARGET_64BIT)
- goto simple;
- /* For 32-bit target DI comparison may be performed on
- SSE registers. To allow this we should avoid split
- to SI mode which is achieved by doing xor in DI mode
- and then comparing with zero (which is recognized by
- STV pass). We don't compare using xor when optimizing
- for size. */
- if (!optimize_insn_for_size_p ()
- && TARGET_STV
- && (code == EQ || code == NE))
- {
- op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
- op1 = const0_rtx;
- }
- /* FALLTHRU */
- case E_TImode:
- /* Expand DImode branch into multiple compare+branch. */
- {
- rtx lo[2], hi[2];
- rtx_code_label *label2;
- enum rtx_code code1, code2, code3;
- machine_mode submode;
-
- if (CONSTANT_P (op0) && !CONSTANT_P (op1))
- {
- std::swap (op0, op1);
- code = swap_condition (code);
- }
-
- split_double_mode (mode, &op0, 1, lo+0, hi+0);
- split_double_mode (mode, &op1, 1, lo+1, hi+1);
-
- submode = mode == DImode ? SImode : DImode;
-
- /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
- avoid two branches. This costs one extra insn, so disable when
- optimizing for size. */
-
- if ((code == EQ || code == NE)
- && (!optimize_insn_for_size_p ()
- || hi[1] == const0_rtx || lo[1] == const0_rtx))
- {
- rtx xor0, xor1;
-
- xor1 = hi[0];
- if (hi[1] != const0_rtx)
- xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
- NULL_RTX, 0, OPTAB_WIDEN);
-
- xor0 = lo[0];
- if (lo[1] != const0_rtx)
- xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
- NULL_RTX, 0, OPTAB_WIDEN);
-
- tmp = expand_binop (submode, ior_optab, xor1, xor0,
- NULL_RTX, 0, OPTAB_WIDEN);
-
- ix86_expand_branch (code, tmp, const0_rtx, label);
- return;
- }
-
- /* Otherwise, if we are doing less-than or greater-or-equal-than,
- op1 is a constant and the low word is zero, then we can just
- examine the high word. Similarly for low word -1 and
- less-or-equal-than or greater-than. */
-
- if (CONST_INT_P (hi[1]))
- switch (code)
- {
- case LT: case LTU: case GE: case GEU:
- if (lo[1] == const0_rtx)
- {
- ix86_expand_branch (code, hi[0], hi[1], label);
- return;
- }
- break;
- case LE: case LEU: case GT: case GTU:
- if (lo[1] == constm1_rtx)
- {
- ix86_expand_branch (code, hi[0], hi[1], label);
- return;
- }
- break;
- default:
- break;
- }
-
- /* Emulate comparisons that do not depend on Zero flag with
- double-word subtraction. Note that only Overflow, Sign
- and Carry flags are valid, so swap arguments and condition
- of comparisons that would otherwise test Zero flag. */
-
- switch (code)
- {
- case LE: case LEU: case GT: case GTU:
- std::swap (lo[0], lo[1]);
- std::swap (hi[0], hi[1]);
- code = swap_condition (code);
- /* FALLTHRU */
-
- case LT: case LTU: case GE: case GEU:
- {
- rtx (*cmp_insn) (rtx, rtx);
- rtx (*sbb_insn) (rtx, rtx, rtx);
- bool uns = (code == LTU || code == GEU);
-
- if (TARGET_64BIT)
- {
- cmp_insn = gen_cmpdi_1;
- sbb_insn
- = uns ? gen_subdi3_carry_ccc : gen_subdi3_carry_ccgz;
- }
- else
- {
- cmp_insn = gen_cmpsi_1;
- sbb_insn
- = uns ? gen_subsi3_carry_ccc : gen_subsi3_carry_ccgz;
- }
-
- if (!nonimmediate_operand (lo[0], submode))
- lo[0] = force_reg (submode, lo[0]);
- if (!x86_64_general_operand (lo[1], submode))
- lo[1] = force_reg (submode, lo[1]);
-
- if (!register_operand (hi[0], submode))
- hi[0] = force_reg (submode, hi[0]);
- if ((uns && !nonimmediate_operand (hi[1], submode))
- || (!uns && !x86_64_general_operand (hi[1], submode)))
- hi[1] = force_reg (submode, hi[1]);
-
- emit_insn (cmp_insn (lo[0], lo[1]));
- emit_insn (sbb_insn (gen_rtx_SCRATCH (submode), hi[0], hi[1]));
-
- tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
-
- ix86_expand_branch (code, tmp, const0_rtx, label);
- return;
- }
-
- default:
- break;
- }
-
- /* Otherwise, we need two or three jumps. */
-
- label2 = gen_label_rtx ();
-
- code1 = code;
- code2 = swap_condition (code);
- code3 = unsigned_condition (code);
-
- switch (code)
- {
- case LT: case GT: case LTU: case GTU:
- break;
-
- case LE: code1 = LT; code2 = GT; break;
- case GE: code1 = GT; code2 = LT; break;
- case LEU: code1 = LTU; code2 = GTU; break;
- case GEU: code1 = GTU; code2 = LTU; break;
-
- case EQ: code1 = UNKNOWN; code2 = NE; break;
- case NE: code2 = UNKNOWN; break;
-
- default:
- gcc_unreachable ();
- }
-
- /*
- * a < b =>
- * if (hi(a) < hi(b)) goto true;
- * if (hi(a) > hi(b)) goto false;
- * if (lo(a) < lo(b)) goto true;
- * false:
- */
-
- if (code1 != UNKNOWN)
- ix86_expand_branch (code1, hi[0], hi[1], label);
- if (code2 != UNKNOWN)
- ix86_expand_branch (code2, hi[0], hi[1], label2);
-
- ix86_expand_branch (code3, lo[0], lo[1], label);
-
- if (code2 != UNKNOWN)
- emit_label (label2);
- return;
- }
-
- default:
- gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
- goto simple;
- }
-}
-
-void
-ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
-{
- rtx ret;
-
- gcc_assert (GET_MODE (dest) == QImode);
-
- ret = ix86_expand_compare (code, op0, op1);
- PUT_MODE (ret, QImode);
- emit_insn (gen_rtx_SET (dest, ret));
-}
-
-/* Expand comparison setting or clearing carry flag. Return true when
- successful and set pop for the operation. */
-static bool
-ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
-{
- machine_mode mode
- = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
-
- /* Do not handle double-mode compares that go through special path. */
- if (mode == (TARGET_64BIT ? TImode : DImode))
- return false;
-
- if (SCALAR_FLOAT_MODE_P (mode))
- {
- rtx compare_op;
- rtx_insn *compare_seq;
-
- gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
-
- /* Shortcut: following common codes never translate
- into carry flag compares. */
- if (code == EQ || code == NE || code == UNEQ || code == LTGT
- || code == ORDERED || code == UNORDERED)
- return false;
-
- /* These comparisons require zero flag; swap operands so they won't. */
- if ((code == GT || code == UNLE || code == LE || code == UNGT)
- && !TARGET_IEEE_FP)
- {
- std::swap (op0, op1);
- code = swap_condition (code);
- }
-
- /* Try to expand the comparison and verify that we end up with
- carry flag based comparison. This fails to be true only when
- we decide to expand comparison using arithmetic that is not
- too common scenario. */
- start_sequence ();
- compare_op = ix86_expand_fp_compare (code, op0, op1);
- compare_seq = get_insns ();
- end_sequence ();
-
- if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
- code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
- else
- code = GET_CODE (compare_op);
-
- if (code != LTU && code != GEU)
- return false;
-
- emit_insn (compare_seq);
- *pop = compare_op;
- return true;
- }
-
- if (!INTEGRAL_MODE_P (mode))
- return false;
-
- switch (code)
- {
- case LTU:
- case GEU:
- break;
-
- /* Convert a==0 into (unsigned)a<1. */
- case EQ:
- case NE:
- if (op1 != const0_rtx)
- return false;
- op1 = const1_rtx;
- code = (code == EQ ? LTU : GEU);
- break;
-
- /* Convert a>b into b<a or a>=b-1. */
- case GTU:
- case LEU:
- if (CONST_INT_P (op1))
- {
- op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
- /* Bail out on overflow. We still can swap operands but that
- would force loading of the constant into register. */
- if (op1 == const0_rtx
- || !x86_64_immediate_operand (op1, GET_MODE (op1)))
- return false;
- code = (code == GTU ? GEU : LTU);
- }
- else
- {
- std::swap (op0, op1);
- code = (code == GTU ? LTU : GEU);
- }
- break;
-
- /* Convert a>=0 into (unsigned)a<0x80000000. */
- case LT:
- case GE:
- if (mode == DImode || op1 != const0_rtx)
- return false;
- op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
- code = (code == LT ? GEU : LTU);
- break;
- case LE:
- case GT:
- if (mode == DImode || op1 != constm1_rtx)
- return false;
- op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
- code = (code == LE ? GEU : LTU);
- break;
-
- default:
- return false;
- }
- /* Swapping operands may cause constant to appear as first operand. */
- if (!nonimmediate_operand (op0, VOIDmode))
- {
- if (!can_create_pseudo_p ())
- return false;
- op0 = force_reg (mode, op0);
- }
- *pop = ix86_expand_compare (code, op0, op1);
- gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
- return true;
-}
-
-bool
-ix86_expand_int_movcc (rtx operands[])
-{
- enum rtx_code code = GET_CODE (operands[1]), compare_code;
- rtx_insn *compare_seq;
- rtx compare_op;
- machine_mode mode = GET_MODE (operands[0]);
- bool sign_bit_compare_p = false;
- rtx op0 = XEXP (operands[1], 0);
- rtx op1 = XEXP (operands[1], 1);
-
- if (GET_MODE (op0) == TImode
- || (GET_MODE (op0) == DImode
- && !TARGET_64BIT))
- return false;
-
- start_sequence ();
- compare_op = ix86_expand_compare (code, op0, op1);
- compare_seq = get_insns ();
- end_sequence ();
-
- compare_code = GET_CODE (compare_op);
-
- if ((op1 == const0_rtx && (code == GE || code == LT))
- || (op1 == constm1_rtx && (code == GT || code == LE)))
- sign_bit_compare_p = true;
-
- /* Don't attempt mode expansion here -- if we had to expand 5 or 6
- HImode insns, we'd be swallowed in word prefix ops. */
-
- if ((mode != HImode || TARGET_FAST_PREFIX)
- && (mode != (TARGET_64BIT ? TImode : DImode))
- && CONST_INT_P (operands[2])
- && CONST_INT_P (operands[3]))
- {
- rtx out = operands[0];
- HOST_WIDE_INT ct = INTVAL (operands[2]);
- HOST_WIDE_INT cf = INTVAL (operands[3]);
- HOST_WIDE_INT diff;
-
- diff = ct - cf;
- /* Sign bit compares are better done using shifts than we do by using
- sbb. */
- if (sign_bit_compare_p
- || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
- {
- /* Detect overlap between destination and compare sources. */
- rtx tmp = out;
-
- if (!sign_bit_compare_p)
- {
- rtx flags;
- bool fpcmp = false;
-
- compare_code = GET_CODE (compare_op);
-
- flags = XEXP (compare_op, 0);
-
- if (GET_MODE (flags) == CCFPmode)
- {
- fpcmp = true;
- compare_code
- = ix86_fp_compare_code_to_integer (compare_code);
- }
-
- /* To simplify rest of code, restrict to the GEU case. */
- if (compare_code == LTU)
- {
- std::swap (ct, cf);
- compare_code = reverse_condition (compare_code);
- code = reverse_condition (code);
- }
- else
- {
- if (fpcmp)
- PUT_CODE (compare_op,
- reverse_condition_maybe_unordered
- (GET_CODE (compare_op)));
- else
- PUT_CODE (compare_op,
- reverse_condition (GET_CODE (compare_op)));
- }
- diff = ct - cf;
-
- if (reg_overlap_mentioned_p (out, op0)
- || reg_overlap_mentioned_p (out, op1))
- tmp = gen_reg_rtx (mode);
-
- if (mode == DImode)
- emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
- else
- emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
- flags, compare_op));
- }
- else
- {
- if (code == GT || code == GE)
- code = reverse_condition (code);
- else
- {
- std::swap (ct, cf);
- diff = ct - cf;
- }
- tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
- }
-
- if (diff == 1)
- {
- /*
- * cmpl op0,op1
- * sbbl dest,dest
- * [addl dest, ct]
- *
- * Size 5 - 8.
- */
- if (ct)
- tmp = expand_simple_binop (mode, PLUS,
- tmp, GEN_INT (ct),
- copy_rtx (tmp), 1, OPTAB_DIRECT);
- }
- else if (cf == -1)
- {
- /*
- * cmpl op0,op1
- * sbbl dest,dest
- * orl $ct, dest
- *
- * Size 8.
- */
- tmp = expand_simple_binop (mode, IOR,
- tmp, GEN_INT (ct),
- copy_rtx (tmp), 1, OPTAB_DIRECT);
- }
- else if (diff == -1 && ct)
- {
- /*
- * cmpl op0,op1
- * sbbl dest,dest
- * notl dest
- * [addl dest, cf]
- *
- * Size 8 - 11.
- */
- tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
- if (cf)
- tmp = expand_simple_binop (mode, PLUS,
- copy_rtx (tmp), GEN_INT (cf),
- copy_rtx (tmp), 1, OPTAB_DIRECT);
- }
- else
- {
- /*
- * cmpl op0,op1
- * sbbl dest,dest
- * [notl dest]
- * andl cf - ct, dest
- * [addl dest, ct]
- *
- * Size 8 - 11.
- */
-
- if (cf == 0)
- {
- cf = ct;
- ct = 0;
- tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
- }
-
- tmp = expand_simple_binop (mode, AND,
- copy_rtx (tmp),
- gen_int_mode (cf - ct, mode),
- copy_rtx (tmp), 1, OPTAB_DIRECT);
- if (ct)
- tmp = expand_simple_binop (mode, PLUS,
- copy_rtx (tmp), GEN_INT (ct),
- copy_rtx (tmp), 1, OPTAB_DIRECT);
- }
-
- if (!rtx_equal_p (tmp, out))
- emit_move_insn (copy_rtx (out), copy_rtx (tmp));
-
- return true;
- }
-
- if (diff < 0)
- {
- machine_mode cmp_mode = GET_MODE (op0);
- enum rtx_code new_code;
-
- if (SCALAR_FLOAT_MODE_P (cmp_mode))
- {
- gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
-
- /* We may be reversing unordered compare to normal compare, that
- is not valid in general (we may convert non-trapping condition
- to trapping one), however on i386 we currently emit all
- comparisons unordered. */
- new_code = reverse_condition_maybe_unordered (code);
- }
- else
- new_code = ix86_reverse_condition (code, cmp_mode);
- if (new_code != UNKNOWN)
- {
- std::swap (ct, cf);
- diff = -diff;
- code = new_code;
- }
- }
-
- compare_code = UNKNOWN;
- if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
- && CONST_INT_P (op1))
- {
- if (op1 == const0_rtx
- && (code == LT || code == GE))
- compare_code = code;
- else if (op1 == constm1_rtx)
- {
- if (code == LE)
- compare_code = LT;
- else if (code == GT)
- compare_code = GE;
- }
- }
-
- /* Optimize dest = (op0 < 0) ? -1 : cf. */
- if (compare_code != UNKNOWN
- && GET_MODE (op0) == GET_MODE (out)
- && (cf == -1 || ct == -1))
- {
- /* If lea code below could be used, only optimize
- if it results in a 2 insn sequence. */
-
- if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
- || diff == 3 || diff == 5 || diff == 9)
- || (compare_code == LT && ct == -1)
- || (compare_code == GE && cf == -1))
- {
- /*
- * notl op1 (if necessary)
- * sarl $31, op1
- * orl cf, op1
- */
- if (ct != -1)
- {
- cf = ct;
- ct = -1;
- code = reverse_condition (code);
- }
-
- out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
-
- out = expand_simple_binop (mode, IOR,
- out, GEN_INT (cf),
- out, 1, OPTAB_DIRECT);
- if (out != operands[0])
- emit_move_insn (operands[0], out);
-
- return true;
- }
- }
-
-
- if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
- || diff == 3 || diff == 5 || diff == 9)
- && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
- && (mode != DImode
- || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
- {
- /*
- * xorl dest,dest
- * cmpl op1,op2
- * setcc dest
- * lea cf(dest*(ct-cf)),dest
- *
- * Size 14.
- *
- * This also catches the degenerate setcc-only case.
- */
-
- rtx tmp;
- int nops;
-
- out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
-
- nops = 0;
- /* On x86_64 the lea instruction operates on Pmode, so we need
- to get arithmetics done in proper mode to match. */
- if (diff == 1)
- tmp = copy_rtx (out);
- else
- {
- rtx out1;
- out1 = copy_rtx (out);
- tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
- nops++;
- if (diff & 1)
- {
- tmp = gen_rtx_PLUS (mode, tmp, out1);
- nops++;
- }
- }
- if (cf != 0)
- {
- tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
- nops++;
- }
- if (!rtx_equal_p (tmp, out))
- {
- if (nops == 1)
- out = force_operand (tmp, copy_rtx (out));
- else
- emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
- }
- if (!rtx_equal_p (out, operands[0]))
- emit_move_insn (operands[0], copy_rtx (out));
-
- return true;
- }
-
- /*
- * General case: Jumpful:
- * xorl dest,dest cmpl op1, op2
- * cmpl op1, op2 movl ct, dest
- * setcc dest jcc 1f
- * decl dest movl cf, dest
- * andl (cf-ct),dest 1:
- * addl ct,dest
- *
- * Size 20. Size 14.
- *
- * This is reasonably steep, but branch mispredict costs are
- * high on modern cpus, so consider failing only if optimizing
- * for space.
- */
-
- if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
- && BRANCH_COST (optimize_insn_for_speed_p (),
- false) >= 2)
- {
- if (cf == 0)
- {
- machine_mode cmp_mode = GET_MODE (op0);
- enum rtx_code new_code;
-
- if (SCALAR_FLOAT_MODE_P (cmp_mode))
- {
- gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
-
- /* We may be reversing unordered compare to normal compare,
- that is not valid in general (we may convert non-trapping
- condition to trapping one), however on i386 we currently
- emit all comparisons unordered. */
- new_code = reverse_condition_maybe_unordered (code);
- }
- else
- {
- new_code = ix86_reverse_condition (code, cmp_mode);
- if (compare_code != UNKNOWN && new_code != UNKNOWN)
- compare_code = reverse_condition (compare_code);
- }
-
- if (new_code != UNKNOWN)
- {
- cf = ct;
- ct = 0;
- code = new_code;
- }
- }
-
- if (compare_code != UNKNOWN)
- {
- /* notl op1 (if needed)
- sarl $31, op1
- andl (cf-ct), op1
- addl ct, op1
-
- For x < 0 (resp. x <= -1) there will be no notl,
- so if possible swap the constants to get rid of the
- complement.
- True/false will be -1/0 while code below (store flag
- followed by decrement) is 0/-1, so the constants need
- to be exchanged once more. */
-
- if (compare_code == GE || !cf)
- {
- code = reverse_condition (code);
- compare_code = LT;
- }
- else
- std::swap (ct, cf);
-
- out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
- }
- else
- {
- out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
-
- out = expand_simple_binop (mode, PLUS, copy_rtx (out),
- constm1_rtx,
- copy_rtx (out), 1, OPTAB_DIRECT);
- }
-
- out = expand_simple_binop (mode, AND, copy_rtx (out),
- gen_int_mode (cf - ct, mode),
- copy_rtx (out), 1, OPTAB_DIRECT);
- if (ct)
- out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
- copy_rtx (out), 1, OPTAB_DIRECT);
- if (!rtx_equal_p (out, operands[0]))
- emit_move_insn (operands[0], copy_rtx (out));
-
- return true;
- }
- }
-
- if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
- {
- /* Try a few things more with specific constants and a variable. */
-
- optab op;
- rtx var, orig_out, out, tmp;
-
- if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
- return false;
-
- /* If one of the two operands is an interesting constant, load a
- constant with the above and mask it in with a logical operation. */
-
- if (CONST_INT_P (operands[2]))
- {
- var = operands[3];
- if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
- operands[3] = constm1_rtx, op = and_optab;
- else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
- operands[3] = const0_rtx, op = ior_optab;
- else
- return false;
- }
- else if (CONST_INT_P (operands[3]))
- {
- var = operands[2];
- if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
- operands[2] = constm1_rtx, op = and_optab;
- else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
- operands[2] = const0_rtx, op = ior_optab;
- else
- return false;
- }
- else
- return false;
-
- orig_out = operands[0];
- tmp = gen_reg_rtx (mode);
- operands[0] = tmp;
-
- /* Recurse to get the constant loaded. */
- if (!ix86_expand_int_movcc (operands))
- return false;
-
- /* Mask in the interesting variable. */
- out = expand_binop (mode, op, var, tmp, orig_out, 0,
- OPTAB_WIDEN);
- if (!rtx_equal_p (out, orig_out))
- emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
-
- return true;
- }
-
- /*
- * For comparison with above,
- *
- * movl cf,dest
- * movl ct,tmp
- * cmpl op1,op2
- * cmovcc tmp,dest
- *
- * Size 15.
- */
-
- if (! nonimmediate_operand (operands[2], mode))
- operands[2] = force_reg (mode, operands[2]);
- if (! nonimmediate_operand (operands[3], mode))
- operands[3] = force_reg (mode, operands[3]);
-
- if (! register_operand (operands[2], VOIDmode)
- && (mode == QImode
- || ! register_operand (operands[3], VOIDmode)))
- operands[2] = force_reg (mode, operands[2]);
-
- if (mode == QImode
- && ! register_operand (operands[3], VOIDmode))
- operands[3] = force_reg (mode, operands[3]);
-
- emit_insn (compare_seq);
- emit_insn (gen_rtx_SET (operands[0],
- gen_rtx_IF_THEN_ELSE (mode,
- compare_op, operands[2],
- operands[3])));
- return true;
-}
-
-/* Swap, force into registers, or otherwise massage the two operands
- to an sse comparison with a mask result. Thus we differ a bit from
- ix86_prepare_fp_compare_args which expects to produce a flags result.
-
- The DEST operand exists to help determine whether to commute commutative
- operators. The POP0/POP1 operands are updated in place. The new
- comparison code is returned, or UNKNOWN if not implementable. */
-
-static enum rtx_code
-ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
- rtx *pop0, rtx *pop1)
-{
- switch (code)
- {
- case LTGT:
- case UNEQ:
- /* AVX supports all the needed comparisons. */
- if (TARGET_AVX)
- break;
- /* We have no LTGT as an operator. We could implement it with
- NE & ORDERED, but this requires an extra temporary. It's
- not clear that it's worth it. */
- return UNKNOWN;
-
- case LT:
- case LE:
- case UNGT:
- case UNGE:
- /* These are supported directly. */
- break;
-
- case EQ:
- case NE:
- case UNORDERED:
- case ORDERED:
- /* AVX has 3 operand comparisons, no need to swap anything. */
- if (TARGET_AVX)
- break;
- /* For commutative operators, try to canonicalize the destination
- operand to be first in the comparison - this helps reload to
- avoid extra moves. */
- if (!dest || !rtx_equal_p (dest, *pop1))
- break;
- /* FALLTHRU */
-
- case GE:
- case GT:
- case UNLE:
- case UNLT:
- /* These are not supported directly before AVX, and furthermore
- ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
- comparison operands to transform into something that is
- supported. */
- std::swap (*pop0, *pop1);
- code = swap_condition (code);
- break;
-
- default:
- gcc_unreachable ();
- }
-
- return code;
-}
-
-/* Detect conditional moves that exactly match min/max operational
- semantics. Note that this is IEEE safe, as long as we don't
- interchange the operands.
-
- Returns FALSE if this conditional move doesn't match a MIN/MAX,
- and TRUE if the operation is successful and instructions are emitted. */
-
-static bool
-ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
- rtx cmp_op1, rtx if_true, rtx if_false)
-{
- machine_mode mode;
- bool is_min;
- rtx tmp;
-
- if (code == LT)
- ;
- else if (code == UNGE)
- std::swap (if_true, if_false);
- else
- return false;
-
- if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
- is_min = true;
- else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
- is_min = false;
- else
- return false;
-
- mode = GET_MODE (dest);
-
- /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
- but MODE may be a vector mode and thus not appropriate. */
- if (!flag_finite_math_only || flag_signed_zeros)
- {
- int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
- rtvec v;
-
- if_true = force_reg (mode, if_true);
- v = gen_rtvec (2, if_true, if_false);
- tmp = gen_rtx_UNSPEC (mode, v, u);
- }
- else
- {
- code = is_min ? SMIN : SMAX;
- if (MEM_P (if_true) && MEM_P (if_false))
- if_true = force_reg (mode, if_true);
- tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
- }
-
- emit_insn (gen_rtx_SET (dest, tmp));
- return true;
-}
-
-/* Expand an SSE comparison. Return the register with the result. */
-
-static rtx
-ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
- rtx op_true, rtx op_false)
-{
- machine_mode mode = GET_MODE (dest);
- machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
-
- /* In general case result of comparison can differ from operands' type. */
- machine_mode cmp_mode;
-
- /* In AVX512F the result of comparison is an integer mask. */
- bool maskcmp = false;
- rtx x;
-
- if (GET_MODE_SIZE (cmp_ops_mode) == 64)
- {
- unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
- cmp_mode = int_mode_for_size (nbits, 0).require ();
- maskcmp = true;
- }
- else
- cmp_mode = cmp_ops_mode;
-
- cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
-
- int (*op1_predicate)(rtx, machine_mode)
- = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
-
- if (!op1_predicate (cmp_op1, cmp_ops_mode))
- cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
-
- if (optimize
- || (maskcmp && cmp_mode != mode)
- || (op_true && reg_overlap_mentioned_p (dest, op_true))
- || (op_false && reg_overlap_mentioned_p (dest, op_false)))
- dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
-
- /* Compare patterns for int modes are unspec in AVX512F only. */
- if (maskcmp && (code == GT || code == EQ))
- {
- rtx (*gen)(rtx, rtx, rtx);
-
- switch (cmp_ops_mode)
- {
- case E_V64QImode:
- gcc_assert (TARGET_AVX512BW);
- gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
- break;
- case E_V32HImode:
- gcc_assert (TARGET_AVX512BW);
- gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
- break;
- case E_V16SImode:
- gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
- break;
- case E_V8DImode:
- gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
- break;
- default:
- gen = NULL;
- }
-
- if (gen)
- {
- emit_insn (gen (dest, cmp_op0, cmp_op1));
- return dest;
- }
- }
- x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
-
- if (cmp_mode != mode && !maskcmp)
- {
- x = force_reg (cmp_ops_mode, x);
- convert_move (dest, x, false);
- }
- else
- emit_insn (gen_rtx_SET (dest, x));
-
- return dest;
-}
-
-/* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
- operations. This is used for both scalar and vector conditional moves. */
-
-void
-ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
-{
- machine_mode mode = GET_MODE (dest);
- machine_mode cmpmode = GET_MODE (cmp);
-
- /* In AVX512F the result of comparison is an integer mask. */
- bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
-
- rtx t2, t3, x;
-
- /* If we have an integer mask and FP value then we need
- to cast mask to FP mode. */
- if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
- {
- cmp = force_reg (cmpmode, cmp);
- cmp = gen_rtx_SUBREG (mode, cmp, 0);
- }
-
- if (maskcmp)
- {
- rtx (*gen) (rtx, rtx) = NULL;
- if ((op_true == CONST0_RTX (mode)
- && vector_all_ones_operand (op_false, mode))
- || (op_false == CONST0_RTX (mode)
- && vector_all_ones_operand (op_true, mode)))
- switch (mode)
- {
- case E_V64QImode:
- if (TARGET_AVX512BW)
- gen = gen_avx512bw_cvtmask2bv64qi;
- break;
- case E_V32QImode:
- if (TARGET_AVX512VL && TARGET_AVX512BW)
- gen = gen_avx512vl_cvtmask2bv32qi;
- break;
- case E_V16QImode:
- if (TARGET_AVX512VL && TARGET_AVX512BW)
- gen = gen_avx512vl_cvtmask2bv16qi;
- break;
- case E_V32HImode:
- if (TARGET_AVX512BW)
- gen = gen_avx512bw_cvtmask2wv32hi;
- break;
- case E_V16HImode:
- if (TARGET_AVX512VL && TARGET_AVX512BW)
- gen = gen_avx512vl_cvtmask2wv16hi;
- break;
- case E_V8HImode:
- if (TARGET_AVX512VL && TARGET_AVX512BW)
- gen = gen_avx512vl_cvtmask2wv8hi;
- break;
- case E_V16SImode:
- if (TARGET_AVX512DQ)
- gen = gen_avx512f_cvtmask2dv16si;
- break;
- case E_V8SImode:
- if (TARGET_AVX512VL && TARGET_AVX512DQ)
- gen = gen_avx512vl_cvtmask2dv8si;
- break;
- case E_V4SImode:
- if (TARGET_AVX512VL && TARGET_AVX512DQ)
- gen = gen_avx512vl_cvtmask2dv4si;
- break;
- case E_V8DImode:
- if (TARGET_AVX512DQ)
- gen = gen_avx512f_cvtmask2qv8di;
- break;
- case E_V4DImode:
- if (TARGET_AVX512VL && TARGET_AVX512DQ)
- gen = gen_avx512vl_cvtmask2qv4di;
- break;
- case E_V2DImode:
- if (TARGET_AVX512VL && TARGET_AVX512DQ)
- gen = gen_avx512vl_cvtmask2qv2di;
- break;
- default:
- break;
- }
- if (gen && SCALAR_INT_MODE_P (cmpmode))
- {
- cmp = force_reg (cmpmode, cmp);
- if (op_true == CONST0_RTX (mode))
- {
- rtx (*gen_not) (rtx, rtx);
- switch (cmpmode)
- {
- case E_QImode: gen_not = gen_knotqi; break;
- case E_HImode: gen_not = gen_knothi; break;
- case E_SImode: gen_not = gen_knotsi; break;
- case E_DImode: gen_not = gen_knotdi; break;
- default: gcc_unreachable ();
- }
- rtx n = gen_reg_rtx (cmpmode);
- emit_insn (gen_not (n, cmp));
- cmp = n;
- }
- emit_insn (gen (dest, cmp));
- return;
- }
- }
- else if (vector_all_ones_operand (op_true, mode)
- && op_false == CONST0_RTX (mode))
- {
- emit_insn (gen_rtx_SET (dest, cmp));
- return;
- }
- else if (op_false == CONST0_RTX (mode))
- {
- op_true = force_reg (mode, op_true);
- x = gen_rtx_AND (mode, cmp, op_true);
- emit_insn (gen_rtx_SET (dest, x));
- return;
- }
- else if (op_true == CONST0_RTX (mode))
- {
- op_false = force_reg (mode, op_false);
- x = gen_rtx_NOT (mode, cmp);
- x = gen_rtx_AND (mode, x, op_false);
- emit_insn (gen_rtx_SET (dest, x));
- return;
- }
- else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
- {
- op_false = force_reg (mode, op_false);
- x = gen_rtx_IOR (mode, cmp, op_false);
- emit_insn (gen_rtx_SET (dest, x));
- return;
- }
- else if (TARGET_XOP)
- {
- op_true = force_reg (mode, op_true);
-
- if (!nonimmediate_operand (op_false, mode))
- op_false = force_reg (mode, op_false);
-
- emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
- op_true,
- op_false)));
- return;
- }
-
- rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
- rtx d = dest;
-
- if (!vector_operand (op_true, mode))
- op_true = force_reg (mode, op_true);
-
- op_false = force_reg (mode, op_false);
-
- switch (mode)
- {
- case E_V4SFmode:
- if (TARGET_SSE4_1)
- gen = gen_sse4_1_blendvps;
- break;
- case E_V2DFmode:
- if (TARGET_SSE4_1)
- gen = gen_sse4_1_blendvpd;
- break;
- case E_SFmode:
- if (TARGET_SSE4_1)
- {
- gen = gen_sse4_1_blendvss;
- op_true = force_reg (mode, op_true);
- }
- break;
- case E_DFmode:
- if (TARGET_SSE4_1)
- {
- gen = gen_sse4_1_blendvsd;
- op_true = force_reg (mode, op_true);
- }
- break;
- case E_V16QImode:
- case E_V8HImode:
- case E_V4SImode:
- case E_V2DImode:
- if (TARGET_SSE4_1)
- {
- gen = gen_sse4_1_pblendvb;
- if (mode != V16QImode)
- d = gen_reg_rtx (V16QImode);
- op_false = gen_lowpart (V16QImode, op_false);
- op_true = gen_lowpart (V16QImode, op_true);
- cmp = gen_lowpart (V16QImode, cmp);
- }
- break;
- case E_V8SFmode:
- if (TARGET_AVX)
- gen = gen_avx_blendvps256;
- break;
- case E_V4DFmode:
- if (TARGET_AVX)
- gen = gen_avx_blendvpd256;
- break;
- case E_V32QImode:
- case E_V16HImode:
- case E_V8SImode:
- case E_V4DImode:
- if (TARGET_AVX2)
- {
- gen = gen_avx2_pblendvb;
- if (mode != V32QImode)
- d = gen_reg_rtx (V32QImode);
- op_false = gen_lowpart (V32QImode, op_false);
- op_true = gen_lowpart (V32QImode, op_true);
- cmp = gen_lowpart (V32QImode, cmp);
- }
- break;
-
- case E_V64QImode:
- gen = gen_avx512bw_blendmv64qi;
- break;
- case E_V32HImode:
- gen = gen_avx512bw_blendmv32hi;
- break;
- case E_V16SImode:
- gen = gen_avx512f_blendmv16si;
- break;
- case E_V8DImode:
- gen = gen_avx512f_blendmv8di;
- break;
- case E_V8DFmode:
- gen = gen_avx512f_blendmv8df;
- break;
- case E_V16SFmode:
- gen = gen_avx512f_blendmv16sf;
- break;
-
- default:
- break;
- }
-
- if (gen != NULL)
- {
- emit_insn (gen (d, op_false, op_true, cmp));
- if (d != dest)
- emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
- }
- else
- {
- op_true = force_reg (mode, op_true);
-
- t2 = gen_reg_rtx (mode);
- if (optimize)
- t3 = gen_reg_rtx (mode);
- else
- t3 = dest;
-
- x = gen_rtx_AND (mode, op_true, cmp);
- emit_insn (gen_rtx_SET (t2, x));
-
- x = gen_rtx_NOT (mode, cmp);
- x = gen_rtx_AND (mode, x, op_false);
- emit_insn (gen_rtx_SET (t3, x));
-
- x = gen_rtx_IOR (mode, t3, t2);
- emit_insn (gen_rtx_SET (dest, x));
- }
-}
-
-/* Expand a floating-point conditional move. Return true if successful. */
-
-bool
-ix86_expand_fp_movcc (rtx operands[])
-{
- machine_mode mode = GET_MODE (operands[0]);
- enum rtx_code code = GET_CODE (operands[1]);
- rtx tmp, compare_op;
- rtx op0 = XEXP (operands[1], 0);
- rtx op1 = XEXP (operands[1], 1);
-
- if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
- {
- machine_mode cmode;
-
- /* Since we've no cmove for sse registers, don't force bad register
- allocation just to gain access to it. Deny movcc when the
- comparison mode doesn't match the move mode. */
- cmode = GET_MODE (op0);
- if (cmode == VOIDmode)
- cmode = GET_MODE (op1);
- if (cmode != mode)
- return false;
-
- code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
- if (code == UNKNOWN)
- return false;
-
- if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
- operands[2], operands[3]))
- return true;
-
- tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
- operands[2], operands[3]);
- ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
- return true;
- }
-
- if (GET_MODE (op0) == TImode
- || (GET_MODE (op0) == DImode
- && !TARGET_64BIT))
- return false;
-
- /* The floating point conditional move instructions don't directly
- support conditions resulting from a signed integer comparison. */
-
- compare_op = ix86_expand_compare (code, op0, op1);
- if (!fcmov_comparison_operator (compare_op, VOIDmode))
- {
- tmp = gen_reg_rtx (QImode);
- ix86_expand_setcc (tmp, code, op0, op1);
-
- compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
- }
-
- emit_insn (gen_rtx_SET (operands[0],
- gen_rtx_IF_THEN_ELSE (mode, compare_op,
- operands[2], operands[3])));
-
- return true;
-}
-
-/* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
-
-static int
-ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
-{
- switch (code)
- {
- case EQ:
- return 0;
- case LT:
- case LTU:
- return 1;
- case LE:
- case LEU:
- return 2;
- case NE:
- return 4;
- case GE:
- case GEU:
- return 5;
- case GT:
- case GTU:
- return 6;
- default:
- gcc_unreachable ();
- }
-}
-
-/* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
-
-static int
-ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
-{
- switch (code)
- {
- case EQ:
- return 0x00;
- case NE:
- return 0x04;
- case GT:
- return 0x0e;
- case LE:
- return 0x02;
- case GE:
- return 0x0d;
- case LT:
- return 0x01;
- case UNLE:
- return 0x0a;
- case UNLT:
- return 0x09;
- case UNGE:
- return 0x05;
- case UNGT:
- return 0x06;
- case UNEQ:
- return 0x18;
- case LTGT:
- return 0x0c;
- case ORDERED:
- return 0x07;
- case UNORDERED:
- return 0x03;
- default:
- gcc_unreachable ();
- }
-}
-
-/* Return immediate value to be used in UNSPEC_PCMP
- for comparison CODE in MODE. */
-
-static int
-ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
-{
- if (FLOAT_MODE_P (mode))
- return ix86_fp_cmp_code_to_pcmp_immediate (code);
- return ix86_int_cmp_code_to_pcmp_immediate (code);
-}
-
-/* Expand AVX-512 vector comparison. */
-
-bool
-ix86_expand_mask_vec_cmp (rtx operands[])
-{
- machine_mode mask_mode = GET_MODE (operands[0]);
- machine_mode cmp_mode = GET_MODE (operands[2]);
- enum rtx_code code = GET_CODE (operands[1]);
- rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
- int unspec_code;
- rtx unspec;
-
- switch (code)
- {
- case LEU:
- case GTU:
- case GEU:
- case LTU:
- unspec_code = UNSPEC_UNSIGNED_PCMP;
- break;
-
- default:
- unspec_code = UNSPEC_PCMP;
- }
-
- unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
- operands[3], imm),
- unspec_code);
- emit_insn (gen_rtx_SET (operands[0], unspec));
-
- return true;
-}
-
-/* Expand fp vector comparison. */
-
-bool
-ix86_expand_fp_vec_cmp (rtx operands[])
-{
- enum rtx_code code = GET_CODE (operands[1]);
- rtx cmp;
-
- code = ix86_prepare_sse_fp_compare_args (operands[0], code,
- &operands[2], &operands[3]);
- if (code == UNKNOWN)
- {
- rtx temp;
- switch (GET_CODE (operands[1]))
- {
- case LTGT:
- temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
- operands[3], NULL, NULL);
- cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
- operands[3], NULL, NULL);
- code = AND;
- break;
- case UNEQ:
- temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
- operands[3], NULL, NULL);
- cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
- operands[3], NULL, NULL);
- code = IOR;
- break;
- default:
- gcc_unreachable ();
- }
- cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
- OPTAB_DIRECT);
- }
- else
- cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
- operands[1], operands[2]);
-
- if (operands[0] != cmp)
- emit_move_insn (operands[0], cmp);
-
- return true;
-}
-
-static rtx
-ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
- rtx op_true, rtx op_false, bool *negate)
-{
- machine_mode data_mode = GET_MODE (dest);
- machine_mode mode = GET_MODE (cop0);
- rtx x;
-
- *negate = false;
-
- /* XOP supports all of the comparisons on all 128-bit vector int types. */
- if (TARGET_XOP
- && (mode == V16QImode || mode == V8HImode
- || mode == V4SImode || mode == V2DImode))
- ;
- else
- {
- /* Canonicalize the comparison to EQ, GT, GTU. */
- switch (code)
- {
- case EQ:
- case GT:
- case GTU:
- break;
-
- case NE:
- case LE:
- case LEU:
- code = reverse_condition (code);
- *negate = true;
- break;
-
- case GE:
- case GEU:
- code = reverse_condition (code);
- *negate = true;
- /* FALLTHRU */
-
- case LT:
- case LTU:
- std::swap (cop0, cop1);
- code = swap_condition (code);
- break;
-
- default:
- gcc_unreachable ();
- }
-
- /* Only SSE4.1/SSE4.2 supports V2DImode. */
- if (mode == V2DImode)
- {
- switch (code)
- {
- case EQ:
- /* SSE4.1 supports EQ. */
- if (!TARGET_SSE4_1)
- return NULL;
- break;
-
- case GT:
- case GTU:
- /* SSE4.2 supports GT/GTU. */
- if (!TARGET_SSE4_2)
- return NULL;
- break;
-
- default:
- gcc_unreachable ();
- }
- }
-
- rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
- rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
- if (*negate)
- std::swap (optrue, opfalse);
-
- /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
- not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
- min (x, y) == x). While we add one instruction (the minimum),
- we remove the need for two instructions in the negation, as the
- result is done this way.
- When using masks, do it for SI/DImode element types, as it is shorter
- than the two subtractions. */
- if ((code != EQ
- && GET_MODE_SIZE (mode) != 64
- && vector_all_ones_operand (opfalse, data_mode)
- && optrue == CONST0_RTX (data_mode))
- || (code == GTU
- && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
- /* Don't do it if not using integer masks and we'd end up with
- the right values in the registers though. */
- && (GET_MODE_SIZE (mode) == 64
- || !vector_all_ones_operand (optrue, data_mode)
- || opfalse != CONST0_RTX (data_mode))))
- {
- rtx (*gen) (rtx, rtx, rtx) = NULL;
-
- switch (mode)
- {
- case E_V16SImode:
- gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
- break;
- case E_V8DImode:
- gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
- cop0 = force_reg (mode, cop0);
- cop1 = force_reg (mode, cop1);
- break;
- case E_V32QImode:
- if (TARGET_AVX2)
- gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
- break;
- case E_V16HImode:
- if (TARGET_AVX2)
- gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
- break;
- case E_V8SImode:
- if (TARGET_AVX2)
- gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
- break;
- case E_V4DImode:
- if (TARGET_AVX512VL)
- {
- gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
- cop0 = force_reg (mode, cop0);
- cop1 = force_reg (mode, cop1);
- }
- break;
- case E_V16QImode:
- if (code == GTU && TARGET_SSE2)
- gen = gen_uminv16qi3;
- else if (code == GT && TARGET_SSE4_1)
- gen = gen_sminv16qi3;
- break;
- case E_V8HImode:
- if (code == GTU && TARGET_SSE4_1)
- gen = gen_uminv8hi3;
- else if (code == GT && TARGET_SSE2)
- gen = gen_sminv8hi3;
- break;
- case E_V4SImode:
- if (TARGET_SSE4_1)
- gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
- break;
- case E_V2DImode:
- if (TARGET_AVX512VL)
- {
- gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
- cop0 = force_reg (mode, cop0);
- cop1 = force_reg (mode, cop1);
- }
- break;
- default:
- break;
- }
-
- if (gen)
- {
- rtx tem = gen_reg_rtx (mode);
- if (!vector_operand (cop0, mode))
- cop0 = force_reg (mode, cop0);
- if (!vector_operand (cop1, mode))
- cop1 = force_reg (mode, cop1);
- *negate = !*negate;
- emit_insn (gen (tem, cop0, cop1));
- cop1 = tem;
- code = EQ;
- }
- }
-
- /* Unsigned parallel compare is not supported by the hardware.
- Play some tricks to turn this into a signed comparison
- against 0. */
- if (code == GTU)
- {
- cop0 = force_reg (mode, cop0);
-
- switch (mode)
- {
- case E_V16SImode:
- case E_V8DImode:
- case E_V8SImode:
- case E_V4DImode:
- case E_V4SImode:
- case E_V2DImode:
- {
- rtx t1, t2, mask;
- rtx (*gen_sub3) (rtx, rtx, rtx);
-
- switch (mode)
- {
- case E_V16SImode: gen_sub3 = gen_subv16si3; break;
- case E_V8DImode: gen_sub3 = gen_subv8di3; break;
- case E_V8SImode: gen_sub3 = gen_subv8si3; break;
- case E_V4DImode: gen_sub3 = gen_subv4di3; break;
- case E_V4SImode: gen_sub3 = gen_subv4si3; break;
- case E_V2DImode: gen_sub3 = gen_subv2di3; break;
- default:
- gcc_unreachable ();
- }
- /* Subtract (-(INT MAX) - 1) from both operands to make
- them signed. */
- mask = ix86_build_signbit_mask (mode, true, false);
- t1 = gen_reg_rtx (mode);
- emit_insn (gen_sub3 (t1, cop0, mask));
-
- t2 = gen_reg_rtx (mode);
- emit_insn (gen_sub3 (t2, cop1, mask));
-
- cop0 = t1;
- cop1 = t2;
- code = GT;
- }
- break;
-
- case E_V64QImode:
- case E_V32HImode:
- case E_V32QImode:
- case E_V16HImode:
- case E_V16QImode:
- case E_V8HImode:
- /* Perform a parallel unsigned saturating subtraction. */
- x = gen_reg_rtx (mode);
- emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
- cop1)));
-
- cop0 = x;
- cop1 = CONST0_RTX (mode);
- code = EQ;
- *negate = !*negate;
- break;
-
- default:
- gcc_unreachable ();
- }
- }
- }
-
- if (*negate)
- std::swap (op_true, op_false);
-
- /* Allow the comparison to be done in one mode, but the movcc to
- happen in another mode. */
- if (data_mode == mode)
- {
- x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
- op_true, op_false);
- }
- else
- {
- gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
- x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
- op_true, op_false);
- if (GET_MODE (x) == mode)
- x = gen_lowpart (data_mode, x);
- }
-
- return x;
-}
-
-/* Expand integer vector comparison. */
-
-bool
-ix86_expand_int_vec_cmp (rtx operands[])
-{
- rtx_code code = GET_CODE (operands[1]);
- bool negate = false;
- rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
- operands[3], NULL, NULL, &negate);
-
- if (!cmp)
- return false;
-
- if (negate)
- cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
- CONST0_RTX (GET_MODE (cmp)),
- NULL, NULL, &negate);
-
- gcc_assert (!negate);
-
- if (operands[0] != cmp)
- emit_move_insn (operands[0], cmp);
-
- return true;
-}
-
-/* Expand a floating-point vector conditional move; a vcond operation
- rather than a movcc operation. */
-
-bool
-ix86_expand_fp_vcond (rtx operands[])
-{
- enum rtx_code code = GET_CODE (operands[3]);
- rtx cmp;
-
- code = ix86_prepare_sse_fp_compare_args (operands[0], code,
- &operands[4], &operands[5]);
- if (code == UNKNOWN)
- {
- rtx temp;
- switch (GET_CODE (operands[3]))
- {
- case LTGT:
- temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
- operands[5], operands[0], operands[0]);
- cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
- operands[5], operands[1], operands[2]);
- code = AND;
- break;
- case UNEQ:
- temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
- operands[5], operands[0], operands[0]);
- cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
- operands[5], operands[1], operands[2]);
- code = IOR;
- break;
- default:
- gcc_unreachable ();
- }
- cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
- OPTAB_DIRECT);
- ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
- return true;
- }
-
- if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
- operands[5], operands[1], operands[2]))
- return true;
-
- cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
- operands[1], operands[2]);
- ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
- return true;
-}
-
-/* Expand a signed/unsigned integral vector conditional move. */
-
-bool
-ix86_expand_int_vcond (rtx operands[])
-{
- machine_mode data_mode = GET_MODE (operands[0]);
- machine_mode mode = GET_MODE (operands[4]);
- enum rtx_code code = GET_CODE (operands[3]);
- bool negate = false;
- rtx x, cop0, cop1;
-
- cop0 = operands[4];
- cop1 = operands[5];
-
- /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
- and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
- if ((code == LT || code == GE)
- && data_mode == mode
- && cop1 == CONST0_RTX (mode)
- && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
- && GET_MODE_UNIT_SIZE (data_mode) > 1
- && GET_MODE_UNIT_SIZE (data_mode) <= 8
- && (GET_MODE_SIZE (data_mode) == 16
- || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
- {
- rtx negop = operands[2 - (code == LT)];
- int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
- if (negop == CONST1_RTX (data_mode))
- {
- rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
- operands[0], 1, OPTAB_DIRECT);
- if (res != operands[0])
- emit_move_insn (operands[0], res);
- return true;
- }
- else if (GET_MODE_INNER (data_mode) != DImode
- && vector_all_ones_operand (negop, data_mode))
- {
- rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
- operands[0], 0, OPTAB_DIRECT);
- if (res != operands[0])
- emit_move_insn (operands[0], res);
- return true;
- }
- }
-
- if (!nonimmediate_operand (cop1, mode))
- cop1 = force_reg (mode, cop1);
- if (!general_operand (operands[1], data_mode))
- operands[1] = force_reg (data_mode, operands[1]);
- if (!general_operand (operands[2], data_mode))
- operands[2] = force_reg (data_mode, operands[2]);
-
- x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
- operands[1], operands[2], &negate);
-
- if (!x)
- return false;
-
- ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
- operands[2-negate]);
- return true;
-}
-
-/* AVX512F does support 64-byte integer vector operations,
- thus the longest vector we are faced with is V64QImode. */
-#define MAX_VECT_LEN 64
-
-struct expand_vec_perm_d
-{
- rtx target, op0, op1;
- unsigned char perm[MAX_VECT_LEN];
- machine_mode vmode;
- unsigned char nelt;
- bool one_operand_p;
- bool testing_p;
-};
-
-static bool
-ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
- struct expand_vec_perm_d *d)
-{
- /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
- expander, so args are either in d, or in op0, op1 etc. */
- machine_mode mode = GET_MODE (d ? d->op0 : op0);
- machine_mode maskmode = mode;
- rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
-
- switch (mode)
- {
- case E_V8HImode:
- if (TARGET_AVX512VL && TARGET_AVX512BW)
- gen = gen_avx512vl_vpermt2varv8hi3;
- break;
- case E_V16HImode:
- if (TARGET_AVX512VL && TARGET_AVX512BW)
- gen = gen_avx512vl_vpermt2varv16hi3;
- break;
- case E_V64QImode:
- if (TARGET_AVX512VBMI)
- gen = gen_avx512bw_vpermt2varv64qi3;
- break;
- case E_V32HImode:
- if (TARGET_AVX512BW)
- gen = gen_avx512bw_vpermt2varv32hi3;
- break;
- case E_V4SImode:
- if (TARGET_AVX512VL)
- gen = gen_avx512vl_vpermt2varv4si3;
- break;
- case E_V8SImode:
- if (TARGET_AVX512VL)
- gen = gen_avx512vl_vpermt2varv8si3;
- break;
- case E_V16SImode:
- if (TARGET_AVX512F)
- gen = gen_avx512f_vpermt2varv16si3;
- break;
- case E_V4SFmode:
- if (TARGET_AVX512VL)
- {
- gen = gen_avx512vl_vpermt2varv4sf3;
- maskmode = V4SImode;
- }
- break;
- case E_V8SFmode:
- if (TARGET_AVX512VL)
- {
- gen = gen_avx512vl_vpermt2varv8sf3;
- maskmode = V8SImode;
- }
- break;
- case E_V16SFmode:
- if (TARGET_AVX512F)
- {
- gen = gen_avx512f_vpermt2varv16sf3;
- maskmode = V16SImode;
- }
- break;
- case E_V2DImode:
- if (TARGET_AVX512VL)
- gen = gen_avx512vl_vpermt2varv2di3;
- break;
- case E_V4DImode:
- if (TARGET_AVX512VL)
- gen = gen_avx512vl_vpermt2varv4di3;
- break;
- case E_V8DImode:
- if (TARGET_AVX512F)
- gen = gen_avx512f_vpermt2varv8di3;
- break;
- case E_V2DFmode:
- if (TARGET_AVX512VL)
- {
- gen = gen_avx512vl_vpermt2varv2df3;
- maskmode = V2DImode;
- }
- break;
- case E_V4DFmode:
- if (TARGET_AVX512VL)
- {
- gen = gen_avx512vl_vpermt2varv4df3;
- maskmode = V4DImode;
- }
- break;
- case E_V8DFmode:
- if (TARGET_AVX512F)
- {
- gen = gen_avx512f_vpermt2varv8df3;
- maskmode = V8DImode;
- }
- break;
- default:
- break;
- }
-
- if (gen == NULL)
- return false;
-
- /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
- expander, so args are either in d, or in op0, op1 etc. */
- if (d)
- {
- rtx vec[64];
- target = d->target;
- op0 = d->op0;
- op1 = d->op1;
- for (int i = 0; i < d->nelt; ++i)
- vec[i] = GEN_INT (d->perm[i]);
- mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
- }
-
- emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
- return true;
-}
-
-/* Expand a variable vector permutation. */
-
-void
-ix86_expand_vec_perm (rtx operands[])
-{
- rtx target = operands[0];
- rtx op0 = operands[1];
- rtx op1 = operands[2];
- rtx mask = operands[3];
- rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
- machine_mode mode = GET_MODE (op0);
- machine_mode maskmode = GET_MODE (mask);
- int w, e, i;
- bool one_operand_shuffle = rtx_equal_p (op0, op1);
-
- /* Number of elements in the vector. */
- w = GET_MODE_NUNITS (mode);
- e = GET_MODE_UNIT_SIZE (mode);
- gcc_assert (w <= 64);
-
- if (TARGET_AVX512F && one_operand_shuffle)
- {
- rtx (*gen) (rtx, rtx, rtx) = NULL;
- switch (mode)
- {
- case E_V16SImode:
- gen =gen_avx512f_permvarv16si;
- break;
- case E_V16SFmode:
- gen = gen_avx512f_permvarv16sf;
- break;
- case E_V8DImode:
- gen = gen_avx512f_permvarv8di;
- break;
- case E_V8DFmode:
- gen = gen_avx512f_permvarv8df;
- break;
- default:
- break;
- }
- if (gen != NULL)
- {
- emit_insn (gen (target, op0, mask));
- return;
- }
- }
-
- if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
- return;
-
- if (TARGET_AVX2)
- {
- if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
- {
- /* Unfortunately, the VPERMQ and VPERMPD instructions only support
- an constant shuffle operand. With a tiny bit of effort we can
- use VPERMD instead. A re-interpretation stall for V4DFmode is
- unfortunate but there's no avoiding it.
- Similarly for V16HImode we don't have instructions for variable
- shuffling, while for V32QImode we can use after preparing suitable
- masks vpshufb; vpshufb; vpermq; vpor. */
-
- if (mode == V16HImode)
- {
- maskmode = mode = V32QImode;
- w = 32;
- e = 1;
- }
- else
- {
- maskmode = mode = V8SImode;
- w = 8;
- e = 4;
- }
- t1 = gen_reg_rtx (maskmode);
-
- /* Replicate the low bits of the V4DImode mask into V8SImode:
- mask = { A B C D }
- t1 = { A A B B C C D D }. */
- for (i = 0; i < w / 2; ++i)
- vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
- vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
- vt = force_reg (maskmode, vt);
- mask = gen_lowpart (maskmode, mask);
- if (maskmode == V8SImode)
- emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
- else
- emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
-
- /* Multiply the shuffle indicies by two. */
- t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
- OPTAB_DIRECT);
-
- /* Add one to the odd shuffle indicies:
- t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
- for (i = 0; i < w / 2; ++i)
- {
- vec[i * 2] = const0_rtx;
- vec[i * 2 + 1] = const1_rtx;
- }
- vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
- vt = validize_mem (force_const_mem (maskmode, vt));
- t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
- OPTAB_DIRECT);
-
- /* Continue as if V8SImode (resp. V32QImode) was used initially. */
- operands[3] = mask = t1;
- target = gen_reg_rtx (mode);
- op0 = gen_lowpart (mode, op0);
- op1 = gen_lowpart (mode, op1);
- }
-
- switch (mode)
- {
- case E_V8SImode:
- /* The VPERMD and VPERMPS instructions already properly ignore
- the high bits of the shuffle elements. No need for us to
- perform an AND ourselves. */
- if (one_operand_shuffle)
- {
- emit_insn (gen_avx2_permvarv8si (target, op0, mask));
- if (target != operands[0])
- emit_move_insn (operands[0],
- gen_lowpart (GET_MODE (operands[0]), target));
- }
- else
- {
- t1 = gen_reg_rtx (V8SImode);
- t2 = gen_reg_rtx (V8SImode);
- emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
- emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
- goto merge_two;
- }
- return;
-
- case E_V8SFmode:
- mask = gen_lowpart (V8SImode, mask);
- if (one_operand_shuffle)
- emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
- else
- {
- t1 = gen_reg_rtx (V8SFmode);
- t2 = gen_reg_rtx (V8SFmode);
- emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
- emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
- goto merge_two;
- }
- return;
-
- case E_V4SImode:
- /* By combining the two 128-bit input vectors into one 256-bit
- input vector, we can use VPERMD and VPERMPS for the full
- two-operand shuffle. */
- t1 = gen_reg_rtx (V8SImode);
- t2 = gen_reg_rtx (V8SImode);
- emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
- emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
- emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
- emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
- return;
-
- case E_V4SFmode:
- t1 = gen_reg_rtx (V8SFmode);
- t2 = gen_reg_rtx (V8SImode);
- mask = gen_lowpart (V4SImode, mask);
- emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
- emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
- emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
- emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
- return;
-
- case E_V32QImode:
- t1 = gen_reg_rtx (V32QImode);
- t2 = gen_reg_rtx (V32QImode);
- t3 = gen_reg_rtx (V32QImode);
- vt2 = GEN_INT (-128);
- vt = gen_const_vec_duplicate (V32QImode, vt2);
- vt = force_reg (V32QImode, vt);
- for (i = 0; i < 32; i++)
- vec[i] = i < 16 ? vt2 : const0_rtx;
- vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
- vt2 = force_reg (V32QImode, vt2);
- /* From mask create two adjusted masks, which contain the same
- bits as mask in the low 7 bits of each vector element.
- The first mask will have the most significant bit clear
- if it requests element from the same 128-bit lane
- and MSB set if it requests element from the other 128-bit lane.
- The second mask will have the opposite values of the MSB,
- and additionally will have its 128-bit lanes swapped.
- E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
- t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
- t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
- stands for other 12 bytes. */
- /* The bit whether element is from the same lane or the other
- lane is bit 4, so shift it up by 3 to the MSB position. */
- t5 = gen_reg_rtx (V4DImode);
- emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
- GEN_INT (3)));
- /* Clear MSB bits from the mask just in case it had them set. */
- emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
- /* After this t1 will have MSB set for elements from other lane. */
- emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
- /* Clear bits other than MSB. */
- emit_insn (gen_andv32qi3 (t1, t1, vt));
- /* Or in the lower bits from mask into t3. */
- emit_insn (gen_iorv32qi3 (t3, t1, t2));
- /* And invert MSB bits in t1, so MSB is set for elements from the same
- lane. */
- emit_insn (gen_xorv32qi3 (t1, t1, vt));
- /* Swap 128-bit lanes in t3. */
- t6 = gen_reg_rtx (V4DImode);
- emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
- const2_rtx, GEN_INT (3),
- const0_rtx, const1_rtx));
- /* And or in the lower bits from mask into t1. */
- emit_insn (gen_iorv32qi3 (t1, t1, t2));
- if (one_operand_shuffle)
- {
- /* Each of these shuffles will put 0s in places where
- element from the other 128-bit lane is needed, otherwise
- will shuffle in the requested value. */
- emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
- gen_lowpart (V32QImode, t6)));
- emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
- /* For t3 the 128-bit lanes are swapped again. */
- t7 = gen_reg_rtx (V4DImode);
- emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
- const2_rtx, GEN_INT (3),
- const0_rtx, const1_rtx));
- /* And oring both together leads to the result. */
- emit_insn (gen_iorv32qi3 (target, t1,
- gen_lowpart (V32QImode, t7)));
- if (target != operands[0])
- emit_move_insn (operands[0],
- gen_lowpart (GET_MODE (operands[0]), target));
- return;
- }
-
- t4 = gen_reg_rtx (V32QImode);
- /* Similarly to the above one_operand_shuffle code,
- just for repeated twice for each operand. merge_two:
- code will merge the two results together. */
- emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
- gen_lowpart (V32QImode, t6)));
- emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
- gen_lowpart (V32QImode, t6)));
- emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
- emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
- t7 = gen_reg_rtx (V4DImode);
- emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
- const2_rtx, GEN_INT (3),
- const0_rtx, const1_rtx));
- t8 = gen_reg_rtx (V4DImode);
- emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
- const2_rtx, GEN_INT (3),
- const0_rtx, const1_rtx));
- emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
- emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
- t1 = t4;
- t2 = t3;
- goto merge_two;
-
- default:
- gcc_assert (GET_MODE_SIZE (mode) <= 16);
- break;
- }
- }
-
- if (TARGET_XOP)
- {
- /* The XOP VPPERM insn supports three inputs. By ignoring the
- one_operand_shuffle special case, we avoid creating another
- set of constant vectors in memory. */
- one_operand_shuffle = false;
-
- /* mask = mask & {2*w-1, ...} */
- vt = GEN_INT (2*w - 1);
- }
- else
- {
- /* mask = mask & {w-1, ...} */
- vt = GEN_INT (w - 1);
- }
-
- vt = gen_const_vec_duplicate (maskmode, vt);
- mask = expand_simple_binop (maskmode, AND, mask, vt,
- NULL_RTX, 0, OPTAB_DIRECT);
-
- /* For non-QImode operations, convert the word permutation control
- into a byte permutation control. */
- if (mode != V16QImode)
- {
- mask = expand_simple_binop (maskmode, ASHIFT, mask,
- GEN_INT (exact_log2 (e)),
- NULL_RTX, 0, OPTAB_DIRECT);
-
- /* Convert mask to vector of chars. */
- mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
-
- /* Replicate each of the input bytes into byte positions:
- (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
- (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
- (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
- for (i = 0; i < 16; ++i)
- vec[i] = GEN_INT (i/e * e);
- vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
- vt = validize_mem (force_const_mem (V16QImode, vt));
- if (TARGET_XOP)
- emit_insn (gen_xop_pperm (mask, mask, mask, vt));
- else
- emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
-
- /* Convert it into the byte positions by doing
- mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
- for (i = 0; i < 16; ++i)
- vec[i] = GEN_INT (i % e);
- vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
- vt = validize_mem (force_const_mem (V16QImode, vt));
- emit_insn (gen_addv16qi3 (mask, mask, vt));
- }
-
- /* The actual shuffle operations all operate on V16QImode. */
- op0 = gen_lowpart (V16QImode, op0);
- op1 = gen_lowpart (V16QImode, op1);
-
- if (TARGET_XOP)
- {
- if (GET_MODE (target) != V16QImode)
- target = gen_reg_rtx (V16QImode);
- emit_insn (gen_xop_pperm (target, op0, op1, mask));
- if (target != operands[0])
- emit_move_insn (operands[0],
- gen_lowpart (GET_MODE (operands[0]), target));
- }
- else if (one_operand_shuffle)
- {
- if (GET_MODE (target) != V16QImode)
- target = gen_reg_rtx (V16QImode);
- emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
- if (target != operands[0])
- emit_move_insn (operands[0],
- gen_lowpart (GET_MODE (operands[0]), target));
- }
- else
- {
- rtx xops[6];
- bool ok;
-
- /* Shuffle the two input vectors independently. */
- t1 = gen_reg_rtx (V16QImode);
- t2 = gen_reg_rtx (V16QImode);
- emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
- emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
-
- merge_two:
- /* Then merge them together. The key is whether any given control
- element contained a bit set that indicates the second word. */
- mask = operands[3];
- vt = GEN_INT (w);
- if (maskmode == V2DImode && !TARGET_SSE4_1)
- {
- /* Without SSE4.1, we don't have V2DImode EQ. Perform one
- more shuffle to convert the V2DI input mask into a V4SI
- input mask. At which point the masking that expand_int_vcond
- will work as desired. */
- rtx t3 = gen_reg_rtx (V4SImode);
- emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
- const0_rtx, const0_rtx,
- const2_rtx, const2_rtx));
- mask = t3;
- maskmode = V4SImode;
- e = w = 4;
- }
-
- vt = gen_const_vec_duplicate (maskmode, vt);
- vt = force_reg (maskmode, vt);
- mask = expand_simple_binop (maskmode, AND, mask, vt,
- NULL_RTX, 0, OPTAB_DIRECT);
-
- if (GET_MODE (target) != mode)
- target = gen_reg_rtx (mode);
- xops[0] = target;
- xops[1] = gen_lowpart (mode, t2);
- xops[2] = gen_lowpart (mode, t1);
- xops[3] = gen_rtx_EQ (maskmode, mask, vt);
- xops[4] = mask;
- xops[5] = vt;
- ok = ix86_expand_int_vcond (xops);
- gcc_assert (ok);
- if (target != operands[0])
- emit_move_insn (operands[0],
- gen_lowpart (GET_MODE (operands[0]), target));
- }
-}
-
-/* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
- true if we should do zero extension, else sign extension. HIGH_P is
- true if we want the N/2 high elements, else the low elements. */
-
-void
-ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
-{
- machine_mode imode = GET_MODE (src);
- rtx tmp;
-
- if (TARGET_SSE4_1)
- {
- rtx (*unpack)(rtx, rtx);
- rtx (*extract)(rtx, rtx) = NULL;
- machine_mode halfmode = BLKmode;
-
- switch (imode)
- {
- case E_V64QImode:
- if (unsigned_p)
- unpack = gen_avx512bw_zero_extendv32qiv32hi2;
- else
- unpack = gen_avx512bw_sign_extendv32qiv32hi2;
- halfmode = V32QImode;
- extract
- = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
- break;
- case E_V32QImode:
- if (unsigned_p)
- unpack = gen_avx2_zero_extendv16qiv16hi2;
- else
- unpack = gen_avx2_sign_extendv16qiv16hi2;
- halfmode = V16QImode;
- extract
- = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
- break;
- case E_V32HImode:
- if (unsigned_p)
- unpack = gen_avx512f_zero_extendv16hiv16si2;
- else
- unpack = gen_avx512f_sign_extendv16hiv16si2;
- halfmode = V16HImode;
- extract
- = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
- break;
- case E_V16HImode:
- if (unsigned_p)
- unpack = gen_avx2_zero_extendv8hiv8si2;
- else
- unpack = gen_avx2_sign_extendv8hiv8si2;
- halfmode = V8HImode;
- extract
- = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
- break;
- case E_V16SImode:
- if (unsigned_p)
- unpack = gen_avx512f_zero_extendv8siv8di2;
- else
- unpack = gen_avx512f_sign_extendv8siv8di2;
- halfmode = V8SImode;
- extract
- = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
- break;
- case E_V8SImode:
- if (unsigned_p)
- unpack = gen_avx2_zero_extendv4siv4di2;
- else
- unpack = gen_avx2_sign_extendv4siv4di2;
- halfmode = V4SImode;
- extract
- = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
- break;
- case E_V16QImode:
- if (unsigned_p)
- unpack = gen_sse4_1_zero_extendv8qiv8hi2;
- else
- unpack = gen_sse4_1_sign_extendv8qiv8hi2;
- break;
- case E_V8HImode:
- if (unsigned_p)
- unpack = gen_sse4_1_zero_extendv4hiv4si2;
- else
- unpack = gen_sse4_1_sign_extendv4hiv4si2;
- break;
- case E_V4SImode:
- if (unsigned_p)
- unpack = gen_sse4_1_zero_extendv2siv2di2;
- else
- unpack = gen_sse4_1_sign_extendv2siv2di2;
- break;
- default:
- gcc_unreachable ();
- }
-
- if (GET_MODE_SIZE (imode) >= 32)
- {
- tmp = gen_reg_rtx (halfmode);
- emit_insn (extract (tmp, src));
- }
- else if (high_p)
- {
- /* Shift higher 8 bytes to lower 8 bytes. */
- tmp = gen_reg_rtx (V1TImode);
- emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
- GEN_INT (64)));
- tmp = gen_lowpart (imode, tmp);
- }
- else
- tmp = src;
-
- emit_insn (unpack (dest, tmp));
- }
- else
- {
- rtx (*unpack)(rtx, rtx, rtx);
-
- switch (imode)
- {
- case E_V16QImode:
- if (high_p)
- unpack = gen_vec_interleave_highv16qi;
- else
- unpack = gen_vec_interleave_lowv16qi;
- break;
- case E_V8HImode:
- if (high_p)
- unpack = gen_vec_interleave_highv8hi;
- else
- unpack = gen_vec_interleave_lowv8hi;
- break;
- case E_V4SImode:
- if (high_p)
- unpack = gen_vec_interleave_highv4si;
- else
- unpack = gen_vec_interleave_lowv4si;
- break;
- default:
- gcc_unreachable ();
- }
-
- if (unsigned_p)
- tmp = force_reg (imode, CONST0_RTX (imode));
- else
- tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
- src, pc_rtx, pc_rtx);
-
- rtx tmp2 = gen_reg_rtx (imode);
- emit_insn (unpack (tmp2, src, tmp));
- emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
- }
-}
-
-/* Expand conditional increment or decrement using adb/sbb instructions.
- The default case using setcc followed by the conditional move can be
- done by generic code. */
-bool
-ix86_expand_int_addcc (rtx operands[])
-{
- enum rtx_code code = GET_CODE (operands[1]);
- rtx flags;
- rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
- rtx compare_op;
- rtx val = const0_rtx;
- bool fpcmp = false;
- machine_mode mode;
- rtx op0 = XEXP (operands[1], 0);
- rtx op1 = XEXP (operands[1], 1);
-
- if (operands[3] != const1_rtx
- && operands[3] != constm1_rtx)
- return false;
- if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
- return false;
- code = GET_CODE (compare_op);
-
- flags = XEXP (compare_op, 0);
-
- if (GET_MODE (flags) == CCFPmode)
- {
- fpcmp = true;
- code = ix86_fp_compare_code_to_integer (code);
- }
-
- if (code != LTU)
- {
- val = constm1_rtx;
- if (fpcmp)
- PUT_CODE (compare_op,
- reverse_condition_maybe_unordered
- (GET_CODE (compare_op)));
- else
- PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
- }
-
- mode = GET_MODE (operands[0]);
-
- /* Construct either adc or sbb insn. */
- if ((code == LTU) == (operands[3] == constm1_rtx))
- {
- switch (mode)
- {
- case E_QImode:
- insn = gen_subqi3_carry;
- break;
- case E_HImode:
- insn = gen_subhi3_carry;
- break;
- case E_SImode:
- insn = gen_subsi3_carry;
- break;
- case E_DImode:
- insn = gen_subdi3_carry;
- break;
- default:
- gcc_unreachable ();
- }
- }
- else
- {
- switch (mode)
- {
- case E_QImode:
- insn = gen_addqi3_carry;
- break;
- case E_HImode:
- insn = gen_addhi3_carry;
- break;
- case E_SImode:
- insn = gen_addsi3_carry;
- break;
- case E_DImode:
- insn = gen_adddi3_carry;
- break;
- default:
- gcc_unreachable ();
- }
- }
- emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
-
- return true;
-}
-
-
-/* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
- but works for floating pointer parameters and nonoffsetable memories.
- For pushes, it returns just stack offsets; the values will be saved
- in the right order. Maximally three parts are generated. */
-
-static int
-ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
-{
- int size;
-
- if (!TARGET_64BIT)
- size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
- else
- size = (GET_MODE_SIZE (mode) + 4) / 8;
-
- gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
- gcc_assert (size >= 2 && size <= 4);
-
- /* Optimize constant pool reference to immediates. This is used by fp
- moves, that force all constants to memory to allow combining. */
- if (MEM_P (operand) && MEM_READONLY_P (operand))
- operand = avoid_constant_pool_reference (operand);
-
- if (MEM_P (operand) && !offsettable_memref_p (operand))
- {
- /* The only non-offsetable memories we handle are pushes. */
- int ok = push_operand (operand, VOIDmode);
-
- gcc_assert (ok);
-
- operand = copy_rtx (operand);
- PUT_MODE (operand, word_mode);
- parts[0] = parts[1] = parts[2] = parts[3] = operand;
- return size;
- }
-
- if (GET_CODE (operand) == CONST_VECTOR)
- {
- scalar_int_mode imode = int_mode_for_mode (mode).require ();
- /* Caution: if we looked through a constant pool memory above,
- the operand may actually have a different mode now. That's
- ok, since we want to pun this all the way back to an integer. */
- operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
- gcc_assert (operand != NULL);
- mode = imode;
- }
-
- if (!TARGET_64BIT)
- {
- if (mode == DImode)
- split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
- else
- {
- int i;
-
- if (REG_P (operand))
- {
- gcc_assert (reload_completed);
- for (i = 0; i < size; i++)
- parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
- }
- else if (offsettable_memref_p (operand))
- {
- operand = adjust_address (operand, SImode, 0);
- parts[0] = operand;
- for (i = 1; i < size; i++)
- parts[i] = adjust_address (operand, SImode, 4 * i);
- }
- else if (CONST_DOUBLE_P (operand))
- {
- const REAL_VALUE_TYPE *r;
- long l[4];
-
- r = CONST_DOUBLE_REAL_VALUE (operand);
- switch (mode)
- {
- case E_TFmode:
- real_to_target (l, r, mode);
- parts[3] = gen_int_mode (l[3], SImode);
- parts[2] = gen_int_mode (l[2], SImode);
- break;
- case E_XFmode:
- /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
- long double may not be 80-bit. */
- real_to_target (l, r, mode);
- parts[2] = gen_int_mode (l[2], SImode);
- break;
- case E_DFmode:
- REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
- break;
- default:
- gcc_unreachable ();
- }
- parts[1] = gen_int_mode (l[1], SImode);
- parts[0] = gen_int_mode (l[0], SImode);
- }
- else
- gcc_unreachable ();
- }
- }
- else
- {
- if (mode == TImode)
- split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
- if (mode == XFmode || mode == TFmode)
- {
- machine_mode upper_mode = mode==XFmode ? SImode : DImode;
- if (REG_P (operand))
- {
- gcc_assert (reload_completed);
- parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
- parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
- }
- else if (offsettable_memref_p (operand))
- {
- operand = adjust_address (operand, DImode, 0);
- parts[0] = operand;
- parts[1] = adjust_address (operand, upper_mode, 8);
- }
- else if (CONST_DOUBLE_P (operand))
- {
- long l[4];
-
- real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
-
- /* real_to_target puts 32-bit pieces in each long. */
- parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
- | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
- << 32), DImode);
-
- if (upper_mode == SImode)
- parts[1] = gen_int_mode (l[2], SImode);
- else
- parts[1]
- = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
- | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
- << 32), DImode);
- }
- else
- gcc_unreachable ();
- }
- }
-
- return size;
-}
-
-/* Emit insns to perform a move or push of DI, DF, XF, and TF values.
- Return false when normal moves are needed; true when all required
- insns have been emitted. Operands 2-4 contain the input values
- int the correct order; operands 5-7 contain the output values. */
-
-void
-ix86_split_long_move (rtx operands[])
-{
- rtx part[2][4];
- int nparts, i, j;
- int push = 0;
- int collisions = 0;
- machine_mode mode = GET_MODE (operands[0]);
- bool collisionparts[4];
-
- /* The DFmode expanders may ask us to move double.
- For 64bit target this is single move. By hiding the fact
- here we simplify i386.md splitters. */
- if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
- {
- /* Optimize constant pool reference to immediates. This is used by
- fp moves, that force all constants to memory to allow combining. */
-
- if (MEM_P (operands[1])
- && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
- && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
- operands[1] = get_pool_constant (XEXP (operands[1], 0));
- if (push_operand (operands[0], VOIDmode))
- {
- operands[0] = copy_rtx (operands[0]);
- PUT_MODE (operands[0], word_mode);
- }
- else
- operands[0] = gen_lowpart (DImode, operands[0]);
- operands[1] = gen_lowpart (DImode, operands[1]);
- emit_move_insn (operands[0], operands[1]);
- return;
- }
-
- /* The only non-offsettable memory we handle is push. */
- if (push_operand (operands[0], VOIDmode))
- push = 1;
- else
- gcc_assert (!MEM_P (operands[0])
- || offsettable_memref_p (operands[0]));
-
- nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
- ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
-
- /* When emitting push, take care for source operands on the stack. */
- if (push && MEM_P (operands[1])
- && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
- {
- rtx src_base = XEXP (part[1][nparts - 1], 0);
-
- /* Compensate for the stack decrement by 4. */
- if (!TARGET_64BIT && nparts == 3
- && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
- src_base = plus_constant (Pmode, src_base, 4);
-
- /* src_base refers to the stack pointer and is
- automatically decreased by emitted push. */
- for (i = 0; i < nparts; i++)
- part[1][i] = change_address (part[1][i],
- GET_MODE (part[1][i]), src_base);
- }
-
- /* We need to do copy in the right order in case an address register
- of the source overlaps the destination. */
- if (REG_P (part[0][0]) && MEM_P (part[1][0]))
- {
- rtx tmp;
-
- for (i = 0; i < nparts; i++)
- {
- collisionparts[i]
- = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
- if (collisionparts[i])
- collisions++;
- }
-
- /* Collision in the middle part can be handled by reordering. */
- if (collisions == 1 && nparts == 3 && collisionparts [1])
- {
- std::swap (part[0][1], part[0][2]);
- std::swap (part[1][1], part[1][2]);
- }
- else if (collisions == 1
- && nparts == 4
- && (collisionparts [1] || collisionparts [2]))
- {
- if (collisionparts [1])
- {
- std::swap (part[0][1], part[0][2]);
- std::swap (part[1][1], part[1][2]);
- }
- else
- {
- std::swap (part[0][2], part[0][3]);
- std::swap (part[1][2], part[1][3]);
- }
- }
-
- /* If there are more collisions, we can't handle it by reordering.
- Do an lea to the last part and use only one colliding move. */
- else if (collisions > 1)
- {
- rtx base, addr;
-
- collisions = 1;
-
- base = part[0][nparts - 1];
-
- /* Handle the case when the last part isn't valid for lea.
- Happens in 64-bit mode storing the 12-byte XFmode. */
- if (GET_MODE (base) != Pmode)
- base = gen_rtx_REG (Pmode, REGNO (base));
-
- addr = XEXP (part[1][0], 0);
- if (TARGET_TLS_DIRECT_SEG_REFS)
- {
- struct ix86_address parts;
- int ok = ix86_decompose_address (addr, &parts);
- gcc_assert (ok);
- /* It is not valid to use %gs: or %fs: in lea. */
- gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
- }
- emit_insn (gen_rtx_SET (base, addr));
- part[1][0] = replace_equiv_address (part[1][0], base);
- for (i = 1; i < nparts; i++)
- {
- tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
- part[1][i] = replace_equiv_address (part[1][i], tmp);
- }
- }
- }
-
- if (push)
- {
- if (!TARGET_64BIT)
- {
- if (nparts == 3)
- {
- if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
- emit_insn (ix86_gen_add3 (stack_pointer_rtx,
- stack_pointer_rtx, GEN_INT (-4)));
- emit_move_insn (part[0][2], part[1][2]);
- }
- else if (nparts == 4)
- {
- emit_move_insn (part[0][3], part[1][3]);
- emit_move_insn (part[0][2], part[1][2]);
- }
- }
- else
- {
- /* In 64bit mode we don't have 32bit push available. In case this is
- register, it is OK - we will just use larger counterpart. We also
- retype memory - these comes from attempt to avoid REX prefix on
- moving of second half of TFmode value. */
- if (GET_MODE (part[1][1]) == SImode)
- {
- switch (GET_CODE (part[1][1]))
- {
- case MEM:
- part[1][1] = adjust_address (part[1][1], DImode, 0);
- break;
-
- case REG:
- part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
- break;
-
- default:
- gcc_unreachable ();
- }
-
- if (GET_MODE (part[1][0]) == SImode)
- part[1][0] = part[1][1];
- }
- }
- emit_move_insn (part[0][1], part[1][1]);
- emit_move_insn (part[0][0], part[1][0]);
- return;
- }
-
- /* Choose correct order to not overwrite the source before it is copied. */
- if ((REG_P (part[0][0])
- && REG_P (part[1][1])
- && (REGNO (part[0][0]) == REGNO (part[1][1])
- || (nparts == 3
- && REGNO (part[0][0]) == REGNO (part[1][2]))
- || (nparts == 4
- && REGNO (part[0][0]) == REGNO (part[1][3]))))
- || (collisions > 0
- && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
- {
- for (i = 0, j = nparts - 1; i < nparts; i++, j--)
- {
- operands[2 + i] = part[0][j];
- operands[6 + i] = part[1][j];
- }
- }
- else
- {
- for (i = 0; i < nparts; i++)
- {
- operands[2 + i] = part[0][i];
- operands[6 + i] = part[1][i];
- }
- }
-
- /* If optimizing for size, attempt to locally unCSE nonzero constants. */
- if (optimize_insn_for_size_p ())
- {
- for (j = 0; j < nparts - 1; j++)
- if (CONST_INT_P (operands[6 + j])
- && operands[6 + j] != const0_rtx
- && REG_P (operands[2 + j]))
- for (i = j; i < nparts - 1; i++)
- if (CONST_INT_P (operands[7 + i])
- && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
- operands[7 + i] = operands[2 + j];
- }
-
- for (i = 0; i < nparts; i++)
- emit_move_insn (operands[2 + i], operands[6 + i]);
-
- return;
-}
-
-/* Helper function of ix86_split_ashl used to generate an SImode/DImode
- left shift by a constant, either using a single shift or
- a sequence of add instructions. */
-
-static void
-ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
-{
- rtx (*insn)(rtx, rtx, rtx);
-
- if (count == 1
- || (count * ix86_cost->add <= ix86_cost->shift_const
- && !optimize_insn_for_size_p ()))
- {
- insn = mode == DImode ? gen_addsi3 : gen_adddi3;
- while (count-- > 0)
- emit_insn (insn (operand, operand, operand));
- }
- else
- {
- insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
- emit_insn (insn (operand, operand, GEN_INT (count)));
- }
-}
-
-void
-ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
-{
- rtx (*gen_ashl3)(rtx, rtx, rtx);
- rtx (*gen_shld)(rtx, rtx, rtx);
- int half_width = GET_MODE_BITSIZE (mode) >> 1;
-
- rtx low[2], high[2];
- int count;
-
- if (CONST_INT_P (operands[2]))
- {
- split_double_mode (mode, operands, 2, low, high);
- count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
-
- if (count >= half_width)
- {
- emit_move_insn (high[0], low[1]);
- emit_move_insn (low[0], const0_rtx);
-
- if (count > half_width)
- ix86_expand_ashl_const (high[0], count - half_width, mode);
- }
- else
- {
- gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
-
- if (!rtx_equal_p (operands[0], operands[1]))
- emit_move_insn (operands[0], operands[1]);
-
- emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
- ix86_expand_ashl_const (low[0], count, mode);
- }
- return;
- }
-
- split_double_mode (mode, operands, 1, low, high);
-
- gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
-
- if (operands[1] == const1_rtx)
- {
- /* Assuming we've chosen a QImode capable registers, then 1 << N
- can be done with two 32/64-bit shifts, no branches, no cmoves. */
- if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
- {
- rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
-
- ix86_expand_clear (low[0]);
- ix86_expand_clear (high[0]);
- emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
-
- d = gen_lowpart (QImode, low[0]);
- d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
- s = gen_rtx_EQ (QImode, flags, const0_rtx);
- emit_insn (gen_rtx_SET (d, s));
-
- d = gen_lowpart (QImode, high[0]);
- d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
- s = gen_rtx_NE (QImode, flags, const0_rtx);
- emit_insn (gen_rtx_SET (d, s));
- }
-
- /* Otherwise, we can get the same results by manually performing
- a bit extract operation on bit 5/6, and then performing the two
- shifts. The two methods of getting 0/1 into low/high are exactly
- the same size. Avoiding the shift in the bit extract case helps
- pentium4 a bit; no one else seems to care much either way. */
- else
- {
- machine_mode half_mode;
- rtx (*gen_lshr3)(rtx, rtx, rtx);
- rtx (*gen_and3)(rtx, rtx, rtx);
- rtx (*gen_xor3)(rtx, rtx, rtx);
- HOST_WIDE_INT bits;
- rtx x;
-
- if (mode == DImode)
- {
- half_mode = SImode;
- gen_lshr3 = gen_lshrsi3;
- gen_and3 = gen_andsi3;
- gen_xor3 = gen_xorsi3;
- bits = 5;
- }
- else
- {
- half_mode = DImode;
- gen_lshr3 = gen_lshrdi3;
- gen_and3 = gen_anddi3;
- gen_xor3 = gen_xordi3;
- bits = 6;
- }
-
- if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
- x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
- else
- x = gen_lowpart (half_mode, operands[2]);
- emit_insn (gen_rtx_SET (high[0], x));
-
- emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
- emit_insn (gen_and3 (high[0], high[0], const1_rtx));
- emit_move_insn (low[0], high[0]);
- emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
- }
-
- emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
- emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
- return;
- }
-
- if (operands[1] == constm1_rtx)
- {
- /* For -1 << N, we can avoid the shld instruction, because we
- know that we're shifting 0...31/63 ones into a -1. */
- emit_move_insn (low[0], constm1_rtx);
- if (optimize_insn_for_size_p ())
- emit_move_insn (high[0], low[0]);
- else
- emit_move_insn (high[0], constm1_rtx);
- }
- else
- {
- gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
-
- if (!rtx_equal_p (operands[0], operands[1]))
- emit_move_insn (operands[0], operands[1]);
-
- split_double_mode (mode, operands, 1, low, high);
- emit_insn (gen_shld (high[0], low[0], operands[2]));
- }
-
- emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
-
- if (TARGET_CMOVE && scratch)
- {
- rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
- = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
-
- ix86_expand_clear (scratch);
- emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
- }
- else
- {
- rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
- = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
-
- emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
- }
-}
-
-void
-ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
-{
- rtx (*gen_ashr3)(rtx, rtx, rtx)
- = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
- rtx (*gen_shrd)(rtx, rtx, rtx);
- int half_width = GET_MODE_BITSIZE (mode) >> 1;
-
- rtx low[2], high[2];
- int count;
-
- if (CONST_INT_P (operands[2]))
- {
- split_double_mode (mode, operands, 2, low, high);
- count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
-
- if (count == GET_MODE_BITSIZE (mode) - 1)
- {
- emit_move_insn (high[0], high[1]);
- emit_insn (gen_ashr3 (high[0], high[0],
- GEN_INT (half_width - 1)));
- emit_move_insn (low[0], high[0]);
-
- }
- else if (count >= half_width)
- {
- emit_move_insn (low[0], high[1]);
- emit_move_insn (high[0], low[0]);
- emit_insn (gen_ashr3 (high[0], high[0],
- GEN_INT (half_width - 1)));
-
- if (count > half_width)
- emit_insn (gen_ashr3 (low[0], low[0],
- GEN_INT (count - half_width)));
- }
- else
- {
- gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
-
- if (!rtx_equal_p (operands[0], operands[1]))
- emit_move_insn (operands[0], operands[1]);
-
- emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
- emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
- }
- }
- else
- {
- gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
-
- if (!rtx_equal_p (operands[0], operands[1]))
- emit_move_insn (operands[0], operands[1]);
-
- split_double_mode (mode, operands, 1, low, high);
-
- emit_insn (gen_shrd (low[0], high[0], operands[2]));
- emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
-
- if (TARGET_CMOVE && scratch)
- {
- rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
- = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
-
- emit_move_insn (scratch, high[0]);
- emit_insn (gen_ashr3 (scratch, scratch,
- GEN_INT (half_width - 1)));
- emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
- scratch));
- }
- else
- {
- rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
- = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
-
- emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
- }
- }
-}
-
-void
-ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
-{
- rtx (*gen_lshr3)(rtx, rtx, rtx)
- = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
- rtx (*gen_shrd)(rtx, rtx, rtx);
- int half_width = GET_MODE_BITSIZE (mode) >> 1;
-
- rtx low[2], high[2];
- int count;
-
- if (CONST_INT_P (operands[2]))
- {
- split_double_mode (mode, operands, 2, low, high);
- count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
-
- if (count >= half_width)
- {
- emit_move_insn (low[0], high[1]);
- ix86_expand_clear (high[0]);
-
- if (count > half_width)
- emit_insn (gen_lshr3 (low[0], low[0],
- GEN_INT (count - half_width)));
- }
- else
- {
- gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
-
- if (!rtx_equal_p (operands[0], operands[1]))
- emit_move_insn (operands[0], operands[1]);
-
- emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
- emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
- }
- }
- else
- {
- gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
-
- if (!rtx_equal_p (operands[0], operands[1]))
- emit_move_insn (operands[0], operands[1]);
-
- split_double_mode (mode, operands, 1, low, high);
-
- emit_insn (gen_shrd (low[0], high[0], operands[2]));
- emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
-
- if (TARGET_CMOVE && scratch)
- {
- rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
- = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
-
- ix86_expand_clear (scratch);
- emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
- scratch));
- }
- else
- {
- rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
- = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
-
- emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
- }
- }
-}
-
-/* Predict just emitted jump instruction to be taken with probability PROB. */
-static void
-predict_jump (int prob)
-{
- rtx_insn *insn = get_last_insn ();
- gcc_assert (JUMP_P (insn));
- add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
-}
-
-/* Helper function for the string operations below. Dest VARIABLE whether
- it is aligned to VALUE bytes. If true, jump to the label. */
-static rtx_code_label *
-ix86_expand_aligntest (rtx variable, int value, bool epilogue)
-{
- rtx_code_label *label = gen_label_rtx ();
- rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
- if (GET_MODE (variable) == DImode)
- emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
- else
- emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
- emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
- 1, label);
- if (epilogue)
- predict_jump (REG_BR_PROB_BASE * 50 / 100);
- else
- predict_jump (REG_BR_PROB_BASE * 90 / 100);
- return label;
-}
-
-/* Adjust COUNTER by the VALUE. */
-static void
-ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
-{
- rtx (*gen_add)(rtx, rtx, rtx)
- = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
-
- emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
-}
-
-/* Zero extend possibly SImode EXP to Pmode register. */
-rtx
-ix86_zero_extend_to_Pmode (rtx exp)
-{
- return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
-}
-
-/* Divide COUNTREG by SCALE. */
-static rtx
-scale_counter (rtx countreg, int scale)
-{
- rtx sc;
-
- if (scale == 1)
- return countreg;
- if (CONST_INT_P (countreg))
- return GEN_INT (INTVAL (countreg) / scale);
- gcc_assert (REG_P (countreg));
-
- sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
- GEN_INT (exact_log2 (scale)),
- NULL, 1, OPTAB_DIRECT);
- return sc;
-}
-
-/* Return mode for the memcpy/memset loop counter. Prefer SImode over
- DImode for constant loop counts. */
-
-static machine_mode
-counter_mode (rtx count_exp)
-{
- if (GET_MODE (count_exp) != VOIDmode)
- return GET_MODE (count_exp);
- if (!CONST_INT_P (count_exp))
- return Pmode;
- if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
- return DImode;
- return SImode;
-}
-
-/* Copy the address to a Pmode register. This is used for x32 to
- truncate DImode TLS address to a SImode register. */
-
-static rtx
-ix86_copy_addr_to_reg (rtx addr)
-{
- rtx reg;
- if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
- {
- reg = copy_addr_to_reg (addr);
- REG_POINTER (reg) = 1;
- return reg;
- }
- else
- {
- gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
- reg = copy_to_mode_reg (DImode, addr);
- REG_POINTER (reg) = 1;
- return gen_rtx_SUBREG (SImode, reg, 0);
- }
-}
-
-/* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
- to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
- specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
- memory by VALUE (supposed to be in MODE).
-
- The size is rounded down to whole number of chunk size moved at once.
- SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
-
-
-static void
-expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
- rtx destptr, rtx srcptr, rtx value,
- rtx count, machine_mode mode, int unroll,
- int expected_size, bool issetmem)
-{
- rtx_code_label *out_label, *top_label;
- rtx iter, tmp;
- machine_mode iter_mode = counter_mode (count);
- int piece_size_n = GET_MODE_SIZE (mode) * unroll;
- rtx piece_size = GEN_INT (piece_size_n);
- rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
- rtx size;
- int i;
-
- top_label = gen_label_rtx ();
- out_label = gen_label_rtx ();
- iter = gen_reg_rtx (iter_mode);
-
- size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
- NULL, 1, OPTAB_DIRECT);
- /* Those two should combine. */
- if (piece_size == const1_rtx)
- {
- emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
- true, out_label);
- predict_jump (REG_BR_PROB_BASE * 10 / 100);
- }
- emit_move_insn (iter, const0_rtx);
-
- emit_label (top_label);
-
- tmp = convert_modes (Pmode, iter_mode, iter, true);
-
- /* This assert could be relaxed - in this case we'll need to compute
- smallest power of two, containing in PIECE_SIZE_N and pass it to
- offset_address. */
- gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
- destmem = offset_address (destmem, tmp, piece_size_n);
- destmem = adjust_address (destmem, mode, 0);
-
- if (!issetmem)
- {
- srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
- srcmem = adjust_address (srcmem, mode, 0);
-
- /* When unrolling for chips that reorder memory reads and writes,
- we can save registers by using single temporary.
- Also using 4 temporaries is overkill in 32bit mode. */
- if (!TARGET_64BIT && 0)
- {
- for (i = 0; i < unroll; i++)
- {
- if (i)
- {
- destmem = adjust_address (copy_rtx (destmem), mode,
- GET_MODE_SIZE (mode));
- srcmem = adjust_address (copy_rtx (srcmem), mode,
- GET_MODE_SIZE (mode));
- }
- emit_move_insn (destmem, srcmem);
- }
- }
- else
- {
- rtx tmpreg[4];
- gcc_assert (unroll <= 4);
- for (i = 0; i < unroll; i++)
- {
- tmpreg[i] = gen_reg_rtx (mode);
- if (i)
- srcmem = adjust_address (copy_rtx (srcmem), mode,
- GET_MODE_SIZE (mode));
- emit_move_insn (tmpreg[i], srcmem);
- }
- for (i = 0; i < unroll; i++)
- {
- if (i)
- destmem = adjust_address (copy_rtx (destmem), mode,
- GET_MODE_SIZE (mode));
- emit_move_insn (destmem, tmpreg[i]);
- }
- }
- }
- else
- for (i = 0; i < unroll; i++)
- {
- if (i)
- destmem = adjust_address (copy_rtx (destmem), mode,
- GET_MODE_SIZE (mode));
- emit_move_insn (destmem, value);
- }
-
- tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
- true, OPTAB_LIB_WIDEN);
- if (tmp != iter)
- emit_move_insn (iter, tmp);
-
- emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
- true, top_label);
- if (expected_size != -1)
- {
- expected_size /= GET_MODE_SIZE (mode) * unroll;
- if (expected_size == 0)
- predict_jump (0);
- else if (expected_size > REG_BR_PROB_BASE)
- predict_jump (REG_BR_PROB_BASE - 1);
- else
- predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
- / expected_size);
- }
- else
- predict_jump (REG_BR_PROB_BASE * 80 / 100);
- iter = ix86_zero_extend_to_Pmode (iter);
- tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
- true, OPTAB_LIB_WIDEN);
- if (tmp != destptr)
- emit_move_insn (destptr, tmp);
- if (!issetmem)
- {
- tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
- true, OPTAB_LIB_WIDEN);
- if (tmp != srcptr)
- emit_move_insn (srcptr, tmp);
- }
- emit_label (out_label);
-}
-
-/* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
- When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
- When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
- For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
- ORIG_VALUE is the original value passed to memset to fill the memory with.
- Other arguments have same meaning as for previous function. */
-
-static void
-expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
- rtx destptr, rtx srcptr, rtx value, rtx orig_value,
- rtx count,
- machine_mode mode, bool issetmem)
-{
- rtx destexp;
- rtx srcexp;
- rtx countreg;
- HOST_WIDE_INT rounded_count;
-
- /* If possible, it is shorter to use rep movs.
- TODO: Maybe it is better to move this logic to decide_alg. */
- if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
- && (!issetmem || orig_value == const0_rtx))
- mode = SImode;
-
- if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
- destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
-
- countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
- GET_MODE_SIZE (mode)));
- if (mode != QImode)
- {
- destexp = gen_rtx_ASHIFT (Pmode, countreg,
- GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
- destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
- }
- else
- destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
- if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
- {
- rounded_count
- = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
- destmem = shallow_copy_rtx (destmem);
- set_mem_size (destmem, rounded_count);
- }
- else if (MEM_SIZE_KNOWN_P (destmem))
- clear_mem_size (destmem);
-
- if (issetmem)
- {
- value = force_reg (mode, gen_lowpart (mode, value));
- emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
- }
- else
- {
- if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
- srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
- if (mode != QImode)
- {
- srcexp = gen_rtx_ASHIFT (Pmode, countreg,
- GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
- srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
- }
- else
- srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
- if (CONST_INT_P (count))
- {
- rounded_count
- = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
- srcmem = shallow_copy_rtx (srcmem);
- set_mem_size (srcmem, rounded_count);
- }
- else
- {
- if (MEM_SIZE_KNOWN_P (srcmem))
- clear_mem_size (srcmem);
- }
- emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
- destexp, srcexp));
- }
-}
-
-/* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
- DESTMEM.
- SRC is passed by pointer to be updated on return.
- Return value is updated DST. */
-static rtx
-emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
- HOST_WIDE_INT size_to_move)
-{
- rtx dst = destmem, src = *srcmem, adjust, tempreg;
- enum insn_code code;
- machine_mode move_mode;
- int piece_size, i;
-
- /* Find the widest mode in which we could perform moves.
- Start with the biggest power of 2 less than SIZE_TO_MOVE and half
- it until move of such size is supported. */
- piece_size = 1 << floor_log2 (size_to_move);
- while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
- || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
- {
- gcc_assert (piece_size > 1);
- piece_size >>= 1;
- }
-
- /* Find the corresponding vector mode with the same size as MOVE_MODE.
- MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
- if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
- {
- int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
- if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
- || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
- {
- move_mode = word_mode;
- piece_size = GET_MODE_SIZE (move_mode);
- code = optab_handler (mov_optab, move_mode);
- }
- }
- gcc_assert (code != CODE_FOR_nothing);
-
- dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
- src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
-
- /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
- gcc_assert (size_to_move % piece_size == 0);
- adjust = GEN_INT (piece_size);
- for (i = 0; i < size_to_move; i += piece_size)
- {
- /* We move from memory to memory, so we'll need to do it via
- a temporary register. */
- tempreg = gen_reg_rtx (move_mode);
- emit_insn (GEN_FCN (code) (tempreg, src));
- emit_insn (GEN_FCN (code) (dst, tempreg));
-
- emit_move_insn (destptr,
- gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
- emit_move_insn (srcptr,
- gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
-
- dst = adjust_automodify_address_nv (dst, move_mode, destptr,
- piece_size);
- src = adjust_automodify_address_nv (src, move_mode, srcptr,
- piece_size);
- }
-
- /* Update DST and SRC rtx. */
- *srcmem = src;
- return dst;
-}
-
-/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
-static void
-expand_movmem_epilogue (rtx destmem, rtx srcmem,
- rtx destptr, rtx srcptr, rtx count, int max_size)
-{
- rtx src, dest;
- if (CONST_INT_P (count))
- {
- HOST_WIDE_INT countval = INTVAL (count);
- HOST_WIDE_INT epilogue_size = countval % max_size;
- int i;
-
- /* For now MAX_SIZE should be a power of 2. This assert could be
- relaxed, but it'll require a bit more complicated epilogue
- expanding. */
- gcc_assert ((max_size & (max_size - 1)) == 0);
- for (i = max_size; i >= 1; i >>= 1)
- {
- if (epilogue_size & i)
- destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
- }
- return;
- }
- if (max_size > 8)
- {
- count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
- count, 1, OPTAB_DIRECT);
- expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
- count, QImode, 1, 4, false);
- return;
- }
-
- /* When there are stringops, we can cheaply increase dest and src pointers.
- Otherwise we save code size by maintaining offset (zero is readily
- available from preceding rep operation) and using x86 addressing modes.
- */
- if (TARGET_SINGLE_STRINGOP)
- {
- if (max_size > 4)
- {
- rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
- src = change_address (srcmem, SImode, srcptr);
- dest = change_address (destmem, SImode, destptr);
- emit_insn (gen_strmov (destptr, dest, srcptr, src));
- emit_label (label);
- LABEL_NUSES (label) = 1;
- }
- if (max_size > 2)
- {
- rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
- src = change_address (srcmem, HImode, srcptr);
- dest = change_address (destmem, HImode, destptr);
- emit_insn (gen_strmov (destptr, dest, srcptr, src));
- emit_label (label);
- LABEL_NUSES (label) = 1;
- }
- if (max_size > 1)
- {
- rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
- src = change_address (srcmem, QImode, srcptr);
- dest = change_address (destmem, QImode, destptr);
- emit_insn (gen_strmov (destptr, dest, srcptr, src));
- emit_label (label);
- LABEL_NUSES (label) = 1;
- }
- }
- else
- {
- rtx offset = force_reg (Pmode, const0_rtx);
- rtx tmp;
-
- if (max_size > 4)
- {
- rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
- src = change_address (srcmem, SImode, srcptr);
- dest = change_address (destmem, SImode, destptr);
- emit_move_insn (dest, src);
- tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
- true, OPTAB_LIB_WIDEN);
- if (tmp != offset)
- emit_move_insn (offset, tmp);
- emit_label (label);
- LABEL_NUSES (label) = 1;
- }
- if (max_size > 2)
- {
- rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
- tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
- src = change_address (srcmem, HImode, tmp);
- tmp = gen_rtx_PLUS (Pmode, destptr, offset);
- dest = change_address (destmem, HImode, tmp);
- emit_move_insn (dest, src);
- tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
- true, OPTAB_LIB_WIDEN);
- if (tmp != offset)
- emit_move_insn (offset, tmp);
- emit_label (label);
- LABEL_NUSES (label) = 1;
- }
- if (max_size > 1)
- {
- rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
- tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
- src = change_address (srcmem, QImode, tmp);
- tmp = gen_rtx_PLUS (Pmode, destptr, offset);
- dest = change_address (destmem, QImode, tmp);
- emit_move_insn (dest, src);
- emit_label (label);
- LABEL_NUSES (label) = 1;
- }
- }
-}
-
-/* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
- with value PROMOTED_VAL.
- SRC is passed by pointer to be updated on return.
- Return value is updated DST. */
-static rtx
-emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
- HOST_WIDE_INT size_to_move)
-{
- rtx dst = destmem, adjust;
- enum insn_code code;
- machine_mode move_mode;
- int piece_size, i;
-
- /* Find the widest mode in which we could perform moves.
- Start with the biggest power of 2 less than SIZE_TO_MOVE and half
- it until move of such size is supported. */
- move_mode = GET_MODE (promoted_val);
- if (move_mode == VOIDmode)
- move_mode = QImode;
- if (size_to_move < GET_MODE_SIZE (move_mode))
- {
- unsigned int move_bits = size_to_move * BITS_PER_UNIT;
- move_mode = int_mode_for_size (move_bits, 0).require ();
- promoted_val = gen_lowpart (move_mode, promoted_val);
- }
- piece_size = GET_MODE_SIZE (move_mode);
- code = optab_handler (mov_optab, move_mode);
- gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
-
- dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
-
- /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
- gcc_assert (size_to_move % piece_size == 0);
- adjust = GEN_INT (piece_size);
- for (i = 0; i < size_to_move; i += piece_size)
- {
- if (piece_size <= GET_MODE_SIZE (word_mode))
- {
- emit_insn (gen_strset (destptr, dst, promoted_val));
- dst = adjust_automodify_address_nv (dst, move_mode, destptr,
- piece_size);
- continue;
- }
-
- emit_insn (GEN_FCN (code) (dst, promoted_val));
-
- emit_move_insn (destptr,
- gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
-
- dst = adjust_automodify_address_nv (dst, move_mode, destptr,
- piece_size);
- }
-
- /* Update DST rtx. */
- return dst;
-}
-/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
-static void
-expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
- rtx count, int max_size)
-{
- count = expand_simple_binop (counter_mode (count), AND, count,
- GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
- expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
- gen_lowpart (QImode, value), count, QImode,
- 1, max_size / 2, true);
-}
-
-/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
-static void
-expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
- rtx count, int max_size)
-{
- rtx dest;
-
- if (CONST_INT_P (count))
- {
- HOST_WIDE_INT countval = INTVAL (count);
- HOST_WIDE_INT epilogue_size = countval % max_size;
- int i;
-
- /* For now MAX_SIZE should be a power of 2. This assert could be
- relaxed, but it'll require a bit more complicated epilogue
- expanding. */
- gcc_assert ((max_size & (max_size - 1)) == 0);
- for (i = max_size; i >= 1; i >>= 1)
- {
- if (epilogue_size & i)
- {
- if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
- destmem = emit_memset (destmem, destptr, vec_value, i);
- else
- destmem = emit_memset (destmem, destptr, value, i);
- }
- }
- return;
- }
- if (max_size > 32)
- {
- expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
- return;
- }
- if (max_size > 16)
- {
- rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
- if (TARGET_64BIT)
- {
- dest = change_address (destmem, DImode, destptr);
- emit_insn (gen_strset (destptr, dest, value));
- dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
- emit_insn (gen_strset (destptr, dest, value));
- }
- else
- {
- dest = change_address (destmem, SImode, destptr);
- emit_insn (gen_strset (destptr, dest, value));
- dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
- emit_insn (gen_strset (destptr, dest, value));
- dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
- emit_insn (gen_strset (destptr, dest, value));
- dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
- emit_insn (gen_strset (destptr, dest, value));
- }
- emit_label (label);
- LABEL_NUSES (label) = 1;
- }
- if (max_size > 8)
- {
- rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
- if (TARGET_64BIT)
- {
- dest = change_address (destmem, DImode, destptr);
- emit_insn (gen_strset (destptr, dest, value));
- }
- else
- {
- dest = change_address (destmem, SImode, destptr);
- emit_insn (gen_strset (destptr, dest, value));
- dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
- emit_insn (gen_strset (destptr, dest, value));
- }
- emit_label (label);
- LABEL_NUSES (label) = 1;
- }
- if (max_size > 4)
- {
- rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
- dest = change_address (destmem, SImode, destptr);
- emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
- emit_label (label);
- LABEL_NUSES (label) = 1;
- }
- if (max_size > 2)
- {
- rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
- dest = change_address (destmem, HImode, destptr);
- emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
- emit_label (label);
- LABEL_NUSES (label) = 1;
- }
- if (max_size > 1)
- {
- rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
- dest = change_address (destmem, QImode, destptr);
- emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
- emit_label (label);
- LABEL_NUSES (label) = 1;
- }
-}
-
-/* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
- DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
- Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
- ignored.
- Return value is updated DESTMEM. */
-static rtx
-expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
- rtx destptr, rtx srcptr, rtx value,
- rtx vec_value, rtx count, int align,
- int desired_alignment, bool issetmem)
-{
- int i;
- for (i = 1; i < desired_alignment; i <<= 1)
- {
- if (align <= i)
- {
- rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
- if (issetmem)
- {
- if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
- destmem = emit_memset (destmem, destptr, vec_value, i);
- else
- destmem = emit_memset (destmem, destptr, value, i);
- }
- else
- destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
- ix86_adjust_counter (count, i);
- emit_label (label);
- LABEL_NUSES (label) = 1;
- set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
- }
- }
- return destmem;
-}
-
-/* Test if COUNT&SIZE is nonzero and if so, expand movme
- or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
- and jump to DONE_LABEL. */
-static void
-expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
- rtx destptr, rtx srcptr,
- rtx value, rtx vec_value,
- rtx count, int size,
- rtx done_label, bool issetmem)
-{
- rtx_code_label *label = ix86_expand_aligntest (count, size, false);
- machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
- rtx modesize;
- int n;
-
- /* If we do not have vector value to copy, we must reduce size. */
- if (issetmem)
- {
- if (!vec_value)
- {
- if (GET_MODE (value) == VOIDmode && size > 8)
- mode = Pmode;
- else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
- mode = GET_MODE (value);
- }
- else
- mode = GET_MODE (vec_value), value = vec_value;
- }
- else
- {
- /* Choose appropriate vector mode. */
- if (size >= 32)
- mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
- else if (size >= 16)
- mode = TARGET_SSE ? V16QImode : DImode;
- srcmem = change_address (srcmem, mode, srcptr);
- }
- destmem = change_address (destmem, mode, destptr);
- modesize = GEN_INT (GET_MODE_SIZE (mode));
- gcc_assert (GET_MODE_SIZE (mode) <= size);
- for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
- {
- if (issetmem)
- emit_move_insn (destmem, gen_lowpart (mode, value));
- else
- {
- emit_move_insn (destmem, srcmem);
- srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
- }
- destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
- }
-
- destmem = offset_address (destmem, count, 1);
- destmem = offset_address (destmem, GEN_INT (-2 * size),
- GET_MODE_SIZE (mode));
- if (!issetmem)
- {
- srcmem = offset_address (srcmem, count, 1);
- srcmem = offset_address (srcmem, GEN_INT (-2 * size),
- GET_MODE_SIZE (mode));
- }
- for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
- {
- if (issetmem)
- emit_move_insn (destmem, gen_lowpart (mode, value));
- else
- {
- emit_move_insn (destmem, srcmem);
- srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
- }
- destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
- }
- emit_jump_insn (gen_jump (done_label));
- emit_barrier ();
-
- emit_label (label);
- LABEL_NUSES (label) = 1;
-}
-
-/* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
- and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
- bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
- proceed with an loop copying SIZE bytes at once. Do moves in MODE.
- DONE_LABEL is a label after the whole copying sequence. The label is created
- on demand if *DONE_LABEL is NULL.
- MIN_SIZE is minimal size of block copied. This value gets adjusted for new
- bounds after the initial copies.
-
- DESTMEM/SRCMEM are memory expressions pointing to the copies block,
- DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
- we will dispatch to a library call for large blocks.
-
- In pseudocode we do:
-
- if (COUNT < SIZE)
- {
- Assume that SIZE is 4. Bigger sizes are handled analogously
- if (COUNT & 4)
- {
- copy 4 bytes from SRCPTR to DESTPTR
- copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
- goto done_label
- }
- if (!COUNT)
- goto done_label;
- copy 1 byte from SRCPTR to DESTPTR
- if (COUNT & 2)
- {
- copy 2 bytes from SRCPTR to DESTPTR
- copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
- }
- }
- else
- {
- copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
- copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
-
- OLD_DESPTR = DESTPTR;
- Align DESTPTR up to DESIRED_ALIGN
- SRCPTR += DESTPTR - OLD_DESTPTR
- COUNT -= DEST_PTR - OLD_DESTPTR
- if (DYNAMIC_CHECK)
- Round COUNT down to multiple of SIZE
- << optional caller supplied zero size guard is here >>
- << optional caller supplied dynamic check is here >>
- << caller supplied main copy loop is here >>
- }
- done_label:
- */
-static void
-expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
- rtx *destptr, rtx *srcptr,
- machine_mode mode,
- rtx value, rtx vec_value,
- rtx *count,
- rtx_code_label **done_label,
- int size,
- int desired_align,
- int align,
- unsigned HOST_WIDE_INT *min_size,
- bool dynamic_check,
- bool issetmem)
-{
- rtx_code_label *loop_label = NULL, *label;
- int n;
- rtx modesize;
- int prolog_size = 0;
- rtx mode_value;
-
- /* Chose proper value to copy. */
- if (issetmem && VECTOR_MODE_P (mode))
- mode_value = vec_value;
- else
- mode_value = value;
- gcc_assert (GET_MODE_SIZE (mode) <= size);
-
- /* See if block is big or small, handle small blocks. */
- if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
- {
- int size2 = size;
- loop_label = gen_label_rtx ();
-
- if (!*done_label)
- *done_label = gen_label_rtx ();
-
- emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
- 1, loop_label);
- size2 >>= 1;
-
- /* Handle sizes > 3. */
- for (;size2 > 2; size2 >>= 1)
- expand_small_movmem_or_setmem (destmem, srcmem,
- *destptr, *srcptr,
- value, vec_value,
- *count,
- size2, *done_label, issetmem);
- /* Nothing to copy? Jump to DONE_LABEL if so */
- emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
- 1, *done_label);
-
- /* Do a byte copy. */
- destmem = change_address (destmem, QImode, *destptr);
- if (issetmem)
- emit_move_insn (destmem, gen_lowpart (QImode, value));
- else
- {
- srcmem = change_address (srcmem, QImode, *srcptr);
- emit_move_insn (destmem, srcmem);
- }
-
- /* Handle sizes 2 and 3. */
- label = ix86_expand_aligntest (*count, 2, false);
- destmem = change_address (destmem, HImode, *destptr);
- destmem = offset_address (destmem, *count, 1);
- destmem = offset_address (destmem, GEN_INT (-2), 2);
- if (issetmem)
- emit_move_insn (destmem, gen_lowpart (HImode, value));
- else
- {
- srcmem = change_address (srcmem, HImode, *srcptr);
- srcmem = offset_address (srcmem, *count, 1);
- srcmem = offset_address (srcmem, GEN_INT (-2), 2);
- emit_move_insn (destmem, srcmem);
- }
-
- emit_label (label);
- LABEL_NUSES (label) = 1;
- emit_jump_insn (gen_jump (*done_label));
- emit_barrier ();
- }
- else
- gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
- || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
-
- /* Start memcpy for COUNT >= SIZE. */
- if (loop_label)
- {
- emit_label (loop_label);
- LABEL_NUSES (loop_label) = 1;
- }
-
- /* Copy first desired_align bytes. */
- if (!issetmem)
- srcmem = change_address (srcmem, mode, *srcptr);
- destmem = change_address (destmem, mode, *destptr);
- modesize = GEN_INT (GET_MODE_SIZE (mode));
- for (n = 0; prolog_size < desired_align - align; n++)
- {
- if (issetmem)
- emit_move_insn (destmem, mode_value);
- else
- {
- emit_move_insn (destmem, srcmem);
- srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
- }
- destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
- prolog_size += GET_MODE_SIZE (mode);
- }
-
-
- /* Copy last SIZE bytes. */
- destmem = offset_address (destmem, *count, 1);
- destmem = offset_address (destmem,
- GEN_INT (-size - prolog_size),
- 1);
- if (issetmem)
- emit_move_insn (destmem, mode_value);
- else
- {
- srcmem = offset_address (srcmem, *count, 1);
- srcmem = offset_address (srcmem,
- GEN_INT (-size - prolog_size),
- 1);
- emit_move_insn (destmem, srcmem);
- }
- for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
- {
- destmem = offset_address (destmem, modesize, 1);
- if (issetmem)
- emit_move_insn (destmem, mode_value);
- else
- {
- srcmem = offset_address (srcmem, modesize, 1);
- emit_move_insn (destmem, srcmem);
- }
- }
-
- /* Align destination. */
- if (desired_align > 1 && desired_align > align)
- {
- rtx saveddest = *destptr;
-
- gcc_assert (desired_align <= size);
- /* Align destptr up, place it to new register. */
- *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
- GEN_INT (prolog_size),
- NULL_RTX, 1, OPTAB_DIRECT);
- if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
- REG_POINTER (*destptr) = 1;
- *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
- GEN_INT (-desired_align),
- *destptr, 1, OPTAB_DIRECT);
- /* See how many bytes we skipped. */
- saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
- *destptr,
- saveddest, 1, OPTAB_DIRECT);
- /* Adjust srcptr and count. */
- if (!issetmem)
- *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
- saveddest, *srcptr, 1, OPTAB_DIRECT);
- *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
- saveddest, *count, 1, OPTAB_DIRECT);
- /* We copied at most size + prolog_size. */
- if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
- *min_size
- = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
- else
- *min_size = 0;
-
- /* Our loops always round down the block size, but for dispatch to
- library we need precise value. */
- if (dynamic_check)
- *count = expand_simple_binop (GET_MODE (*count), AND, *count,
- GEN_INT (-size), *count, 1, OPTAB_DIRECT);
- }
- else
- {
- gcc_assert (prolog_size == 0);
- /* Decrease count, so we won't end up copying last word twice. */
- if (!CONST_INT_P (*count))
- *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
- constm1_rtx, *count, 1, OPTAB_DIRECT);
- else
- *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
- (unsigned HOST_WIDE_INT)size));
- if (*min_size)
- *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
- }
-}
-
-
-/* This function is like the previous one, except here we know how many bytes
- need to be copied. That allows us to update alignment not only of DST, which
- is returned, but also of SRC, which is passed as a pointer for that
- reason. */
-static rtx
-expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
- rtx srcreg, rtx value, rtx vec_value,
- int desired_align, int align_bytes,
- bool issetmem)
-{
- rtx src = NULL;
- rtx orig_dst = dst;
- rtx orig_src = NULL;
- int piece_size = 1;
- int copied_bytes = 0;
-
- if (!issetmem)
- {
- gcc_assert (srcp != NULL);
- src = *srcp;
- orig_src = src;
- }
-
- for (piece_size = 1;
- piece_size <= desired_align && copied_bytes < align_bytes;
- piece_size <<= 1)
- {
- if (align_bytes & piece_size)
- {
- if (issetmem)
- {
- if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
- dst = emit_memset (dst, destreg, vec_value, piece_size);
- else
- dst = emit_memset (dst, destreg, value, piece_size);
- }
- else
- dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
- copied_bytes += piece_size;
- }
- }
- if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
- set_mem_align (dst, desired_align * BITS_PER_UNIT);
- if (MEM_SIZE_KNOWN_P (orig_dst))
- set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
-
- if (!issetmem)
- {
- int src_align_bytes = get_mem_align_offset (src, desired_align
- * BITS_PER_UNIT);
- if (src_align_bytes >= 0)
- src_align_bytes = desired_align - src_align_bytes;
- if (src_align_bytes >= 0)
- {
- unsigned int src_align;
- for (src_align = desired_align; src_align >= 2; src_align >>= 1)
- {
- if ((src_align_bytes & (src_align - 1))
- == (align_bytes & (src_align - 1)))
- break;
- }
- if (src_align > (unsigned int) desired_align)
- src_align = desired_align;
- if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
- set_mem_align (src, src_align * BITS_PER_UNIT);
- }
- if (MEM_SIZE_KNOWN_P (orig_src))
- set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
- *srcp = src;
- }
-
- return dst;
-}
-
-/* Return true if ALG can be used in current context.
- Assume we expand memset if MEMSET is true. */
-static bool
-alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
-{
- if (alg == no_stringop)
- return false;
- if (alg == vector_loop)
- return TARGET_SSE || TARGET_AVX;
- /* Algorithms using the rep prefix want at least edi and ecx;
- additionally, memset wants eax and memcpy wants esi. Don't
- consider such algorithms if the user has appropriated those
- registers for their own purposes, or if we have a non-default
- address space, since some string insns cannot override the segment. */
- if (alg == rep_prefix_1_byte
- || alg == rep_prefix_4_byte
- || alg == rep_prefix_8_byte)
- {
- if (have_as)
- return false;
- if (fixed_regs[CX_REG]
- || fixed_regs[DI_REG]
- || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
- return false;
- }
- return true;
-}
-
-/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
-static enum stringop_alg
-decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
- unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
- bool memset, bool zero_memset, bool have_as,
- int *dynamic_check, bool *noalign, bool recur)
-{
- const struct stringop_algs *algs;
- bool optimize_for_speed;
- int max = 0;
- const struct processor_costs *cost;
- int i;
- bool any_alg_usable_p = false;
-
- *noalign = false;
- *dynamic_check = -1;
-
- /* Even if the string operation call is cold, we still might spend a lot
- of time processing large blocks. */
- if (optimize_function_for_size_p (cfun)
- || (optimize_insn_for_size_p ()
- && (max_size < 256
- || (expected_size != -1 && expected_size < 256))))
- optimize_for_speed = false;
- else
- optimize_for_speed = true;
-
- cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
- if (memset)
- algs = &cost->memset[TARGET_64BIT != 0];
- else
- algs = &cost->memcpy[TARGET_64BIT != 0];
-
- /* See maximal size for user defined algorithm. */
- for (i = 0; i < MAX_STRINGOP_ALGS; i++)
- {
- enum stringop_alg candidate = algs->size[i].alg;
- bool usable = alg_usable_p (candidate, memset, have_as);
- any_alg_usable_p |= usable;
-
- if (candidate != libcall && candidate && usable)
- max = algs->size[i].max;
- }
-
- /* If expected size is not known but max size is small enough
- so inline version is a win, set expected size into
- the range. */
- if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
- && expected_size == -1)
- expected_size = min_size / 2 + max_size / 2;
-
- /* If user specified the algorithm, honor it if possible. */
- if (ix86_stringop_alg != no_stringop
- && alg_usable_p (ix86_stringop_alg, memset, have_as))
- return ix86_stringop_alg;
- /* rep; movq or rep; movl is the smallest variant. */
- else if (!optimize_for_speed)
- {
- *noalign = true;
- if (!count || (count & 3) || (memset && !zero_memset))
- return alg_usable_p (rep_prefix_1_byte, memset, have_as)
- ? rep_prefix_1_byte : loop_1_byte;
- else
- return alg_usable_p (rep_prefix_4_byte, memset, have_as)
- ? rep_prefix_4_byte : loop;
- }
- /* Very tiny blocks are best handled via the loop, REP is expensive to
- setup. */
- else if (expected_size != -1 && expected_size < 4)
- return loop_1_byte;
- else if (expected_size != -1)
- {
- enum stringop_alg alg = libcall;
- bool alg_noalign = false;
- for (i = 0; i < MAX_STRINGOP_ALGS; i++)
- {
- /* We get here if the algorithms that were not libcall-based
- were rep-prefix based and we are unable to use rep prefixes
- based on global register usage. Break out of the loop and
- use the heuristic below. */
- if (algs->size[i].max == 0)
- break;
- if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
- {
- enum stringop_alg candidate = algs->size[i].alg;
-
- if (candidate != libcall
- && alg_usable_p (candidate, memset, have_as))
- {
- alg = candidate;
- alg_noalign = algs->size[i].noalign;
- }
- /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
- last non-libcall inline algorithm. */
- if (TARGET_INLINE_ALL_STRINGOPS)
- {
- /* When the current size is best to be copied by a libcall,
- but we are still forced to inline, run the heuristic below
- that will pick code for medium sized blocks. */
- if (alg != libcall)
- {
- *noalign = alg_noalign;
- return alg;
- }
- else if (!any_alg_usable_p)
- break;
- }
- else if (alg_usable_p (candidate, memset, have_as))
- {
- *noalign = algs->size[i].noalign;
- return candidate;
- }
- }
- }
- }
- /* When asked to inline the call anyway, try to pick meaningful choice.
- We look for maximal size of block that is faster to copy by hand and
- take blocks of at most of that size guessing that average size will
- be roughly half of the block.
-
- If this turns out to be bad, we might simply specify the preferred
- choice in ix86_costs. */
- if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
- && (algs->unknown_size == libcall
- || !alg_usable_p (algs->unknown_size, memset, have_as)))
- {
- enum stringop_alg alg;
- HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
-
- /* If there aren't any usable algorithms or if recursing already,
- then recursing on smaller sizes or same size isn't going to
- find anything. Just return the simple byte-at-a-time copy loop. */
- if (!any_alg_usable_p || recur)
- {
- /* Pick something reasonable. */
- if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
- *dynamic_check = 128;
- return loop_1_byte;
- }
- alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
- zero_memset, have_as, dynamic_check, noalign, true);
- gcc_assert (*dynamic_check == -1);
- if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
- *dynamic_check = max;
- else
- gcc_assert (alg != libcall);
- return alg;
- }
- return (alg_usable_p (algs->unknown_size, memset, have_as)
- ? algs->unknown_size : libcall);
-}
-
-/* Decide on alignment. We know that the operand is already aligned to ALIGN
- (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
-static int
-decide_alignment (int align,
- enum stringop_alg alg,
- int expected_size,
- machine_mode move_mode)
-{
- int desired_align = 0;
-
- gcc_assert (alg != no_stringop);
-
- if (alg == libcall)
- return 0;
- if (move_mode == VOIDmode)
- return 0;
-
- desired_align = GET_MODE_SIZE (move_mode);
- /* PentiumPro has special logic triggering for 8 byte aligned blocks.
- copying whole cacheline at once. */
- if (TARGET_PENTIUMPRO
- && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
- desired_align = 8;
-
- if (optimize_size)
- desired_align = 1;
- if (desired_align < align)
- desired_align = align;
- if (expected_size != -1 && expected_size < 4)
- desired_align = align;
-
- return desired_align;
-}
-
-
-/* Helper function for memcpy. For QImode value 0xXY produce
- 0xXYXYXYXY of wide specified by MODE. This is essentially
- a * 0x10101010, but we can do slightly better than
- synth_mult by unwinding the sequence by hand on CPUs with
- slow multiply. */
-static rtx
-promote_duplicated_reg (machine_mode mode, rtx val)
-{
- machine_mode valmode = GET_MODE (val);
- rtx tmp;
- int nops = mode == DImode ? 3 : 2;
-
- gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
- if (val == const0_rtx)
- return copy_to_mode_reg (mode, CONST0_RTX (mode));
- if (CONST_INT_P (val))
- {
- HOST_WIDE_INT v = INTVAL (val) & 255;
-
- v |= v << 8;
- v |= v << 16;
- if (mode == DImode)
- v |= (v << 16) << 16;
- return copy_to_mode_reg (mode, gen_int_mode (v, mode));
- }
-
- if (valmode == VOIDmode)
- valmode = QImode;
- if (valmode != QImode)
- val = gen_lowpart (QImode, val);
- if (mode == QImode)
- return val;
- if (!TARGET_PARTIAL_REG_STALL)
- nops--;
- if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
- + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
- <= (ix86_cost->shift_const + ix86_cost->add) * nops
- + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
- {
- rtx reg = convert_modes (mode, QImode, val, true);
- tmp = promote_duplicated_reg (mode, const1_rtx);
- return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
- OPTAB_DIRECT);
- }
- else
- {
- rtx reg = convert_modes (mode, QImode, val, true);
-
- if (!TARGET_PARTIAL_REG_STALL)
- if (mode == SImode)
- emit_insn (gen_insvsi_1 (reg, reg));
- else
- emit_insn (gen_insvdi_1 (reg, reg));
- else
- {
- tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
- NULL, 1, OPTAB_DIRECT);
- reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
- OPTAB_DIRECT);
- }
- tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
- NULL, 1, OPTAB_DIRECT);
- reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
- if (mode == SImode)
- return reg;
- tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
- NULL, 1, OPTAB_DIRECT);
- reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
- return reg;
- }
-}
-
-/* Duplicate value VAL using promote_duplicated_reg into maximal size that will
- be needed by main loop copying SIZE_NEEDED chunks and prologue getting
- alignment from ALIGN to DESIRED_ALIGN. */
-static rtx
-promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
- int align)
-{
- rtx promoted_val;
-
- if (TARGET_64BIT
- && (size_needed > 4 || (desired_align > align && desired_align > 4)))
- promoted_val = promote_duplicated_reg (DImode, val);
- else if (size_needed > 2 || (desired_align > align && desired_align > 2))
- promoted_val = promote_duplicated_reg (SImode, val);
- else if (size_needed > 1 || (desired_align > align && desired_align > 1))
- promoted_val = promote_duplicated_reg (HImode, val);
- else
- promoted_val = val;
-
- return promoted_val;
-}
-
-/* Expand string move (memcpy) ot store (memset) operation. Use i386 string
- operations when profitable. The code depends upon architecture, block size
- and alignment, but always has one of the following overall structures:
-
- Aligned move sequence:
-
- 1) Prologue guard: Conditional that jumps up to epilogues for small
- blocks that can be handled by epilogue alone. This is faster
- but also needed for correctness, since prologue assume the block
- is larger than the desired alignment.
-
- Optional dynamic check for size and libcall for large
- blocks is emitted here too, with -minline-stringops-dynamically.
-
- 2) Prologue: copy first few bytes in order to get destination
- aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
- than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
- copied. We emit either a jump tree on power of two sized
- blocks, or a byte loop.
-
- 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
- with specified algorithm.
-
- 4) Epilogue: code copying tail of the block that is too small to be
- handled by main body (or up to size guarded by prologue guard).
-
- Misaligned move sequence
-
- 1) missaligned move prologue/epilogue containing:
- a) Prologue handling small memory blocks and jumping to done_label
- (skipped if blocks are known to be large enough)
- b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
- needed by single possibly misaligned move
- (skipped if alignment is not needed)
- c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
-
- 2) Zero size guard dispatching to done_label, if needed
-
- 3) dispatch to library call, if needed,
-
- 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
- with specified algorithm. */
-bool
-ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
- rtx align_exp, rtx expected_align_exp,
- rtx expected_size_exp, rtx min_size_exp,
- rtx max_size_exp, rtx probable_max_size_exp,
- bool issetmem)
-{
- rtx destreg;
- rtx srcreg = NULL;
- rtx_code_label *label = NULL;
- rtx tmp;
- rtx_code_label *jump_around_label = NULL;
- HOST_WIDE_INT align = 1;
- unsigned HOST_WIDE_INT count = 0;
- HOST_WIDE_INT expected_size = -1;
- int size_needed = 0, epilogue_size_needed;
- int desired_align = 0, align_bytes = 0;
- enum stringop_alg alg;
- rtx promoted_val = NULL;
- rtx vec_promoted_val = NULL;
- bool force_loopy_epilogue = false;
- int dynamic_check;
- bool need_zero_guard = false;
- bool noalign;
- machine_mode move_mode = VOIDmode;
- machine_mode wider_mode;
- int unroll_factor = 1;
- /* TODO: Once value ranges are available, fill in proper data. */
- unsigned HOST_WIDE_INT min_size = 0;
- unsigned HOST_WIDE_INT max_size = -1;
- unsigned HOST_WIDE_INT probable_max_size = -1;
- bool misaligned_prologue_used = false;
- bool have_as;
-
- if (CONST_INT_P (align_exp))
- align = INTVAL (align_exp);
- /* i386 can do misaligned access on reasonably increased cost. */
- if (CONST_INT_P (expected_align_exp)
- && INTVAL (expected_align_exp) > align)
- align = INTVAL (expected_align_exp);
- /* ALIGN is the minimum of destination and source alignment, but we care here
- just about destination alignment. */
- else if (!issetmem
- && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
- align = MEM_ALIGN (dst) / BITS_PER_UNIT;
-
- if (CONST_INT_P (count_exp))
- {
- min_size = max_size = probable_max_size = count = expected_size
- = INTVAL (count_exp);
- /* When COUNT is 0, there is nothing to do. */
- if (!count)
- return true;
- }
- else
- {
- if (min_size_exp)
- min_size = INTVAL (min_size_exp);
- if (max_size_exp)
- max_size = INTVAL (max_size_exp);
- if (probable_max_size_exp)
- probable_max_size = INTVAL (probable_max_size_exp);
- if (CONST_INT_P (expected_size_exp))
- expected_size = INTVAL (expected_size_exp);
- }
-
- /* Make sure we don't need to care about overflow later on. */
- if (count > (HOST_WIDE_INT_1U << 30))
- return false;
-
- have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
- if (!issetmem)
- have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
-
- /* Step 0: Decide on preferred algorithm, desired alignment and
- size of chunks to be copied by main loop. */
- alg = decide_alg (count, expected_size, min_size, probable_max_size,
- issetmem,
- issetmem && val_exp == const0_rtx, have_as,
- &dynamic_check, &noalign, false);
-
- if (dump_file)
- fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
- stringop_alg_names[alg]);
-
- if (alg == libcall)
- return false;
- gcc_assert (alg != no_stringop);
-
- /* For now vector-version of memset is generated only for memory zeroing, as
- creating of promoted vector value is very cheap in this case. */
- if (issetmem && alg == vector_loop && val_exp != const0_rtx)
- alg = unrolled_loop;
-
- if (!count)
- count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
- destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
- if (!issetmem)
- srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
-
- unroll_factor = 1;
- move_mode = word_mode;
- switch (alg)
- {
- case libcall:
- case no_stringop:
- case last_alg:
- gcc_unreachable ();
- case loop_1_byte:
- need_zero_guard = true;
- move_mode = QImode;
- break;
- case loop:
- need_zero_guard = true;
- break;
- case unrolled_loop:
- need_zero_guard = true;
- unroll_factor = (TARGET_64BIT ? 4 : 2);
- break;
- case vector_loop:
- need_zero_guard = true;
- unroll_factor = 4;
- /* Find the widest supported mode. */
- move_mode = word_mode;
- while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
- && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
- move_mode = wider_mode;
-
- if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
- move_mode = TImode;
-
- /* Find the corresponding vector mode with the same size as MOVE_MODE.
- MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
- if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
- {
- int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
- if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
- || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
- move_mode = word_mode;
- }
- gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
- break;
- case rep_prefix_8_byte:
- move_mode = DImode;
- break;
- case rep_prefix_4_byte:
- move_mode = SImode;
- break;
- case rep_prefix_1_byte:
- move_mode = QImode;
- break;
- }
- size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
- epilogue_size_needed = size_needed;
-
- /* If we are going to call any library calls conditionally, make sure any
- pending stack adjustment happen before the first conditional branch,
- otherwise they will be emitted before the library call only and won't
- happen from the other branches. */
- if (dynamic_check != -1)
- do_pending_stack_adjust ();
-
- desired_align = decide_alignment (align, alg, expected_size, move_mode);
- if (!TARGET_ALIGN_STRINGOPS || noalign)
- align = desired_align;
-
- /* Step 1: Prologue guard. */
-
- /* Alignment code needs count to be in register. */
- if (CONST_INT_P (count_exp) && desired_align > align)
- {
- if (INTVAL (count_exp) > desired_align
- && INTVAL (count_exp) > size_needed)
- {
- align_bytes
- = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
- if (align_bytes <= 0)
- align_bytes = 0;
- else
- align_bytes = desired_align - align_bytes;
- }
- if (align_bytes == 0)
- count_exp = force_reg (counter_mode (count_exp), count_exp);
- }
- gcc_assert (desired_align >= 1 && align >= 1);
-
- /* Misaligned move sequences handle both prologue and epilogue at once.
- Default code generation results in a smaller code for large alignments
- and also avoids redundant job when sizes are known precisely. */
- misaligned_prologue_used
- = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
- && MAX (desired_align, epilogue_size_needed) <= 32
- && desired_align <= epilogue_size_needed
- && ((desired_align > align && !align_bytes)
- || (!count && epilogue_size_needed > 1)));
-
- /* Do the cheap promotion to allow better CSE across the
- main loop and epilogue (ie one load of the big constant in the
- front of all code.
- For now the misaligned move sequences do not have fast path
- without broadcasting. */
- if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
- {
- if (alg == vector_loop)
- {
- gcc_assert (val_exp == const0_rtx);
- vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
- promoted_val = promote_duplicated_reg_to_size (val_exp,
- GET_MODE_SIZE (word_mode),
- desired_align, align);
- }
- else
- {
- promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
- desired_align, align);
- }
- }
- /* Misaligned move sequences handles both prologues and epilogues at once.
- Default code generation results in smaller code for large alignments and
- also avoids redundant job when sizes are known precisely. */
- if (misaligned_prologue_used)
- {
- /* Misaligned move prologue handled small blocks by itself. */
- expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
- (dst, src, &destreg, &srcreg,
- move_mode, promoted_val, vec_promoted_val,
- &count_exp,
- &jump_around_label,
- desired_align < align
- ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
- desired_align, align, &min_size, dynamic_check, issetmem);
- if (!issetmem)
- src = change_address (src, BLKmode, srcreg);
- dst = change_address (dst, BLKmode, destreg);
- set_mem_align (dst, desired_align * BITS_PER_UNIT);
- epilogue_size_needed = 0;
- if (need_zero_guard
- && min_size < (unsigned HOST_WIDE_INT) size_needed)
- {
- /* It is possible that we copied enough so the main loop will not
- execute. */
- gcc_assert (size_needed > 1);
- if (jump_around_label == NULL_RTX)
- jump_around_label = gen_label_rtx ();
- emit_cmp_and_jump_insns (count_exp,
- GEN_INT (size_needed),
- LTU, 0, counter_mode (count_exp), 1, jump_around_label);
- if (expected_size == -1
- || expected_size < (desired_align - align) / 2 + size_needed)
- predict_jump (REG_BR_PROB_BASE * 20 / 100);
- else
- predict_jump (REG_BR_PROB_BASE * 60 / 100);
- }
- }
- /* Ensure that alignment prologue won't copy past end of block. */
- else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
- {
- epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
- /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
- Make sure it is power of 2. */
- epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
-
- /* To improve performance of small blocks, we jump around the VAL
- promoting mode. This mean that if the promoted VAL is not constant,
- we might not use it in the epilogue and have to use byte
- loop variant. */
- if (issetmem && epilogue_size_needed > 2 && !promoted_val)
- force_loopy_epilogue = true;
- if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
- || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
- {
- /* If main algorithm works on QImode, no epilogue is needed.
- For small sizes just don't align anything. */
- if (size_needed == 1)
- desired_align = align;
- else
- goto epilogue;
- }
- else if (!count
- && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
- {
- label = gen_label_rtx ();
- emit_cmp_and_jump_insns (count_exp,
- GEN_INT (epilogue_size_needed),
- LTU, 0, counter_mode (count_exp), 1, label);
- if (expected_size == -1 || expected_size < epilogue_size_needed)
- predict_jump (REG_BR_PROB_BASE * 60 / 100);
- else
- predict_jump (REG_BR_PROB_BASE * 20 / 100);
- }
- }
-
- /* Emit code to decide on runtime whether library call or inline should be
- used. */
- if (dynamic_check != -1)
- {
- if (!issetmem && CONST_INT_P (count_exp))
- {
- if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
- {
- emit_block_copy_via_libcall (dst, src, count_exp);
- count_exp = const0_rtx;
- goto epilogue;
- }
- }
- else
- {
- rtx_code_label *hot_label = gen_label_rtx ();
- if (jump_around_label == NULL_RTX)
- jump_around_label = gen_label_rtx ();
- emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
- LEU, 0, counter_mode (count_exp),
- 1, hot_label);
- predict_jump (REG_BR_PROB_BASE * 90 / 100);
- if (issetmem)
- set_storage_via_libcall (dst, count_exp, val_exp);
- else
- emit_block_copy_via_libcall (dst, src, count_exp);
- emit_jump (jump_around_label);
- emit_label (hot_label);
- }
- }
-
- /* Step 2: Alignment prologue. */
- /* Do the expensive promotion once we branched off the small blocks. */
- if (issetmem && !promoted_val)
- promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
- desired_align, align);
-
- if (desired_align > align && !misaligned_prologue_used)
- {
- if (align_bytes == 0)
- {
- /* Except for the first move in prologue, we no longer know
- constant offset in aliasing info. It don't seems to worth
- the pain to maintain it for the first move, so throw away
- the info early. */
- dst = change_address (dst, BLKmode, destreg);
- if (!issetmem)
- src = change_address (src, BLKmode, srcreg);
- dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
- promoted_val, vec_promoted_val,
- count_exp, align, desired_align,
- issetmem);
- /* At most desired_align - align bytes are copied. */
- if (min_size < (unsigned)(desired_align - align))
- min_size = 0;
- else
- min_size -= desired_align - align;
- }
- else
- {
- /* If we know how many bytes need to be stored before dst is
- sufficiently aligned, maintain aliasing info accurately. */
- dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
- srcreg,
- promoted_val,
- vec_promoted_val,
- desired_align,
- align_bytes,
- issetmem);
-
- count_exp = plus_constant (counter_mode (count_exp),
- count_exp, -align_bytes);
- count -= align_bytes;
- min_size -= align_bytes;
- max_size -= align_bytes;
- }
- if (need_zero_guard
- && min_size < (unsigned HOST_WIDE_INT) size_needed
- && (count < (unsigned HOST_WIDE_INT) size_needed
- || (align_bytes == 0
- && count < ((unsigned HOST_WIDE_INT) size_needed
- + desired_align - align))))
- {
- /* It is possible that we copied enough so the main loop will not
- execute. */
- gcc_assert (size_needed > 1);
- if (label == NULL_RTX)
- label = gen_label_rtx ();
- emit_cmp_and_jump_insns (count_exp,
- GEN_INT (size_needed),
- LTU, 0, counter_mode (count_exp), 1, label);
- if (expected_size == -1
- || expected_size < (desired_align - align) / 2 + size_needed)
- predict_jump (REG_BR_PROB_BASE * 20 / 100);
- else
- predict_jump (REG_BR_PROB_BASE * 60 / 100);
- }
- }
- if (label && size_needed == 1)
- {
- emit_label (label);
- LABEL_NUSES (label) = 1;
- label = NULL;
- epilogue_size_needed = 1;
- if (issetmem)
- promoted_val = val_exp;
- }
- else if (label == NULL_RTX && !misaligned_prologue_used)
- epilogue_size_needed = size_needed;
-
- /* Step 3: Main loop. */
-
- switch (alg)
- {
- case libcall:
- case no_stringop:
- case last_alg:
- gcc_unreachable ();
- case loop_1_byte:
- case loop:
- case unrolled_loop:
- expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
- count_exp, move_mode, unroll_factor,
- expected_size, issetmem);
- break;
- case vector_loop:
- expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
- vec_promoted_val, count_exp, move_mode,
- unroll_factor, expected_size, issetmem);
- break;
- case rep_prefix_8_byte:
- case rep_prefix_4_byte:
- case rep_prefix_1_byte:
- expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
- val_exp, count_exp, move_mode, issetmem);
- break;
- }
- /* Adjust properly the offset of src and dest memory for aliasing. */
- if (CONST_INT_P (count_exp))
- {
- if (!issetmem)
- src = adjust_automodify_address_nv (src, BLKmode, srcreg,
- (count / size_needed) * size_needed);
- dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
- (count / size_needed) * size_needed);
- }
- else
- {
- if (!issetmem)
- src = change_address (src, BLKmode, srcreg);
- dst = change_address (dst, BLKmode, destreg);
- }
-
- /* Step 4: Epilogue to copy the remaining bytes. */
- epilogue:
- if (label)
- {
- /* When the main loop is done, COUNT_EXP might hold original count,
- while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
- Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
- bytes. Compensate if needed. */
-
- if (size_needed < epilogue_size_needed)
- {
- tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
- GEN_INT (size_needed - 1), count_exp, 1,
- OPTAB_DIRECT);
- if (tmp != count_exp)
- emit_move_insn (count_exp, tmp);
- }
- emit_label (label);
- LABEL_NUSES (label) = 1;
- }
-
- if (count_exp != const0_rtx && epilogue_size_needed > 1)
- {
- if (force_loopy_epilogue)
- expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
- epilogue_size_needed);
- else
- {
- if (issetmem)
- expand_setmem_epilogue (dst, destreg, promoted_val,
- vec_promoted_val, count_exp,
- epilogue_size_needed);
- else
- expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
- epilogue_size_needed);
- }
- }
- if (jump_around_label)
- emit_label (jump_around_label);
- return true;
-}
-
-
-/* Expand the appropriate insns for doing strlen if not just doing
- repnz; scasb
-
- out = result, initialized with the start address
- align_rtx = alignment of the address.
- scratch = scratch register, initialized with the startaddress when
- not aligned, otherwise undefined
-
- This is just the body. It needs the initializations mentioned above and
- some address computing at the end. These things are done in i386.md. */
-
-static void
-ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
-{
- int align;
- rtx tmp;
- rtx_code_label *align_2_label = NULL;
- rtx_code_label *align_3_label = NULL;
- rtx_code_label *align_4_label = gen_label_rtx ();
- rtx_code_label *end_0_label = gen_label_rtx ();
- rtx mem;
- rtx tmpreg = gen_reg_rtx (SImode);
- rtx scratch = gen_reg_rtx (SImode);
- rtx cmp;
-
- align = 0;
- if (CONST_INT_P (align_rtx))
- align = INTVAL (align_rtx);
-
- /* Loop to check 1..3 bytes for null to get an aligned pointer. */
-
- /* Is there a known alignment and is it less than 4? */
- if (align < 4)
- {
- rtx scratch1 = gen_reg_rtx (Pmode);
- emit_move_insn (scratch1, out);
- /* Is there a known alignment and is it not 2? */
- if (align != 2)
- {
- align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
- align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
-
- /* Leave just the 3 lower bits. */
- align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
- NULL_RTX, 0, OPTAB_WIDEN);
-
- emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
- Pmode, 1, align_4_label);
- emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
- Pmode, 1, align_2_label);
- emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
- Pmode, 1, align_3_label);
- }
- else
- {
- /* Since the alignment is 2, we have to check 2 or 0 bytes;
- check if is aligned to 4 - byte. */
-
- align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
- NULL_RTX, 0, OPTAB_WIDEN);
-
- emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
- Pmode, 1, align_4_label);
- }
-
- mem = change_address (src, QImode, out);
-
- /* Now compare the bytes. */
-
- /* Compare the first n unaligned byte on a byte per byte basis. */
- emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
- QImode, 1, end_0_label);
-
- /* Increment the address. */
- emit_insn (ix86_gen_add3 (out, out, const1_rtx));
-
- /* Not needed with an alignment of 2 */
- if (align != 2)
- {
- emit_label (align_2_label);
-
- emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
- end_0_label);
-
- emit_insn (ix86_gen_add3 (out, out, const1_rtx));
-
- emit_label (align_3_label);
- }
-
- emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
- end_0_label);
-
- emit_insn (ix86_gen_add3 (out, out, const1_rtx));
- }
-
- /* Generate loop to check 4 bytes at a time. It is not a good idea to
- align this loop. It gives only huge programs, but does not help to
- speed up. */
- emit_label (align_4_label);
-
- mem = change_address (src, SImode, out);
- emit_move_insn (scratch, mem);
- emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
-
- /* This formula yields a nonzero result iff one of the bytes is zero.
- This saves three branches inside loop and many cycles. */
-
- emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
- emit_insn (gen_one_cmplsi2 (scratch, scratch));
- emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
- emit_insn (gen_andsi3 (tmpreg, tmpreg,
- gen_int_mode (0x80808080, SImode)));
- emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
- align_4_label);
-
- if (TARGET_CMOVE)
- {
- rtx reg = gen_reg_rtx (SImode);
- rtx reg2 = gen_reg_rtx (Pmode);
- emit_move_insn (reg, tmpreg);
- emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
-
- /* If zero is not in the first two bytes, move two bytes forward. */
- emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
- tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
- tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
- emit_insn (gen_rtx_SET (tmpreg,
- gen_rtx_IF_THEN_ELSE (SImode, tmp,
- reg,
- tmpreg)));
- /* Emit lea manually to avoid clobbering of flags. */
- emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
-
- tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
- tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
- emit_insn (gen_rtx_SET (out,
- gen_rtx_IF_THEN_ELSE (Pmode, tmp,
- reg2,
- out)));
- }
- else
- {
- rtx_code_label *end_2_label = gen_label_rtx ();
- /* Is zero in the first two bytes? */
-
- emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
- tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
- tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
- tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
- gen_rtx_LABEL_REF (VOIDmode, end_2_label),
- pc_rtx);
- tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
- JUMP_LABEL (tmp) = end_2_label;
-
- /* Not in the first two. Move two bytes forward. */
- emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
- emit_insn (ix86_gen_add3 (out, out, const2_rtx));
-
- emit_label (end_2_label);
-
- }
-
- /* Avoid branch in fixing the byte. */
- tmpreg = gen_lowpart (QImode, tmpreg);
- emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
- tmp = gen_rtx_REG (CCmode, FLAGS_REG);
- cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
- emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
-
- emit_label (end_0_label);
-}
-
-/* Expand strlen. */
-
-bool
-ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
-{
-if (TARGET_UNROLL_STRLEN
- && TARGET_INLINE_ALL_STRINGOPS
- && eoschar == const0_rtx
- && optimize > 1)
- {
- /* The generic case of strlen expander is long. Avoid it's
- expanding unless TARGET_INLINE_ALL_STRINGOPS. */
- rtx addr = force_reg (Pmode, XEXP (src, 0));
- /* Well it seems that some optimizer does not combine a call like
- foo(strlen(bar), strlen(bar));
- when the move and the subtraction is done here. It does calculate
- the length just once when these instructions are done inside of
- output_strlen_unroll(). But I think since &bar[strlen(bar)] is
- often used and I use one fewer register for the lifetime of
- output_strlen_unroll() this is better. */
-
- emit_move_insn (out, addr);
-
- ix86_expand_strlensi_unroll_1 (out, src, align);
-
- /* strlensi_unroll_1 returns the address of the zero at the end of
- the string, like memchr(), so compute the length by subtracting
- the start address. */
- emit_insn (ix86_gen_sub3 (out, out, addr));
- return true;
- }
- else
- return false;
-}
-
-/* For given symbol (function) construct code to compute address of it's PLT
- entry in large x86-64 PIC model. */
-static rtx
-construct_plt_address (rtx symbol)
-{
- rtx tmp, unspec;
-
- gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
- gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
- gcc_assert (Pmode == DImode);
-
- tmp = gen_reg_rtx (Pmode);
- unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
-
- emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
- emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
- return tmp;
-}
-
-rtx_insn *
-ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
- rtx callarg2,
- rtx pop, bool sibcall)
-{
- rtx vec[3];
- rtx use = NULL, call;
- unsigned int vec_len = 0;
- tree fndecl;
-
- if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
- {
- fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
- if (fndecl
- && (lookup_attribute ("interrupt",
- TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
- error ("interrupt service routine can%'t be called directly");
- }
- else
- fndecl = NULL_TREE;
-
- if (pop == const0_rtx)
- pop = NULL;
- gcc_assert (!TARGET_64BIT || !pop);
-
- if (TARGET_MACHO && !TARGET_64BIT)
- {
-#if TARGET_MACHO
- if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
- fnaddr = machopic_indirect_call_target (fnaddr);
-#endif
- }
- else
- {
- /* Static functions and indirect calls don't need the pic register. Also,
- check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
- it an indirect call. */
- rtx addr = XEXP (fnaddr, 0);
- if (flag_pic
- && GET_CODE (addr) == SYMBOL_REF
- && !SYMBOL_REF_LOCAL_P (addr))
- {
- if (flag_plt
- && (SYMBOL_REF_DECL (addr) == NULL_TREE
- || !lookup_attribute ("noplt",
- DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
- {
- if (!TARGET_64BIT
- || (ix86_cmodel == CM_LARGE_PIC
- && DEFAULT_ABI != MS_ABI))
- {
- use_reg (&use, gen_rtx_REG (Pmode,
- REAL_PIC_OFFSET_TABLE_REGNUM));
- if (ix86_use_pseudo_pic_reg ())
- emit_move_insn (gen_rtx_REG (Pmode,
- REAL_PIC_OFFSET_TABLE_REGNUM),
- pic_offset_table_rtx);
- }
- }
- else if (!TARGET_PECOFF && !TARGET_MACHO)
- {
- if (TARGET_64BIT)
- {
- fnaddr = gen_rtx_UNSPEC (Pmode,
- gen_rtvec (1, addr),
- UNSPEC_GOTPCREL);
- fnaddr = gen_rtx_CONST (Pmode, fnaddr);
- }
- else
- {
- fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
- UNSPEC_GOT);
- fnaddr = gen_rtx_CONST (Pmode, fnaddr);
- fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
- fnaddr);
- }
- fnaddr = gen_const_mem (Pmode, fnaddr);
- /* Pmode may not be the same as word_mode for x32, which
- doesn't support indirect branch via 32-bit memory slot.
- Since x32 GOT slot is 64 bit with zero upper 32 bits,
- indirect branch via x32 GOT slot is OK. */
- if (GET_MODE (fnaddr) != word_mode)
- fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
- fnaddr = gen_rtx_MEM (QImode, fnaddr);
- }
- }
- }
-
- /* Skip setting up RAX register for -mskip-rax-setup when there are no
- parameters passed in vector registers. */
- if (TARGET_64BIT
- && (INTVAL (callarg2) > 0
- || (INTVAL (callarg2) == 0
- && (TARGET_SSE || !flag_skip_rax_setup))))
- {
- rtx al = gen_rtx_REG (QImode, AX_REG);
- emit_move_insn (al, callarg2);
- use_reg (&use, al);
- }
-
- if (ix86_cmodel == CM_LARGE_PIC
- && !TARGET_PECOFF
- && MEM_P (fnaddr)
- && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
- && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
- fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
- /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
- branch via x32 GOT slot is OK. */
- else if (!(TARGET_X32
- && MEM_P (fnaddr)
- && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
- && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
- && (sibcall
- ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
- : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
- {
- fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
- fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
- }
-
- call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
-
- if (retval)
- call = gen_rtx_SET (retval, call);
- vec[vec_len++] = call;
-
- if (pop)
- {
- pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
- pop = gen_rtx_SET (stack_pointer_rtx, pop);
- vec[vec_len++] = pop;
- }
-
- if (cfun->machine->no_caller_saved_registers
- && (!fndecl
- || (!TREE_THIS_VOLATILE (fndecl)
- && !lookup_attribute ("no_caller_saved_registers",
- TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
- {
- static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
- bool is_64bit_ms_abi = (TARGET_64BIT
- && ix86_function_abi (fndecl) == MS_ABI);
- char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
-
- /* If there are no caller-saved registers, add all registers
- that are clobbered by the call which returns. */
- for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
- if (!fixed_regs[i]
- && (ix86_call_used_regs[i] == 1
- || (ix86_call_used_regs[i] & c_mask))
- && !STACK_REGNO_P (i)
- && !MMX_REGNO_P (i))
- clobber_reg (&use,
- gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
- }
- else if (TARGET_64BIT_MS_ABI
- && (!callarg2 || INTVAL (callarg2) != -2))
- {
- unsigned i;
-
- for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
- {
- int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
- machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
-
- clobber_reg (&use, gen_rtx_REG (mode, regno));
- }
-
- /* Set here, but it may get cleared later. */
- if (TARGET_CALL_MS2SYSV_XLOGUES)
- {
- if (!TARGET_SSE)
- ;
-
- /* Don't break hot-patched functions. */
- else if (ix86_function_ms_hook_prologue (current_function_decl))
- ;
-
- /* TODO: Cases not yet examined. */
- else if (flag_split_stack)
- warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
-
- else
- {
- gcc_assert (!reload_completed);
- cfun->machine->call_ms2sysv = true;
- }
- }
- }
-
- if (vec_len > 1)
- call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
- rtx_insn *call_insn = emit_call_insn (call);
- if (use)
- CALL_INSN_FUNCTION_USAGE (call_insn) = use;
-
- return call_insn;
-}
-
-/* Return true if the function being called was marked with attribute
- "noplt" or using -fno-plt and we are compiling for non-PIC. We need
- to handle the non-PIC case in the backend because there is no easy
- interface for the front-end to force non-PLT calls to use the GOT.
- This is currently used only with 64-bit or 32-bit GOT32X ELF targets
- to call the function marked "noplt" indirectly. */
-
-static bool
-ix86_nopic_noplt_attribute_p (rtx call_op)
-{
- if (flag_pic || ix86_cmodel == CM_LARGE
- || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
- || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
- || SYMBOL_REF_LOCAL_P (call_op))
- return false;
-
- tree symbol_decl = SYMBOL_REF_DECL (call_op);
-
- if (!flag_plt
- || (symbol_decl != NULL_TREE
- && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
- return true;
-
- return false;
-}
-
-/* Output indirect branch via a call and return thunk. CALL_OP is a
- register which contains the branch target. XASM is the assembly
- template for CALL_OP. Branch is a tail call if SIBCALL_P is true.
- A normal call is converted to:
-
- call __x86_indirect_thunk_reg
-
- and a tail call is converted to:
-
- jmp __x86_indirect_thunk_reg
- */
-
-static void
-ix86_output_indirect_branch_via_reg (rtx call_op, bool sibcall_p)
-{
- char thunk_name_buf[32];
- char *thunk_name;
- enum indirect_thunk_prefix need_prefix
- = indirect_thunk_need_prefix (current_output_insn);
- int regno = REGNO (call_op);
-
- if (cfun->machine->indirect_branch_type
- != indirect_branch_thunk_inline)
- {
- if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
- {
- int i = regno;
- if (i >= FIRST_REX_INT_REG)
- i -= (FIRST_REX_INT_REG - LAST_INT_REG - 1);
- indirect_thunks_used |= 1 << i;
- }
- indirect_thunk_name (thunk_name_buf, regno, need_prefix, false);
- thunk_name = thunk_name_buf;
- }
- else
- thunk_name = NULL;
-
- if (sibcall_p)
- {
- if (thunk_name != NULL)
- fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
- else
- output_indirect_thunk (regno);
- }
- else
- {
- if (thunk_name != NULL)
- {
- fprintf (asm_out_file, "\tcall\t%s\n", thunk_name);
- return;
- }
-
- char indirectlabel1[32];
- char indirectlabel2[32];
-
- ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
- INDIRECT_LABEL,
- indirectlabelno++);
- ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
- INDIRECT_LABEL,
- indirectlabelno++);
-
- /* Jump. */
- fputs ("\tjmp\t", asm_out_file);
- assemble_name_raw (asm_out_file, indirectlabel2);
- fputc ('\n', asm_out_file);
-
- ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
-
- if (thunk_name != NULL)
- fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
- else
- output_indirect_thunk (regno);
-
- ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
-
- /* Call. */
- fputs ("\tcall\t", asm_out_file);
- assemble_name_raw (asm_out_file, indirectlabel1);
- fputc ('\n', asm_out_file);
- }
-}
-
-/* Output indirect branch via a call and return thunk. CALL_OP is
- the branch target. XASM is the assembly template for CALL_OP.
- Branch is a tail call if SIBCALL_P is true. A normal call is
- converted to:
-
- jmp L2
- L1:
- push CALL_OP
- jmp __x86_indirect_thunk
- L2:
- call L1
-
- and a tail call is converted to:
-
- push CALL_OP
- jmp __x86_indirect_thunk
- */
-
-static void
-ix86_output_indirect_branch_via_push (rtx call_op, const char *xasm,
- bool sibcall_p)
-{
- char thunk_name_buf[32];
- char *thunk_name;
- char push_buf[64];
- enum indirect_thunk_prefix need_prefix
- = indirect_thunk_need_prefix (current_output_insn);
- int regno = -1;
-
- if (cfun->machine->indirect_branch_type
- != indirect_branch_thunk_inline)
- {
- if (cfun->machine->indirect_branch_type == indirect_branch_thunk)
- indirect_thunk_needed = true;
- indirect_thunk_name (thunk_name_buf, regno, need_prefix, false);
- thunk_name = thunk_name_buf;
- }
- else
- thunk_name = NULL;
-
- snprintf (push_buf, sizeof (push_buf), "push{%c}\t%s",
- TARGET_64BIT ? 'q' : 'l', xasm);
-
- if (sibcall_p)
- {
- output_asm_insn (push_buf, &call_op);
- if (thunk_name != NULL)
- fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
- else
- output_indirect_thunk (regno);
- }
- else
- {
- char indirectlabel1[32];
- char indirectlabel2[32];
-
- ASM_GENERATE_INTERNAL_LABEL (indirectlabel1,
- INDIRECT_LABEL,
- indirectlabelno++);
- ASM_GENERATE_INTERNAL_LABEL (indirectlabel2,
- INDIRECT_LABEL,
- indirectlabelno++);
-
- /* Jump. */
- fputs ("\tjmp\t", asm_out_file);
- assemble_name_raw (asm_out_file, indirectlabel2);
- fputc ('\n', asm_out_file);
-
- ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1);
-
- /* An external function may be called via GOT, instead of PLT. */
- if (MEM_P (call_op))
- {
- struct ix86_address parts;
- rtx addr = XEXP (call_op, 0);
- if (ix86_decompose_address (addr, &parts)
- && parts.base == stack_pointer_rtx)
- {
- /* Since call will adjust stack by -UNITS_PER_WORD,
- we must convert "disp(stack, index, scale)" to
- "disp+UNITS_PER_WORD(stack, index, scale)". */
- if (parts.index)
- {
- addr = gen_rtx_MULT (Pmode, parts.index,
- GEN_INT (parts.scale));
- addr = gen_rtx_PLUS (Pmode, stack_pointer_rtx,
- addr);
- }
- else
- addr = stack_pointer_rtx;
-
- rtx disp;
- if (parts.disp != NULL_RTX)
- disp = plus_constant (Pmode, parts.disp,
- UNITS_PER_WORD);
- else
- disp = GEN_INT (UNITS_PER_WORD);
-
- addr = gen_rtx_PLUS (Pmode, addr, disp);
- call_op = gen_rtx_MEM (GET_MODE (call_op), addr);
- }
- }
-
- output_asm_insn (push_buf, &call_op);
-
- if (thunk_name != NULL)
- fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
- else
- output_indirect_thunk (regno);
-
- ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2);
-
- /* Call. */
- fputs ("\tcall\t", asm_out_file);
- assemble_name_raw (asm_out_file, indirectlabel1);
- fputc ('\n', asm_out_file);
- }
-}
-
-/* Output indirect branch via a call and return thunk. CALL_OP is
- the branch target. XASM is the assembly template for CALL_OP.
- Branch is a tail call if SIBCALL_P is true. */
-
-static void
-ix86_output_indirect_branch (rtx call_op, const char *xasm,
- bool sibcall_p)
-{
- if (REG_P (call_op))
- ix86_output_indirect_branch_via_reg (call_op, sibcall_p);
- else
- ix86_output_indirect_branch_via_push (call_op, xasm, sibcall_p);
-}
-
-/* Output indirect jump. CALL_OP is the jump target. */
-
-const char *
-ix86_output_indirect_jmp (rtx call_op)
-{
- if (cfun->machine->indirect_branch_type != indirect_branch_keep)
- {
- /* We can't have red-zone since "call" in the indirect thunk
- pushes the return address onto stack, destroying red-zone. */
- if (ix86_red_zone_size != 0)
- gcc_unreachable ();
-
- ix86_output_indirect_branch (call_op, "%0", true);
- return "";
- }
- else
- return "%!jmp\t%A0";
-}
-
-/* Output return instrumentation for current function if needed. */
-
-static void
-output_return_instrumentation (void)
-{
- if (ix86_instrument_return != instrument_return_none
- && flag_fentry
- && !DECL_NO_INSTRUMENT_FUNCTION_ENTRY_EXIT (cfun->decl))
- {
- if (ix86_flag_record_return)
- fprintf (asm_out_file, "1:\n");
- switch (ix86_instrument_return)
- {
- case instrument_return_call:
- fprintf (asm_out_file, "\tcall\t__return__\n");
- break;
- case instrument_return_nop5:
- /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
- fprintf (asm_out_file, ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
- break;
- case instrument_return_none:
- break;
- }
-
- if (ix86_flag_record_return)
- {
- fprintf (asm_out_file, "\t.section __return_loc, \"a\",@progbits\n");
- fprintf (asm_out_file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
- fprintf (asm_out_file, "\t.previous\n");
- }
- }
-}
-
-/* Output function return. CALL_OP is the jump target. Add a REP
- prefix to RET if LONG_P is true and function return is kept. */
-
-const char *
-ix86_output_function_return (bool long_p)
-{
- output_return_instrumentation ();
-
- if (cfun->machine->function_return_type != indirect_branch_keep)
- {
- char thunk_name[32];
- enum indirect_thunk_prefix need_prefix
- = indirect_thunk_need_prefix (current_output_insn);
-
- if (cfun->machine->function_return_type
- != indirect_branch_thunk_inline)
- {
- bool need_thunk = (cfun->machine->function_return_type
- == indirect_branch_thunk);
- indirect_thunk_name (thunk_name, INVALID_REGNUM, need_prefix,
- true);
- indirect_return_needed |= need_thunk;
- fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
- }
- else
- output_indirect_thunk (INVALID_REGNUM);
-
- return "";
- }
-
- if (!long_p)
- return "%!ret";
-
- return "rep%; ret";
-}
-
-/* Output indirect function return. RET_OP is the function return
- target. */
-
-const char *
-ix86_output_indirect_function_return (rtx ret_op)
-{
- if (cfun->machine->function_return_type != indirect_branch_keep)
- {
- char thunk_name[32];
- enum indirect_thunk_prefix need_prefix
- = indirect_thunk_need_prefix (current_output_insn);
- unsigned int regno = REGNO (ret_op);
- gcc_assert (regno == CX_REG);
-
- if (cfun->machine->function_return_type
- != indirect_branch_thunk_inline)
- {
- bool need_thunk = (cfun->machine->function_return_type
- == indirect_branch_thunk);
- indirect_thunk_name (thunk_name, regno, need_prefix, true);
-
- if (need_thunk)
- {
- indirect_return_via_cx = true;
- indirect_thunks_used |= 1 << CX_REG;
- }
- fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name);
- }
- else
- output_indirect_thunk (regno);
-
- return "";
- }
- else
- return "%!jmp\t%A0";
-}
-
-/* Split simple return with popping POPC bytes from stack to indirect
- branch with stack adjustment . */
-
-void
-ix86_split_simple_return_pop_internal (rtx popc)
-{
- struct machine_function *m = cfun->machine;
- rtx ecx = gen_rtx_REG (SImode, CX_REG);
- rtx_insn *insn;
-
- /* There is no "pascal" calling convention in any 64bit ABI. */
- gcc_assert (!TARGET_64BIT);
-
- insn = emit_insn (gen_pop (ecx));
- m->fs.cfa_offset -= UNITS_PER_WORD;
- m->fs.sp_offset -= UNITS_PER_WORD;
-
- rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
- x = gen_rtx_SET (stack_pointer_rtx, x);
- add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
- add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
- RTX_FRAME_RELATED_P (insn) = 1;
-
- x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
- x = gen_rtx_SET (stack_pointer_rtx, x);
- insn = emit_insn (x);
- add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
- RTX_FRAME_RELATED_P (insn) = 1;
-
- /* Now return address is in ECX. */
- emit_jump_insn (gen_simple_return_indirect_internal (ecx));
-}
-
-/* Output the assembly for a call instruction. */
-
-const char *
-ix86_output_call_insn (rtx_insn *insn, rtx call_op)
-{
- bool direct_p = constant_call_address_operand (call_op, VOIDmode);
- bool output_indirect_p
- = (!TARGET_SEH
- && cfun->machine->indirect_branch_type != indirect_branch_keep);
- bool seh_nop_p = false;
- const char *xasm;
-
- if (SIBLING_CALL_P (insn))
- {
- output_return_instrumentation ();
- if (direct_p)
- {
- if (ix86_nopic_noplt_attribute_p (call_op))
- {
- direct_p = false;
- if (TARGET_64BIT)
- {
- if (output_indirect_p)
- xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
- else
- xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
- }
- else
- {
- if (output_indirect_p)
- xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
- else
- xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
- }
- }
- else
- xasm = "%!jmp\t%P0";
- }
- /* SEH epilogue detection requires the indirect branch case
- to include REX.W. */
- else if (TARGET_SEH)
- xasm = "%!rex.W jmp\t%A0";
- else
- {
- if (output_indirect_p)
- xasm = "%0";
- else
- xasm = "%!jmp\t%A0";
- }
-
- if (output_indirect_p && !direct_p)
- ix86_output_indirect_branch (call_op, xasm, true);
- else
- output_asm_insn (xasm, &call_op);
- return "";
- }
-
- /* SEH unwinding can require an extra nop to be emitted in several
- circumstances. Determine if we have one of those. */
- if (TARGET_SEH)
- {
- rtx_insn *i;
-
- for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
- {
- /* Prevent a catch region from being adjacent to a jump that would
- be interpreted as an epilogue sequence by the unwinder. */
- if (JUMP_P(i) && CROSSING_JUMP_P (i))
- {
- seh_nop_p = true;
- break;
- }
-
- /* If we get to another real insn, we don't need the nop. */
- if (INSN_P (i))
- break;
-
- /* If we get to the epilogue note, prevent a catch region from
- being adjacent to the standard epilogue sequence. If non-
- call-exceptions, we'll have done this during epilogue emission. */
- if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
- && !flag_non_call_exceptions
- && !can_throw_internal (insn))
- {
- seh_nop_p = true;
- break;
- }
- }
-
- /* If we didn't find a real insn following the call, prevent the
- unwinder from looking into the next function. */
- if (i == NULL)
- seh_nop_p = true;
- }
-
- if (direct_p)
- {
- if (ix86_nopic_noplt_attribute_p (call_op))
- {
- direct_p = false;
- if (TARGET_64BIT)
- {
- if (output_indirect_p)
- xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
- else
- xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
- }
- else
- {
- if (output_indirect_p)
- xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}";
- else
- xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
- }
- }
- else
- xasm = "%!call\t%P0";
- }
- else
- {
- if (output_indirect_p)
- xasm = "%0";
- else
- xasm = "%!call\t%A0";
- }
-
- if (output_indirect_p && !direct_p)
- ix86_output_indirect_branch (call_op, xasm, false);
- else
- output_asm_insn (xasm, &call_op);
-
- if (seh_nop_p)
- return "nop";
-
- return "";
-}
-\f
-/* Clear stack slot assignments remembered from previous functions.
- This is called from INIT_EXPANDERS once before RTL is emitted for each
- function. */
-
-static struct machine_function *
-ix86_init_machine_status (void)
-{
- struct machine_function *f;
-
- f = ggc_cleared_alloc<machine_function> ();
- f->call_abi = ix86_abi;
-
- return f;
-}
-
-/* Return a MEM corresponding to a stack slot with mode MODE.
- Allocate a new slot if necessary.
-
- The RTL for a function can have several slots available: N is
- which slot to use. */
-
-rtx
-assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
-{
- struct stack_local_entry *s;
-
- gcc_assert (n < MAX_386_STACK_LOCALS);
-
- for (s = ix86_stack_locals; s; s = s->next)
- if (s->mode == mode && s->n == n)
- return validize_mem (copy_rtx (s->rtl));
-
- s = ggc_alloc<stack_local_entry> ();
- s->n = n;
- s->mode = mode;
- s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
-
- s->next = ix86_stack_locals;
- ix86_stack_locals = s;
- return validize_mem (copy_rtx (s->rtl));
-}
-
-static void
-ix86_instantiate_decls (void)
-{
- struct stack_local_entry *s;
-
- for (s = ix86_stack_locals; s; s = s->next)
- if (s->rtl != NULL_RTX)
- instantiate_decl_rtl (s->rtl);
-}
-\f
-/* Check whether x86 address PARTS is a pc-relative address. */
-
-bool
-ix86_rip_relative_addr_p (struct ix86_address *parts)
-{
- rtx base, index, disp;
-
- base = parts->base;
- index = parts->index;
- disp = parts->disp;
-
- if (disp && !base && !index)
- {
- if (TARGET_64BIT)
- {
- rtx symbol = disp;
-
- if (GET_CODE (disp) == CONST)
- symbol = XEXP (disp, 0);
- if (GET_CODE (symbol) == PLUS
- && CONST_INT_P (XEXP (symbol, 1)))
- symbol = XEXP (symbol, 0);
-
- if (GET_CODE (symbol) == LABEL_REF
- || (GET_CODE (symbol) == SYMBOL_REF
- && SYMBOL_REF_TLS_MODEL (symbol) == 0)
- || (GET_CODE (symbol) == UNSPEC
- && (XINT (symbol, 1) == UNSPEC_GOTPCREL
- || XINT (symbol, 1) == UNSPEC_PCREL
- || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
- return true;
- }
- }
- return false;
-}
-
-/* Calculate the length of the memory address in the instruction encoding.
- Includes addr32 prefix, does not include the one-byte modrm, opcode,
- or other prefixes. We never generate addr32 prefix for LEA insn. */
-
-int
-memory_address_length (rtx addr, bool lea)
-{
- struct ix86_address parts;
- rtx base, index, disp;
- int len;
- int ok;
-
- if (GET_CODE (addr) == PRE_DEC
- || GET_CODE (addr) == POST_INC
- || GET_CODE (addr) == PRE_MODIFY
- || GET_CODE (addr) == POST_MODIFY)
- return 0;
-
- ok = ix86_decompose_address (addr, &parts);
- gcc_assert (ok);
-
- len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
-
- /* If this is not LEA instruction, add the length of addr32 prefix. */
- if (TARGET_64BIT && !lea
- && (SImode_address_operand (addr, VOIDmode)
- || (parts.base && GET_MODE (parts.base) == SImode)
- || (parts.index && GET_MODE (parts.index) == SImode)))
- len++;
-
- base = parts.base;
- index = parts.index;
- disp = parts.disp;
-
- if (base && SUBREG_P (base))
- base = SUBREG_REG (base);
- if (index && SUBREG_P (index))
- index = SUBREG_REG (index);
-
- gcc_assert (base == NULL_RTX || REG_P (base));
- gcc_assert (index == NULL_RTX || REG_P (index));
-
- /* Rule of thumb:
- - esp as the base always wants an index,
- - ebp as the base always wants a displacement,
- - r12 as the base always wants an index,
- - r13 as the base always wants a displacement. */
-
- /* Register Indirect. */
- if (base && !index && !disp)
- {
- /* esp (for its index) and ebp (for its displacement) need
- the two-byte modrm form. Similarly for r12 and r13 in 64-bit
- code. */
- if (base == arg_pointer_rtx
- || base == frame_pointer_rtx
- || REGNO (base) == SP_REG
- || REGNO (base) == BP_REG
- || REGNO (base) == R12_REG
- || REGNO (base) == R13_REG)
- len++;
- }
-
- /* Direct Addressing. In 64-bit mode mod 00 r/m 5
- is not disp32, but disp32(%rip), so for disp32
- SIB byte is needed, unless print_operand_address
- optimizes it into disp32(%rip) or (%rip) is implied
- by UNSPEC. */
- else if (disp && !base && !index)
- {
- len += 4;
- if (!ix86_rip_relative_addr_p (&parts))
- len++;
- }
- else
- {
- /* Find the length of the displacement constant. */
- if (disp)
- {
- if (base && satisfies_constraint_K (disp))
- len += 1;
- else
- len += 4;
- }
- /* ebp always wants a displacement. Similarly r13. */
- else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
- len++;
-
- /* An index requires the two-byte modrm form.... */
- if (index
- /* ...like esp (or r12), which always wants an index. */
- || base == arg_pointer_rtx
- || base == frame_pointer_rtx
- || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
- len++;
- }
-
- return len;
-}
-
-/* Compute default value for "length_immediate" attribute. When SHORTFORM
- is set, expect that insn have 8bit immediate alternative. */
-int
-ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
-{
- int len = 0;
- int i;
- extract_insn_cached (insn);
- for (i = recog_data.n_operands - 1; i >= 0; --i)
- if (CONSTANT_P (recog_data.operand[i]))
- {
- enum attr_mode mode = get_attr_mode (insn);
-
- gcc_assert (!len);
- if (shortform && CONST_INT_P (recog_data.operand[i]))
- {
- HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
- switch (mode)
- {
- case MODE_QI:
- len = 1;
- continue;
- case MODE_HI:
- ival = trunc_int_for_mode (ival, HImode);
- break;
- case MODE_SI:
- ival = trunc_int_for_mode (ival, SImode);
- break;
- default:
- break;
- }
- if (IN_RANGE (ival, -128, 127))
- {
- len = 1;
- continue;
- }
- }
- switch (mode)
- {
- case MODE_QI:
- len = 1;
- break;
- case MODE_HI:
- len = 2;
- break;
- case MODE_SI:
- len = 4;
- break;
- /* Immediates for DImode instructions are encoded
- as 32bit sign extended values. */
- case MODE_DI:
- len = 4;
- break;
- default:
- fatal_insn ("unknown insn mode", insn);
- }
- }
- return len;
-}
-
-/* Compute default value for "length_address" attribute. */
-int
-ix86_attr_length_address_default (rtx_insn *insn)
-{
- int i;
-
- if (get_attr_type (insn) == TYPE_LEA)
- {
- rtx set = PATTERN (insn), addr;
-
- if (GET_CODE (set) == PARALLEL)
- set = XVECEXP (set, 0, 0);
-
- gcc_assert (GET_CODE (set) == SET);
-
- addr = SET_SRC (set);
-
- return memory_address_length (addr, true);
- }
-
- extract_insn_cached (insn);
- for (i = recog_data.n_operands - 1; i >= 0; --i)
- {
- rtx op = recog_data.operand[i];
- if (MEM_P (op))
- {
- constrain_operands_cached (insn, reload_completed);
- if (which_alternative != -1)
- {
- const char *constraints = recog_data.constraints[i];
- int alt = which_alternative;
-
- while (*constraints == '=' || *constraints == '+')
- constraints++;
- while (alt-- > 0)
- while (*constraints++ != ',')
- ;
- /* Skip ignored operands. */
- if (*constraints == 'X')
- continue;
- }
-
- int len = memory_address_length (XEXP (op, 0), false);
-
- /* Account for segment prefix for non-default addr spaces. */
- if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
- len++;
-
- return len;
- }
- }
- return 0;
-}
-
-/* Compute default value for "length_vex" attribute. It includes
- 2 or 3 byte VEX prefix and 1 opcode byte. */
-
-int
-ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
- bool has_vex_w)
-{
- int i;
-
- /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
- byte VEX prefix. */
- if (!has_0f_opcode || has_vex_w)
- return 3 + 1;
-
- /* We can always use 2 byte VEX prefix in 32bit. */
- if (!TARGET_64BIT)
- return 2 + 1;
-
- extract_insn_cached (insn);
-
- for (i = recog_data.n_operands - 1; i >= 0; --i)
- if (REG_P (recog_data.operand[i]))
- {
- /* REX.W bit uses 3 byte VEX prefix. */
- if (GET_MODE (recog_data.operand[i]) == DImode
- && GENERAL_REG_P (recog_data.operand[i]))
- return 3 + 1;
- }
- else
- {
- /* REX.X or REX.B bits use 3 byte VEX prefix. */
- if (MEM_P (recog_data.operand[i])
- && x86_extended_reg_mentioned_p (recog_data.operand[i]))
- return 3 + 1;
- }
-
- return 2 + 1;
-}
-\f
-
-static bool
-ix86_class_likely_spilled_p (reg_class_t);
-
-/* Returns true if lhs of insn is HW function argument register and set up
- is_spilled to true if it is likely spilled HW register. */
-static bool
-insn_is_function_arg (rtx insn, bool* is_spilled)
-{
- rtx dst;
-
- if (!NONDEBUG_INSN_P (insn))
- return false;
- /* Call instructions are not movable, ignore it. */
- if (CALL_P (insn))
- return false;
- insn = PATTERN (insn);
- if (GET_CODE (insn) == PARALLEL)
- insn = XVECEXP (insn, 0, 0);
- if (GET_CODE (insn) != SET)
- return false;
- dst = SET_DEST (insn);
- if (REG_P (dst) && HARD_REGISTER_P (dst)
- && ix86_function_arg_regno_p (REGNO (dst)))
- {
- /* Is it likely spilled HW register? */
- if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
- && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
- *is_spilled = true;
- return true;
- }
- return false;
-}
-
-/* Add output dependencies for chain of function adjacent arguments if only
- there is a move to likely spilled HW register. Return first argument
- if at least one dependence was added or NULL otherwise. */
-static rtx_insn *
-add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
-{
- rtx_insn *insn;
- rtx_insn *last = call;
- rtx_insn *first_arg = NULL;
- bool is_spilled = false;
-
- head = PREV_INSN (head);
-
- /* Find nearest to call argument passing instruction. */
- while (true)
- {
- last = PREV_INSN (last);
- if (last == head)
- return NULL;
- if (!NONDEBUG_INSN_P (last))
- continue;
- if (insn_is_function_arg (last, &is_spilled))
- break;
- return NULL;
- }
-
- first_arg = last;
- while (true)
- {
- insn = PREV_INSN (last);
- if (!INSN_P (insn))
- break;
- if (insn == head)
- break;
- if (!NONDEBUG_INSN_P (insn))
- {
- last = insn;
- continue;
- }
- if (insn_is_function_arg (insn, &is_spilled))
- {
- /* Add output depdendence between two function arguments if chain
- of output arguments contains likely spilled HW registers. */
- if (is_spilled)
- add_dependence (first_arg, insn, REG_DEP_OUTPUT);
- first_arg = last = insn;
- }
- else
- break;
- }
- if (!is_spilled)
- return NULL;
- return first_arg;
-}
-
-/* Add output or anti dependency from insn to first_arg to restrict its code
- motion. */
-static void
-avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
-{
- rtx set;
- rtx tmp;
-
- set = single_set (insn);
- if (!set)
- return;
- tmp = SET_DEST (set);
- if (REG_P (tmp))
- {
- /* Add output dependency to the first function argument. */
- add_dependence (first_arg, insn, REG_DEP_OUTPUT);
- return;
- }
- /* Add anti dependency. */
- add_dependence (first_arg, insn, REG_DEP_ANTI);
-}
-
-/* Avoid cross block motion of function argument through adding dependency
- from the first non-jump instruction in bb. */
-static void
-add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
-{
- rtx_insn *insn = BB_END (bb);
-
- while (insn)
- {
- if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
- {
- rtx set = single_set (insn);
- if (set)
- {
- avoid_func_arg_motion (arg, insn);
- return;
- }
- }
- if (insn == BB_HEAD (bb))
- return;
- insn = PREV_INSN (insn);
- }
-}
-
-/* Hook for pre-reload schedule - avoid motion of function arguments
- passed in likely spilled HW registers. */
-static void
-ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
-{
- rtx_insn *insn;
- rtx_insn *first_arg = NULL;
- if (reload_completed)
- return;
- while (head != tail && DEBUG_INSN_P (head))
- head = NEXT_INSN (head);
- for (insn = tail; insn != head; insn = PREV_INSN (insn))
- if (INSN_P (insn) && CALL_P (insn))
- {
- first_arg = add_parameter_dependencies (insn, head);
- if (first_arg)
- {
- /* Add dependee for first argument to predecessors if only
- region contains more than one block. */
- basic_block bb = BLOCK_FOR_INSN (insn);
- int rgn = CONTAINING_RGN (bb->index);
- int nr_blks = RGN_NR_BLOCKS (rgn);
- /* Skip trivial regions and region head blocks that can have
- predecessors outside of region. */
- if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
- {
- edge e;
- edge_iterator ei;
-
- /* Regions are SCCs with the exception of selective
- scheduling with pipelining of outer blocks enabled.
- So also check that immediate predecessors of a non-head
- block are in the same region. */
- FOR_EACH_EDGE (e, ei, bb->preds)
- {
- /* Avoid creating of loop-carried dependencies through
- using topological ordering in the region. */
- if (rgn == CONTAINING_RGN (e->src->index)
- && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
- add_dependee_for_func_arg (first_arg, e->src);
- }
- }
- insn = first_arg;
- if (insn == head)
- break;
- }
- }
- else if (first_arg)
- avoid_func_arg_motion (first_arg, insn);
-}
-
-/* Hook for pre-reload schedule - set priority of moves from likely spilled
- HW registers to maximum, to schedule them at soon as possible. These are
- moves from function argument registers at the top of the function entry
- and moves from function return value registers after call. */
-static int
-ix86_adjust_priority (rtx_insn *insn, int priority)
-{
- rtx set;
-
- if (reload_completed)
- return priority;
-
- if (!NONDEBUG_INSN_P (insn))
- return priority;
-
- set = single_set (insn);
- if (set)
- {
- rtx tmp = SET_SRC (set);
- if (REG_P (tmp)
- && HARD_REGISTER_P (tmp)
- && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
- && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
- return current_sched_info->sched_max_insns_priority;
- }
-
- return priority;
-}
-
-/* Prepare for scheduling pass. */
-static void
-ix86_sched_init_global (FILE *, int, int)
-{
- /* Install scheduling hooks for current CPU. Some of these hooks are used
- in time-critical parts of the scheduler, so we only set them up when
- they are actually used. */
- switch (ix86_tune)
- {
- case PROCESSOR_CORE2:
- case PROCESSOR_NEHALEM:
- case PROCESSOR_SANDYBRIDGE:
- case PROCESSOR_HASWELL:
- case PROCESSOR_GENERIC:
- /* Do not perform multipass scheduling for pre-reload schedule
- to save compile time. */
- if (reload_completed)
- {
- ix86_core2i7_init_hooks ();
- break;
- }
- /* Fall through. */
- default:
- targetm.sched.dfa_post_advance_cycle = NULL;
- targetm.sched.first_cycle_multipass_init = NULL;
- targetm.sched.first_cycle_multipass_begin = NULL;
- targetm.sched.first_cycle_multipass_issue = NULL;
- targetm.sched.first_cycle_multipass_backtrack = NULL;
- targetm.sched.first_cycle_multipass_end = NULL;
- targetm.sched.first_cycle_multipass_fini = NULL;
- break;
- }
-}
-
-\f
-/* Implement TARGET_STATIC_RTX_ALIGNMENT. */
-
-static HOST_WIDE_INT
-ix86_static_rtx_alignment (machine_mode mode)
-{
- if (mode == DFmode)
- return 64;
- if (ALIGN_MODE_128 (mode))
- return MAX (128, GET_MODE_ALIGNMENT (mode));
- return GET_MODE_ALIGNMENT (mode);
-}
-
-/* Implement TARGET_CONSTANT_ALIGNMENT. */
-
-static HOST_WIDE_INT
-ix86_constant_alignment (const_tree exp, HOST_WIDE_INT align)
-{
- if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
- || TREE_CODE (exp) == INTEGER_CST)
- {
- machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
- HOST_WIDE_INT mode_align = ix86_static_rtx_alignment (mode);
- return MAX (mode_align, align);
- }
- else if (!optimize_size && TREE_CODE (exp) == STRING_CST
- && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
- return BITS_PER_WORD;
-
- return align;
-}
-
-/* Implement TARGET_EMPTY_RECORD_P. */
-
-static bool
-ix86_is_empty_record (const_tree type)
-{
- if (!TARGET_64BIT)
- return false;
- return default_is_empty_record (type);
-}
-
-/* Implement TARGET_WARN_PARAMETER_PASSING_ABI. */
-
-static void
-ix86_warn_parameter_passing_abi (cumulative_args_t cum_v, tree type)
-{
- CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
-
- if (!cum->warn_empty)
- return;
-
- if (!TYPE_EMPTY_P (type))
- return;
-
- /* Don't warn if the function isn't visible outside of the TU. */
- if (cum->decl && !TREE_PUBLIC (cum->decl))
- return;
-
- const_tree ctx = get_ultimate_context (cum->decl);
- if (ctx != NULL_TREE
- && !TRANSLATION_UNIT_WARN_EMPTY_P (ctx))
- return;
-
- /* If the actual size of the type is zero, then there is no change
- in how objects of this size are passed. */
- if (int_size_in_bytes (type) == 0)
- return;
-
- warning (OPT_Wabi, "empty class %qT parameter passing ABI "
- "changes in %<-fabi-version=12%> (GCC 8)", type);
-
- /* Only warn once. */
- cum->warn_empty = false;
-}
-
-/* This hook returns name of multilib ABI. */
-
-static const char *
-ix86_get_multilib_abi_name (void)
-{
- if (!(TARGET_64BIT_P (ix86_isa_flags)))
- return "i386";
- else if (TARGET_X32_P (ix86_isa_flags))
- return "x32";
- else
- return "x86_64";
-}
-
-/* Compute the alignment for a variable for Intel MCU psABI. TYPE is
- the data type, and ALIGN is the alignment that the object would
- ordinarily have. */
-
-static int
-iamcu_alignment (tree type, int align)
-{
- machine_mode mode;
-
- if (align < 32 || TYPE_USER_ALIGN (type))
- return align;
-
- /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
- bytes. */
- mode = TYPE_MODE (strip_array_types (type));
- switch (GET_MODE_CLASS (mode))
- {
- case MODE_INT:
- case MODE_COMPLEX_INT:
- case MODE_COMPLEX_FLOAT:
- case MODE_FLOAT:
- case MODE_DECIMAL_FLOAT:
- return 32;
- default:
- return align;
- }
-}
-
-/* Compute the alignment for a static variable.
- TYPE is the data type, and ALIGN is the alignment that
- the object would ordinarily have. The value of this function is used
- instead of that alignment to align the object. */
-
-int
-ix86_data_alignment (tree type, unsigned int align, bool opt)
-{
- /* GCC 4.8 and earlier used to incorrectly assume this alignment even
- for symbols from other compilation units or symbols that don't need
- to bind locally. In order to preserve some ABI compatibility with
- those compilers, ensure we don't decrease alignment from what we
- used to assume. */
-
- unsigned int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
-
- /* A data structure, equal or greater than the size of a cache line
- (64 bytes in the Pentium 4 and other recent Intel processors, including
- processors based on Intel Core microarchitecture) should be aligned
- so that its base address is a multiple of a cache line size. */
-
- unsigned int max_align
- = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
-
- if (max_align < BITS_PER_WORD)
- max_align = BITS_PER_WORD;
-
- switch (ix86_align_data_type)
- {
- case ix86_align_data_type_abi: opt = false; break;
- case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
- case ix86_align_data_type_cacheline: break;
- }
-
- if (TARGET_IAMCU)
- align = iamcu_alignment (type, align);
-
- if (opt
- && AGGREGATE_TYPE_P (type)
- && TYPE_SIZE (type)
- && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
- {
- if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align_compat)
- && align < max_align_compat)
- align = max_align_compat;
- if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align)
- && align < max_align)
- align = max_align;
- }
-
- /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
- to 16byte boundary. */
- if (TARGET_64BIT)
- {
- if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
- && TYPE_SIZE (type)
- && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
- && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
- && align < 128)
- return 128;
- }
-
- if (!opt)
- return align;
-
- if (TREE_CODE (type) == ARRAY_TYPE)
- {
- if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
- return 64;
- if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
- return 128;
- }
- else if (TREE_CODE (type) == COMPLEX_TYPE)
- {
-
- if (TYPE_MODE (type) == DCmode && align < 64)
- return 64;
- if ((TYPE_MODE (type) == XCmode
- || TYPE_MODE (type) == TCmode) && align < 128)
- return 128;
- }
- else if ((TREE_CODE (type) == RECORD_TYPE
- || TREE_CODE (type) == UNION_TYPE
- || TREE_CODE (type) == QUAL_UNION_TYPE)
- && TYPE_FIELDS (type))
- {
- if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
- return 64;
- if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
- return 128;
- }
- else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
- || TREE_CODE (type) == INTEGER_TYPE)
- {
- if (TYPE_MODE (type) == DFmode && align < 64)
- return 64;
- if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
- return 128;
- }
-
- return align;
-}
-
-/* Compute the alignment for a local variable or a stack slot. EXP is
- the data type or decl itself, MODE is the widest mode available and
- ALIGN is the alignment that the object would ordinarily have. The
- value of this macro is used instead of that alignment to align the
- object. */
-
-unsigned int
-ix86_local_alignment (tree exp, machine_mode mode,
- unsigned int align)
-{
- tree type, decl;
-
- if (exp && DECL_P (exp))
- {
- type = TREE_TYPE (exp);
- decl = exp;
- }
- else
- {
- type = exp;
- decl = NULL;
- }
-
- /* Don't do dynamic stack realignment for long long objects with
- -mpreferred-stack-boundary=2. */
- if (!TARGET_64BIT
- && align == 64
- && ix86_preferred_stack_boundary < 64
- && (mode == DImode || (type && TYPE_MODE (type) == DImode))
- && (!type || !TYPE_USER_ALIGN (type))
- && (!decl || !DECL_USER_ALIGN (decl)))
- align = 32;
-
- /* If TYPE is NULL, we are allocating a stack slot for caller-save
- register in MODE. We will return the largest alignment of XF
- and DF. */
- if (!type)
- {
- if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
- align = GET_MODE_ALIGNMENT (DFmode);
- return align;
- }
-
- /* Don't increase alignment for Intel MCU psABI. */
- if (TARGET_IAMCU)
- return align;
-
- /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
- to 16byte boundary. Exact wording is:
-
- An array uses the same alignment as its elements, except that a local or
- global array variable of length at least 16 bytes or
- a C99 variable-length array variable always has alignment of at least 16 bytes.
-
- This was added to allow use of aligned SSE instructions at arrays. This
- rule is meant for static storage (where compiler cannot do the analysis
- by itself). We follow it for automatic variables only when convenient.
- We fully control everything in the function compiled and functions from
- other unit cannot rely on the alignment.
-
- Exclude va_list type. It is the common case of local array where
- we cannot benefit from the alignment.
-
- TODO: Probably one should optimize for size only when var is not escaping. */
- if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
- && TARGET_SSE)
- {
- if (AGGREGATE_TYPE_P (type)
- && (va_list_type_node == NULL_TREE
- || (TYPE_MAIN_VARIANT (type)
- != TYPE_MAIN_VARIANT (va_list_type_node)))
- && TYPE_SIZE (type)
- && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
- && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
- && align < 128)
- return 128;
- }
- if (TREE_CODE (type) == ARRAY_TYPE)
- {
- if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
- return 64;
- if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
- return 128;
- }
- else if (TREE_CODE (type) == COMPLEX_TYPE)
- {
- if (TYPE_MODE (type) == DCmode && align < 64)
- return 64;
- if ((TYPE_MODE (type) == XCmode
- || TYPE_MODE (type) == TCmode) && align < 128)
- return 128;
- }
- else if ((TREE_CODE (type) == RECORD_TYPE
- || TREE_CODE (type) == UNION_TYPE
- || TREE_CODE (type) == QUAL_UNION_TYPE)
- && TYPE_FIELDS (type))
- {
- if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
- return 64;
- if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
- return 128;
- }
- else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
- || TREE_CODE (type) == INTEGER_TYPE)
- {
-
- if (TYPE_MODE (type) == DFmode && align < 64)
- return 64;
- if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
- return 128;
- }
- return align;
-}
-
-/* Compute the minimum required alignment for dynamic stack realignment
- purposes for a local variable, parameter or a stack slot. EXP is
- the data type or decl itself, MODE is its mode and ALIGN is the
- alignment that the object would ordinarily have. */
-
-unsigned int
-ix86_minimum_alignment (tree exp, machine_mode mode,
- unsigned int align)
-{
- tree type, decl;
-
- if (exp && DECL_P (exp))
- {
- type = TREE_TYPE (exp);
- decl = exp;
- }
- else
- {
- type = exp;
- decl = NULL;
- }
-
- if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
- return align;
-
- /* Don't do dynamic stack realignment for long long objects with
- -mpreferred-stack-boundary=2. */
- if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
- && (!type || !TYPE_USER_ALIGN (type))
- && (!decl || !DECL_USER_ALIGN (decl)))
- {
- gcc_checking_assert (!TARGET_STV);
- return 32;
- }
-
- return align;
-}
-\f
-/* Find a location for the static chain incoming to a nested function.
- This is a register, unless all free registers are used by arguments. */
-
-static rtx
-ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
-{
- unsigned regno;
-
- if (TARGET_64BIT)
- {
- /* We always use R10 in 64-bit mode. */
- regno = R10_REG;
- }
- else
- {
- const_tree fntype, fndecl;
- unsigned int ccvt;
-
- /* By default in 32-bit mode we use ECX to pass the static chain. */
- regno = CX_REG;
-
- if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
- {
- fntype = TREE_TYPE (fndecl_or_type);
- fndecl = fndecl_or_type;
- }
- else
- {
- fntype = fndecl_or_type;
- fndecl = NULL;
- }
-
- ccvt = ix86_get_callcvt (fntype);
- if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
- {
- /* Fastcall functions use ecx/edx for arguments, which leaves
- us with EAX for the static chain.
- Thiscall functions use ecx for arguments, which also
- leaves us with EAX for the static chain. */
- regno = AX_REG;
- }
- else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
- {
- /* Thiscall functions use ecx for arguments, which leaves
- us with EAX and EDX for the static chain.
- We are using for abi-compatibility EAX. */
- regno = AX_REG;
- }
- else if (ix86_function_regparm (fntype, fndecl) == 3)
- {
- /* For regparm 3, we have no free call-clobbered registers in
- which to store the static chain. In order to implement this,
- we have the trampoline push the static chain to the stack.
- However, we can't push a value below the return address when
- we call the nested function directly, so we have to use an
- alternate entry point. For this we use ESI, and have the
- alternate entry point push ESI, so that things appear the
- same once we're executing the nested function. */
- if (incoming_p)
- {
- if (fndecl == current_function_decl
- && !ix86_static_chain_on_stack)
- {
- gcc_assert (!reload_completed);
- ix86_static_chain_on_stack = true;
- }
- return gen_frame_mem (SImode,
- plus_constant (Pmode,
- arg_pointer_rtx, -8));
- }
- regno = SI_REG;
- }
- }
-
- return gen_rtx_REG (Pmode, regno);
-}
-
-/* Emit RTL insns to initialize the variable parts of a trampoline.
- FNDECL is the decl of the target address; M_TRAMP is a MEM for
- the trampoline, and CHAIN_VALUE is an RTX for the static chain
- to be passed to the target function. */
-
-static void
-ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
-{
- rtx mem, fnaddr;
- int opcode;
- int offset = 0;
- bool need_endbr = (flag_cf_protection & CF_BRANCH);
-
- fnaddr = XEXP (DECL_RTL (fndecl), 0);
-
- if (TARGET_64BIT)
- {
- int size;
-
- if (need_endbr)
- {
- /* Insert ENDBR64. */
- mem = adjust_address (m_tramp, SImode, offset);
- emit_move_insn (mem, gen_int_mode (0xfa1e0ff3, SImode));
- offset += 4;
- }
-
- /* Load the function address to r11. Try to load address using
- the shorter movl instead of movabs. We may want to support
- movq for kernel mode, but kernel does not use trampolines at
- the moment. FNADDR is a 32bit address and may not be in
- DImode when ptr_mode == SImode. Always use movl in this
- case. */
- if (ptr_mode == SImode
- || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
- {
- fnaddr = copy_addr_to_reg (fnaddr);
-
- mem = adjust_address (m_tramp, HImode, offset);
- emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
-
- mem = adjust_address (m_tramp, SImode, offset + 2);
- emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
- offset += 6;
- }
- else
- {
- mem = adjust_address (m_tramp, HImode, offset);
- emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
-
- mem = adjust_address (m_tramp, DImode, offset + 2);
- emit_move_insn (mem, fnaddr);
- offset += 10;
- }
-
- /* Load static chain using movabs to r10. Use the shorter movl
- instead of movabs when ptr_mode == SImode. */
- if (ptr_mode == SImode)
- {
- opcode = 0xba41;
- size = 6;
- }
- else
- {
- opcode = 0xba49;
- size = 10;
- }
-
- mem = adjust_address (m_tramp, HImode, offset);
- emit_move_insn (mem, gen_int_mode (opcode, HImode));
-
- mem = adjust_address (m_tramp, ptr_mode, offset + 2);
- emit_move_insn (mem, chain_value);
- offset += size;
-
- /* Jump to r11; the last (unused) byte is a nop, only there to
- pad the write out to a single 32-bit store. */
- mem = adjust_address (m_tramp, SImode, offset);
- emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
- offset += 4;
- }
- else
- {
- rtx disp, chain;
-
- /* Depending on the static chain location, either load a register
- with a constant, or push the constant to the stack. All of the
- instructions are the same size. */
- chain = ix86_static_chain (fndecl, true);
- if (REG_P (chain))
- {
- switch (REGNO (chain))
- {
- case AX_REG:
- opcode = 0xb8; break;
- case CX_REG:
- opcode = 0xb9; break;
- default:
- gcc_unreachable ();
- }
- }
- else
- opcode = 0x68;
-
- if (need_endbr)
- {
- /* Insert ENDBR32. */
- mem = adjust_address (m_tramp, SImode, offset);
- emit_move_insn (mem, gen_int_mode (0xfb1e0ff3, SImode));
- offset += 4;
- }
-
- mem = adjust_address (m_tramp, QImode, offset);
- emit_move_insn (mem, gen_int_mode (opcode, QImode));
-
- mem = adjust_address (m_tramp, SImode, offset + 1);
- emit_move_insn (mem, chain_value);
- offset += 5;
-
- mem = adjust_address (m_tramp, QImode, offset);
- emit_move_insn (mem, gen_int_mode (0xe9, QImode));
-
- mem = adjust_address (m_tramp, SImode, offset + 1);
-
- /* Compute offset from the end of the jmp to the target function.
- In the case in which the trampoline stores the static chain on
- the stack, we need to skip the first insn which pushes the
- (call-saved) register static chain; this push is 1 byte. */
- offset += 5;
- disp = expand_binop (SImode, sub_optab, fnaddr,
- plus_constant (Pmode, XEXP (m_tramp, 0),
- offset - (MEM_P (chain) ? 1 : 0)),
- NULL_RTX, 1, OPTAB_DIRECT);
- emit_move_insn (mem, disp);
- }
-
- gcc_assert (offset <= TRAMPOLINE_SIZE);
-
-#ifdef HAVE_ENABLE_EXECUTE_STACK
-#ifdef CHECK_EXECUTE_STACK_ENABLED
- if (CHECK_EXECUTE_STACK_ENABLED)
-#endif
- emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
- LCT_NORMAL, VOIDmode, XEXP (m_tramp, 0), Pmode);
-#endif
-}
-
-static bool
-ix86_allocate_stack_slots_for_args (void)
-{
- /* Naked functions should not allocate stack slots for arguments. */
- return !ix86_function_naked (current_function_decl);
-}
-
-static bool
-ix86_warn_func_return (tree decl)
-{
- /* Naked functions are implemented entirely in assembly, including the
- return sequence, so suppress warnings about this. */
- return !ix86_function_naked (decl);
-}
-\f
-/* The following file contains several enumerations and data structures
- built from the definitions in i386-builtin-types.def. */
-
-#include "i386-builtin-types.inc"
-
-/* Table for the ix86 builtin non-function types. */
-static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
-
-/* Retrieve an element from the above table, building some of
- the types lazily. */
-
-static tree
-ix86_get_builtin_type (enum ix86_builtin_type tcode)
-{
- unsigned int index;
- tree type, itype;
-
- gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
-
- type = ix86_builtin_type_tab[(int) tcode];
- if (type != NULL)
- return type;
-
- gcc_assert (tcode > IX86_BT_LAST_PRIM);
- if (tcode <= IX86_BT_LAST_VECT)
- {
- machine_mode mode;
-
- index = tcode - IX86_BT_LAST_PRIM - 1;
- itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
- mode = ix86_builtin_type_vect_mode[index];
-
- type = build_vector_type_for_mode (itype, mode);
- }
- else
- {
- int quals;
-
- index = tcode - IX86_BT_LAST_VECT - 1;
- if (tcode <= IX86_BT_LAST_PTR)
- quals = TYPE_UNQUALIFIED;
- else
- quals = TYPE_QUAL_CONST;
-
- itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
- if (quals != TYPE_UNQUALIFIED)
- itype = build_qualified_type (itype, quals);
-
- type = build_pointer_type (itype);
- }
-
- ix86_builtin_type_tab[(int) tcode] = type;
- return type;
-}
-
-/* Table for the ix86 builtin function types. */
-static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
-
-/* Retrieve an element from the above table, building some of
- the types lazily. */
-
-static tree
-ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
-{
- tree type;
-
- gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
-
- type = ix86_builtin_func_type_tab[(int) tcode];
- if (type != NULL)
- return type;
-
- if (tcode <= IX86_BT_LAST_FUNC)
- {
- unsigned start = ix86_builtin_func_start[(int) tcode];
- unsigned after = ix86_builtin_func_start[(int) tcode + 1];
- tree rtype, atype, args = void_list_node;
- unsigned i;
-
- rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
- for (i = after - 1; i > start; --i)
- {
- atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
- args = tree_cons (NULL, atype, args);
- }
-
- type = build_function_type (rtype, args);
- }
- else
- {
- unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
- enum ix86_builtin_func_type icode;
-
- icode = ix86_builtin_func_alias_base[index];
- type = ix86_get_builtin_func_type (icode);
- }
-
- ix86_builtin_func_type_tab[(int) tcode] = type;
- return type;
-}
-
-
-/* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
- bdesc_* arrays below should come first, then builtins for each bdesc_*
- array in ascending order, so that we can use direct array accesses. */
-enum ix86_builtins
-{
- IX86_BUILTIN_MASKMOVQ,
- IX86_BUILTIN_LDMXCSR,
- IX86_BUILTIN_STMXCSR,
- IX86_BUILTIN_MASKMOVDQU,
- IX86_BUILTIN_PSLLDQ128,
- IX86_BUILTIN_CLFLUSH,
- IX86_BUILTIN_MONITOR,
- IX86_BUILTIN_MWAIT,
- IX86_BUILTIN_UMONITOR,
- IX86_BUILTIN_UMWAIT,
- IX86_BUILTIN_TPAUSE,
- IX86_BUILTIN_CLZERO,
- IX86_BUILTIN_CLDEMOTE,
- IX86_BUILTIN_VEC_INIT_V2SI,
- IX86_BUILTIN_VEC_INIT_V4HI,
- IX86_BUILTIN_VEC_INIT_V8QI,
- IX86_BUILTIN_VEC_EXT_V2DF,
- IX86_BUILTIN_VEC_EXT_V2DI,
- IX86_BUILTIN_VEC_EXT_V4SF,
- IX86_BUILTIN_VEC_EXT_V4SI,
- IX86_BUILTIN_VEC_EXT_V8HI,
- IX86_BUILTIN_VEC_EXT_V2SI,
- IX86_BUILTIN_VEC_EXT_V4HI,
- IX86_BUILTIN_VEC_EXT_V16QI,
- IX86_BUILTIN_VEC_SET_V2DI,
- IX86_BUILTIN_VEC_SET_V4SF,
- IX86_BUILTIN_VEC_SET_V4SI,
- IX86_BUILTIN_VEC_SET_V8HI,
- IX86_BUILTIN_VEC_SET_V4HI,
- IX86_BUILTIN_VEC_SET_V16QI,
- IX86_BUILTIN_GATHERSIV2DF,
- IX86_BUILTIN_GATHERSIV4DF,
- IX86_BUILTIN_GATHERDIV2DF,
- IX86_BUILTIN_GATHERDIV4DF,
- IX86_BUILTIN_GATHERSIV4SF,
- IX86_BUILTIN_GATHERSIV8SF,
- IX86_BUILTIN_GATHERDIV4SF,
- IX86_BUILTIN_GATHERDIV8SF,
- IX86_BUILTIN_GATHERSIV2DI,
- IX86_BUILTIN_GATHERSIV4DI,
- IX86_BUILTIN_GATHERDIV2DI,
- IX86_BUILTIN_GATHERDIV4DI,
- IX86_BUILTIN_GATHERSIV4SI,
- IX86_BUILTIN_GATHERSIV8SI,
- IX86_BUILTIN_GATHERDIV4SI,
- IX86_BUILTIN_GATHERDIV8SI,
- IX86_BUILTIN_GATHER3SIV8SF,
- IX86_BUILTIN_GATHER3SIV4SF,
- IX86_BUILTIN_GATHER3SIV4DF,
- IX86_BUILTIN_GATHER3SIV2DF,
- IX86_BUILTIN_GATHER3DIV8SF,
- IX86_BUILTIN_GATHER3DIV4SF,
- IX86_BUILTIN_GATHER3DIV4DF,
- IX86_BUILTIN_GATHER3DIV2DF,
- IX86_BUILTIN_GATHER3SIV8SI,
- IX86_BUILTIN_GATHER3SIV4SI,
- IX86_BUILTIN_GATHER3SIV4DI,
- IX86_BUILTIN_GATHER3SIV2DI,
- IX86_BUILTIN_GATHER3DIV8SI,
- IX86_BUILTIN_GATHER3DIV4SI,
- IX86_BUILTIN_GATHER3DIV4DI,
- IX86_BUILTIN_GATHER3DIV2DI,
- IX86_BUILTIN_SCATTERSIV8SF,
- IX86_BUILTIN_SCATTERSIV4SF,
- IX86_BUILTIN_SCATTERSIV4DF,
- IX86_BUILTIN_SCATTERSIV2DF,
- IX86_BUILTIN_SCATTERDIV8SF,
- IX86_BUILTIN_SCATTERDIV4SF,
- IX86_BUILTIN_SCATTERDIV4DF,
- IX86_BUILTIN_SCATTERDIV2DF,
- IX86_BUILTIN_SCATTERSIV8SI,
- IX86_BUILTIN_SCATTERSIV4SI,
- IX86_BUILTIN_SCATTERSIV4DI,
- IX86_BUILTIN_SCATTERSIV2DI,
- IX86_BUILTIN_SCATTERDIV8SI,
- IX86_BUILTIN_SCATTERDIV4SI,
- IX86_BUILTIN_SCATTERDIV4DI,
- IX86_BUILTIN_SCATTERDIV2DI,
- /* Alternate 4 and 8 element gather/scatter for the vectorizer
- where all operands are 32-byte or 64-byte wide respectively. */
- IX86_BUILTIN_GATHERALTSIV4DF,
- IX86_BUILTIN_GATHERALTDIV8SF,
- IX86_BUILTIN_GATHERALTSIV4DI,
- IX86_BUILTIN_GATHERALTDIV8SI,
- IX86_BUILTIN_GATHER3ALTDIV16SF,
- IX86_BUILTIN_GATHER3ALTDIV16SI,
- IX86_BUILTIN_GATHER3ALTSIV4DF,
- IX86_BUILTIN_GATHER3ALTDIV8SF,
- IX86_BUILTIN_GATHER3ALTSIV4DI,
- IX86_BUILTIN_GATHER3ALTDIV8SI,
- IX86_BUILTIN_GATHER3ALTSIV8DF,
- IX86_BUILTIN_GATHER3ALTSIV8DI,
- IX86_BUILTIN_GATHER3DIV16SF,
- IX86_BUILTIN_GATHER3DIV16SI,
- IX86_BUILTIN_GATHER3DIV8DF,
- IX86_BUILTIN_GATHER3DIV8DI,
- IX86_BUILTIN_GATHER3SIV16SF,
- IX86_BUILTIN_GATHER3SIV16SI,
- IX86_BUILTIN_GATHER3SIV8DF,
- IX86_BUILTIN_GATHER3SIV8DI,
- IX86_BUILTIN_SCATTERALTSIV8DF,
- IX86_BUILTIN_SCATTERALTDIV16SF,
- IX86_BUILTIN_SCATTERALTSIV8DI,
- IX86_BUILTIN_SCATTERALTDIV16SI,
- IX86_BUILTIN_SCATTERALTSIV4DF,
- IX86_BUILTIN_SCATTERALTDIV8SF,
- IX86_BUILTIN_SCATTERALTSIV4DI,
- IX86_BUILTIN_SCATTERALTDIV8SI,
- IX86_BUILTIN_SCATTERALTSIV2DF,
- IX86_BUILTIN_SCATTERALTDIV4SF,
- IX86_BUILTIN_SCATTERALTSIV2DI,
- IX86_BUILTIN_SCATTERALTDIV4SI,
- IX86_BUILTIN_SCATTERDIV16SF,
- IX86_BUILTIN_SCATTERDIV16SI,
- IX86_BUILTIN_SCATTERDIV8DF,
- IX86_BUILTIN_SCATTERDIV8DI,
- IX86_BUILTIN_SCATTERSIV16SF,
- IX86_BUILTIN_SCATTERSIV16SI,
- IX86_BUILTIN_SCATTERSIV8DF,
- IX86_BUILTIN_SCATTERSIV8DI,
- IX86_BUILTIN_GATHERPFQPD,
- IX86_BUILTIN_GATHERPFDPS,
- IX86_BUILTIN_GATHERPFDPD,
- IX86_BUILTIN_GATHERPFQPS,
- IX86_BUILTIN_SCATTERPFDPD,
- IX86_BUILTIN_SCATTERPFDPS,
- IX86_BUILTIN_SCATTERPFQPD,
- IX86_BUILTIN_SCATTERPFQPS,
- IX86_BUILTIN_CLWB,
- IX86_BUILTIN_CLFLUSHOPT,
- IX86_BUILTIN_INFQ,
- IX86_BUILTIN_HUGE_VALQ,
- IX86_BUILTIN_NANQ,
- IX86_BUILTIN_NANSQ,
- IX86_BUILTIN_XABORT,
- IX86_BUILTIN_ADDCARRYX32,
- IX86_BUILTIN_ADDCARRYX64,
- IX86_BUILTIN_SBB32,
- IX86_BUILTIN_SBB64,
- IX86_BUILTIN_RDRAND16_STEP,
- IX86_BUILTIN_RDRAND32_STEP,
- IX86_BUILTIN_RDRAND64_STEP,
- IX86_BUILTIN_RDSEED16_STEP,
- IX86_BUILTIN_RDSEED32_STEP,
- IX86_BUILTIN_RDSEED64_STEP,
- IX86_BUILTIN_MONITORX,
- IX86_BUILTIN_MWAITX,
- IX86_BUILTIN_CFSTRING,
- IX86_BUILTIN_CPU_INIT,
- IX86_BUILTIN_CPU_IS,
- IX86_BUILTIN_CPU_SUPPORTS,
- IX86_BUILTIN_READ_FLAGS,
- IX86_BUILTIN_WRITE_FLAGS,
-
- /* All the remaining builtins are tracked in bdesc_* arrays in
- i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
- this point. */
-#define BDESC(mask, mask2, icode, name, code, comparison, flag) \
- code,
-#define BDESC_FIRST(kind, kindu, mask, mask2, icode, name, code, comparison, flag) \
- code, \
- IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
-#define BDESC_END(kind, next_kind)
-
-#include "i386-builtin.def"
-
-#undef BDESC
-#undef BDESC_FIRST
-#undef BDESC_END
-
- IX86_BUILTIN_MAX,
-
- IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
-
- /* Now just the aliases for bdesc_* start/end. */
-#define BDESC(mask, mask2, icode, name, code, comparison, flag)
-#define BDESC_FIRST(kind, kindu, mask, mask2, icode, name, code, comparison, flag)
-#define BDESC_END(kind, next_kind) \
- IX86_BUILTIN__BDESC_##kind##_LAST \
- = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
-
-#include "i386-builtin.def"
-
-#undef BDESC
-#undef BDESC_FIRST
-#undef BDESC_END
-
- /* Just to make sure there is no comma after the last enumerator. */
- IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
-};
-
-/* Table for the ix86 builtin decls. */
-static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
-
-/* Table of all of the builtin functions that are possible with different ISA's
- but are waiting to be built until a function is declared to use that
- ISA. */
-struct builtin_isa {
- HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
- HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
- const char *name; /* function name */
- enum ix86_builtin_func_type tcode; /* type to use in the declaration */
- unsigned char const_p:1; /* true if the declaration is constant */
- unsigned char pure_p:1; /* true if the declaration has pure attribute */
- bool set_and_not_built_p;
-};
-
-static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
-
-/* Bits that can still enable any inclusion of a builtin. */
-static HOST_WIDE_INT deferred_isa_values = 0;
-static HOST_WIDE_INT deferred_isa_values2 = 0;
-
-/* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the
- MASK and MASK2 of which isa_flags and ix86_isa_flags2 to use in the
- ix86_builtins_isa array. Stores the function decl in the ix86_builtins
- array. Returns the function decl or NULL_TREE, if the builtin was not
- added.
-
- If the front end has a special hook for builtin functions, delay adding
- builtin functions that aren't in the current ISA until the ISA is changed
- with function specific optimization. Doing so, can save about 300K for the
- default compiler. When the builtin is expanded, check at that time whether
- it is valid.
-
- If the front end doesn't have a special hook, record all builtins, even if
- it isn't an instruction set in the current ISA in case the user uses
- function specific options for a different ISA, so that we don't get scope
- errors if a builtin is added in the middle of a function scope. */
-
-static inline tree
-def_builtin (HOST_WIDE_INT mask, HOST_WIDE_INT mask2,
- const char *name,
- enum ix86_builtin_func_type tcode,
- enum ix86_builtins code)
-{
- tree decl = NULL_TREE;
-
- /* An instruction may be 64bit only regardless of ISAs. */
- if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
- {
- ix86_builtins_isa[(int) code].isa = mask;
- ix86_builtins_isa[(int) code].isa2 = mask2;
-
- mask &= ~OPTION_MASK_ISA_64BIT;
-
- /* Filter out the masks most often ored together with others. */
- if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
- && mask != OPTION_MASK_ISA_AVX512VL)
- mask &= ~OPTION_MASK_ISA_AVX512VL;
- if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512BW)
- && mask != OPTION_MASK_ISA_AVX512BW)
- mask &= ~OPTION_MASK_ISA_AVX512BW;
-
- if (((mask2 == 0 || (mask2 & ix86_isa_flags2) != 0)
- && (mask == 0 || (mask & ix86_isa_flags) != 0))
- || (lang_hooks.builtin_function
- == lang_hooks.builtin_function_ext_scope))
- {
- tree type = ix86_get_builtin_func_type (tcode);
- decl = add_builtin_function (name, type, code, BUILT_IN_MD,
- NULL, NULL_TREE);
- ix86_builtins[(int) code] = decl;
- ix86_builtins_isa[(int) code].set_and_not_built_p = false;
- }
- else
- {
- /* Just MASK and MASK2 where set_and_not_built_p == true can potentially
- include a builtin. */
- deferred_isa_values |= mask;
- deferred_isa_values2 |= mask2;
- ix86_builtins[(int) code] = NULL_TREE;
- ix86_builtins_isa[(int) code].tcode = tcode;
- ix86_builtins_isa[(int) code].name = name;
- ix86_builtins_isa[(int) code].const_p = false;
- ix86_builtins_isa[(int) code].pure_p = false;
- ix86_builtins_isa[(int) code].set_and_not_built_p = true;
- }
- }
-
- return decl;
-}
-
-/* Like def_builtin, but also marks the function decl "const". */
-
-static inline tree
-def_builtin_const (HOST_WIDE_INT mask, HOST_WIDE_INT mask2, const char *name,
- enum ix86_builtin_func_type tcode, enum ix86_builtins code)
-{
- tree decl = def_builtin (mask, mask2, name, tcode, code);
- if (decl)
- TREE_READONLY (decl) = 1;
- else
- ix86_builtins_isa[(int) code].const_p = true;
-
- return decl;
-}
-
-/* Like def_builtin, but also marks the function decl "pure". */
-
-static inline tree
-def_builtin_pure (HOST_WIDE_INT mask, HOST_WIDE_INT mask2, const char *name,
- enum ix86_builtin_func_type tcode, enum ix86_builtins code)
-{
- tree decl = def_builtin (mask, mask2, name, tcode, code);
- if (decl)
- DECL_PURE_P (decl) = 1;
- else
- ix86_builtins_isa[(int) code].pure_p = true;
-
- return decl;
-}
-
-/* Add any new builtin functions for a given ISA that may not have been
- declared. This saves a bit of space compared to adding all of the
- declarations to the tree, even if we didn't use them. */
-
-static void
-ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
-{
- isa &= ~OPTION_MASK_ISA_64BIT;
-
- if ((isa & deferred_isa_values) == 0
- && (isa2 & deferred_isa_values2) == 0)
- return;
-
- /* Bits in ISA value can be removed from potential isa values. */
- deferred_isa_values &= ~isa;
- deferred_isa_values2 &= ~isa2;
-
- int i;
- tree saved_current_target_pragma = current_target_pragma;
- current_target_pragma = NULL_TREE;
-
- for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
- {
- if (((ix86_builtins_isa[i].isa & isa) != 0
- || (ix86_builtins_isa[i].isa2 & isa2) != 0)
- && ix86_builtins_isa[i].set_and_not_built_p)
- {
- tree decl, type;
-
- /* Don't define the builtin again. */
- ix86_builtins_isa[i].set_and_not_built_p = false;
-
- type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
- decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
- type, i, BUILT_IN_MD, NULL,
- NULL_TREE);
-
- ix86_builtins[i] = decl;
- if (ix86_builtins_isa[i].const_p)
- TREE_READONLY (decl) = 1;
- }
- }
-
- current_target_pragma = saved_current_target_pragma;
-}
-
-/* Bits for builtin_description.flag. */
-
-/* Set when we don't support the comparison natively, and should
- swap_comparison in order to support it. */
-#define BUILTIN_DESC_SWAP_OPERANDS 1
-
-struct builtin_description
-{
- const HOST_WIDE_INT mask;
- const HOST_WIDE_INT mask2;
- const enum insn_code icode;
- const char *const name;
- const enum ix86_builtins code;
- const enum rtx_code comparison;
- const int flag;
-};
-
-#define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
-#define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
-#define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
-#define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
-#define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
-#define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
-#define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
-#define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
-#define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
-#define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
-#define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
-#define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
-#define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
-#define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
-#define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
-#define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
-#define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
-#define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
-#define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
-#define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
-#define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
-#define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
-#define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
-#define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
-#define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
-#define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
-#define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
-#define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
-#define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
-#define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
-#define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
-#define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
-#define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
-#define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
-#define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
-#define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
-#define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
-#define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
-#define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
-#define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
-#define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
-#define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
-#define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
-#define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
-#define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
-#define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
-#define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
-#define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
-#define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
-#define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
-#define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
-#define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
-
-#define BDESC(mask, mask2, icode, name, code, comparison, flag) \
- { mask, mask2, icode, name, code, comparison, flag },
-#define BDESC_FIRST(kind, kindu, mask, mask2, icode, name, code, comparison, flag) \
-static const struct builtin_description bdesc_##kind[] = \
-{ \
- BDESC (mask, mask2, icode, name, code, comparison, flag)
-#define BDESC_END(kind, next_kind) \
-};
-
-#include "i386-builtin.def"
-
-#undef BDESC
-#undef BDESC_FIRST
-#undef BDESC_END
-
-\f
-/* TM vector builtins. */
-
-/* Reuse the existing x86-specific `struct builtin_description' cause
- we're lazy. Add casts to make them fit. */
-static const struct builtin_description bdesc_tm[] =
-{
- { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
- { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
- { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
- { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
- { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
- { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
- { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
-
- { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
- { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
- { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
- { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
- { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
- { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
- { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
-
- { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
- { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
- { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
- { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
- { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
- { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
- { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
-
- { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
- { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
- { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
-};
-
-/* Initialize the transactional memory vector load/store builtins. */
-
-static void
-ix86_init_tm_builtins (void)
-{
- enum ix86_builtin_func_type ftype;
- const struct builtin_description *d;
- size_t i;
- tree decl;
- tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
- tree attrs_log, attrs_type_log;
-
- if (!flag_tm)
- return;
-
- /* If there are no builtins defined, we must be compiling in a
- language without trans-mem support. */
- if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
- return;
-
- /* Use whatever attributes a normal TM load has. */
- decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
- attrs_load = DECL_ATTRIBUTES (decl);
- attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
- /* Use whatever attributes a normal TM store has. */
- decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
- attrs_store = DECL_ATTRIBUTES (decl);
- attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
- /* Use whatever attributes a normal TM log has. */
- decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
- attrs_log = DECL_ATTRIBUTES (decl);
- attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
-
- for (i = 0, d = bdesc_tm;
- i < ARRAY_SIZE (bdesc_tm);
- i++, d++)
- {
- if ((d->mask & ix86_isa_flags) != 0
- || (lang_hooks.builtin_function
- == lang_hooks.builtin_function_ext_scope))
- {
- tree type, attrs, attrs_type;
- enum built_in_function code = (enum built_in_function) d->code;
-
- ftype = (enum ix86_builtin_func_type) d->flag;
- type = ix86_get_builtin_func_type (ftype);
-
- if (BUILTIN_TM_LOAD_P (code))
- {
- attrs = attrs_load;
- attrs_type = attrs_type_load;
- }
- else if (BUILTIN_TM_STORE_P (code))
- {
- attrs = attrs_store;
- attrs_type = attrs_type_store;
- }
- else
- {
- attrs = attrs_log;
- attrs_type = attrs_type_log;
- }
- decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
- /* The builtin without the prefix for
- calling it directly. */
- d->name + strlen ("__builtin_"),
- attrs);
- /* add_builtin_function() will set the DECL_ATTRIBUTES, now
- set the TYPE_ATTRIBUTES. */
- decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
-
- set_builtin_decl (code, decl, false);
- }
- }
-}
-
-/* Macros for verification of enum ix86_builtins order. */
-#define BDESC_VERIFY(x, y, z) \
- gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
-#define BDESC_VERIFYS(x, y, z) \
- STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
-
-BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
- IX86_BUILTIN__BDESC_COMI_LAST, 1);
-BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
- IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
-BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
- IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
-BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
- IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
-BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
- IX86_BUILTIN__BDESC_ARGS_LAST, 1);
-BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
- IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
-BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_FIRST,
- IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
-BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
- IX86_BUILTIN__BDESC_CET_LAST, 1);
-BDESC_VERIFYS (IX86_BUILTIN_MAX,
- IX86_BUILTIN__BDESC_CET_NORMAL_LAST, 1);
-
-/* Set up all the MMX/SSE builtins, even builtins for instructions that are not
- in the current target ISA to allow the user to compile particular modules
- with different target specific options that differ from the command line
- options. */
-static void
-ix86_init_mmx_sse_builtins (void)
-{
- const struct builtin_description * d;
- enum ix86_builtin_func_type ftype;
- size_t i;
-
- /* Add all special builtins with variable number of operands. */
- for (i = 0, d = bdesc_special_args;
- i < ARRAY_SIZE (bdesc_special_args);
- i++, d++)
- {
- BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
- if (d->name == 0)
- continue;
-
- ftype = (enum ix86_builtin_func_type) d->flag;
- def_builtin (d->mask, d->mask2, d->name, ftype, d->code);
- }
- BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
- IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
- ARRAY_SIZE (bdesc_special_args) - 1);
-
- /* Add all builtins with variable number of operands. */
- for (i = 0, d = bdesc_args;
- i < ARRAY_SIZE (bdesc_args);
- i++, d++)
- {
- BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
- if (d->name == 0)
- continue;
-
- ftype = (enum ix86_builtin_func_type) d->flag;
- def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
- }
- BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
- IX86_BUILTIN__BDESC_ARGS_FIRST,
- ARRAY_SIZE (bdesc_args) - 1);
-
- /* Add all builtins with rounding. */
- for (i = 0, d = bdesc_round_args;
- i < ARRAY_SIZE (bdesc_round_args);
- i++, d++)
- {
- BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
- if (d->name == 0)
- continue;
-
- ftype = (enum ix86_builtin_func_type) d->flag;
- def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
- }
- BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
- IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
- ARRAY_SIZE (bdesc_round_args) - 1);
-
- /* pcmpestr[im] insns. */
- for (i = 0, d = bdesc_pcmpestr;
- i < ARRAY_SIZE (bdesc_pcmpestr);
- i++, d++)
- {
- BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
- if (d->code == IX86_BUILTIN_PCMPESTRM128)
- ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
- else
- ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
- def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
- }
- BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
- IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
- ARRAY_SIZE (bdesc_pcmpestr) - 1);
-
- /* pcmpistr[im] insns. */
- for (i = 0, d = bdesc_pcmpistr;
- i < ARRAY_SIZE (bdesc_pcmpistr);
- i++, d++)
- {
- BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
- if (d->code == IX86_BUILTIN_PCMPISTRM128)
- ftype = V16QI_FTYPE_V16QI_V16QI_INT;
- else
- ftype = INT_FTYPE_V16QI_V16QI_INT;
- def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
- }
- BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
- IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
- ARRAY_SIZE (bdesc_pcmpistr) - 1);
-
- /* comi/ucomi insns. */
- for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
- {
- BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
- if (d->mask == OPTION_MASK_ISA_SSE2)
- ftype = INT_FTYPE_V2DF_V2DF;
- else
- ftype = INT_FTYPE_V4SF_V4SF;
- def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
- }
- BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
- IX86_BUILTIN__BDESC_COMI_FIRST,
- ARRAY_SIZE (bdesc_comi) - 1);
-
- /* SSE */
- def_builtin (OPTION_MASK_ISA_SSE, 0, "__builtin_ia32_ldmxcsr",
- VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
- def_builtin_pure (OPTION_MASK_ISA_SSE, 0, "__builtin_ia32_stmxcsr",
- UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
-
- /* SSE or 3DNow!A */
- def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
- /* As it uses V4HImode, we have to require -mmmx too. */
- | OPTION_MASK_ISA_MMX, 0,
- "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
- IX86_BUILTIN_MASKMOVQ);
-
- /* SSE2 */
- def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_maskmovdqu",
- VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
-
- def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_clflush",
- VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
- x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_mfence",
- VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
-
- /* SSE3. */
- def_builtin (OPTION_MASK_ISA_SSE3, 0, "__builtin_ia32_monitor",
- VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
- def_builtin (OPTION_MASK_ISA_SSE3, 0, "__builtin_ia32_mwait",
- VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
-
- /* AES */
- def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
- "__builtin_ia32_aesenc128",
- V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
- def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
- "__builtin_ia32_aesenclast128",
- V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
- def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
- "__builtin_ia32_aesdec128",
- V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
- def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
- "__builtin_ia32_aesdeclast128",
- V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
- def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
- "__builtin_ia32_aesimc128",
- V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
- def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0,
- "__builtin_ia32_aeskeygenassist128",
- V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
-
- /* PCLMUL */
- def_builtin_const (OPTION_MASK_ISA_PCLMUL | OPTION_MASK_ISA_SSE2, 0,
- "__builtin_ia32_pclmulqdq128",
- V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
-
- /* RDRND */
- def_builtin (OPTION_MASK_ISA_RDRND, 0, "__builtin_ia32_rdrand16_step",
- INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
- def_builtin (OPTION_MASK_ISA_RDRND, 0, "__builtin_ia32_rdrand32_step",
- INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
- def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT, 0,
- "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
- IX86_BUILTIN_RDRAND64_STEP);
-
- /* AVX2 */
- def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv2df",
- V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
- IX86_BUILTIN_GATHERSIV2DF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4df",
- V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
- IX86_BUILTIN_GATHERSIV4DF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv2df",
- V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
- IX86_BUILTIN_GATHERDIV2DF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4df",
- V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
- IX86_BUILTIN_GATHERDIV4DF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4sf",
- V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
- IX86_BUILTIN_GATHERSIV4SF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv8sf",
- V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
- IX86_BUILTIN_GATHERSIV8SF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4sf",
- V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
- IX86_BUILTIN_GATHERDIV4SF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4sf256",
- V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
- IX86_BUILTIN_GATHERDIV8SF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv2di",
- V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
- IX86_BUILTIN_GATHERSIV2DI);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4di",
- V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
- IX86_BUILTIN_GATHERSIV4DI);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv2di",
- V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
- IX86_BUILTIN_GATHERDIV2DI);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4di",
- V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
- IX86_BUILTIN_GATHERDIV4DI);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4si",
- V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
- IX86_BUILTIN_GATHERSIV4SI);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv8si",
- V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
- IX86_BUILTIN_GATHERSIV8SI);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4si",
- V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
- IX86_BUILTIN_GATHERDIV4SI);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4si256",
- V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
- IX86_BUILTIN_GATHERDIV8SI);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltsiv4df ",
- V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
- IX86_BUILTIN_GATHERALTSIV4DF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltdiv8sf ",
- V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
- IX86_BUILTIN_GATHERALTDIV8SF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltsiv4di ",
- V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
- IX86_BUILTIN_GATHERALTSIV4DI);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltdiv8si ",
- V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
- IX86_BUILTIN_GATHERALTDIV8SI);
-
- /* AVX512F */
- def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv16sf",
- V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
- IX86_BUILTIN_GATHER3SIV16SF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv8df",
- V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
- IX86_BUILTIN_GATHER3SIV8DF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv16sf",
- V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
- IX86_BUILTIN_GATHER3DIV16SF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv8df",
- V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
- IX86_BUILTIN_GATHER3DIV8DF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv16si",
- V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
- IX86_BUILTIN_GATHER3SIV16SI);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv8di",
- V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
- IX86_BUILTIN_GATHER3SIV8DI);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv16si",
- V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
- IX86_BUILTIN_GATHER3DIV16SI);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv8di",
- V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
- IX86_BUILTIN_GATHER3DIV8DI);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altsiv8df ",
- V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
- IX86_BUILTIN_GATHER3ALTSIV8DF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altdiv16sf ",
- V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
- IX86_BUILTIN_GATHER3ALTDIV16SF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altsiv8di ",
- V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
- IX86_BUILTIN_GATHER3ALTSIV8DI);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altdiv16si ",
- V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
- IX86_BUILTIN_GATHER3ALTDIV16SI);
-
- def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv16sf",
- VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
- IX86_BUILTIN_SCATTERSIV16SF);
-
- def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv8df",
- VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
- IX86_BUILTIN_SCATTERSIV8DF);
-
- def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv16sf",
- VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
- IX86_BUILTIN_SCATTERDIV16SF);
-
- def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv8df",
- VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
- IX86_BUILTIN_SCATTERDIV8DF);
-
- def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv16si",
- VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
- IX86_BUILTIN_SCATTERSIV16SI);
-
- def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv8di",
- VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
- IX86_BUILTIN_SCATTERSIV8DI);
-
- def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv16si",
- VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
- IX86_BUILTIN_SCATTERDIV16SI);
-
- def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv8di",
- VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
- IX86_BUILTIN_SCATTERDIV8DI);
-
- /* AVX512VL */
- def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv2df",
- V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
- IX86_BUILTIN_GATHER3SIV2DF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4df",
- V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
- IX86_BUILTIN_GATHER3SIV4DF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div2df",
- V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
- IX86_BUILTIN_GATHER3DIV2DF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4df",
- V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
- IX86_BUILTIN_GATHER3DIV4DF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4sf",
- V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
- IX86_BUILTIN_GATHER3SIV4SF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv8sf",
- V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
- IX86_BUILTIN_GATHER3SIV8SF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4sf",
- V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
- IX86_BUILTIN_GATHER3DIV4SF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div8sf",
- V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
- IX86_BUILTIN_GATHER3DIV8SF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv2di",
- V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
- IX86_BUILTIN_GATHER3SIV2DI);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4di",
- V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
- IX86_BUILTIN_GATHER3SIV4DI);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div2di",
- V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
- IX86_BUILTIN_GATHER3DIV2DI);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4di",
- V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
- IX86_BUILTIN_GATHER3DIV4DI);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4si",
- V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
- IX86_BUILTIN_GATHER3SIV4SI);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv8si",
- V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
- IX86_BUILTIN_GATHER3SIV8SI);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4si",
- V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
- IX86_BUILTIN_GATHER3DIV4SI);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div8si",
- V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
- IX86_BUILTIN_GATHER3DIV8SI);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altsiv4df ",
- V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
- IX86_BUILTIN_GATHER3ALTSIV4DF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altdiv8sf ",
- V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
- IX86_BUILTIN_GATHER3ALTDIV8SF);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altsiv4di ",
- V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
- IX86_BUILTIN_GATHER3ALTSIV4DI);
-
- def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altdiv8si ",
- V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
- IX86_BUILTIN_GATHER3ALTDIV8SI);
-
- def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv8sf",
- VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
- IX86_BUILTIN_SCATTERSIV8SF);
-
- def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4sf",
- VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
- IX86_BUILTIN_SCATTERSIV4SF);
-
- def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4df",
- VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
- IX86_BUILTIN_SCATTERSIV4DF);
-
- def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv2df",
- VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
- IX86_BUILTIN_SCATTERSIV2DF);
-
- def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv8sf",
- VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
- IX86_BUILTIN_SCATTERDIV8SF);
-
- def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4sf",
- VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
- IX86_BUILTIN_SCATTERDIV4SF);
-
- def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4df",
- VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
- IX86_BUILTIN_SCATTERDIV4DF);
-
- def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv2df",
- VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
- IX86_BUILTIN_SCATTERDIV2DF);
-
- def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv8si",
- VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
- IX86_BUILTIN_SCATTERSIV8SI);
-
- def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4si",
- VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
- IX86_BUILTIN_SCATTERSIV4SI);
-
- def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4di",
- VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
- IX86_BUILTIN_SCATTERSIV4DI);
-
- def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv2di",
- VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
- IX86_BUILTIN_SCATTERSIV2DI);
-
- def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv8si",
- VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
- IX86_BUILTIN_SCATTERDIV8SI);
-
- def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4si",
- VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
- IX86_BUILTIN_SCATTERDIV4SI);
-
- def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4di",
- VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
- IX86_BUILTIN_SCATTERDIV4DI);
-
- def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv2di",
- VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
- IX86_BUILTIN_SCATTERDIV2DI);
-
- def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltsiv8df ",
- VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
- IX86_BUILTIN_SCATTERALTSIV8DF);
-
- def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltdiv16sf ",
- VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
- IX86_BUILTIN_SCATTERALTDIV16SF);
-
- def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltsiv8di ",
- VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
- IX86_BUILTIN_SCATTERALTSIV8DI);
-
- def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltdiv16si ",
- VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
- IX86_BUILTIN_SCATTERALTDIV16SI);
-
- def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv4df ",
- VOID_FTYPE_PDOUBLE_QI_V8SI_V4DF_INT,
- IX86_BUILTIN_SCATTERALTSIV4DF);
-
- def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv8sf ",
- VOID_FTYPE_PFLOAT_QI_V4DI_V8SF_INT,
- IX86_BUILTIN_SCATTERALTDIV8SF);
-
- def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv4di ",
- VOID_FTYPE_PLONGLONG_QI_V8SI_V4DI_INT,
- IX86_BUILTIN_SCATTERALTSIV4DI);
-
- def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv8si ",
- VOID_FTYPE_PINT_QI_V4DI_V8SI_INT,
- IX86_BUILTIN_SCATTERALTDIV8SI);
-
- def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv2df ",
- VOID_FTYPE_PDOUBLE_QI_V4SI_V2DF_INT,
- IX86_BUILTIN_SCATTERALTSIV2DF);
-
- def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv4sf ",
- VOID_FTYPE_PFLOAT_QI_V2DI_V4SF_INT,
- IX86_BUILTIN_SCATTERALTDIV4SF);
-
- def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv2di ",
- VOID_FTYPE_PLONGLONG_QI_V4SI_V2DI_INT,
- IX86_BUILTIN_SCATTERALTSIV2DI);
-
- def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv4si ",
- VOID_FTYPE_PINT_QI_V2DI_V4SI_INT,
- IX86_BUILTIN_SCATTERALTDIV4SI);
-
- /* AVX512PF */
- def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfdpd",
- VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
- IX86_BUILTIN_GATHERPFDPD);
- def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfdps",
- VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
- IX86_BUILTIN_GATHERPFDPS);
- def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfqpd",
- VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
- IX86_BUILTIN_GATHERPFQPD);
- def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfqps",
- VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
- IX86_BUILTIN_GATHERPFQPS);
- def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfdpd",
- VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
- IX86_BUILTIN_SCATTERPFDPD);
- def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfdps",
- VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
- IX86_BUILTIN_SCATTERPFDPS);
- def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfqpd",
- VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
- IX86_BUILTIN_SCATTERPFQPD);
- def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfqps",
- VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
- IX86_BUILTIN_SCATTERPFQPS);
-
- /* SHA */
- def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1msg1",
- V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
- def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1msg2",
- V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
- def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1nexte",
- V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
- def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1rnds4",
- V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
- def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha256msg1",
- V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
- def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha256msg2",
- V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
- def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha256rnds2",
- V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
-
- /* RTM. */
- def_builtin (OPTION_MASK_ISA_RTM, 0, "__builtin_ia32_xabort",
- VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
-
- /* MMX access to the vec_init patterns. */
- def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_init_v2si",
- V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
-
- def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_init_v4hi",
- V4HI_FTYPE_HI_HI_HI_HI,
- IX86_BUILTIN_VEC_INIT_V4HI);
-
- def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_init_v8qi",
- V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
- IX86_BUILTIN_VEC_INIT_V8QI);
-
- /* Access to the vec_extract patterns. */
- def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v2df",
- DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
- def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v2di",
- DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
- def_builtin_const (OPTION_MASK_ISA_SSE, 0, "__builtin_ia32_vec_ext_v4sf",
- FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
- def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v4si",
- SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
- def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v8hi",
- HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
-
- def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
- /* As it uses V4HImode, we have to require -mmmx too. */
- | OPTION_MASK_ISA_MMX, 0,
- "__builtin_ia32_vec_ext_v4hi",
- HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
-
- def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_ext_v2si",
- SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
-
- def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v16qi",
- QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
-
- /* Access to the vec_set patterns. */
- def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT, 0,
- "__builtin_ia32_vec_set_v2di",
- V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
-
- def_builtin_const (OPTION_MASK_ISA_SSE4_1, 0, "__builtin_ia32_vec_set_v4sf",
- V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
-
- def_builtin_const (OPTION_MASK_ISA_SSE4_1, 0, "__builtin_ia32_vec_set_v4si",
- V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
-
- def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_set_v8hi",
- V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
-
- def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
- /* As it uses V4HImode, we have to require -mmmx too. */
- | OPTION_MASK_ISA_MMX, 0,
- "__builtin_ia32_vec_set_v4hi",
- V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
-
- def_builtin_const (OPTION_MASK_ISA_SSE4_1, 0, "__builtin_ia32_vec_set_v16qi",
- V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
-
- /* RDSEED */
- def_builtin (OPTION_MASK_ISA_RDSEED, 0, "__builtin_ia32_rdseed_hi_step",
- INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
- def_builtin (OPTION_MASK_ISA_RDSEED, 0, "__builtin_ia32_rdseed_si_step",
- INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
- def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT, 0,
- "__builtin_ia32_rdseed_di_step",
- INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
-
- /* ADCX */
- def_builtin (0, 0, "__builtin_ia32_addcarryx_u32",
- UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
- def_builtin (OPTION_MASK_ISA_64BIT, 0,
- "__builtin_ia32_addcarryx_u64",
- UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
- IX86_BUILTIN_ADDCARRYX64);
-
- /* SBB */
- def_builtin (0, 0, "__builtin_ia32_sbb_u32",
- UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
- def_builtin (OPTION_MASK_ISA_64BIT, 0,
- "__builtin_ia32_sbb_u64",
- UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
- IX86_BUILTIN_SBB64);
-
- /* Read/write FLAGS. */
- if (TARGET_64BIT)
- {
- def_builtin (OPTION_MASK_ISA_64BIT, 0, "__builtin_ia32_readeflags_u64",
- UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
- def_builtin (OPTION_MASK_ISA_64BIT, 0, "__builtin_ia32_writeeflags_u64",
- VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
- }
- else
- {
- def_builtin (0, 0, "__builtin_ia32_readeflags_u32",
- UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
- def_builtin (0, 0, "__builtin_ia32_writeeflags_u32",
- VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
- }
-
- /* CLFLUSHOPT. */
- def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, 0, "__builtin_ia32_clflushopt",
- VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
-
- /* CLWB. */
- def_builtin (OPTION_MASK_ISA_CLWB, 0, "__builtin_ia32_clwb",
- VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
-
- /* MONITORX and MWAITX. */
- def_builtin (0, OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
- VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
- def_builtin (0, OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
- VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
-
- /* CLZERO. */
- def_builtin (0, OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
- VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
-
- /* WAITPKG. */
- def_builtin (0, OPTION_MASK_ISA_WAITPKG, "__builtin_ia32_umonitor",
- VOID_FTYPE_PVOID, IX86_BUILTIN_UMONITOR);
- def_builtin (0, OPTION_MASK_ISA_WAITPKG, "__builtin_ia32_umwait",
- UINT8_FTYPE_UNSIGNED_UINT64, IX86_BUILTIN_UMWAIT);
- def_builtin (0, OPTION_MASK_ISA_WAITPKG, "__builtin_ia32_tpause",
- UINT8_FTYPE_UNSIGNED_UINT64, IX86_BUILTIN_TPAUSE);
-
- /* CLDEMOTE. */
- def_builtin (0, OPTION_MASK_ISA_CLDEMOTE, "__builtin_ia32_cldemote",
- VOID_FTYPE_PCVOID, IX86_BUILTIN_CLDEMOTE);
-
- /* Add FMA4 multi-arg argument instructions */
- for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
- {
- BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
- if (d->name == 0)
- continue;
-
- ftype = (enum ix86_builtin_func_type) d->flag;
- def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code);
- }
- BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
- IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
- ARRAY_SIZE (bdesc_multi_arg) - 1);
-
- /* Add CET inrinsics. */
- for (i = 0, d = bdesc_cet; i < ARRAY_SIZE (bdesc_cet); i++, d++)
- {
- BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_FIRST, i);
- if (d->name == 0)
- continue;
-
- ftype = (enum ix86_builtin_func_type) d->flag;
- def_builtin (d->mask, d->mask2, d->name, ftype, d->code);
- }
- BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_LAST,
- IX86_BUILTIN__BDESC_CET_FIRST,
- ARRAY_SIZE (bdesc_cet) - 1);
-
- for (i = 0, d = bdesc_cet_rdssp;
- i < ARRAY_SIZE (bdesc_cet_rdssp);
- i++, d++)
- {
- BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, i);
- if (d->name == 0)
- continue;
-
- ftype = (enum ix86_builtin_func_type) d->flag;
- def_builtin (d->mask, d->mask2, d->name, ftype, d->code);
- }
- BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_LAST,
- IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
- ARRAY_SIZE (bdesc_cet_rdssp) - 1);
-}
-
-#undef BDESC_VERIFY
-#undef BDESC_VERIFYS
-
-/* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
- to return a pointer to VERSION_DECL if the outcome of the expression
- formed by PREDICATE_CHAIN is true. This function will be called during
- version dispatch to decide which function version to execute. It returns
- the basic block at the end, to which more conditions can be added. */
-
-static basic_block
-add_condition_to_bb (tree function_decl, tree version_decl,
- tree predicate_chain, basic_block new_bb)
-{
- gimple *return_stmt;
- tree convert_expr, result_var;
- gimple *convert_stmt;
- gimple *call_cond_stmt;
- gimple *if_else_stmt;
-
- basic_block bb1, bb2, bb3;
- edge e12, e23;
-
- tree cond_var, and_expr_var = NULL_TREE;
- gimple_seq gseq;
-
- tree predicate_decl, predicate_arg;
-
- push_cfun (DECL_STRUCT_FUNCTION (function_decl));
-
- gcc_assert (new_bb != NULL);
- gseq = bb_seq (new_bb);
-
-
- convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
- build_fold_addr_expr (version_decl));
- result_var = create_tmp_var (ptr_type_node);
- convert_stmt = gimple_build_assign (result_var, convert_expr);
- return_stmt = gimple_build_return (result_var);
-
- if (predicate_chain == NULL_TREE)
- {
- gimple_seq_add_stmt (&gseq, convert_stmt);
- gimple_seq_add_stmt (&gseq, return_stmt);
- set_bb_seq (new_bb, gseq);
- gimple_set_bb (convert_stmt, new_bb);
- gimple_set_bb (return_stmt, new_bb);
- pop_cfun ();
- return new_bb;
- }
-
- while (predicate_chain != NULL)
- {
- cond_var = create_tmp_var (integer_type_node);
- predicate_decl = TREE_PURPOSE (predicate_chain);
- predicate_arg = TREE_VALUE (predicate_chain);
- call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
- gimple_call_set_lhs (call_cond_stmt, cond_var);
-
- gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
- gimple_set_bb (call_cond_stmt, new_bb);
- gimple_seq_add_stmt (&gseq, call_cond_stmt);
-
- predicate_chain = TREE_CHAIN (predicate_chain);
-
- if (and_expr_var == NULL)
- and_expr_var = cond_var;
- else
- {
- gimple *assign_stmt;
- /* Use MIN_EXPR to check if any integer is zero?.
- and_expr_var = min_expr <cond_var, and_expr_var> */
- assign_stmt = gimple_build_assign (and_expr_var,
- build2 (MIN_EXPR, integer_type_node,
- cond_var, and_expr_var));
-
- gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
- gimple_set_bb (assign_stmt, new_bb);
- gimple_seq_add_stmt (&gseq, assign_stmt);
- }
- }
-
- if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
- integer_zero_node,
- NULL_TREE, NULL_TREE);
- gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
- gimple_set_bb (if_else_stmt, new_bb);
- gimple_seq_add_stmt (&gseq, if_else_stmt);
-
- gimple_seq_add_stmt (&gseq, convert_stmt);
- gimple_seq_add_stmt (&gseq, return_stmt);
- set_bb_seq (new_bb, gseq);
-
- bb1 = new_bb;
- e12 = split_block (bb1, if_else_stmt);
- bb2 = e12->dest;
- e12->flags &= ~EDGE_FALLTHRU;
- e12->flags |= EDGE_TRUE_VALUE;
-
- e23 = split_block (bb2, return_stmt);
-
- gimple_set_bb (convert_stmt, bb2);
- gimple_set_bb (return_stmt, bb2);
-
- bb3 = e23->dest;
- make_edge (bb1, bb3, EDGE_FALSE_VALUE);
-
- remove_edge (e23);
- make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
-
- pop_cfun ();
-
- return bb3;
-}
-
-/* Priority of i386 features, greater value is higher priority. This is
- used to decide the order in which function dispatch must happen. For
- instance, a version specialized for SSE4.2 should be checked for dispatch
- before a version for SSE3, as SSE4.2 implies SSE3. */
-enum feature_priority
-{
- P_ZERO = 0,
- P_MMX,
- P_SSE,
- P_SSE2,
- P_SSE3,
- P_SSSE3,
- P_PROC_SSSE3,
- P_SSE4_A,
- P_PROC_SSE4_A,
- P_SSE4_1,
- P_SSE4_2,
- P_PROC_SSE4_2,
- P_POPCNT,
- P_AES,
- P_PCLMUL,
- P_AVX,
- P_PROC_AVX,
- P_BMI,
- P_PROC_BMI,
- P_FMA4,
- P_XOP,
- P_PROC_XOP,
- P_FMA,
- P_PROC_FMA,
- P_BMI2,
- P_AVX2,
- P_PROC_AVX2,
- P_AVX512F,
- P_PROC_AVX512F
-};
-
-/* This is the order of bit-fields in __processor_features in cpuinfo.c */
-enum processor_features
-{
- F_CMOV = 0,
- F_MMX,
- F_POPCNT,
- F_SSE,
- F_SSE2,
- F_SSE3,
- F_SSSE3,
- F_SSE4_1,
- F_SSE4_2,
- F_AVX,
- F_AVX2,
- F_SSE4_A,
- F_FMA4,
- F_XOP,
- F_FMA,
- F_AVX512F,
- F_BMI,
- F_BMI2,
- F_AES,
- F_PCLMUL,
- F_AVX512VL,
- F_AVX512BW,
- F_AVX512DQ,
- F_AVX512CD,
- F_AVX512ER,
- F_AVX512PF,
- F_AVX512VBMI,
- F_AVX512IFMA,
- F_AVX5124VNNIW,
- F_AVX5124FMAPS,
- F_AVX512VPOPCNTDQ,
- F_AVX512VBMI2,
- F_GFNI,
- F_VPCLMULQDQ,
- F_AVX512VNNI,
- F_AVX512BITALG,
- F_MAX
-};
-
-/* These are the values for vendor types and cpu types and subtypes
- in cpuinfo.c. Cpu types and subtypes should be subtracted by
- the corresponding start value. */
-enum processor_model
-{
- M_INTEL = 1,
- M_AMD,
- M_CPU_TYPE_START,
- M_INTEL_BONNELL,
- M_INTEL_CORE2,
- M_INTEL_COREI7,
- M_AMDFAM10H,
- M_AMDFAM15H,
- M_INTEL_SILVERMONT,
- M_INTEL_KNL,
- M_AMD_BTVER1,
- M_AMD_BTVER2,
- M_AMDFAM17H,
- M_INTEL_KNM,
- M_INTEL_GOLDMONT,
- M_INTEL_GOLDMONT_PLUS,
- M_INTEL_TREMONT,
- M_CPU_SUBTYPE_START,
- M_INTEL_COREI7_NEHALEM,
- M_INTEL_COREI7_WESTMERE,
- M_INTEL_COREI7_SANDYBRIDGE,
- M_AMDFAM10H_BARCELONA,
- M_AMDFAM10H_SHANGHAI,
- M_AMDFAM10H_ISTANBUL,
- M_AMDFAM15H_BDVER1,
- M_AMDFAM15H_BDVER2,
- M_AMDFAM15H_BDVER3,
- M_AMDFAM15H_BDVER4,
- M_AMDFAM17H_ZNVER1,
- M_INTEL_COREI7_IVYBRIDGE,
- M_INTEL_COREI7_HASWELL,
- M_INTEL_COREI7_BROADWELL,
- M_INTEL_COREI7_SKYLAKE,
- M_INTEL_COREI7_SKYLAKE_AVX512,
- M_INTEL_COREI7_CANNONLAKE,
- M_INTEL_COREI7_ICELAKE_CLIENT,
- M_INTEL_COREI7_ICELAKE_SERVER,
- M_AMDFAM17H_ZNVER2,
- M_INTEL_COREI7_CASCADELAKE
-};
-
-struct _arch_names_table
-{
- const char *const name;
- const enum processor_model model;
-};
-
-static const _arch_names_table arch_names_table[] =
-{
- {"amd", M_AMD},
- {"intel", M_INTEL},
- {"atom", M_INTEL_BONNELL},
- {"slm", M_INTEL_SILVERMONT},
- {"core2", M_INTEL_CORE2},
- {"corei7", M_INTEL_COREI7},
- {"nehalem", M_INTEL_COREI7_NEHALEM},
- {"westmere", M_INTEL_COREI7_WESTMERE},
- {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
- {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
- {"haswell", M_INTEL_COREI7_HASWELL},
- {"broadwell", M_INTEL_COREI7_BROADWELL},
- {"skylake", M_INTEL_COREI7_SKYLAKE},
- {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
- {"cannonlake", M_INTEL_COREI7_CANNONLAKE},
- {"icelake-client", M_INTEL_COREI7_ICELAKE_CLIENT},
- {"icelake-server", M_INTEL_COREI7_ICELAKE_SERVER},
- {"cascadelake", M_INTEL_COREI7_CASCADELAKE},
- {"bonnell", M_INTEL_BONNELL},
- {"silvermont", M_INTEL_SILVERMONT},
- {"goldmont", M_INTEL_GOLDMONT},
- {"goldmont-plus", M_INTEL_GOLDMONT_PLUS},
- {"tremont", M_INTEL_TREMONT},
- {"knl", M_INTEL_KNL},
- {"knm", M_INTEL_KNM},
- {"amdfam10h", M_AMDFAM10H},
- {"barcelona", M_AMDFAM10H_BARCELONA},
- {"shanghai", M_AMDFAM10H_SHANGHAI},
- {"istanbul", M_AMDFAM10H_ISTANBUL},
- {"btver1", M_AMD_BTVER1},
- {"amdfam15h", M_AMDFAM15H},
- {"bdver1", M_AMDFAM15H_BDVER1},
- {"bdver2", M_AMDFAM15H_BDVER2},
- {"bdver3", M_AMDFAM15H_BDVER3},
- {"bdver4", M_AMDFAM15H_BDVER4},
- {"btver2", M_AMD_BTVER2},
- {"amdfam17h", M_AMDFAM17H},
- {"znver1", M_AMDFAM17H_ZNVER1},
- {"znver2", M_AMDFAM17H_ZNVER2},
-};
-
-/* These are the target attribute strings for which a dispatcher is
- available, from fold_builtin_cpu. */
-struct _isa_names_table
-{
- const char *const name;
- const enum processor_features feature;
- const enum feature_priority priority;
-};
-
-static const _isa_names_table isa_names_table[] =
-{
- {"cmov", F_CMOV, P_ZERO},
- {"mmx", F_MMX, P_MMX},
- {"popcnt", F_POPCNT, P_POPCNT},
- {"sse", F_SSE, P_SSE},
- {"sse2", F_SSE2, P_SSE2},
- {"sse3", F_SSE3, P_SSE3},
- {"ssse3", F_SSSE3, P_SSSE3},
- {"sse4a", F_SSE4_A, P_SSE4_A},
- {"sse4.1", F_SSE4_1, P_SSE4_1},
- {"sse4.2", F_SSE4_2, P_SSE4_2},
- {"avx", F_AVX, P_AVX},
- {"fma4", F_FMA4, P_FMA4},
- {"xop", F_XOP, P_XOP},
- {"fma", F_FMA, P_FMA},
- {"avx2", F_AVX2, P_AVX2},
- {"avx512f", F_AVX512F, P_AVX512F},
- {"bmi", F_BMI, P_BMI},
- {"bmi2", F_BMI2, P_BMI2},
- {"aes", F_AES, P_AES},
- {"pclmul", F_PCLMUL, P_PCLMUL},
- {"avx512vl",F_AVX512VL, P_ZERO},
- {"avx512bw",F_AVX512BW, P_ZERO},
- {"avx512dq",F_AVX512DQ, P_ZERO},
- {"avx512cd",F_AVX512CD, P_ZERO},
- {"avx512er",F_AVX512ER, P_ZERO},
- {"avx512pf",F_AVX512PF, P_ZERO},
- {"avx512vbmi",F_AVX512VBMI, P_ZERO},
- {"avx512ifma",F_AVX512IFMA, P_ZERO},
- {"avx5124vnniw",F_AVX5124VNNIW, P_ZERO},
- {"avx5124fmaps",F_AVX5124FMAPS, P_ZERO},
- {"avx512vpopcntdq",F_AVX512VPOPCNTDQ, P_ZERO},
- {"avx512vbmi2", F_AVX512VBMI2, P_ZERO},
- {"gfni", F_GFNI, P_ZERO},
- {"vpclmulqdq", F_VPCLMULQDQ, P_ZERO},
- {"avx512vnni", F_AVX512VNNI, P_ZERO},
- {"avx512bitalg", F_AVX512BITALG, P_ZERO}
-};
-
-/* This parses the attribute arguments to target in DECL and determines
- the right builtin to use to match the platform specification.
- It returns the priority value for this version decl. If PREDICATE_LIST
- is not NULL, it stores the list of cpu features that need to be checked
- before dispatching this function. */
-
-static unsigned int
-get_builtin_code_for_version (tree decl, tree *predicate_list)
-{
- tree attrs;
- struct cl_target_option cur_target;
- tree target_node;
- struct cl_target_option *new_target;
- const char *arg_str = NULL;
- const char *attrs_str = NULL;
- char *tok_str = NULL;
- char *token;
-
- enum feature_priority priority = P_ZERO;
-
- static unsigned int NUM_FEATURES
- = sizeof (isa_names_table) / sizeof (_isa_names_table);
-
- unsigned int i;
-
- tree predicate_chain = NULL_TREE;
- tree predicate_decl, predicate_arg;
-
- attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
- gcc_assert (attrs != NULL);
-
- attrs = TREE_VALUE (TREE_VALUE (attrs));
-
- gcc_assert (TREE_CODE (attrs) == STRING_CST);
- attrs_str = TREE_STRING_POINTER (attrs);
-
- /* Return priority zero for default function. */
- if (strcmp (attrs_str, "default") == 0)
- return 0;
-
- /* Handle arch= if specified. For priority, set it to be 1 more than
- the best instruction set the processor can handle. For instance, if
- there is a version for atom and a version for ssse3 (the highest ISA
- priority for atom), the atom version must be checked for dispatch
- before the ssse3 version. */
- if (strstr (attrs_str, "arch=") != NULL)
- {
- cl_target_option_save (&cur_target, &global_options);
- target_node
- = ix86_valid_target_attribute_tree (decl, attrs, &global_options,
- &global_options_set, 0);
-
- gcc_assert (target_node);
- if (target_node == error_mark_node)
- return 0;
- new_target = TREE_TARGET_OPTION (target_node);
- gcc_assert (new_target);
-
- if (new_target->arch_specified && new_target->arch > 0)
- {
- switch (new_target->arch)
- {
- case PROCESSOR_CORE2:
- arg_str = "core2";
- priority = P_PROC_SSSE3;
- break;
- case PROCESSOR_NEHALEM:
- if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_PCLMUL)
- {
- arg_str = "westmere";
- priority = P_PCLMUL;
- }
- else
- {
- /* We translate "arch=corei7" and "arch=nehalem" to
- "corei7" so that it will be mapped to M_INTEL_COREI7
- as cpu type to cover all M_INTEL_COREI7_XXXs. */
- arg_str = "corei7";
- priority = P_PROC_SSE4_2;
- }
- break;
- case PROCESSOR_SANDYBRIDGE:
- if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
- arg_str = "ivybridge";
- else
- arg_str = "sandybridge";
- priority = P_PROC_AVX;
- break;
- case PROCESSOR_HASWELL:
- if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
- arg_str = "broadwell";
- else
- arg_str = "haswell";
- priority = P_PROC_AVX2;
- break;
- case PROCESSOR_SKYLAKE:
- arg_str = "skylake";
- priority = P_PROC_AVX2;
- break;
- case PROCESSOR_SKYLAKE_AVX512:
- arg_str = "skylake-avx512";
- priority = P_PROC_AVX512F;
- break;
- case PROCESSOR_CANNONLAKE:
- arg_str = "cannonlake";
- priority = P_PROC_AVX512F;
- break;
- case PROCESSOR_ICELAKE_CLIENT:
- arg_str = "icelake-client";
- priority = P_PROC_AVX512F;
- break;
- case PROCESSOR_ICELAKE_SERVER:
- arg_str = "icelake-server";
- priority = P_PROC_AVX512F;
- break;
- case PROCESSOR_CASCADELAKE:
- arg_str = "cascadelake";
- priority = P_PROC_AVX512F;
- break;
- case PROCESSOR_BONNELL:
- arg_str = "bonnell";
- priority = P_PROC_SSSE3;
- break;
- case PROCESSOR_KNL:
- arg_str = "knl";
- priority = P_PROC_AVX512F;
- break;
- case PROCESSOR_KNM:
- arg_str = "knm";
- priority = P_PROC_AVX512F;
- break;
- case PROCESSOR_SILVERMONT:
- arg_str = "silvermont";
- priority = P_PROC_SSE4_2;
- break;
- case PROCESSOR_GOLDMONT:
- arg_str = "goldmont";
- priority = P_PROC_SSE4_2;
- break;
- case PROCESSOR_GOLDMONT_PLUS:
- arg_str = "goldmont-plus";
- priority = P_PROC_SSE4_2;
- break;
- case PROCESSOR_TREMONT:
- arg_str = "tremont";
- priority = P_PROC_SSE4_2;
- break;
- case PROCESSOR_AMDFAM10:
- arg_str = "amdfam10h";
- priority = P_PROC_SSE4_A;
- break;
- case PROCESSOR_BTVER1:
- arg_str = "btver1";
- priority = P_PROC_SSE4_A;
- break;
- case PROCESSOR_BTVER2:
- arg_str = "btver2";
- priority = P_PROC_BMI;
- break;
- case PROCESSOR_BDVER1:
- arg_str = "bdver1";
- priority = P_PROC_XOP;
- break;
- case PROCESSOR_BDVER2:
- arg_str = "bdver2";
- priority = P_PROC_FMA;
- break;
- case PROCESSOR_BDVER3:
- arg_str = "bdver3";
- priority = P_PROC_FMA;
- break;
- case PROCESSOR_BDVER4:
- arg_str = "bdver4";
- priority = P_PROC_AVX2;
- break;
- case PROCESSOR_ZNVER1:
- arg_str = "znver1";
- priority = P_PROC_AVX2;
- break;
- case PROCESSOR_ZNVER2:
- arg_str = "znver2";
- priority = P_PROC_AVX2;
- break;
- }
- }
-
- cl_target_option_restore (&global_options, &cur_target);
-
- if (predicate_list && arg_str == NULL)
- {
- error_at (DECL_SOURCE_LOCATION (decl),
- "no dispatcher found for the versioning attributes");
- return 0;
- }
-
- if (predicate_list)
- {
- predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
- /* For a C string literal the length includes the trailing NULL. */
- predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
- predicate_chain = tree_cons (predicate_decl, predicate_arg,
- predicate_chain);
- }
- }
-
- /* Process feature name. */
- tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
- strcpy (tok_str, attrs_str);
- token = strtok (tok_str, ",");
- predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
-
- while (token != NULL)
- {
- /* Do not process "arch=" */
- if (strncmp (token, "arch=", 5) == 0)
- {
- token = strtok (NULL, ",");
- continue;
- }
- for (i = 0; i < NUM_FEATURES; ++i)
- {
- if (strcmp (token, isa_names_table[i].name) == 0)
- {
- if (predicate_list)
- {
- predicate_arg = build_string_literal (
- strlen (isa_names_table[i].name) + 1,
- isa_names_table[i].name);
- predicate_chain = tree_cons (predicate_decl, predicate_arg,
- predicate_chain);
- }
- /* Find the maximum priority feature. */
- if (isa_names_table[i].priority > priority)
- priority = isa_names_table[i].priority;
-
- break;
- }
- }
- if (predicate_list && priority == P_ZERO)
- {
- error_at (DECL_SOURCE_LOCATION (decl),
- "ISA %qs is not supported in %<target%> attribute, "
- "use %<arch=%> syntax", token);
- return 0;
- }
- token = strtok (NULL, ",");
- }
- free (tok_str);
-
- if (predicate_list && predicate_chain == NULL_TREE)
- {
- error_at (DECL_SOURCE_LOCATION (decl),
- "no dispatcher found for the versioning attributes: %s",
- attrs_str);
- return 0;
- }
- else if (predicate_list)
- {
- predicate_chain = nreverse (predicate_chain);
- *predicate_list = predicate_chain;
- }
-
- return priority;
-}
-
-/* This compares the priority of target features in function DECL1
- and DECL2. It returns positive value if DECL1 is higher priority,
- negative value if DECL2 is higher priority and 0 if they are the
- same. */
-
-static int
-ix86_compare_version_priority (tree decl1, tree decl2)
-{
- unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
- unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
-
- return (int)priority1 - (int)priority2;
-}
-
-/* V1 and V2 point to function versions with different priorities
- based on the target ISA. This function compares their priorities. */
-
-static int
-feature_compare (const void *v1, const void *v2)
-{
- typedef struct _function_version_info
- {
- tree version_decl;
- tree predicate_chain;
- unsigned int dispatch_priority;
- } function_version_info;
-
- const function_version_info c1 = *(const function_version_info *)v1;
- const function_version_info c2 = *(const function_version_info *)v2;
- return (c2.dispatch_priority - c1.dispatch_priority);
-}
-
-/* This function generates the dispatch function for
- multi-versioned functions. DISPATCH_DECL is the function which will
- contain the dispatch logic. FNDECLS are the function choices for
- dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
- in DISPATCH_DECL in which the dispatch code is generated. */
-
-static int
-dispatch_function_versions (tree dispatch_decl,
- void *fndecls_p,
- basic_block *empty_bb)
-{
- tree default_decl;
- gimple *ifunc_cpu_init_stmt;
- gimple_seq gseq;
- int ix;
- tree ele;
- vec<tree> *fndecls;
- unsigned int num_versions = 0;
- unsigned int actual_versions = 0;
- unsigned int i;
-
- struct _function_version_info
- {
- tree version_decl;
- tree predicate_chain;
- unsigned int dispatch_priority;
- }*function_version_info;
-
- gcc_assert (dispatch_decl != NULL
- && fndecls_p != NULL
- && empty_bb != NULL);
-
- /*fndecls_p is actually a vector. */
- fndecls = static_cast<vec<tree> *> (fndecls_p);
-
- /* At least one more version other than the default. */
- num_versions = fndecls->length ();
- gcc_assert (num_versions >= 2);
-
- function_version_info = (struct _function_version_info *)
- XNEWVEC (struct _function_version_info, (num_versions - 1));
-
- /* The first version in the vector is the default decl. */
- default_decl = (*fndecls)[0];
-
- push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
-
- gseq = bb_seq (*empty_bb);
- /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
- constructors, so explicity call __builtin_cpu_init here. */
- ifunc_cpu_init_stmt = gimple_build_call_vec (
- ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
- gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
- gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
- set_bb_seq (*empty_bb, gseq);
-
- pop_cfun ();
-
-
- for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
- {
- tree version_decl = ele;
- tree predicate_chain = NULL_TREE;
- unsigned int priority;
- /* Get attribute string, parse it and find the right predicate decl.
- The predicate function could be a lengthy combination of many
- features, like arch-type and various isa-variants. */
- priority = get_builtin_code_for_version (version_decl,
- &predicate_chain);
-
- if (predicate_chain == NULL_TREE)
- continue;
-
- function_version_info [actual_versions].version_decl = version_decl;
- function_version_info [actual_versions].predicate_chain
- = predicate_chain;
- function_version_info [actual_versions].dispatch_priority = priority;
- actual_versions++;
- }
-
- /* Sort the versions according to descending order of dispatch priority. The
- priority is based on the ISA. This is not a perfect solution. There
- could still be ambiguity. If more than one function version is suitable
- to execute, which one should be dispatched? In future, allow the user
- to specify a dispatch priority next to the version. */
- qsort (function_version_info, actual_versions,
- sizeof (struct _function_version_info), feature_compare);
-
- for (i = 0; i < actual_versions; ++i)
- *empty_bb = add_condition_to_bb (dispatch_decl,
- function_version_info[i].version_decl,
- function_version_info[i].predicate_chain,
- *empty_bb);
-
- /* dispatch default version at the end. */
- *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
- NULL, *empty_bb);
-
- free (function_version_info);
- return 0;
-}
-
-/* This function changes the assembler name for functions that are
- versions. If DECL is a function version and has a "target"
- attribute, it appends the attribute string to its assembler name. */
-
-static tree
-ix86_mangle_function_version_assembler_name (tree decl, tree id)
-{
- tree version_attr;
- const char *orig_name, *version_string;
- char *attr_str, *assembler_name;
-
- if (DECL_DECLARED_INLINE_P (decl)
- && lookup_attribute ("gnu_inline",
- DECL_ATTRIBUTES (decl)))
- error_at (DECL_SOURCE_LOCATION (decl),
- "function versions cannot be marked as gnu_inline,"
- " bodies have to be generated");
-
- if (DECL_VIRTUAL_P (decl)
- || DECL_VINDEX (decl))
- sorry ("virtual function multiversioning not supported");
-
- version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
-
- /* target attribute string cannot be NULL. */
- gcc_assert (version_attr != NULL_TREE);
-
- orig_name = IDENTIFIER_POINTER (id);
- version_string
- = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
-
- if (strcmp (version_string, "default") == 0)
- return id;
-
- attr_str = sorted_attr_string (TREE_VALUE (version_attr));
- assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
-
- sprintf (assembler_name, "%s.%s", orig_name, attr_str);
-
- /* Allow assembler name to be modified if already set. */
- if (DECL_ASSEMBLER_NAME_SET_P (decl))
- SET_DECL_RTL (decl, NULL);
-
- tree ret = get_identifier (assembler_name);
- XDELETEVEC (attr_str);
- XDELETEVEC (assembler_name);
- return ret;
-}
-
-
-static tree
-ix86_mangle_decl_assembler_name (tree decl, tree id)
-{
- /* For function version, add the target suffix to the assembler name. */
- if (TREE_CODE (decl) == FUNCTION_DECL
- && DECL_FUNCTION_VERSIONED (decl))
- id = ix86_mangle_function_version_assembler_name (decl, id);
-#ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
- id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
-#endif
-
- return id;
-}
-
-/* Make a dispatcher declaration for the multi-versioned function DECL.
- Calls to DECL function will be replaced with calls to the dispatcher
- by the front-end. Returns the decl of the dispatcher function. */
-
-static tree
-ix86_get_function_versions_dispatcher (void *decl)
-{
- tree fn = (tree) decl;
- struct cgraph_node *node = NULL;
- struct cgraph_node *default_node = NULL;
- struct cgraph_function_version_info *node_v = NULL;
- struct cgraph_function_version_info *first_v = NULL;
-
- tree dispatch_decl = NULL;
-
- struct cgraph_function_version_info *default_version_info = NULL;
-
- gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
-
- node = cgraph_node::get (fn);
- gcc_assert (node != NULL);
-
- node_v = node->function_version ();
- gcc_assert (node_v != NULL);
-
- if (node_v->dispatcher_resolver != NULL)
- return node_v->dispatcher_resolver;
-
- /* Find the default version and make it the first node. */
- first_v = node_v;
- /* Go to the beginning of the chain. */
- while (first_v->prev != NULL)
- first_v = first_v->prev;
- default_version_info = first_v;
- while (default_version_info != NULL)
- {
- if (is_function_default_version
- (default_version_info->this_node->decl))
- break;
- default_version_info = default_version_info->next;
- }
-
- /* If there is no default node, just return NULL. */
- if (default_version_info == NULL)
- return NULL;
-
- /* Make default info the first node. */
- if (first_v != default_version_info)
- {
- default_version_info->prev->next = default_version_info->next;
- if (default_version_info->next)
- default_version_info->next->prev = default_version_info->prev;
- first_v->prev = default_version_info;
- default_version_info->next = first_v;
- default_version_info->prev = NULL;
- }
-
- default_node = default_version_info->this_node;
-
-#if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
- if (targetm.has_ifunc_p ())
- {
- struct cgraph_function_version_info *it_v = NULL;
- struct cgraph_node *dispatcher_node = NULL;
- struct cgraph_function_version_info *dispatcher_version_info = NULL;
-
- /* Right now, the dispatching is done via ifunc. */
- dispatch_decl = make_dispatcher_decl (default_node->decl);
-
- dispatcher_node = cgraph_node::get_create (dispatch_decl);
- gcc_assert (dispatcher_node != NULL);
- dispatcher_node->dispatcher_function = 1;
- dispatcher_version_info
- = dispatcher_node->insert_new_function_version ();
- dispatcher_version_info->next = default_version_info;
- dispatcher_node->definition = 1;
-
- /* Set the dispatcher for all the versions. */
- it_v = default_version_info;
- while (it_v != NULL)
- {
- it_v->dispatcher_resolver = dispatch_decl;
- it_v = it_v->next;
- }
- }
- else
-#endif
- {
- error_at (DECL_SOURCE_LOCATION (default_node->decl),
- "multiversioning needs ifunc which is not supported "
- "on this target");
- }
-
- return dispatch_decl;
-}
-
-/* Make the resolver function decl to dispatch the versions of
- a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
- ifunc alias that will point to the created resolver. Create an
- empty basic block in the resolver and store the pointer in
- EMPTY_BB. Return the decl of the resolver function. */
-
-static tree
-make_resolver_func (const tree default_decl,
- const tree ifunc_alias_decl,
- basic_block *empty_bb)
-{
- char *resolver_name;
- tree decl, type, decl_name, t;
-
- /* IFUNC's have to be globally visible. So, if the default_decl is
- not, then the name of the IFUNC should be made unique. */
- if (TREE_PUBLIC (default_decl) == 0)
- {
- char *ifunc_name = make_unique_name (default_decl, "ifunc", true);
- symtab->change_decl_assembler_name (ifunc_alias_decl,
- get_identifier (ifunc_name));
- XDELETEVEC (ifunc_name);
- }
-
- resolver_name = make_unique_name (default_decl, "resolver", false);
-
- /* The resolver function should return a (void *). */
- type = build_function_type_list (ptr_type_node, NULL_TREE);
-
- decl = build_fn_decl (resolver_name, type);
- decl_name = get_identifier (resolver_name);
- SET_DECL_ASSEMBLER_NAME (decl, decl_name);
-
- DECL_NAME (decl) = decl_name;
- TREE_USED (decl) = 1;
- DECL_ARTIFICIAL (decl) = 1;
- DECL_IGNORED_P (decl) = 1;
- TREE_PUBLIC (decl) = 0;
- DECL_UNINLINABLE (decl) = 1;
-
- /* Resolver is not external, body is generated. */
- DECL_EXTERNAL (decl) = 0;
- DECL_EXTERNAL (ifunc_alias_decl) = 0;
-
- DECL_CONTEXT (decl) = NULL_TREE;
- DECL_INITIAL (decl) = make_node (BLOCK);
- DECL_STATIC_CONSTRUCTOR (decl) = 0;
-
- if (DECL_COMDAT_GROUP (default_decl)
- || TREE_PUBLIC (default_decl))
- {
- /* In this case, each translation unit with a call to this
- versioned function will put out a resolver. Ensure it
- is comdat to keep just one copy. */
- DECL_COMDAT (decl) = 1;
- make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
- }
- /* Build result decl and add to function_decl. */
- t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
- DECL_CONTEXT (t) = decl;
- DECL_ARTIFICIAL (t) = 1;
- DECL_IGNORED_P (t) = 1;
- DECL_RESULT (decl) = t;
-
- gimplify_function_tree (decl);
- push_cfun (DECL_STRUCT_FUNCTION (decl));
- *empty_bb = init_lowered_empty_function (decl, false,
- profile_count::uninitialized ());
-
- cgraph_node::add_new_function (decl, true);
- symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
-
- pop_cfun ();
-
- gcc_assert (ifunc_alias_decl != NULL);
- /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
- DECL_ATTRIBUTES (ifunc_alias_decl)
- = make_attribute ("ifunc", resolver_name,
- DECL_ATTRIBUTES (ifunc_alias_decl));
-
- /* Create the alias for dispatch to resolver here. */
- cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
- XDELETEVEC (resolver_name);
- return decl;
-}
-
-/* Generate the dispatching code body to dispatch multi-versioned function
- DECL. The target hook is called to process the "target" attributes and
- provide the code to dispatch the right function at run-time. NODE points
- to the dispatcher decl whose body will be created. */
-
-static tree
-ix86_generate_version_dispatcher_body (void *node_p)
-{
- tree resolver_decl;
- basic_block empty_bb;
- tree default_ver_decl;
- struct cgraph_node *versn;
- struct cgraph_node *node;
-
- struct cgraph_function_version_info *node_version_info = NULL;
- struct cgraph_function_version_info *versn_info = NULL;
-
- node = (cgraph_node *)node_p;
-
- node_version_info = node->function_version ();
- gcc_assert (node->dispatcher_function
- && node_version_info != NULL);
-
- if (node_version_info->dispatcher_resolver)
- return node_version_info->dispatcher_resolver;
-
- /* The first version in the chain corresponds to the default version. */
- default_ver_decl = node_version_info->next->this_node->decl;
-
- /* node is going to be an alias, so remove the finalized bit. */
- node->definition = false;
-
- resolver_decl = make_resolver_func (default_ver_decl,
- node->decl, &empty_bb);
-
- node_version_info->dispatcher_resolver = resolver_decl;
-
- push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
-
- auto_vec<tree, 2> fn_ver_vec;
-
- for (versn_info = node_version_info->next; versn_info;
- versn_info = versn_info->next)
- {
- versn = versn_info->this_node;
- /* Check for virtual functions here again, as by this time it should
- have been determined if this function needs a vtable index or
- not. This happens for methods in derived classes that override
- virtual methods in base classes but are not explicitly marked as
- virtual. */
- if (DECL_VINDEX (versn->decl))
- sorry ("virtual function multiversioning not supported");
-
- fn_ver_vec.safe_push (versn->decl);
- }
-
- dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
- cgraph_edge::rebuild_edges ();
- pop_cfun ();
- return resolver_decl;
-}
-/* This builds the processor_model struct type defined in
- libgcc/config/i386/cpuinfo.c */
-
-static tree
-build_processor_model_struct (void)
-{
- const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
- "__cpu_features"};
- tree field = NULL_TREE, field_chain = NULL_TREE;
- int i;
- tree type = make_node (RECORD_TYPE);
-
- /* The first 3 fields are unsigned int. */
- for (i = 0; i < 3; ++i)
- {
- field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
- get_identifier (field_name[i]), unsigned_type_node);
- if (field_chain != NULL_TREE)
- DECL_CHAIN (field) = field_chain;
- field_chain = field;
- }
-
- /* The last field is an array of unsigned integers of size one. */
- field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
- get_identifier (field_name[3]),
- build_array_type (unsigned_type_node,
- build_index_type (size_one_node)));
- if (field_chain != NULL_TREE)
- DECL_CHAIN (field) = field_chain;
- field_chain = field;
-
- finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
- return type;
-}
-
-/* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
-
-static tree
-make_var_decl (tree type, const char *name)
-{
- tree new_decl;
-
- new_decl = build_decl (UNKNOWN_LOCATION,
- VAR_DECL,
- get_identifier(name),
- type);
-
- DECL_EXTERNAL (new_decl) = 1;
- TREE_STATIC (new_decl) = 1;
- TREE_PUBLIC (new_decl) = 1;
- DECL_INITIAL (new_decl) = 0;
- DECL_ARTIFICIAL (new_decl) = 0;
- DECL_PRESERVE_P (new_decl) = 1;
-
- make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
- assemble_variable (new_decl, 0, 0, 0);
-
- return new_decl;
-}
-
-/* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
- into an integer defined in libgcc/config/i386/cpuinfo.c */
-
-static tree
-fold_builtin_cpu (tree fndecl, tree *args)
-{
- unsigned int i;
- enum ix86_builtins fn_code = (enum ix86_builtins)
- DECL_FUNCTION_CODE (fndecl);
- tree param_string_cst = NULL;
-
- tree __processor_model_type = build_processor_model_struct ();
- tree __cpu_model_var = make_var_decl (__processor_model_type,
- "__cpu_model");
-
-
- varpool_node::add (__cpu_model_var);
-
- gcc_assert ((args != NULL) && (*args != NULL));
-
- param_string_cst = *args;
- while (param_string_cst
- && TREE_CODE (param_string_cst) != STRING_CST)
- {
- /* *args must be a expr that can contain other EXPRS leading to a
- STRING_CST. */
- if (!EXPR_P (param_string_cst))
- {
- error ("parameter to builtin must be a string constant or literal");
- return integer_zero_node;
- }
- param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
- }
-
- gcc_assert (param_string_cst);
-
- if (fn_code == IX86_BUILTIN_CPU_IS)
- {
- tree ref;
- tree field;
- tree final;
-
- unsigned int field_val = 0;
- unsigned int NUM_ARCH_NAMES
- = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
-
- for (i = 0; i < NUM_ARCH_NAMES; i++)
- if (strcmp (arch_names_table[i].name,
- TREE_STRING_POINTER (param_string_cst)) == 0)
- break;
-
- if (i == NUM_ARCH_NAMES)
- {
- error ("parameter to builtin not valid: %s",
- TREE_STRING_POINTER (param_string_cst));
- return integer_zero_node;
- }
-
- field = TYPE_FIELDS (__processor_model_type);
- field_val = arch_names_table[i].model;
-
- /* CPU types are stored in the next field. */
- if (field_val > M_CPU_TYPE_START
- && field_val < M_CPU_SUBTYPE_START)
- {
- field = DECL_CHAIN (field);
- field_val -= M_CPU_TYPE_START;
- }
-
- /* CPU subtypes are stored in the next field. */
- if (field_val > M_CPU_SUBTYPE_START)
- {
- field = DECL_CHAIN ( DECL_CHAIN (field));
- field_val -= M_CPU_SUBTYPE_START;
- }
-
- /* Get the appropriate field in __cpu_model. */
- ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
- field, NULL_TREE);
-
- /* Check the value. */
- final = build2 (EQ_EXPR, unsigned_type_node, ref,
- build_int_cstu (unsigned_type_node, field_val));
- return build1 (CONVERT_EXPR, integer_type_node, final);
- }
- else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
- {
- tree ref;
- tree array_elt;
- tree field;
- tree final;
-
- unsigned int field_val = 0;
- unsigned int NUM_ISA_NAMES
- = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
-
- for (i = 0; i < NUM_ISA_NAMES; i++)
- if (strcmp (isa_names_table[i].name,
- TREE_STRING_POINTER (param_string_cst)) == 0)
- break;
-
- if (i == NUM_ISA_NAMES)
- {
- error ("parameter to builtin not valid: %s",
- TREE_STRING_POINTER (param_string_cst));
- return integer_zero_node;
- }
-
- if (isa_names_table[i].feature >= 32)
- {
- tree __cpu_features2_var = make_var_decl (unsigned_type_node,
- "__cpu_features2");
-
- varpool_node::add (__cpu_features2_var);
- field_val = (1U << (isa_names_table[i].feature - 32));
- /* Return __cpu_features2 & field_val */
- final = build2 (BIT_AND_EXPR, unsigned_type_node,
- __cpu_features2_var,
- build_int_cstu (unsigned_type_node, field_val));
- return build1 (CONVERT_EXPR, integer_type_node, final);
- }
-
- field = TYPE_FIELDS (__processor_model_type);
- /* Get the last field, which is __cpu_features. */
- while (DECL_CHAIN (field))
- field = DECL_CHAIN (field);
-
- /* Get the appropriate field: __cpu_model.__cpu_features */
- ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
- field, NULL_TREE);
-
- /* Access the 0th element of __cpu_features array. */
- array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
- integer_zero_node, NULL_TREE, NULL_TREE);
-
- field_val = (1U << isa_names_table[i].feature);
- /* Return __cpu_model.__cpu_features[0] & field_val */
- final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
- build_int_cstu (unsigned_type_node, field_val));
- return build1 (CONVERT_EXPR, integer_type_node, final);
- }
- gcc_unreachable ();
-}
-
-/* Return the shift count of a vector by scalar shift builtin second argument
- ARG1. */
-static tree
-ix86_vector_shift_count (tree arg1)
-{
- if (tree_fits_uhwi_p (arg1))
- return arg1;
- else if (TREE_CODE (arg1) == VECTOR_CST && CHAR_BIT == 8)
- {
- /* The count argument is weird, passed in as various 128-bit
- (or 64-bit) vectors, the low 64 bits from it are the count. */
- unsigned char buf[16];
- int len = native_encode_expr (arg1, buf, 16);
- if (len == 0)
- return NULL_TREE;
- tree t = native_interpret_expr (uint64_type_node, buf, len);
- if (t && tree_fits_uhwi_p (t))
- return t;
- }
- return NULL_TREE;
-}
-
-static tree
-ix86_fold_builtin (tree fndecl, int n_args,
- tree *args, bool ignore ATTRIBUTE_UNUSED)
-{
- if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
- {
- enum ix86_builtins fn_code = (enum ix86_builtins)
- DECL_FUNCTION_CODE (fndecl);
- enum rtx_code rcode;
- bool is_vshift;
- unsigned HOST_WIDE_INT mask;
-
- switch (fn_code)
- {
- case IX86_BUILTIN_CPU_IS:
- case IX86_BUILTIN_CPU_SUPPORTS:
- gcc_assert (n_args == 1);
- return fold_builtin_cpu (fndecl, args);
-
- case IX86_BUILTIN_NANQ:
- case IX86_BUILTIN_NANSQ:
- {
- tree type = TREE_TYPE (TREE_TYPE (fndecl));
- const char *str = c_getstr (*args);
- int quiet = fn_code == IX86_BUILTIN_NANQ;
- REAL_VALUE_TYPE real;
-
- if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
- return build_real (type, real);
- return NULL_TREE;
- }
-
- case IX86_BUILTIN_INFQ:
- case IX86_BUILTIN_HUGE_VALQ:
- {
- tree type = TREE_TYPE (TREE_TYPE (fndecl));
- REAL_VALUE_TYPE inf;
- real_inf (&inf);
- return build_real (type, inf);
- }
-
- case IX86_BUILTIN_TZCNT16:
- case IX86_BUILTIN_CTZS:
- case IX86_BUILTIN_TZCNT32:
- case IX86_BUILTIN_TZCNT64:
- gcc_assert (n_args == 1);
- if (TREE_CODE (args[0]) == INTEGER_CST)
- {
- tree type = TREE_TYPE (TREE_TYPE (fndecl));
- tree arg = args[0];
- if (fn_code == IX86_BUILTIN_TZCNT16
- || fn_code == IX86_BUILTIN_CTZS)
- arg = fold_convert (short_unsigned_type_node, arg);
- if (integer_zerop (arg))
- return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
- else
- return fold_const_call (CFN_CTZ, type, arg);
- }
- break;
-
- case IX86_BUILTIN_LZCNT16:
- case IX86_BUILTIN_CLZS:
- case IX86_BUILTIN_LZCNT32:
- case IX86_BUILTIN_LZCNT64:
- gcc_assert (n_args == 1);
- if (TREE_CODE (args[0]) == INTEGER_CST)
- {
- tree type = TREE_TYPE (TREE_TYPE (fndecl));
- tree arg = args[0];
- if (fn_code == IX86_BUILTIN_LZCNT16
- || fn_code == IX86_BUILTIN_CLZS)
- arg = fold_convert (short_unsigned_type_node, arg);
- if (integer_zerop (arg))
- return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
- else
- return fold_const_call (CFN_CLZ, type, arg);
- }
- break;
-
- case IX86_BUILTIN_BEXTR32:
- case IX86_BUILTIN_BEXTR64:
- case IX86_BUILTIN_BEXTRI32:
- case IX86_BUILTIN_BEXTRI64:
- gcc_assert (n_args == 2);
- if (tree_fits_uhwi_p (args[1]))
- {
- unsigned HOST_WIDE_INT res = 0;
- unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
- unsigned int start = tree_to_uhwi (args[1]);
- unsigned int len = (start & 0xff00) >> 8;
- start &= 0xff;
- if (start >= prec || len == 0)
- res = 0;
- else if (!tree_fits_uhwi_p (args[0]))
- break;
- else
- res = tree_to_uhwi (args[0]) >> start;
- if (len > prec)
- len = prec;
- if (len < HOST_BITS_PER_WIDE_INT)
- res &= (HOST_WIDE_INT_1U << len) - 1;
- return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
- }
- break;
-
- case IX86_BUILTIN_BZHI32:
- case IX86_BUILTIN_BZHI64:
- gcc_assert (n_args == 2);
- if (tree_fits_uhwi_p (args[1]))
- {
- unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
- if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
- return args[0];
- if (idx == 0)
- return build_int_cst (TREE_TYPE (TREE_TYPE (fndecl)), 0);
- if (!tree_fits_uhwi_p (args[0]))
- break;
- unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
- res &= ~(HOST_WIDE_INT_M1U << idx);
- return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
- }
- break;
-
- case IX86_BUILTIN_PDEP32:
- case IX86_BUILTIN_PDEP64:
- gcc_assert (n_args == 2);
- if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
- {
- unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
- unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
- unsigned HOST_WIDE_INT res = 0;
- unsigned HOST_WIDE_INT m, k = 1;
- for (m = 1; m; m <<= 1)
- if ((mask & m) != 0)
- {
- if ((src & k) != 0)
- res |= m;
- k <<= 1;
- }
- return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
- }
- break;
-
- case IX86_BUILTIN_PEXT32:
- case IX86_BUILTIN_PEXT64:
- gcc_assert (n_args == 2);
- if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
- {
- unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
- unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
- unsigned HOST_WIDE_INT res = 0;
- unsigned HOST_WIDE_INT m, k = 1;
- for (m = 1; m; m <<= 1)
- if ((mask & m) != 0)
- {
- if ((src & m) != 0)
- res |= k;
- k <<= 1;
- }
- return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
- }
- break;
-
- case IX86_BUILTIN_MOVMSKPS:
- case IX86_BUILTIN_PMOVMSKB:
- case IX86_BUILTIN_MOVMSKPD:
- case IX86_BUILTIN_PMOVMSKB128:
- case IX86_BUILTIN_MOVMSKPD256:
- case IX86_BUILTIN_MOVMSKPS256:
- case IX86_BUILTIN_PMOVMSKB256:
- gcc_assert (n_args == 1);
- if (TREE_CODE (args[0]) == VECTOR_CST)
- {
- HOST_WIDE_INT res = 0;
- for (unsigned i = 0; i < VECTOR_CST_NELTS (args[0]); ++i)
- {
- tree e = VECTOR_CST_ELT (args[0], i);
- if (TREE_CODE (e) == INTEGER_CST && !TREE_OVERFLOW (e))
- {
- if (wi::neg_p (wi::to_wide (e)))
- res |= HOST_WIDE_INT_1 << i;
- }
- else if (TREE_CODE (e) == REAL_CST && !TREE_OVERFLOW (e))
- {
- if (TREE_REAL_CST (e).sign)
- res |= HOST_WIDE_INT_1 << i;
- }
- else
- return NULL_TREE;
- }
- return build_int_cst (TREE_TYPE (TREE_TYPE (fndecl)), res);
- }
- break;
-
- case IX86_BUILTIN_PSLLD:
- case IX86_BUILTIN_PSLLD128:
- case IX86_BUILTIN_PSLLD128_MASK:
- case IX86_BUILTIN_PSLLD256:
- case IX86_BUILTIN_PSLLD256_MASK:
- case IX86_BUILTIN_PSLLD512:
- case IX86_BUILTIN_PSLLDI:
- case IX86_BUILTIN_PSLLDI128:
- case IX86_BUILTIN_PSLLDI128_MASK:
- case IX86_BUILTIN_PSLLDI256:
- case IX86_BUILTIN_PSLLDI256_MASK:
- case IX86_BUILTIN_PSLLDI512:
- case IX86_BUILTIN_PSLLQ:
- case IX86_BUILTIN_PSLLQ128:
- case IX86_BUILTIN_PSLLQ128_MASK:
- case IX86_BUILTIN_PSLLQ256:
- case IX86_BUILTIN_PSLLQ256_MASK:
- case IX86_BUILTIN_PSLLQ512:
- case IX86_BUILTIN_PSLLQI:
- case IX86_BUILTIN_PSLLQI128:
- case IX86_BUILTIN_PSLLQI128_MASK:
- case IX86_BUILTIN_PSLLQI256:
- case IX86_BUILTIN_PSLLQI256_MASK:
- case IX86_BUILTIN_PSLLQI512:
- case IX86_BUILTIN_PSLLW:
- case IX86_BUILTIN_PSLLW128:
- case IX86_BUILTIN_PSLLW128_MASK:
- case IX86_BUILTIN_PSLLW256:
- case IX86_BUILTIN_PSLLW256_MASK:
- case IX86_BUILTIN_PSLLW512_MASK:
- case IX86_BUILTIN_PSLLWI:
- case IX86_BUILTIN_PSLLWI128:
- case IX86_BUILTIN_PSLLWI128_MASK:
- case IX86_BUILTIN_PSLLWI256:
- case IX86_BUILTIN_PSLLWI256_MASK:
- case IX86_BUILTIN_PSLLWI512_MASK:
- rcode = ASHIFT;
- is_vshift = false;
- goto do_shift;
- case IX86_BUILTIN_PSRAD:
- case IX86_BUILTIN_PSRAD128:
- case IX86_BUILTIN_PSRAD128_MASK:
- case IX86_BUILTIN_PSRAD256:
- case IX86_BUILTIN_PSRAD256_MASK:
- case IX86_BUILTIN_PSRAD512:
- case IX86_BUILTIN_PSRADI:
- case IX86_BUILTIN_PSRADI128:
- case IX86_BUILTIN_PSRADI128_MASK:
- case IX86_BUILTIN_PSRADI256:
- case IX86_BUILTIN_PSRADI256_MASK:
- case IX86_BUILTIN_PSRADI512:
- case IX86_BUILTIN_PSRAQ128_MASK:
- case IX86_BUILTIN_PSRAQ256_MASK:
- case IX86_BUILTIN_PSRAQ512:
- case IX86_BUILTIN_PSRAQI128_MASK:
- case IX86_BUILTIN_PSRAQI256_MASK:
- case IX86_BUILTIN_PSRAQI512:
- case IX86_BUILTIN_PSRAW:
- case IX86_BUILTIN_PSRAW128:
- case IX86_BUILTIN_PSRAW128_MASK:
- case IX86_BUILTIN_PSRAW256:
- case IX86_BUILTIN_PSRAW256_MASK:
- case IX86_BUILTIN_PSRAW512:
- case IX86_BUILTIN_PSRAWI:
- case IX86_BUILTIN_PSRAWI128:
- case IX86_BUILTIN_PSRAWI128_MASK:
- case IX86_BUILTIN_PSRAWI256:
- case IX86_BUILTIN_PSRAWI256_MASK:
- case IX86_BUILTIN_PSRAWI512:
- rcode = ASHIFTRT;
- is_vshift = false;
- goto do_shift;
- case IX86_BUILTIN_PSRLD:
- case IX86_BUILTIN_PSRLD128:
- case IX86_BUILTIN_PSRLD128_MASK:
- case IX86_BUILTIN_PSRLD256:
- case IX86_BUILTIN_PSRLD256_MASK:
- case IX86_BUILTIN_PSRLD512:
- case IX86_BUILTIN_PSRLDI:
- case IX86_BUILTIN_PSRLDI128:
- case IX86_BUILTIN_PSRLDI128_MASK:
- case IX86_BUILTIN_PSRLDI256:
- case IX86_BUILTIN_PSRLDI256_MASK:
- case IX86_BUILTIN_PSRLDI512:
- case IX86_BUILTIN_PSRLQ:
- case IX86_BUILTIN_PSRLQ128:
- case IX86_BUILTIN_PSRLQ128_MASK:
- case IX86_BUILTIN_PSRLQ256:
- case IX86_BUILTIN_PSRLQ256_MASK:
- case IX86_BUILTIN_PSRLQ512:
- case IX86_BUILTIN_PSRLQI:
- case IX86_BUILTIN_PSRLQI128:
- case IX86_BUILTIN_PSRLQI128_MASK:
- case IX86_BUILTIN_PSRLQI256:
- case IX86_BUILTIN_PSRLQI256_MASK:
- case IX86_BUILTIN_PSRLQI512:
- case IX86_BUILTIN_PSRLW:
- case IX86_BUILTIN_PSRLW128:
- case IX86_BUILTIN_PSRLW128_MASK:
- case IX86_BUILTIN_PSRLW256:
- case IX86_BUILTIN_PSRLW256_MASK:
- case IX86_BUILTIN_PSRLW512:
- case IX86_BUILTIN_PSRLWI:
- case IX86_BUILTIN_PSRLWI128:
- case IX86_BUILTIN_PSRLWI128_MASK:
- case IX86_BUILTIN_PSRLWI256:
- case IX86_BUILTIN_PSRLWI256_MASK:
- case IX86_BUILTIN_PSRLWI512:
- rcode = LSHIFTRT;
- is_vshift = false;
- goto do_shift;
- case IX86_BUILTIN_PSLLVV16HI:
- case IX86_BUILTIN_PSLLVV16SI:
- case IX86_BUILTIN_PSLLVV2DI:
- case IX86_BUILTIN_PSLLVV2DI_MASK:
- case IX86_BUILTIN_PSLLVV32HI:
- case IX86_BUILTIN_PSLLVV4DI:
- case IX86_BUILTIN_PSLLVV4DI_MASK:
- case IX86_BUILTIN_PSLLVV4SI:
- case IX86_BUILTIN_PSLLVV4SI_MASK:
- case IX86_BUILTIN_PSLLVV8DI:
- case IX86_BUILTIN_PSLLVV8HI:
- case IX86_BUILTIN_PSLLVV8SI:
- case IX86_BUILTIN_PSLLVV8SI_MASK:
- rcode = ASHIFT;
- is_vshift = true;
- goto do_shift;
- case IX86_BUILTIN_PSRAVQ128:
- case IX86_BUILTIN_PSRAVQ256:
- case IX86_BUILTIN_PSRAVV16HI:
- case IX86_BUILTIN_PSRAVV16SI:
- case IX86_BUILTIN_PSRAVV32HI:
- case IX86_BUILTIN_PSRAVV4SI:
- case IX86_BUILTIN_PSRAVV4SI_MASK:
- case IX86_BUILTIN_PSRAVV8DI:
- case IX86_BUILTIN_PSRAVV8HI:
- case IX86_BUILTIN_PSRAVV8SI:
- case IX86_BUILTIN_PSRAVV8SI_MASK:
- rcode = ASHIFTRT;
- is_vshift = true;
- goto do_shift;
- case IX86_BUILTIN_PSRLVV16HI:
- case IX86_BUILTIN_PSRLVV16SI:
- case IX86_BUILTIN_PSRLVV2DI:
- case IX86_BUILTIN_PSRLVV2DI_MASK:
- case IX86_BUILTIN_PSRLVV32HI:
- case IX86_BUILTIN_PSRLVV4DI:
- case IX86_BUILTIN_PSRLVV4DI_MASK:
- case IX86_BUILTIN_PSRLVV4SI:
- case IX86_BUILTIN_PSRLVV4SI_MASK:
- case IX86_BUILTIN_PSRLVV8DI:
- case IX86_BUILTIN_PSRLVV8HI:
- case IX86_BUILTIN_PSRLVV8SI:
- case IX86_BUILTIN_PSRLVV8SI_MASK:
- rcode = LSHIFTRT;
- is_vshift = true;
- goto do_shift;
-
- do_shift:
- gcc_assert (n_args >= 2);
- if (TREE_CODE (args[0]) != VECTOR_CST)
- break;
- mask = HOST_WIDE_INT_M1U;
- if (n_args > 2)
- {
- /* This is masked shift. */
- if (!tree_fits_uhwi_p (args[n_args - 1])
- || TREE_SIDE_EFFECTS (args[n_args - 2]))
- break;
- mask = tree_to_uhwi (args[n_args - 1]);
- unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0]));
- mask |= HOST_WIDE_INT_M1U << elems;
- if (mask != HOST_WIDE_INT_M1U
- && TREE_CODE (args[n_args - 2]) != VECTOR_CST)
- break;
- if (mask == (HOST_WIDE_INT_M1U << elems))
- return args[n_args - 2];
- }
- if (is_vshift && TREE_CODE (args[1]) != VECTOR_CST)
- break;
- if (tree tem = (is_vshift ? integer_one_node
- : ix86_vector_shift_count (args[1])))
- {
- unsigned HOST_WIDE_INT count = tree_to_uhwi (tem);
- unsigned HOST_WIDE_INT prec
- = TYPE_PRECISION (TREE_TYPE (TREE_TYPE (args[0])));
- if (count == 0 && mask == HOST_WIDE_INT_M1U)
- return args[0];
- if (count >= prec)
- {
- if (rcode == ASHIFTRT)
- count = prec - 1;
- else if (mask == HOST_WIDE_INT_M1U)
- return build_zero_cst (TREE_TYPE (args[0]));
- }
- tree countt = NULL_TREE;
- if (!is_vshift)
- {
- if (count >= prec)
- countt = integer_zero_node;
- else
- countt = build_int_cst (integer_type_node, count);
- }
- tree_vector_builder builder;
- builder.new_unary_operation (TREE_TYPE (args[0]), args[0],
- false);
- unsigned int cnt = builder.encoded_nelts ();
- for (unsigned int i = 0; i < cnt; ++i)
- {
- tree elt = VECTOR_CST_ELT (args[0], i);
- if (TREE_CODE (elt) != INTEGER_CST || TREE_OVERFLOW (elt))
- return NULL_TREE;
- tree type = TREE_TYPE (elt);
- if (rcode == LSHIFTRT)
- elt = fold_convert (unsigned_type_for (type), elt);
- if (is_vshift)
- {
- countt = VECTOR_CST_ELT (args[1], i);
- if (TREE_CODE (countt) != INTEGER_CST
- || TREE_OVERFLOW (countt))
- return NULL_TREE;
- if (wi::neg_p (wi::to_wide (countt))
- || wi::to_widest (countt) >= prec)
- {
- if (rcode == ASHIFTRT)
- countt = build_int_cst (TREE_TYPE (countt),
- prec - 1);
- else
- {
- elt = build_zero_cst (TREE_TYPE (elt));
- countt = build_zero_cst (TREE_TYPE (countt));
- }
- }
- }
- else if (count >= prec)
- elt = build_zero_cst (TREE_TYPE (elt));
- elt = const_binop (rcode == ASHIFT
- ? LSHIFT_EXPR : RSHIFT_EXPR,
- TREE_TYPE (elt), elt, countt);
- if (!elt || TREE_CODE (elt) != INTEGER_CST)
- return NULL_TREE;
- if (rcode == LSHIFTRT)
- elt = fold_convert (type, elt);
- if ((mask & (HOST_WIDE_INT_1U << i)) == 0)
- {
- elt = VECTOR_CST_ELT (args[n_args - 2], i);
- if (TREE_CODE (elt) != INTEGER_CST
- || TREE_OVERFLOW (elt))
- return NULL_TREE;
- }
- builder.quick_push (elt);
- }
- return builder.build ();
- }
- break;
-
- default:
- break;
- }
- }
-
-#ifdef SUBTARGET_FOLD_BUILTIN
- return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
-#endif
-
- return NULL_TREE;
-}
-
-/* Fold a MD builtin (use ix86_fold_builtin for folding into
- constant) in GIMPLE. */
-
-bool
-ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
-{
- gimple *stmt = gsi_stmt (*gsi);
- tree fndecl = gimple_call_fndecl (stmt);
- gcc_checking_assert (fndecl && fndecl_built_in_p (fndecl, BUILT_IN_MD));
- int n_args = gimple_call_num_args (stmt);
- enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
- tree decl = NULL_TREE;
- tree arg0, arg1;
- enum rtx_code rcode;
- unsigned HOST_WIDE_INT count;
- bool is_vshift;
-
- switch (fn_code)
- {
- case IX86_BUILTIN_TZCNT32:
- decl = builtin_decl_implicit (BUILT_IN_CTZ);
- goto fold_tzcnt_lzcnt;
-
- case IX86_BUILTIN_TZCNT64:
- decl = builtin_decl_implicit (BUILT_IN_CTZLL);
- goto fold_tzcnt_lzcnt;
-
- case IX86_BUILTIN_LZCNT32:
- decl = builtin_decl_implicit (BUILT_IN_CLZ);
- goto fold_tzcnt_lzcnt;
-
- case IX86_BUILTIN_LZCNT64:
- decl = builtin_decl_implicit (BUILT_IN_CLZLL);
- goto fold_tzcnt_lzcnt;
-
- fold_tzcnt_lzcnt:
- gcc_assert (n_args == 1);
- arg0 = gimple_call_arg (stmt, 0);
- if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
- {
- int prec = TYPE_PRECISION (TREE_TYPE (arg0));
- /* If arg0 is provably non-zero, optimize into generic
- __builtin_c[tl]z{,ll} function the middle-end handles
- better. */
- if (!expr_not_equal_to (arg0, wi::zero (prec)))
- return false;
-
- location_t loc = gimple_location (stmt);
- gimple *g = gimple_build_call (decl, 1, arg0);
- gimple_set_location (g, loc);
- tree lhs = make_ssa_name (integer_type_node);
- gimple_call_set_lhs (g, lhs);
- gsi_insert_before (gsi, g, GSI_SAME_STMT);
- g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
- gimple_set_location (g, loc);
- gsi_replace (gsi, g, false);
- return true;
- }
- break;
-
- case IX86_BUILTIN_BZHI32:
- case IX86_BUILTIN_BZHI64:
- gcc_assert (n_args == 2);
- arg1 = gimple_call_arg (stmt, 1);
- if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
- {
- unsigned int idx = tree_to_uhwi (arg1) & 0xff;
- arg0 = gimple_call_arg (stmt, 0);
- if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
- break;
- location_t loc = gimple_location (stmt);
- gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
- gimple_set_location (g, loc);
- gsi_replace (gsi, g, false);
- return true;
- }
- break;
-
- case IX86_BUILTIN_PDEP32:
- case IX86_BUILTIN_PDEP64:
- case IX86_BUILTIN_PEXT32:
- case IX86_BUILTIN_PEXT64:
- gcc_assert (n_args == 2);
- arg1 = gimple_call_arg (stmt, 1);
- if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
- {
- location_t loc = gimple_location (stmt);
- arg0 = gimple_call_arg (stmt, 0);
- gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
- gimple_set_location (g, loc);
- gsi_replace (gsi, g, false);
- return true;
- }
- break;
-
- case IX86_BUILTIN_PSLLD:
- case IX86_BUILTIN_PSLLD128:
- case IX86_BUILTIN_PSLLD128_MASK:
- case IX86_BUILTIN_PSLLD256:
- case IX86_BUILTIN_PSLLD256_MASK:
- case IX86_BUILTIN_PSLLD512:
- case IX86_BUILTIN_PSLLDI:
- case IX86_BUILTIN_PSLLDI128:
- case IX86_BUILTIN_PSLLDI128_MASK:
- case IX86_BUILTIN_PSLLDI256:
- case IX86_BUILTIN_PSLLDI256_MASK:
- case IX86_BUILTIN_PSLLDI512:
- case IX86_BUILTIN_PSLLQ:
- case IX86_BUILTIN_PSLLQ128:
- case IX86_BUILTIN_PSLLQ128_MASK:
- case IX86_BUILTIN_PSLLQ256:
- case IX86_BUILTIN_PSLLQ256_MASK:
- case IX86_BUILTIN_PSLLQ512:
- case IX86_BUILTIN_PSLLQI:
- case IX86_BUILTIN_PSLLQI128:
- case IX86_BUILTIN_PSLLQI128_MASK:
- case IX86_BUILTIN_PSLLQI256:
- case IX86_BUILTIN_PSLLQI256_MASK:
- case IX86_BUILTIN_PSLLQI512:
- case IX86_BUILTIN_PSLLW:
- case IX86_BUILTIN_PSLLW128:
- case IX86_BUILTIN_PSLLW128_MASK:
- case IX86_BUILTIN_PSLLW256:
- case IX86_BUILTIN_PSLLW256_MASK:
- case IX86_BUILTIN_PSLLW512_MASK:
- case IX86_BUILTIN_PSLLWI:
- case IX86_BUILTIN_PSLLWI128:
- case IX86_BUILTIN_PSLLWI128_MASK:
- case IX86_BUILTIN_PSLLWI256:
- case IX86_BUILTIN_PSLLWI256_MASK:
- case IX86_BUILTIN_PSLLWI512_MASK:
- rcode = ASHIFT;
- is_vshift = false;
- goto do_shift;
- case IX86_BUILTIN_PSRAD:
- case IX86_BUILTIN_PSRAD128:
- case IX86_BUILTIN_PSRAD128_MASK:
- case IX86_BUILTIN_PSRAD256:
- case IX86_BUILTIN_PSRAD256_MASK:
- case IX86_BUILTIN_PSRAD512:
- case IX86_BUILTIN_PSRADI:
- case IX86_BUILTIN_PSRADI128:
- case IX86_BUILTIN_PSRADI128_MASK:
- case IX86_BUILTIN_PSRADI256:
- case IX86_BUILTIN_PSRADI256_MASK:
- case IX86_BUILTIN_PSRADI512:
- case IX86_BUILTIN_PSRAQ128_MASK:
- case IX86_BUILTIN_PSRAQ256_MASK:
- case IX86_BUILTIN_PSRAQ512:
- case IX86_BUILTIN_PSRAQI128_MASK:
- case IX86_BUILTIN_PSRAQI256_MASK:
- case IX86_BUILTIN_PSRAQI512:
- case IX86_BUILTIN_PSRAW:
- case IX86_BUILTIN_PSRAW128:
- case IX86_BUILTIN_PSRAW128_MASK:
- case IX86_BUILTIN_PSRAW256:
- case IX86_BUILTIN_PSRAW256_MASK:
- case IX86_BUILTIN_PSRAW512:
- case IX86_BUILTIN_PSRAWI:
- case IX86_BUILTIN_PSRAWI128:
- case IX86_BUILTIN_PSRAWI128_MASK:
- case IX86_BUILTIN_PSRAWI256:
- case IX86_BUILTIN_PSRAWI256_MASK:
- case IX86_BUILTIN_PSRAWI512:
- rcode = ASHIFTRT;
- is_vshift = false;
- goto do_shift;
- case IX86_BUILTIN_PSRLD:
- case IX86_BUILTIN_PSRLD128:
- case IX86_BUILTIN_PSRLD128_MASK:
- case IX86_BUILTIN_PSRLD256:
- case IX86_BUILTIN_PSRLD256_MASK:
- case IX86_BUILTIN_PSRLD512:
- case IX86_BUILTIN_PSRLDI:
- case IX86_BUILTIN_PSRLDI128:
- case IX86_BUILTIN_PSRLDI128_MASK:
- case IX86_BUILTIN_PSRLDI256:
- case IX86_BUILTIN_PSRLDI256_MASK:
- case IX86_BUILTIN_PSRLDI512:
- case IX86_BUILTIN_PSRLQ:
- case IX86_BUILTIN_PSRLQ128:
- case IX86_BUILTIN_PSRLQ128_MASK:
- case IX86_BUILTIN_PSRLQ256:
- case IX86_BUILTIN_PSRLQ256_MASK:
- case IX86_BUILTIN_PSRLQ512:
- case IX86_BUILTIN_PSRLQI:
- case IX86_BUILTIN_PSRLQI128:
- case IX86_BUILTIN_PSRLQI128_MASK:
- case IX86_BUILTIN_PSRLQI256:
- case IX86_BUILTIN_PSRLQI256_MASK:
- case IX86_BUILTIN_PSRLQI512:
- case IX86_BUILTIN_PSRLW:
- case IX86_BUILTIN_PSRLW128:
- case IX86_BUILTIN_PSRLW128_MASK:
- case IX86_BUILTIN_PSRLW256:
- case IX86_BUILTIN_PSRLW256_MASK:
- case IX86_BUILTIN_PSRLW512:
- case IX86_BUILTIN_PSRLWI:
- case IX86_BUILTIN_PSRLWI128:
- case IX86_BUILTIN_PSRLWI128_MASK:
- case IX86_BUILTIN_PSRLWI256:
- case IX86_BUILTIN_PSRLWI256_MASK:
- case IX86_BUILTIN_PSRLWI512:
- rcode = LSHIFTRT;
- is_vshift = false;
- goto do_shift;
- case IX86_BUILTIN_PSLLVV16HI:
- case IX86_BUILTIN_PSLLVV16SI:
- case IX86_BUILTIN_PSLLVV2DI:
- case IX86_BUILTIN_PSLLVV2DI_MASK:
- case IX86_BUILTIN_PSLLVV32HI:
- case IX86_BUILTIN_PSLLVV4DI:
- case IX86_BUILTIN_PSLLVV4DI_MASK:
- case IX86_BUILTIN_PSLLVV4SI:
- case IX86_BUILTIN_PSLLVV4SI_MASK:
- case IX86_BUILTIN_PSLLVV8DI:
- case IX86_BUILTIN_PSLLVV8HI:
- case IX86_BUILTIN_PSLLVV8SI:
- case IX86_BUILTIN_PSLLVV8SI_MASK:
- rcode = ASHIFT;
- is_vshift = true;
- goto do_shift;
- case IX86_BUILTIN_PSRAVQ128:
- case IX86_BUILTIN_PSRAVQ256:
- case IX86_BUILTIN_PSRAVV16HI:
- case IX86_BUILTIN_PSRAVV16SI:
- case IX86_BUILTIN_PSRAVV32HI:
- case IX86_BUILTIN_PSRAVV4SI:
- case IX86_BUILTIN_PSRAVV4SI_MASK:
- case IX86_BUILTIN_PSRAVV8DI:
- case IX86_BUILTIN_PSRAVV8HI:
- case IX86_BUILTIN_PSRAVV8SI:
- case IX86_BUILTIN_PSRAVV8SI_MASK:
- rcode = ASHIFTRT;
- is_vshift = true;
- goto do_shift;
- case IX86_BUILTIN_PSRLVV16HI:
- case IX86_BUILTIN_PSRLVV16SI:
- case IX86_BUILTIN_PSRLVV2DI:
- case IX86_BUILTIN_PSRLVV2DI_MASK:
- case IX86_BUILTIN_PSRLVV32HI:
- case IX86_BUILTIN_PSRLVV4DI:
- case IX86_BUILTIN_PSRLVV4DI_MASK:
- case IX86_BUILTIN_PSRLVV4SI:
- case IX86_BUILTIN_PSRLVV4SI_MASK:
- case IX86_BUILTIN_PSRLVV8DI:
- case IX86_BUILTIN_PSRLVV8HI:
- case IX86_BUILTIN_PSRLVV8SI:
- case IX86_BUILTIN_PSRLVV8SI_MASK:
- rcode = LSHIFTRT;
- is_vshift = true;
- goto do_shift;
-
- do_shift:
- gcc_assert (n_args >= 2);
- arg0 = gimple_call_arg (stmt, 0);
- arg1 = gimple_call_arg (stmt, 1);
- if (n_args > 2)
- {
- /* This is masked shift. Only optimize if the mask is all ones. */
- tree argl = gimple_call_arg (stmt, n_args - 1);
- if (!tree_fits_uhwi_p (argl))
- break;
- unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl);
- unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
- if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)
- break;
- }
- if (is_vshift)
- {
- if (TREE_CODE (arg1) != VECTOR_CST)
- break;
- count = TYPE_PRECISION (TREE_TYPE (TREE_TYPE (arg0)));
- if (integer_zerop (arg1))
- count = 0;
- else if (rcode == ASHIFTRT)
- break;
- else
- for (unsigned int i = 0; i < VECTOR_CST_NELTS (arg1); ++i)
- {
- tree elt = VECTOR_CST_ELT (arg1, i);
- if (!wi::neg_p (wi::to_wide (elt))
- && wi::to_widest (elt) < count)
- return false;
- }
- }
- else
- {
- arg1 = ix86_vector_shift_count (arg1);
- if (!arg1)
- break;
- count = tree_to_uhwi (arg1);
- }
- if (count == 0)
- {
- /* Just return the first argument for shift by 0. */
- location_t loc = gimple_location (stmt);
- gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
- gimple_set_location (g, loc);
- gsi_replace (gsi, g, false);
- return true;
- }
- if (rcode != ASHIFTRT
- && count >= TYPE_PRECISION (TREE_TYPE (TREE_TYPE (arg0))))
- {
- /* For shift counts equal or greater than precision, except for
- arithmetic right shift the result is zero. */
- location_t loc = gimple_location (stmt);
- gimple *g = gimple_build_assign (gimple_call_lhs (stmt),
- build_zero_cst (TREE_TYPE (arg0)));
- gimple_set_location (g, loc);
- gsi_replace (gsi, g, false);
- return true;
- }
- break;
-
- default:
- break;
- }
-
- return false;
-}
-
-/* Make builtins to detect cpu type and features supported. NAME is
- the builtin name, CODE is the builtin code, and FTYPE is the function
- type of the builtin. */
-
-static void
-make_cpu_type_builtin (const char* name, int code,
- enum ix86_builtin_func_type ftype, bool is_const)
-{
- tree decl;
- tree type;
-
- type = ix86_get_builtin_func_type (ftype);
- decl = add_builtin_function (name, type, code, BUILT_IN_MD,
- NULL, NULL_TREE);
- gcc_assert (decl != NULL_TREE);
- ix86_builtins[(int) code] = decl;
- TREE_READONLY (decl) = is_const;
-}
-
-/* Make builtins to get CPU type and features supported. The created
- builtins are :
-
- __builtin_cpu_init (), to detect cpu type and features,
- __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
- __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
- */
-
-static void
-ix86_init_platform_type_builtins (void)
-{
- make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
- INT_FTYPE_VOID, false);
- make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
- INT_FTYPE_PCCHAR, true);
- make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
- INT_FTYPE_PCCHAR, true);
-}
-
-/* Internal method for ix86_init_builtins. */
-
-static void
-ix86_init_builtins_va_builtins_abi (void)
-{
- tree ms_va_ref, sysv_va_ref;
- tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
- tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
- tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
- tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
-
- if (!TARGET_64BIT)
- return;
- fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
- fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
- ms_va_ref = build_reference_type (ms_va_list_type_node);
- sysv_va_ref = build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
-
- fnvoid_va_end_ms = build_function_type_list (void_type_node, ms_va_ref,
- NULL_TREE);
- fnvoid_va_start_ms
- = build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
- fnvoid_va_end_sysv
- = build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
- fnvoid_va_start_sysv
- = build_varargs_function_type_list (void_type_node, sysv_va_ref,
- NULL_TREE);
- fnvoid_va_copy_ms
- = build_function_type_list (void_type_node, ms_va_ref,
- ms_va_list_type_node, NULL_TREE);
- fnvoid_va_copy_sysv
- = build_function_type_list (void_type_node, sysv_va_ref,
- sysv_va_ref, NULL_TREE);
-
- add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
- BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
- add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
- BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
- add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
- BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
- add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
- BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
- add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
- BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
- add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
- BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
-}
-
-static void
-ix86_init_builtin_types (void)
-{
- tree float80_type_node, const_string_type_node;
-
- /* The __float80 type. */
- float80_type_node = long_double_type_node;
- if (TYPE_MODE (float80_type_node) != XFmode)
- {
- if (float64x_type_node != NULL_TREE
- && TYPE_MODE (float64x_type_node) == XFmode)
- float80_type_node = float64x_type_node;
- else
- {
- /* The __float80 type. */
- float80_type_node = make_node (REAL_TYPE);
-
- TYPE_PRECISION (float80_type_node) = 80;
- layout_type (float80_type_node);
- }
- }
- lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
-
- /* The __float128 type. The node has already been created as
- _Float128, so we only need to register the __float128 name for
- it. */
- lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
-
- const_string_type_node
- = build_pointer_type (build_qualified_type
- (char_type_node, TYPE_QUAL_CONST));
-
- /* This macro is built by i386-builtin-types.awk. */
- DEFINE_BUILTIN_PRIMITIVE_TYPES;
-}
-
-static void
-ix86_init_builtins (void)
-{
- tree ftype, decl;
-
- ix86_init_builtin_types ();
-
- /* Builtins to get CPU type and features. */
- ix86_init_platform_type_builtins ();
-
- /* TFmode support builtins. */
- def_builtin_const (0, 0, "__builtin_infq",
- FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
- def_builtin_const (0, 0, "__builtin_huge_valq",
- FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
-
- ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
- decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
- BUILT_IN_MD, "nanq", NULL_TREE);
- TREE_READONLY (decl) = 1;
- ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
-
- decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
- BUILT_IN_MD, "nansq", NULL_TREE);
- TREE_READONLY (decl) = 1;
- ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
-
- /* We will expand them to normal call if SSE isn't available since
- they are used by libgcc. */
- ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
- decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
- BUILT_IN_MD, "__fabstf2", NULL_TREE);
- TREE_READONLY (decl) = 1;
- ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
-
- ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
- decl = add_builtin_function ("__builtin_copysignq", ftype,
- IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
- "__copysigntf3", NULL_TREE);
- TREE_READONLY (decl) = 1;
- ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
-
- ix86_init_tm_builtins ();
- ix86_init_mmx_sse_builtins ();
-
- if (TARGET_LP64)
- ix86_init_builtins_va_builtins_abi ();
-
-#ifdef SUBTARGET_INIT_BUILTINS
- SUBTARGET_INIT_BUILTINS;
-#endif
-}
-
-/* Return the ix86 builtin for CODE. */
-
-static tree
-ix86_builtin_decl (unsigned code, bool)
-{
- if (code >= IX86_BUILTIN_MAX)
- return error_mark_node;
-
- return ix86_builtins[code];
-}
-
-/* Errors in the source file can cause expand_expr to return const0_rtx
- where we expect a vector. To avoid crashing, use one of the vector
- clear instructions. */
-static rtx
-safe_vector_operand (rtx x, machine_mode mode)
-{
- if (x == const0_rtx)
- x = CONST0_RTX (mode);
- return x;
-}
-
-/* Fixup modeless constants to fit required mode. */
-static rtx
-fixup_modeless_constant (rtx x, machine_mode mode)
-{
- if (GET_MODE (x) == VOIDmode)
- x = convert_to_mode (mode, x, 1);
- return x;
-}
-
-/* Subroutine of ix86_expand_builtin to take care of binop insns. */
-
-static rtx
-ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
-{
- rtx pat;
- tree arg0 = CALL_EXPR_ARG (exp, 0);
- tree arg1 = CALL_EXPR_ARG (exp, 1);
- rtx op0 = expand_normal (arg0);
- rtx op1 = expand_normal (arg1);
- machine_mode tmode = insn_data[icode].operand[0].mode;
- machine_mode mode0 = insn_data[icode].operand[1].mode;
- machine_mode mode1 = insn_data[icode].operand[2].mode;
-
- if (VECTOR_MODE_P (mode0))
- op0 = safe_vector_operand (op0, mode0);
- if (VECTOR_MODE_P (mode1))
- op1 = safe_vector_operand (op1, mode1);
-
- if (optimize || !target
- || GET_MODE (target) != tmode
- || !insn_data[icode].operand[0].predicate (target, tmode))
- target = gen_reg_rtx (tmode);
-
- if (GET_MODE (op1) == SImode && mode1 == TImode)
- {
- rtx x = gen_reg_rtx (V4SImode);
- emit_insn (gen_sse2_loadd (x, op1));
- op1 = gen_lowpart (TImode, x);
- }
-
- if (!insn_data[icode].operand[1].predicate (op0, mode0))
- op0 = copy_to_mode_reg (mode0, op0);
- if (!insn_data[icode].operand[2].predicate (op1, mode1))
- op1 = copy_to_mode_reg (mode1, op1);
-
- pat = GEN_FCN (icode) (target, op0, op1);
- if (! pat)
- return 0;
-
- emit_insn (pat);
-
- return target;
-}
-
-/* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
-
-static rtx
-ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
- enum ix86_builtin_func_type m_type,
- enum rtx_code sub_code)
-{
- rtx pat;
- int i;
- int nargs;
- bool comparison_p = false;
- bool tf_p = false;
- bool last_arg_constant = false;
- int num_memory = 0;
- struct {
- rtx op;
- machine_mode mode;
- } args[4];
-
- machine_mode tmode = insn_data[icode].operand[0].mode;
-
- switch (m_type)
- {
- case MULTI_ARG_4_DF2_DI_I:
- case MULTI_ARG_4_DF2_DI_I1:
- case MULTI_ARG_4_SF2_SI_I:
- case MULTI_ARG_4_SF2_SI_I1:
- nargs = 4;
- last_arg_constant = true;
- break;
-
- case MULTI_ARG_3_SF:
- case MULTI_ARG_3_DF:
- case MULTI_ARG_3_SF2:
- case MULTI_ARG_3_DF2:
- case MULTI_ARG_3_DI:
- case MULTI_ARG_3_SI:
- case MULTI_ARG_3_SI_DI:
- case MULTI_ARG_3_HI:
- case MULTI_ARG_3_HI_SI:
- case MULTI_ARG_3_QI:
- case MULTI_ARG_3_DI2:
- case MULTI_ARG_3_SI2:
- case MULTI_ARG_3_HI2:
- case MULTI_ARG_3_QI2:
- nargs = 3;
- break;
-
- case MULTI_ARG_2_SF:
- case MULTI_ARG_2_DF:
- case MULTI_ARG_2_DI:
- case MULTI_ARG_2_SI:
- case MULTI_ARG_2_HI:
- case MULTI_ARG_2_QI:
- nargs = 2;
- break;
-
- case MULTI_ARG_2_DI_IMM:
- case MULTI_ARG_2_SI_IMM:
- case MULTI_ARG_2_HI_IMM:
- case MULTI_ARG_2_QI_IMM:
- nargs = 2;
- last_arg_constant = true;
- break;
-
- case MULTI_ARG_1_SF:
- case MULTI_ARG_1_DF:
- case MULTI_ARG_1_SF2:
- case MULTI_ARG_1_DF2:
- case MULTI_ARG_1_DI:
- case MULTI_ARG_1_SI:
- case MULTI_ARG_1_HI:
- case MULTI_ARG_1_QI:
- case MULTI_ARG_1_SI_DI:
- case MULTI_ARG_1_HI_DI:
- case MULTI_ARG_1_HI_SI:
- case MULTI_ARG_1_QI_DI:
- case MULTI_ARG_1_QI_SI:
- case MULTI_ARG_1_QI_HI:
- nargs = 1;
- break;
-
- case MULTI_ARG_2_DI_CMP:
- case MULTI_ARG_2_SI_CMP:
- case MULTI_ARG_2_HI_CMP:
- case MULTI_ARG_2_QI_CMP:
- nargs = 2;
- comparison_p = true;
- break;
-
- case MULTI_ARG_2_SF_TF:
- case MULTI_ARG_2_DF_TF:
- case MULTI_ARG_2_DI_TF:
- case MULTI_ARG_2_SI_TF:
- case MULTI_ARG_2_HI_TF:
- case MULTI_ARG_2_QI_TF:
- nargs = 2;
- tf_p = true;
- break;
-
- default:
- gcc_unreachable ();
- }
-
- if (optimize || !target
- || GET_MODE (target) != tmode
- || !insn_data[icode].operand[0].predicate (target, tmode))
- target = gen_reg_rtx (tmode);
- else if (memory_operand (target, tmode))
- num_memory++;
-
- gcc_assert (nargs <= 4);
-
- for (i = 0; i < nargs; i++)
- {
- tree arg = CALL_EXPR_ARG (exp, i);
- rtx op = expand_normal (arg);
- int adjust = (comparison_p) ? 1 : 0;
- machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
-
- if (last_arg_constant && i == nargs - 1)
- {
- if (!insn_data[icode].operand[i + 1].predicate (op, mode))
- {
- enum insn_code new_icode = icode;
- switch (icode)
- {
- case CODE_FOR_xop_vpermil2v2df3:
- case CODE_FOR_xop_vpermil2v4sf3:
- case CODE_FOR_xop_vpermil2v4df3:
- case CODE_FOR_xop_vpermil2v8sf3:
- error ("the last argument must be a 2-bit immediate");
- return gen_reg_rtx (tmode);
- case CODE_FOR_xop_rotlv2di3:
- new_icode = CODE_FOR_rotlv2di3;
- goto xop_rotl;
- case CODE_FOR_xop_rotlv4si3:
- new_icode = CODE_FOR_rotlv4si3;
- goto xop_rotl;
- case CODE_FOR_xop_rotlv8hi3:
- new_icode = CODE_FOR_rotlv8hi3;
- goto xop_rotl;
- case CODE_FOR_xop_rotlv16qi3:
- new_icode = CODE_FOR_rotlv16qi3;
- xop_rotl:
- if (CONST_INT_P (op))
- {
- int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
- op = GEN_INT (INTVAL (op) & mask);
- gcc_checking_assert
- (insn_data[icode].operand[i + 1].predicate (op, mode));
- }
- else
- {
- gcc_checking_assert
- (nargs == 2
- && insn_data[new_icode].operand[0].mode == tmode
- && insn_data[new_icode].operand[1].mode == tmode
- && insn_data[new_icode].operand[2].mode == mode
- && insn_data[new_icode].operand[0].predicate
- == insn_data[icode].operand[0].predicate
- && insn_data[new_icode].operand[1].predicate
- == insn_data[icode].operand[1].predicate);
- icode = new_icode;
- goto non_constant;
- }
- break;
- default:
- gcc_unreachable ();
- }
- }
- }
- else
- {
- non_constant:
- if (VECTOR_MODE_P (mode))
- op = safe_vector_operand (op, mode);
-
- /* If we aren't optimizing, only allow one memory operand to be
- generated. */
- if (memory_operand (op, mode))
- num_memory++;
-
- gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
-
- if (optimize
- || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
- || num_memory > 1)
- op = force_reg (mode, op);
- }
-
- args[i].op = op;
- args[i].mode = mode;
- }
-
- switch (nargs)
- {
- case 1:
- pat = GEN_FCN (icode) (target, args[0].op);
- break;
-
- case 2:
- if (tf_p)
- pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
- GEN_INT ((int)sub_code));
- else if (! comparison_p)
- pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
- else
- {
- rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
- args[0].op,
- args[1].op);
-
- pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
- }
- break;
-
- case 3:
- pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
- break;
-
- case 4:
- pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
- break;
-
- default:
- gcc_unreachable ();
- }
-
- if (! pat)
- return 0;
-
- emit_insn (pat);
- return target;
-}
-
-/* Subroutine of ix86_expand_args_builtin to take care of scalar unop
- insns with vec_merge. */
-
-static rtx
-ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
- rtx target)
-{
- rtx pat;
- tree arg0 = CALL_EXPR_ARG (exp, 0);
- rtx op1, op0 = expand_normal (arg0);
- machine_mode tmode = insn_data[icode].operand[0].mode;
- machine_mode mode0 = insn_data[icode].operand[1].mode;
-
- if (optimize || !target
- || GET_MODE (target) != tmode
- || !insn_data[icode].operand[0].predicate (target, tmode))
- target = gen_reg_rtx (tmode);
-
- if (VECTOR_MODE_P (mode0))
- op0 = safe_vector_operand (op0, mode0);
-
- if ((optimize && !register_operand (op0, mode0))
- || !insn_data[icode].operand[1].predicate (op0, mode0))
- op0 = copy_to_mode_reg (mode0, op0);
-
- op1 = op0;
- if (!insn_data[icode].operand[2].predicate (op1, mode0))
- op1 = copy_to_mode_reg (mode0, op1);
-
- pat = GEN_FCN (icode) (target, op0, op1);
- if (! pat)
- return 0;
- emit_insn (pat);
- return target;
-}
-
-/* Subroutine of ix86_expand_builtin to take care of comparison insns. */
-
-static rtx
-ix86_expand_sse_compare (const struct builtin_description *d,
- tree exp, rtx target, bool swap)
-{
- rtx pat;
- tree arg0 = CALL_EXPR_ARG (exp, 0);
- tree arg1 = CALL_EXPR_ARG (exp, 1);
- rtx op0 = expand_normal (arg0);
- rtx op1 = expand_normal (arg1);
- rtx op2;
- machine_mode tmode = insn_data[d->icode].operand[0].mode;
- machine_mode mode0 = insn_data[d->icode].operand[1].mode;
- machine_mode mode1 = insn_data[d->icode].operand[2].mode;
- enum rtx_code comparison = d->comparison;
-
- if (VECTOR_MODE_P (mode0))
- op0 = safe_vector_operand (op0, mode0);
- if (VECTOR_MODE_P (mode1))
- op1 = safe_vector_operand (op1, mode1);
-
- /* Swap operands if we have a comparison that isn't available in
- hardware. */
- if (swap)
- std::swap (op0, op1);
-
- if (optimize || !target
- || GET_MODE (target) != tmode
- || !insn_data[d->icode].operand[0].predicate (target, tmode))
- target = gen_reg_rtx (tmode);
-
- if ((optimize && !register_operand (op0, mode0))
- || !insn_data[d->icode].operand[1].predicate (op0, mode0))
- op0 = copy_to_mode_reg (mode0, op0);
- if ((optimize && !register_operand (op1, mode1))
- || !insn_data[d->icode].operand[2].predicate (op1, mode1))
- op1 = copy_to_mode_reg (mode1, op1);
-
- op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
- pat = GEN_FCN (d->icode) (target, op0, op1, op2);
- if (! pat)
- return 0;
- emit_insn (pat);
- return target;
-}
-
-/* Subroutine of ix86_expand_builtin to take care of comi insns. */
-
-static rtx
-ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
- rtx target)
-{
- rtx pat;
- tree arg0 = CALL_EXPR_ARG (exp, 0);
- tree arg1 = CALL_EXPR_ARG (exp, 1);
- rtx op0 = expand_normal (arg0);
- rtx op1 = expand_normal (arg1);
- machine_mode mode0 = insn_data[d->icode].operand[0].mode;
- machine_mode mode1 = insn_data[d->icode].operand[1].mode;
- enum rtx_code comparison = d->comparison;
-
- if (VECTOR_MODE_P (mode0))
- op0 = safe_vector_operand (op0, mode0);
- if (VECTOR_MODE_P (mode1))
- op1 = safe_vector_operand (op1, mode1);
-
- /* Swap operands if we have a comparison that isn't available in
- hardware. */
- if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
- std::swap (op0, op1);
-
- target = gen_reg_rtx (SImode);
- emit_move_insn (target, const0_rtx);
- target = gen_rtx_SUBREG (QImode, target, 0);
-
- if ((optimize && !register_operand (op0, mode0))
- || !insn_data[d->icode].operand[0].predicate (op0, mode0))
- op0 = copy_to_mode_reg (mode0, op0);
- if ((optimize && !register_operand (op1, mode1))
- || !insn_data[d->icode].operand[1].predicate (op1, mode1))
- op1 = copy_to_mode_reg (mode1, op1);
-
- pat = GEN_FCN (d->icode) (op0, op1);
- if (! pat)
- return 0;
- emit_insn (pat);
- emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
- gen_rtx_fmt_ee (comparison, QImode,
- SET_DEST (pat),
- const0_rtx)));
-
- return SUBREG_REG (target);
-}
-
-/* Subroutines of ix86_expand_args_builtin to take care of round insns. */
-
-static rtx
-ix86_expand_sse_round (const struct builtin_description *d, tree exp,
- rtx target)
-{
- rtx pat;
- tree arg0 = CALL_EXPR_ARG (exp, 0);
- rtx op1, op0 = expand_normal (arg0);
- machine_mode tmode = insn_data[d->icode].operand[0].mode;
- machine_mode mode0 = insn_data[d->icode].operand[1].mode;
-
- if (optimize || target == 0
- || GET_MODE (target) != tmode
- || !insn_data[d->icode].operand[0].predicate (target, tmode))
- target = gen_reg_rtx (tmode);
-
- if (VECTOR_MODE_P (mode0))
- op0 = safe_vector_operand (op0, mode0);
-
- if ((optimize && !register_operand (op0, mode0))
- || !insn_data[d->icode].operand[0].predicate (op0, mode0))
- op0 = copy_to_mode_reg (mode0, op0);
-
- op1 = GEN_INT (d->comparison);
-
- pat = GEN_FCN (d->icode) (target, op0, op1);
- if (! pat)
- return 0;
- emit_insn (pat);
- return target;
-}
-
-static rtx
-ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
- tree exp, rtx target)
-{
- rtx pat;
- tree arg0 = CALL_EXPR_ARG (exp, 0);
- tree arg1 = CALL_EXPR_ARG (exp, 1);
- rtx op0 = expand_normal (arg0);
- rtx op1 = expand_normal (arg1);
- rtx op2;
- machine_mode tmode = insn_data[d->icode].operand[0].mode;
- machine_mode mode0 = insn_data[d->icode].operand[1].mode;
- machine_mode mode1 = insn_data[d->icode].operand[2].mode;
-
- if (optimize || target == 0
- || GET_MODE (target) != tmode
- || !insn_data[d->icode].operand[0].predicate (target, tmode))
- target = gen_reg_rtx (tmode);
-
- op0 = safe_vector_operand (op0, mode0);
- op1 = safe_vector_operand (op1, mode1);
-
- if ((optimize && !register_operand (op0, mode0))
- || !insn_data[d->icode].operand[0].predicate (op0, mode0))
- op0 = copy_to_mode_reg (mode0, op0);
- if ((optimize && !register_operand (op1, mode1))
- || !insn_data[d->icode].operand[1].predicate (op1, mode1))
- op1 = copy_to_mode_reg (mode1, op1);
-
- op2 = GEN_INT (d->comparison);
-
- pat = GEN_FCN (d->icode) (target, op0, op1, op2);
- if (! pat)
- return 0;
- emit_insn (pat);
- return target;
-}
-
-/* Subroutine of ix86_expand_builtin to take care of ptest insns. */
-
-static rtx
-ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
- rtx target)
-{
- rtx pat;
- tree arg0 = CALL_EXPR_ARG (exp, 0);
- tree arg1 = CALL_EXPR_ARG (exp, 1);
- rtx op0 = expand_normal (arg0);
- rtx op1 = expand_normal (arg1);
- machine_mode mode0 = insn_data[d->icode].operand[0].mode;
- machine_mode mode1 = insn_data[d->icode].operand[1].mode;
- enum rtx_code comparison = d->comparison;
-
- if (VECTOR_MODE_P (mode0))
- op0 = safe_vector_operand (op0, mode0);
- if (VECTOR_MODE_P (mode1))
- op1 = safe_vector_operand (op1, mode1);
-
- target = gen_reg_rtx (SImode);
- emit_move_insn (target, const0_rtx);
- target = gen_rtx_SUBREG (QImode, target, 0);
-
- if ((optimize && !register_operand (op0, mode0))
- || !insn_data[d->icode].operand[0].predicate (op0, mode0))
- op0 = copy_to_mode_reg (mode0, op0);
- if ((optimize && !register_operand (op1, mode1))
- || !insn_data[d->icode].operand[1].predicate (op1, mode1))
- op1 = copy_to_mode_reg (mode1, op1);
-
- pat = GEN_FCN (d->icode) (op0, op1);
- if (! pat)
- return 0;
- emit_insn (pat);
- emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
- gen_rtx_fmt_ee (comparison, QImode,
- SET_DEST (pat),
- const0_rtx)));
-
- return SUBREG_REG (target);
-}
-
-/* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
-
-static rtx
-ix86_expand_sse_pcmpestr (const struct builtin_description *d,
- tree exp, rtx target)
-{
- rtx pat;
- tree arg0 = CALL_EXPR_ARG (exp, 0);
- tree arg1 = CALL_EXPR_ARG (exp, 1);
- tree arg2 = CALL_EXPR_ARG (exp, 2);
- tree arg3 = CALL_EXPR_ARG (exp, 3);
- tree arg4 = CALL_EXPR_ARG (exp, 4);
- rtx scratch0, scratch1;
- rtx op0 = expand_normal (arg0);
- rtx op1 = expand_normal (arg1);
- rtx op2 = expand_normal (arg2);
- rtx op3 = expand_normal (arg3);
- rtx op4 = expand_normal (arg4);
- machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
-
- tmode0 = insn_data[d->icode].operand[0].mode;
- tmode1 = insn_data[d->icode].operand[1].mode;
- modev2 = insn_data[d->icode].operand[2].mode;
- modei3 = insn_data[d->icode].operand[3].mode;
- modev4 = insn_data[d->icode].operand[4].mode;
- modei5 = insn_data[d->icode].operand[5].mode;
- modeimm = insn_data[d->icode].operand[6].mode;
-
- if (VECTOR_MODE_P (modev2))
- op0 = safe_vector_operand (op0, modev2);
- if (VECTOR_MODE_P (modev4))
- op2 = safe_vector_operand (op2, modev4);
-
- if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
- op0 = copy_to_mode_reg (modev2, op0);
- if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
- op1 = copy_to_mode_reg (modei3, op1);
- if ((optimize && !register_operand (op2, modev4))
- || !insn_data[d->icode].operand[4].predicate (op2, modev4))
- op2 = copy_to_mode_reg (modev4, op2);
- if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
- op3 = copy_to_mode_reg (modei5, op3);
-
- if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
- {
- error ("the fifth argument must be an 8-bit immediate");
- return const0_rtx;
- }
-
- if (d->code == IX86_BUILTIN_PCMPESTRI128)
- {
- if (optimize || !target
- || GET_MODE (target) != tmode0
- || !insn_data[d->icode].operand[0].predicate (target, tmode0))
- target = gen_reg_rtx (tmode0);
-
- scratch1 = gen_reg_rtx (tmode1);
-
- pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
- }
- else if (d->code == IX86_BUILTIN_PCMPESTRM128)
- {
- if (optimize || !target
- || GET_MODE (target) != tmode1
- || !insn_data[d->icode].operand[1].predicate (target, tmode1))
- target = gen_reg_rtx (tmode1);
-
- scratch0 = gen_reg_rtx (tmode0);
-
- pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
- }
- else
- {
- gcc_assert (d->flag);
-
- scratch0 = gen_reg_rtx (tmode0);
- scratch1 = gen_reg_rtx (tmode1);
-
- pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
- }
-
- if (! pat)
- return 0;
-
- emit_insn (pat);
-
- if (d->flag)
- {
- target = gen_reg_rtx (SImode);
- emit_move_insn (target, const0_rtx);
- target = gen_rtx_SUBREG (QImode, target, 0);
-
- emit_insn
- (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
- gen_rtx_fmt_ee (EQ, QImode,
- gen_rtx_REG ((machine_mode) d->flag,
- FLAGS_REG),
- const0_rtx)));
- return SUBREG_REG (target);
- }
- else
- return target;
-}
-
-
-/* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
-
-static rtx
-ix86_expand_sse_pcmpistr (const struct builtin_description *d,
- tree exp, rtx target)
-{
- rtx pat;
- tree arg0 = CALL_EXPR_ARG (exp, 0);
- tree arg1 = CALL_EXPR_ARG (exp, 1);
- tree arg2 = CALL_EXPR_ARG (exp, 2);
- rtx scratch0, scratch1;
- rtx op0 = expand_normal (arg0);
- rtx op1 = expand_normal (arg1);
- rtx op2 = expand_normal (arg2);
- machine_mode tmode0, tmode1, modev2, modev3, modeimm;
-
- tmode0 = insn_data[d->icode].operand[0].mode;
- tmode1 = insn_data[d->icode].operand[1].mode;
- modev2 = insn_data[d->icode].operand[2].mode;
- modev3 = insn_data[d->icode].operand[3].mode;
- modeimm = insn_data[d->icode].operand[4].mode;
-
- if (VECTOR_MODE_P (modev2))
- op0 = safe_vector_operand (op0, modev2);
- if (VECTOR_MODE_P (modev3))
- op1 = safe_vector_operand (op1, modev3);
-
- if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
- op0 = copy_to_mode_reg (modev2, op0);
- if ((optimize && !register_operand (op1, modev3))
- || !insn_data[d->icode].operand[3].predicate (op1, modev3))
- op1 = copy_to_mode_reg (modev3, op1);
-
- if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
- {
- error ("the third argument must be an 8-bit immediate");
- return const0_rtx;
- }
-
- if (d->code == IX86_BUILTIN_PCMPISTRI128)
- {
- if (optimize || !target
- || GET_MODE (target) != tmode0
- || !insn_data[d->icode].operand[0].predicate (target, tmode0))
- target = gen_reg_rtx (tmode0);
-
- scratch1 = gen_reg_rtx (tmode1);
-
- pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
- }
- else if (d->code == IX86_BUILTIN_PCMPISTRM128)
- {
- if (optimize || !target
- || GET_MODE (target) != tmode1
- || !insn_data[d->icode].operand[1].predicate (target, tmode1))
- target = gen_reg_rtx (tmode1);
-
- scratch0 = gen_reg_rtx (tmode0);
-
- pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
- }
- else
- {
- gcc_assert (d->flag);
-
- scratch0 = gen_reg_rtx (tmode0);
- scratch1 = gen_reg_rtx (tmode1);
-
- pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
- }
-
- if (! pat)
- return 0;
-
- emit_insn (pat);
-
- if (d->flag)
- {
- target = gen_reg_rtx (SImode);
- emit_move_insn (target, const0_rtx);
- target = gen_rtx_SUBREG (QImode, target, 0);
-
- emit_insn
- (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
- gen_rtx_fmt_ee (EQ, QImode,
- gen_rtx_REG ((machine_mode) d->flag,
- FLAGS_REG),
- const0_rtx)));
- return SUBREG_REG (target);
- }
- else
- return target;
-}
-
-/* Subroutine of ix86_expand_builtin to take care of insns with
- variable number of operands. */
-
-static rtx
-ix86_expand_args_builtin (const struct builtin_description *d,
- tree exp, rtx target)
-{
- rtx pat, real_target;
- unsigned int i, nargs;
- unsigned int nargs_constant = 0;
- unsigned int mask_pos = 0;
- int num_memory = 0;
- struct
- {
- rtx op;
- machine_mode mode;
- } args[6];
- bool second_arg_count = false;
- enum insn_code icode = d->icode;
- const struct insn_data_d *insn_p = &insn_data[icode];
- machine_mode tmode = insn_p->operand[0].mode;
- machine_mode rmode = VOIDmode;
- bool swap = false;
- enum rtx_code comparison = d->comparison;
-
- switch ((enum ix86_builtin_func_type) d->flag)
- {
- case V2DF_FTYPE_V2DF_ROUND:
- case V4DF_FTYPE_V4DF_ROUND:
- case V8DF_FTYPE_V8DF_ROUND:
- case V4SF_FTYPE_V4SF_ROUND:
- case V8SF_FTYPE_V8SF_ROUND:
- case V16SF_FTYPE_V16SF_ROUND:
- case V4SI_FTYPE_V4SF_ROUND:
- case V8SI_FTYPE_V8SF_ROUND:
- case V16SI_FTYPE_V16SF_ROUND:
- return ix86_expand_sse_round (d, exp, target);
- case V4SI_FTYPE_V2DF_V2DF_ROUND:
- case V8SI_FTYPE_V4DF_V4DF_ROUND:
- case V16SI_FTYPE_V8DF_V8DF_ROUND:
- return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
- case INT_FTYPE_V8SF_V8SF_PTEST:
- case INT_FTYPE_V4DI_V4DI_PTEST:
- case INT_FTYPE_V4DF_V4DF_PTEST:
- case INT_FTYPE_V4SF_V4SF_PTEST:
- case INT_FTYPE_V2DI_V2DI_PTEST:
- case INT_FTYPE_V2DF_V2DF_PTEST:
- return ix86_expand_sse_ptest (d, exp, target);
- case FLOAT128_FTYPE_FLOAT128:
- case FLOAT_FTYPE_FLOAT:
- case INT_FTYPE_INT:
- case UINT_FTYPE_UINT:
- case UINT16_FTYPE_UINT16:
- case UINT64_FTYPE_INT:
- case UINT64_FTYPE_UINT64:
- case INT64_FTYPE_INT64:
- case INT64_FTYPE_V4SF:
- case INT64_FTYPE_V2DF:
- case INT_FTYPE_V16QI:
- case INT_FTYPE_V8QI:
- case INT_FTYPE_V8SF:
- case INT_FTYPE_V4DF:
- case INT_FTYPE_V4SF:
- case INT_FTYPE_V2DF:
- case INT_FTYPE_V32QI:
- case V16QI_FTYPE_V16QI:
- case V8SI_FTYPE_V8SF:
- case V8SI_FTYPE_V4SI:
- case V8HI_FTYPE_V8HI:
- case V8HI_FTYPE_V16QI:
- case V8QI_FTYPE_V8QI:
- case V8SF_FTYPE_V8SF:
- case V8SF_FTYPE_V8SI:
- case V8SF_FTYPE_V4SF:
- case V8SF_FTYPE_V8HI:
- case V4SI_FTYPE_V4SI:
- case V4SI_FTYPE_V16QI:
- case V4SI_FTYPE_V4SF:
- case V4SI_FTYPE_V8SI:
- case V4SI_FTYPE_V8HI:
- case V4SI_FTYPE_V4DF:
- case V4SI_FTYPE_V2DF:
- case V4HI_FTYPE_V4HI:
- case V4DF_FTYPE_V4DF:
- case V4DF_FTYPE_V4SI:
- case V4DF_FTYPE_V4SF:
- case V4DF_FTYPE_V2DF:
- case V4SF_FTYPE_V4SF:
- case V4SF_FTYPE_V4SI:
- case V4SF_FTYPE_V8SF:
- case V4SF_FTYPE_V4DF:
- case V4SF_FTYPE_V8HI:
- case V4SF_FTYPE_V2DF:
- case V2DI_FTYPE_V2DI:
- case V2DI_FTYPE_V16QI:
- case V2DI_FTYPE_V8HI:
- case V2DI_FTYPE_V4SI:
- case V2DF_FTYPE_V2DF:
- case V2DF_FTYPE_V4SI:
- case V2DF_FTYPE_V4DF:
- case V2DF_FTYPE_V4SF:
- case V2DF_FTYPE_V2SI:
- case V2SI_FTYPE_V2SI:
- case V2SI_FTYPE_V4SF:
- case V2SI_FTYPE_V2SF:
- case V2SI_FTYPE_V2DF:
- case V2SF_FTYPE_V2SF:
- case V2SF_FTYPE_V2SI:
- case V32QI_FTYPE_V32QI:
- case V32QI_FTYPE_V16QI:
- case V16HI_FTYPE_V16HI:
- case V16HI_FTYPE_V8HI:
- case V8SI_FTYPE_V8SI:
- case V16HI_FTYPE_V16QI:
- case V8SI_FTYPE_V16QI:
- case V4DI_FTYPE_V16QI:
- case V8SI_FTYPE_V8HI:
- case V4DI_FTYPE_V8HI:
- case V4DI_FTYPE_V4SI:
- case V4DI_FTYPE_V2DI:
- case UQI_FTYPE_UQI:
- case UHI_FTYPE_UHI:
- case USI_FTYPE_USI:
- case USI_FTYPE_UQI:
- case USI_FTYPE_UHI:
- case UDI_FTYPE_UDI:
- case UHI_FTYPE_V16QI:
- case USI_FTYPE_V32QI:
- case UDI_FTYPE_V64QI:
- case V16QI_FTYPE_UHI:
- case V32QI_FTYPE_USI:
- case V64QI_FTYPE_UDI:
- case V8HI_FTYPE_UQI:
- case V16HI_FTYPE_UHI:
- case V32HI_FTYPE_USI:
- case V4SI_FTYPE_UQI:
- case V8SI_FTYPE_UQI:
- case V4SI_FTYPE_UHI:
- case V8SI_FTYPE_UHI:
- case UQI_FTYPE_V8HI:
- case UHI_FTYPE_V16HI:
- case USI_FTYPE_V32HI:
- case UQI_FTYPE_V4SI:
- case UQI_FTYPE_V8SI:
- case UHI_FTYPE_V16SI:
- case UQI_FTYPE_V2DI:
- case UQI_FTYPE_V4DI:
- case UQI_FTYPE_V8DI:
- case V16SI_FTYPE_UHI:
- case V2DI_FTYPE_UQI:
- case V4DI_FTYPE_UQI:
- case V16SI_FTYPE_INT:
- case V16SF_FTYPE_V8SF:
- case V16SI_FTYPE_V8SI:
- case V16SF_FTYPE_V4SF:
- case V16SI_FTYPE_V4SI:
- case V16SI_FTYPE_V16SF:
- case V16SI_FTYPE_V16SI:
- case V64QI_FTYPE_V64QI:
- case V32HI_FTYPE_V32HI:
- case V16SF_FTYPE_V16SF:
- case V8DI_FTYPE_UQI:
- case V8DI_FTYPE_V8DI:
- case V8DF_FTYPE_V4DF:
- case V8DF_FTYPE_V2DF:
- case V8DF_FTYPE_V8DF:
- case V4DI_FTYPE_V4DI:
- nargs = 1;
- break;
- case V4SF_FTYPE_V4SF_VEC_MERGE:
- case V2DF_FTYPE_V2DF_VEC_MERGE:
- return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
- case FLOAT128_FTYPE_FLOAT128_FLOAT128:
- case V16QI_FTYPE_V16QI_V16QI:
- case V16QI_FTYPE_V8HI_V8HI:
- case V16SF_FTYPE_V16SF_V16SF:
- case V8QI_FTYPE_V8QI_V8QI:
- case V8QI_FTYPE_V4HI_V4HI:
- case V8HI_FTYPE_V8HI_V8HI:
- case V8HI_FTYPE_V16QI_V16QI:
- case V8HI_FTYPE_V4SI_V4SI:
- case V8SF_FTYPE_V8SF_V8SF:
- case V8SF_FTYPE_V8SF_V8SI:
- case V8DF_FTYPE_V8DF_V8DF:
- case V4SI_FTYPE_V4SI_V4SI:
- case V4SI_FTYPE_V8HI_V8HI:
- case V4SI_FTYPE_V2DF_V2DF:
- case V4HI_FTYPE_V4HI_V4HI:
- case V4HI_FTYPE_V8QI_V8QI:
- case V4HI_FTYPE_V2SI_V2SI:
- case V4DF_FTYPE_V4DF_V4DF:
- case V4DF_FTYPE_V4DF_V4DI:
- case V4SF_FTYPE_V4SF_V4SF:
- case V4SF_FTYPE_V4SF_V4SI:
- case V4SF_FTYPE_V4SF_V2SI:
- case V4SF_FTYPE_V4SF_V2DF:
- case V4SF_FTYPE_V4SF_UINT:
- case V4SF_FTYPE_V4SF_DI:
- case V4SF_FTYPE_V4SF_SI:
- case V2DI_FTYPE_V2DI_V2DI:
- case V2DI_FTYPE_V16QI_V16QI:
- case V2DI_FTYPE_V4SI_V4SI:
- case V2DI_FTYPE_V2DI_V16QI:
- case V2SI_FTYPE_V2SI_V2SI:
- case V2SI_FTYPE_V4HI_V4HI:
- case V2SI_FTYPE_V2SF_V2SF:
- case V2DF_FTYPE_V2DF_V2DF:
- case V2DF_FTYPE_V2DF_V4SF:
- case V2DF_FTYPE_V2DF_V2DI:
- case V2DF_FTYPE_V2DF_DI:
- case V2DF_FTYPE_V2DF_SI:
- case V2DF_FTYPE_V2DF_UINT:
- case V2SF_FTYPE_V2SF_V2SF:
- case V1DI_FTYPE_V1DI_V1DI:
- case V1DI_FTYPE_V8QI_V8QI:
- case V1DI_FTYPE_V2SI_V2SI:
- case V32QI_FTYPE_V16HI_V16HI:
- case V16HI_FTYPE_V8SI_V8SI:
- case V64QI_FTYPE_V64QI_V64QI:
- case V32QI_FTYPE_V32QI_V32QI:
- case V16HI_FTYPE_V32QI_V32QI:
- case V16HI_FTYPE_V16HI_V16HI:
- case V8SI_FTYPE_V4DF_V4DF:
- case V8SI_FTYPE_V8SI_V8SI:
- case V8SI_FTYPE_V16HI_V16HI:
- case V4DI_FTYPE_V4DI_V4DI:
- case V4DI_FTYPE_V8SI_V8SI:
- case V8DI_FTYPE_V64QI_V64QI:
- if (comparison == UNKNOWN)
- return ix86_expand_binop_builtin (icode, exp, target);
- nargs = 2;
- break;
- case V4SF_FTYPE_V4SF_V4SF_SWAP:
- case V2DF_FTYPE_V2DF_V2DF_SWAP:
- gcc_assert (comparison != UNKNOWN);
- nargs = 2;
- swap = true;
- break;
- case V16HI_FTYPE_V16HI_V8HI_COUNT:
- case V16HI_FTYPE_V16HI_SI_COUNT:
- case V8SI_FTYPE_V8SI_V4SI_COUNT:
- case V8SI_FTYPE_V8SI_SI_COUNT:
- case V4DI_FTYPE_V4DI_V2DI_COUNT:
- case V4DI_FTYPE_V4DI_INT_COUNT:
- case V8HI_FTYPE_V8HI_V8HI_COUNT:
- case V8HI_FTYPE_V8HI_SI_COUNT:
- case V4SI_FTYPE_V4SI_V4SI_COUNT:
- case V4SI_FTYPE_V4SI_SI_COUNT:
- case V4HI_FTYPE_V4HI_V4HI_COUNT:
- case V4HI_FTYPE_V4HI_SI_COUNT:
- case V2DI_FTYPE_V2DI_V2DI_COUNT:
- case V2DI_FTYPE_V2DI_SI_COUNT:
- case V2SI_FTYPE_V2SI_V2SI_COUNT:
- case V2SI_FTYPE_V2SI_SI_COUNT:
- case V1DI_FTYPE_V1DI_V1DI_COUNT:
- case V1DI_FTYPE_V1DI_SI_COUNT:
- nargs = 2;
- second_arg_count = true;
- break;
- case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
- case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
- case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
- case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
- case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
- case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
- case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
- case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
- case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
- case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
- case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
- case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
- case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
- case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
- case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
- case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
- case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
- case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
- nargs = 4;
- second_arg_count = true;
- break;
- case UINT64_FTYPE_UINT64_UINT64:
- case UINT_FTYPE_UINT_UINT:
- case UINT_FTYPE_UINT_USHORT:
- case UINT_FTYPE_UINT_UCHAR:
- case UINT16_FTYPE_UINT16_INT:
- case UINT8_FTYPE_UINT8_INT:
- case UQI_FTYPE_UQI_UQI:
- case UHI_FTYPE_UHI_UHI:
- case USI_FTYPE_USI_USI:
- case UDI_FTYPE_UDI_UDI:
- case V16SI_FTYPE_V8DF_V8DF:
- nargs = 2;
- break;
- case V2DI_FTYPE_V2DI_INT_CONVERT:
- nargs = 2;
- rmode = V1TImode;
- nargs_constant = 1;
- break;
- case V4DI_FTYPE_V4DI_INT_CONVERT:
- nargs = 2;
- rmode = V2TImode;
- nargs_constant = 1;
- break;
- case V8DI_FTYPE_V8DI_INT_CONVERT:
- nargs = 2;
- rmode = V4TImode;
- nargs_constant = 1;
- break;
- case V8HI_FTYPE_V8HI_INT:
- case V8HI_FTYPE_V8SF_INT:
- case V16HI_FTYPE_V16SF_INT:
- case V8HI_FTYPE_V4SF_INT:
- case V8SF_FTYPE_V8SF_INT:
- case V4SF_FTYPE_V16SF_INT:
- case V16SF_FTYPE_V16SF_INT:
- case V4SI_FTYPE_V4SI_INT:
- case V4SI_FTYPE_V8SI_INT:
- case V4HI_FTYPE_V4HI_INT:
- case V4DF_FTYPE_V4DF_INT:
- case V4DF_FTYPE_V8DF_INT:
- case V4SF_FTYPE_V4SF_INT:
- case V4SF_FTYPE_V8SF_INT:
- case V2DI_FTYPE_V2DI_INT:
- case V2DF_FTYPE_V2DF_INT:
- case V2DF_FTYPE_V4DF_INT:
- case V16HI_FTYPE_V16HI_INT:
- case V8SI_FTYPE_V8SI_INT:
- case V16SI_FTYPE_V16SI_INT:
- case V4SI_FTYPE_V16SI_INT:
- case V4DI_FTYPE_V4DI_INT:
- case V2DI_FTYPE_V4DI_INT:
- case V4DI_FTYPE_V8DI_INT:
- case QI_FTYPE_V4SF_INT:
- case QI_FTYPE_V2DF_INT:
- case UQI_FTYPE_UQI_UQI_CONST:
- case UHI_FTYPE_UHI_UQI:
- case USI_FTYPE_USI_UQI:
- case UDI_FTYPE_UDI_UQI:
- nargs = 2;
- nargs_constant = 1;
- break;
- case V16QI_FTYPE_V16QI_V16QI_V16QI:
- case V8SF_FTYPE_V8SF_V8SF_V8SF:
- case V4DF_FTYPE_V4DF_V4DF_V4DF:
- case V4SF_FTYPE_V4SF_V4SF_V4SF:
- case V2DF_FTYPE_V2DF_V2DF_V2DF:
- case V32QI_FTYPE_V32QI_V32QI_V32QI:
- case UHI_FTYPE_V16SI_V16SI_UHI:
- case UQI_FTYPE_V8DI_V8DI_UQI:
- case V16HI_FTYPE_V16SI_V16HI_UHI:
- case V16QI_FTYPE_V16SI_V16QI_UHI:
- case V16QI_FTYPE_V8DI_V16QI_UQI:
- case V16SF_FTYPE_V16SF_V16SF_UHI:
- case V16SF_FTYPE_V4SF_V16SF_UHI:
- case V16SI_FTYPE_SI_V16SI_UHI:
- case V16SI_FTYPE_V16HI_V16SI_UHI:
- case V16SI_FTYPE_V16QI_V16SI_UHI:
- case V8SF_FTYPE_V4SF_V8SF_UQI:
- case V4DF_FTYPE_V2DF_V4DF_UQI:
- case V8SI_FTYPE_V4SI_V8SI_UQI:
- case V8SI_FTYPE_SI_V8SI_UQI:
- case V4SI_FTYPE_V4SI_V4SI_UQI:
- case V4SI_FTYPE_SI_V4SI_UQI:
- case V4DI_FTYPE_V2DI_V4DI_UQI:
- case V4DI_FTYPE_DI_V4DI_UQI:
- case V2DI_FTYPE_V2DI_V2DI_UQI:
- case V2DI_FTYPE_DI_V2DI_UQI:
- case V64QI_FTYPE_V64QI_V64QI_UDI:
- case V64QI_FTYPE_V16QI_V64QI_UDI:
- case V64QI_FTYPE_QI_V64QI_UDI:
- case V32QI_FTYPE_V32QI_V32QI_USI:
- case V32QI_FTYPE_V16QI_V32QI_USI:
- case V32QI_FTYPE_QI_V32QI_USI:
- case V16QI_FTYPE_V16QI_V16QI_UHI:
- case V16QI_FTYPE_QI_V16QI_UHI:
- case V32HI_FTYPE_V8HI_V32HI_USI:
- case V32HI_FTYPE_HI_V32HI_USI:
- case V16HI_FTYPE_V8HI_V16HI_UHI:
- case V16HI_FTYPE_HI_V16HI_UHI:
- case V8HI_FTYPE_V8HI_V8HI_UQI:
- case V8HI_FTYPE_HI_V8HI_UQI:
- case V8SF_FTYPE_V8HI_V8SF_UQI:
- case V4SF_FTYPE_V8HI_V4SF_UQI:
- case V8SI_FTYPE_V8SF_V8SI_UQI:
- case V4SI_FTYPE_V4SF_V4SI_UQI:
- case V4DI_FTYPE_V4SF_V4DI_UQI:
- case V2DI_FTYPE_V4SF_V2DI_UQI:
- case V4SF_FTYPE_V4DI_V4SF_UQI:
- case V4SF_FTYPE_V2DI_V4SF_UQI:
- case V4DF_FTYPE_V4DI_V4DF_UQI:
- case V2DF_FTYPE_V2DI_V2DF_UQI:
- case V16QI_FTYPE_V8HI_V16QI_UQI:
- case V16QI_FTYPE_V16HI_V16QI_UHI:
- case V16QI_FTYPE_V4SI_V16QI_UQI:
- case V16QI_FTYPE_V8SI_V16QI_UQI:
- case V8HI_FTYPE_V4SI_V8HI_UQI:
- case V8HI_FTYPE_V8SI_V8HI_UQI:
- case V16QI_FTYPE_V2DI_V16QI_UQI:
- case V16QI_FTYPE_V4DI_V16QI_UQI:
- case V8HI_FTYPE_V2DI_V8HI_UQI:
- case V8HI_FTYPE_V4DI_V8HI_UQI:
- case V4SI_FTYPE_V2DI_V4SI_UQI:
- case V4SI_FTYPE_V4DI_V4SI_UQI:
- case V32QI_FTYPE_V32HI_V32QI_USI:
- case UHI_FTYPE_V16QI_V16QI_UHI:
- case USI_FTYPE_V32QI_V32QI_USI:
- case UDI_FTYPE_V64QI_V64QI_UDI:
- case UQI_FTYPE_V8HI_V8HI_UQI:
- case UHI_FTYPE_V16HI_V16HI_UHI:
- case USI_FTYPE_V32HI_V32HI_USI:
- case UQI_FTYPE_V4SI_V4SI_UQI:
- case UQI_FTYPE_V8SI_V8SI_UQI:
- case UQI_FTYPE_V2DI_V2DI_UQI:
- case UQI_FTYPE_V4DI_V4DI_UQI:
- case V4SF_FTYPE_V2DF_V4SF_UQI:
- case V4SF_FTYPE_V4DF_V4SF_UQI:
- case V16SI_FTYPE_V16SI_V16SI_UHI:
- case V16SI_FTYPE_V4SI_V16SI_UHI:
- case V2DI_FTYPE_V4SI_V2DI_UQI:
- case V2DI_FTYPE_V8HI_V2DI_UQI:
- case V2DI_FTYPE_V16QI_V2DI_UQI:
- case V4DI_FTYPE_V4DI_V4DI_UQI:
- case V4DI_FTYPE_V4SI_V4DI_UQI:
- case V4DI_FTYPE_V8HI_V4DI_UQI:
- case V4DI_FTYPE_V16QI_V4DI_UQI:
- case V4DI_FTYPE_V4DF_V4DI_UQI:
- case V2DI_FTYPE_V2DF_V2DI_UQI:
- case V4SI_FTYPE_V4DF_V4SI_UQI:
- case V4SI_FTYPE_V2DF_V4SI_UQI:
- case V4SI_FTYPE_V8HI_V4SI_UQI:
- case V4SI_FTYPE_V16QI_V4SI_UQI:
- case V4DI_FTYPE_V4DI_V4DI_V4DI:
- case V8DF_FTYPE_V2DF_V8DF_UQI:
- case V8DF_FTYPE_V4DF_V8DF_UQI:
- case V8DF_FTYPE_V8DF_V8DF_UQI:
- case V8SF_FTYPE_V8SF_V8SF_UQI:
- case V8SF_FTYPE_V8SI_V8SF_UQI:
- case V4DF_FTYPE_V4DF_V4DF_UQI:
- case V4SF_FTYPE_V4SF_V4SF_UQI:
- case V2DF_FTYPE_V2DF_V2DF_UQI:
- case V2DF_FTYPE_V4SF_V2DF_UQI:
- case V2DF_FTYPE_V4SI_V2DF_UQI:
- case V4SF_FTYPE_V4SI_V4SF_UQI:
- case V4DF_FTYPE_V4SF_V4DF_UQI:
- case V4DF_FTYPE_V4SI_V4DF_UQI:
- case V8SI_FTYPE_V8SI_V8SI_UQI:
- case V8SI_FTYPE_V8HI_V8SI_UQI:
- case V8SI_FTYPE_V16QI_V8SI_UQI:
- case V8DF_FTYPE_V8SI_V8DF_UQI:
- case V8DI_FTYPE_DI_V8DI_UQI:
- case V16SF_FTYPE_V8SF_V16SF_UHI:
- case V16SI_FTYPE_V8SI_V16SI_UHI:
- case V16HI_FTYPE_V16HI_V16HI_UHI:
- case V8HI_FTYPE_V16QI_V8HI_UQI:
- case V16HI_FTYPE_V16QI_V16HI_UHI:
- case V32HI_FTYPE_V32HI_V32HI_USI:
- case V32HI_FTYPE_V32QI_V32HI_USI:
- case V8DI_FTYPE_V16QI_V8DI_UQI:
- case V8DI_FTYPE_V2DI_V8DI_UQI:
- case V8DI_FTYPE_V4DI_V8DI_UQI:
- case V8DI_FTYPE_V8DI_V8DI_UQI:
- case V8DI_FTYPE_V8HI_V8DI_UQI:
- case V8DI_FTYPE_V8SI_V8DI_UQI:
- case V8HI_FTYPE_V8DI_V8HI_UQI:
- case V8SI_FTYPE_V8DI_V8SI_UQI:
- case V4SI_FTYPE_V4SI_V4SI_V4SI:
- case V16SI_FTYPE_V16SI_V16SI_V16SI:
- case V8DI_FTYPE_V8DI_V8DI_V8DI:
- case V32HI_FTYPE_V32HI_V32HI_V32HI:
- case V2DI_FTYPE_V2DI_V2DI_V2DI:
- case V16HI_FTYPE_V16HI_V16HI_V16HI:
- case V8SI_FTYPE_V8SI_V8SI_V8SI:
- case V8HI_FTYPE_V8HI_V8HI_V8HI:
- nargs = 3;
- break;
- case V32QI_FTYPE_V32QI_V32QI_INT:
- case V16HI_FTYPE_V16HI_V16HI_INT:
- case V16QI_FTYPE_V16QI_V16QI_INT:
- case V4DI_FTYPE_V4DI_V4DI_INT:
- case V8HI_FTYPE_V8HI_V8HI_INT:
- case V8SI_FTYPE_V8SI_V8SI_INT:
- case V8SI_FTYPE_V8SI_V4SI_INT:
- case V8SF_FTYPE_V8SF_V8SF_INT:
- case V8SF_FTYPE_V8SF_V4SF_INT:
- case V4SI_FTYPE_V4SI_V4SI_INT:
- case V4DF_FTYPE_V4DF_V4DF_INT:
- case V16SF_FTYPE_V16SF_V16SF_INT:
- case V16SF_FTYPE_V16SF_V4SF_INT:
- case V16SI_FTYPE_V16SI_V4SI_INT:
- case V4DF_FTYPE_V4DF_V2DF_INT:
- case V4SF_FTYPE_V4SF_V4SF_INT:
- case V2DI_FTYPE_V2DI_V2DI_INT:
- case V4DI_FTYPE_V4DI_V2DI_INT:
- case V2DF_FTYPE_V2DF_V2DF_INT:
- case UQI_FTYPE_V8DI_V8UDI_INT:
- case UQI_FTYPE_V8DF_V8DF_INT:
- case UQI_FTYPE_V2DF_V2DF_INT:
- case UQI_FTYPE_V4SF_V4SF_INT:
- case UHI_FTYPE_V16SI_V16SI_INT:
- case UHI_FTYPE_V16SF_V16SF_INT:
- case V64QI_FTYPE_V64QI_V64QI_INT:
- case V32HI_FTYPE_V32HI_V32HI_INT:
- case V16SI_FTYPE_V16SI_V16SI_INT:
- case V8DI_FTYPE_V8DI_V8DI_INT:
- nargs = 3;
- nargs_constant = 1;
- break;
- case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
- nargs = 3;
- rmode = V4DImode;
- nargs_constant = 1;
- break;
- case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
- nargs = 3;
- rmode = V2DImode;
- nargs_constant = 1;
- break;
- case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
- nargs = 3;
- rmode = DImode;
- nargs_constant = 1;
- break;
- case V2DI_FTYPE_V2DI_UINT_UINT:
- nargs = 3;
- nargs_constant = 2;
- break;
- case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
- nargs = 3;
- rmode = V8DImode;
- nargs_constant = 1;
- break;
- case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
- nargs = 5;
- rmode = V8DImode;
- mask_pos = 2;
- nargs_constant = 1;
- break;
- case QI_FTYPE_V8DF_INT_UQI:
- case QI_FTYPE_V4DF_INT_UQI:
- case QI_FTYPE_V2DF_INT_UQI:
- case HI_FTYPE_V16SF_INT_UHI:
- case QI_FTYPE_V8SF_INT_UQI:
- case QI_FTYPE_V4SF_INT_UQI:
- case V4SI_FTYPE_V4SI_V4SI_UHI:
- case V8SI_FTYPE_V8SI_V8SI_UHI:
- nargs = 3;
- mask_pos = 1;
- nargs_constant = 1;
- break;
- case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
- nargs = 5;
- rmode = V4DImode;
- mask_pos = 2;
- nargs_constant = 1;
- break;
- case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
- nargs = 5;
- rmode = V2DImode;
- mask_pos = 2;
- nargs_constant = 1;
- break;
- case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
- case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
- case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
- case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
- case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
- case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
- case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
- case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
- case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
- case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
- case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
- case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
- case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
- case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
- case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
- case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
- case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
- case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
- case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
- case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
- case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
- case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
- case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
- case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
- case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
- case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
- case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
- case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
- case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
- case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
- case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
- case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
- case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
- case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
- case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
- case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
- case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
- case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
- case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
- case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
- case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
- case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
- case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
- case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
- case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
- case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
- case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
- case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
- case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
- case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
- case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
- nargs = 4;
- break;
- case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
- case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
- case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
- case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
- case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
- nargs = 4;
- nargs_constant = 1;
- break;
- case UQI_FTYPE_V4DI_V4DI_INT_UQI:
- case UQI_FTYPE_V8SI_V8SI_INT_UQI:
- case QI_FTYPE_V4DF_V4DF_INT_UQI:
- case QI_FTYPE_V8SF_V8SF_INT_UQI:
- case UQI_FTYPE_V2DI_V2DI_INT_UQI:
- case UQI_FTYPE_V4SI_V4SI_INT_UQI:
- case UQI_FTYPE_V2DF_V2DF_INT_UQI:
- case UQI_FTYPE_V4SF_V4SF_INT_UQI:
- case UDI_FTYPE_V64QI_V64QI_INT_UDI:
- case USI_FTYPE_V32QI_V32QI_INT_USI:
- case UHI_FTYPE_V16QI_V16QI_INT_UHI:
- case USI_FTYPE_V32HI_V32HI_INT_USI:
- case UHI_FTYPE_V16HI_V16HI_INT_UHI:
- case UQI_FTYPE_V8HI_V8HI_INT_UQI:
- case V32HI_FTYPE_V32HI_V32HI_V32HI_INT:
- case V16HI_FTYPE_V16HI_V16HI_V16HI_INT:
- case V8HI_FTYPE_V8HI_V8HI_V8HI_INT:
- case V8SI_FTYPE_V8SI_V8SI_V8SI_INT:
- case V4DI_FTYPE_V4DI_V4DI_V4DI_INT:
- case V8DI_FTYPE_V8DI_V8DI_V8DI_INT:
- case V16SI_FTYPE_V16SI_V16SI_V16SI_INT:
- case V2DI_FTYPE_V2DI_V2DI_V2DI_INT:
- case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
- nargs = 4;
- mask_pos = 1;
- nargs_constant = 1;
- break;
- case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
- nargs = 4;
- nargs_constant = 2;
- break;
- case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
- case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
- nargs = 4;
- break;
- case UQI_FTYPE_V8DI_V8DI_INT_UQI:
- case UHI_FTYPE_V16SI_V16SI_INT_UHI:
- mask_pos = 1;
- nargs = 4;
- nargs_constant = 1;
- break;
- case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
- case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
- case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
- case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
- case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
- case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
- case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
- case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
- case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
- case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
- case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
- case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
- case V32HI_FTYPE_V32HI_INT_V32HI_USI:
- case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
- case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
- case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
- case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
- case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
- case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
- case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
- case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
- case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
- case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
- case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
- case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
- case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
- case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
- case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
- case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
- case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
- nargs = 4;
- mask_pos = 2;
- nargs_constant = 1;
- break;
- case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
- case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
- case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
- case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
- case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
- case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
- case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
- case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
- case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
- case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
- case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
- case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
- case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
- case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
- case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
- case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
- case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
- case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
- case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
- case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
- case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
- case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
- case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
- case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
- case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
- case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
- case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
- nargs = 5;
- mask_pos = 2;
- nargs_constant = 1;
- break;
- case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
- case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
- case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
- case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
- case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
- case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
- case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
- case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
- case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
- case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
- nargs = 5;
- mask_pos = 1;
- nargs_constant = 1;
- break;
- case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
- case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
- case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
- case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
- case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
- case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
- case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
- case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
- case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
- case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
- case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
- case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
- nargs = 5;
- mask_pos = 1;
- nargs_constant = 2;
- break;
-
- default:
- gcc_unreachable ();
- }
-
- gcc_assert (nargs <= ARRAY_SIZE (args));
-
- if (comparison != UNKNOWN)
- {
- gcc_assert (nargs == 2);
- return ix86_expand_sse_compare (d, exp, target, swap);
- }
-
- if (rmode == VOIDmode || rmode == tmode)
- {
- if (optimize
- || target == 0
- || GET_MODE (target) != tmode
- || !insn_p->operand[0].predicate (target, tmode))
- target = gen_reg_rtx (tmode);
- else if (memory_operand (target, tmode))
- num_memory++;
- real_target = target;
- }
- else
- {
- real_target = gen_reg_rtx (tmode);
- target = lowpart_subreg (rmode, real_target, tmode);
- }
-
- for (i = 0; i < nargs; i++)
- {
- tree arg = CALL_EXPR_ARG (exp, i);
- rtx op = expand_normal (arg);
- machine_mode mode = insn_p->operand[i + 1].mode;
- bool match = insn_p->operand[i + 1].predicate (op, mode);
-
- if (second_arg_count && i == 1)
- {
- /* SIMD shift insns take either an 8-bit immediate or
- register as count. But builtin functions take int as
- count. If count doesn't match, we put it in register.
- The instructions are using 64-bit count, if op is just
- 32-bit, zero-extend it, as negative shift counts
- are undefined behavior and zero-extension is more
- efficient. */
- if (!match)
- {
- if (SCALAR_INT_MODE_P (GET_MODE (op)))
- op = convert_modes (mode, GET_MODE (op), op, 1);
- else
- op = lowpart_subreg (mode, op, GET_MODE (op));
- if (!insn_p->operand[i + 1].predicate (op, mode))
- op = copy_to_reg (op);
- }
- }
- else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
- (!mask_pos && (nargs - i) <= nargs_constant))
- {
- if (!match)
- switch (icode)
- {
- case CODE_FOR_avx_vinsertf128v4di:
- case CODE_FOR_avx_vextractf128v4di:
- error ("the last argument must be an 1-bit immediate");
- return const0_rtx;
-
- case CODE_FOR_avx512f_cmpv8di3_mask:
- case CODE_FOR_avx512f_cmpv16si3_mask:
- case CODE_FOR_avx512f_ucmpv8di3_mask:
- case CODE_FOR_avx512f_ucmpv16si3_mask:
- case CODE_FOR_avx512vl_cmpv4di3_mask:
- case CODE_FOR_avx512vl_cmpv8si3_mask:
- case CODE_FOR_avx512vl_ucmpv4di3_mask:
- case CODE_FOR_avx512vl_ucmpv8si3_mask:
- case CODE_FOR_avx512vl_cmpv2di3_mask:
- case CODE_FOR_avx512vl_cmpv4si3_mask:
- case CODE_FOR_avx512vl_ucmpv2di3_mask:
- case CODE_FOR_avx512vl_ucmpv4si3_mask:
- error ("the last argument must be a 3-bit immediate");
- return const0_rtx;
-
- case CODE_FOR_sse4_1_roundsd:
- case CODE_FOR_sse4_1_roundss:
-
- case CODE_FOR_sse4_1_roundpd:
- case CODE_FOR_sse4_1_roundps:
- case CODE_FOR_avx_roundpd256:
- case CODE_FOR_avx_roundps256:
-
- case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
- case CODE_FOR_sse4_1_roundps_sfix:
- case CODE_FOR_avx_roundpd_vec_pack_sfix256:
- case CODE_FOR_avx_roundps_sfix256:
-
- case CODE_FOR_sse4_1_blendps:
- case CODE_FOR_avx_blendpd256:
- case CODE_FOR_avx_vpermilv4df:
- case CODE_FOR_avx_vpermilv4df_mask:
- case CODE_FOR_avx512f_getmantv8df_mask:
- case CODE_FOR_avx512f_getmantv16sf_mask:
- case CODE_FOR_avx512vl_getmantv8sf_mask:
- case CODE_FOR_avx512vl_getmantv4df_mask:
- case CODE_FOR_avx512vl_getmantv4sf_mask:
- case CODE_FOR_avx512vl_getmantv2df_mask:
- case CODE_FOR_avx512dq_rangepv8df_mask_round:
- case CODE_FOR_avx512dq_rangepv16sf_mask_round:
- case CODE_FOR_avx512dq_rangepv4df_mask:
- case CODE_FOR_avx512dq_rangepv8sf_mask:
- case CODE_FOR_avx512dq_rangepv2df_mask:
- case CODE_FOR_avx512dq_rangepv4sf_mask:
- case CODE_FOR_avx_shufpd256_mask:
- error ("the last argument must be a 4-bit immediate");
- return const0_rtx;
-
- case CODE_FOR_sha1rnds4:
- case CODE_FOR_sse4_1_blendpd:
- case CODE_FOR_avx_vpermilv2df:
- case CODE_FOR_avx_vpermilv2df_mask:
- case CODE_FOR_xop_vpermil2v2df3:
- case CODE_FOR_xop_vpermil2v4sf3:
- case CODE_FOR_xop_vpermil2v4df3:
- case CODE_FOR_xop_vpermil2v8sf3:
- case CODE_FOR_avx512f_vinsertf32x4_mask:
- case CODE_FOR_avx512f_vinserti32x4_mask:
- case CODE_FOR_avx512f_vextractf32x4_mask:
- case CODE_FOR_avx512f_vextracti32x4_mask:
- case CODE_FOR_sse2_shufpd:
- case CODE_FOR_sse2_shufpd_mask:
- case CODE_FOR_avx512dq_shuf_f64x2_mask:
- case CODE_FOR_avx512dq_shuf_i64x2_mask:
- case CODE_FOR_avx512vl_shuf_i32x4_mask:
- case CODE_FOR_avx512vl_shuf_f32x4_mask:
- error ("the last argument must be a 2-bit immediate");
- return const0_rtx;
-
- case CODE_FOR_avx_vextractf128v4df:
- case CODE_FOR_avx_vextractf128v8sf:
- case CODE_FOR_avx_vextractf128v8si:
- case CODE_FOR_avx_vinsertf128v4df:
- case CODE_FOR_avx_vinsertf128v8sf:
- case CODE_FOR_avx_vinsertf128v8si:
- case CODE_FOR_avx512f_vinsertf64x4_mask:
- case CODE_FOR_avx512f_vinserti64x4_mask:
- case CODE_FOR_avx512f_vextractf64x4_mask:
- case CODE_FOR_avx512f_vextracti64x4_mask:
- case CODE_FOR_avx512dq_vinsertf32x8_mask:
- case CODE_FOR_avx512dq_vinserti32x8_mask:
- case CODE_FOR_avx512vl_vinsertv4df:
- case CODE_FOR_avx512vl_vinsertv4di:
- case CODE_FOR_avx512vl_vinsertv8sf:
- case CODE_FOR_avx512vl_vinsertv8si:
- error ("the last argument must be a 1-bit immediate");
- return const0_rtx;
-
- case CODE_FOR_avx_vmcmpv2df3:
- case CODE_FOR_avx_vmcmpv4sf3:
- case CODE_FOR_avx_cmpv2df3:
- case CODE_FOR_avx_cmpv4sf3:
- case CODE_FOR_avx_cmpv4df3:
- case CODE_FOR_avx_cmpv8sf3:
- case CODE_FOR_avx512f_cmpv8df3_mask:
- case CODE_FOR_avx512f_cmpv16sf3_mask:
- case CODE_FOR_avx512f_vmcmpv2df3_mask:
- case CODE_FOR_avx512f_vmcmpv4sf3_mask:
- error ("the last argument must be a 5-bit immediate");
- return const0_rtx;
-
- default:
- switch (nargs_constant)
- {
- case 2:
- if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
- (!mask_pos && (nargs - i) == nargs_constant))
- {
- error ("the next to last argument must be an 8-bit immediate");
- break;
- }
- /* FALLTHRU */
- case 1:
- error ("the last argument must be an 8-bit immediate");
- break;
- default:
- gcc_unreachable ();
- }
- return const0_rtx;
- }
- }
- else
- {
- if (VECTOR_MODE_P (mode))
- op = safe_vector_operand (op, mode);
-
- /* If we aren't optimizing, only allow one memory operand to
- be generated. */
- if (memory_operand (op, mode))
- num_memory++;
-
- op = fixup_modeless_constant (op, mode);
-
- if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
- {
- if (optimize || !match || num_memory > 1)
- op = copy_to_mode_reg (mode, op);
- }
- else
- {
- op = copy_to_reg (op);
- op = lowpart_subreg (mode, op, GET_MODE (op));
- }
- }
-
- args[i].op = op;
- args[i].mode = mode;
- }
-
- switch (nargs)
- {
- case 1:
- pat = GEN_FCN (icode) (real_target, args[0].op);
- break;
- case 2:
- pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
- break;
- case 3:
- pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
- args[2].op);
- break;
- case 4:
- pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
- args[2].op, args[3].op);
- break;
- case 5:
- pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
- args[2].op, args[3].op, args[4].op);
- break;
- case 6:
- pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
- args[2].op, args[3].op, args[4].op,
- args[5].op);
- break;
- default:
- gcc_unreachable ();
- }
-
- if (! pat)
- return 0;
-
- emit_insn (pat);
- return target;
-}
-
-/* Transform pattern of following layout:
- (set A
- (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
- )
- into:
- (set (A B)) */
-
-static rtx
-ix86_erase_embedded_rounding (rtx pat)
-{
- if (GET_CODE (pat) == INSN)
- pat = PATTERN (pat);
-
- gcc_assert (GET_CODE (pat) == SET);
- rtx src = SET_SRC (pat);
- gcc_assert (XVECLEN (src, 0) == 2);
- rtx p0 = XVECEXP (src, 0, 0);
- gcc_assert (GET_CODE (src) == UNSPEC
- && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
- rtx res = gen_rtx_SET (SET_DEST (pat), p0);
- return res;
-}
-
-/* Subroutine of ix86_expand_round_builtin to take care of comi insns
- with rounding. */
-static rtx
-ix86_expand_sse_comi_round (const struct builtin_description *d,
- tree exp, rtx target)
-{
- rtx pat, set_dst;
- tree arg0 = CALL_EXPR_ARG (exp, 0);
- tree arg1 = CALL_EXPR_ARG (exp, 1);
- tree arg2 = CALL_EXPR_ARG (exp, 2);
- tree arg3 = CALL_EXPR_ARG (exp, 3);
- rtx op0 = expand_normal (arg0);
- rtx op1 = expand_normal (arg1);
- rtx op2 = expand_normal (arg2);
- rtx op3 = expand_normal (arg3);
- enum insn_code icode = d->icode;
- const struct insn_data_d *insn_p = &insn_data[icode];
- machine_mode mode0 = insn_p->operand[0].mode;
- machine_mode mode1 = insn_p->operand[1].mode;
- enum rtx_code comparison = UNEQ;
- bool need_ucomi = false;
-
- /* See avxintrin.h for values. */
- enum rtx_code comi_comparisons[32] =
- {
- UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
- UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
- UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
- };
- bool need_ucomi_values[32] =
- {
- true, false, false, true, true, false, false, true,
- true, false, false, true, true, false, false, true,
- false, true, true, false, false, true, true, false,
- false, true, true, false, false, true, true, false
- };
-
- if (!CONST_INT_P (op2))
- {
- error ("the third argument must be comparison constant");
- return const0_rtx;
- }
- if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
- {
- error ("incorrect comparison mode");
- return const0_rtx;
- }
-
- if (!insn_p->operand[2].predicate (op3, SImode))
- {
- error ("incorrect rounding operand");
- return const0_rtx;
- }
-
- comparison = comi_comparisons[INTVAL (op2)];
- need_ucomi = need_ucomi_values[INTVAL (op2)];
-
- if (VECTOR_MODE_P (mode0))
- op0 = safe_vector_operand (op0, mode0);
- if (VECTOR_MODE_P (mode1))
- op1 = safe_vector_operand (op1, mode1);
-
- target = gen_reg_rtx (SImode);
- emit_move_insn (target, const0_rtx);
- target = gen_rtx_SUBREG (QImode, target, 0);
-
- if ((optimize && !register_operand (op0, mode0))
- || !insn_p->operand[0].predicate (op0, mode0))
- op0 = copy_to_mode_reg (mode0, op0);
- if ((optimize && !register_operand (op1, mode1))
- || !insn_p->operand[1].predicate (op1, mode1))
- op1 = copy_to_mode_reg (mode1, op1);
-
- if (need_ucomi)
- icode = icode == CODE_FOR_sse_comi_round
- ? CODE_FOR_sse_ucomi_round
- : CODE_FOR_sse2_ucomi_round;
-
- pat = GEN_FCN (icode) (op0, op1, op3);
- if (! pat)
- return 0;
-
- /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
- if (INTVAL (op3) == NO_ROUND)
- {
- pat = ix86_erase_embedded_rounding (pat);
- if (! pat)
- return 0;
-
- set_dst = SET_DEST (pat);
- }
- else
- {
- gcc_assert (GET_CODE (pat) == SET);
- set_dst = SET_DEST (pat);
- }
-
- emit_insn (pat);
- emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
- gen_rtx_fmt_ee (comparison, QImode,
- set_dst,
- const0_rtx)));
-
- return SUBREG_REG (target);
-}
-
-static rtx
-ix86_expand_round_builtin (const struct builtin_description *d,
- tree exp, rtx target)
-{
- rtx pat;
- unsigned int i, nargs;
- struct
- {
- rtx op;
- machine_mode mode;
- } args[6];
- enum insn_code icode = d->icode;
- const struct insn_data_d *insn_p = &insn_data[icode];
- machine_mode tmode = insn_p->operand[0].mode;
- unsigned int nargs_constant = 0;
- unsigned int redundant_embed_rnd = 0;
-
- switch ((enum ix86_builtin_func_type) d->flag)
- {
- case UINT64_FTYPE_V2DF_INT:
- case UINT64_FTYPE_V4SF_INT:
- case UINT_FTYPE_V2DF_INT:
- case UINT_FTYPE_V4SF_INT:
- case INT64_FTYPE_V2DF_INT:
- case INT64_FTYPE_V4SF_INT:
- case INT_FTYPE_V2DF_INT:
- case INT_FTYPE_V4SF_INT:
- nargs = 2;
- break;
- case V4SF_FTYPE_V4SF_UINT_INT:
- case V4SF_FTYPE_V4SF_UINT64_INT:
- case V2DF_FTYPE_V2DF_UINT64_INT:
- case V4SF_FTYPE_V4SF_INT_INT:
- case V4SF_FTYPE_V4SF_INT64_INT:
- case V2DF_FTYPE_V2DF_INT64_INT:
- case V4SF_FTYPE_V4SF_V4SF_INT:
- case V2DF_FTYPE_V2DF_V2DF_INT:
- case V4SF_FTYPE_V4SF_V2DF_INT:
- case V2DF_FTYPE_V2DF_V4SF_INT:
- nargs = 3;
- break;
- case V8SF_FTYPE_V8DF_V8SF_QI_INT:
- case V8DF_FTYPE_V8DF_V8DF_QI_INT:
- case V8SI_FTYPE_V8DF_V8SI_QI_INT:
- case V8DI_FTYPE_V8DF_V8DI_QI_INT:
- case V8SF_FTYPE_V8DI_V8SF_QI_INT:
- case V8DF_FTYPE_V8DI_V8DF_QI_INT:
- case V16SF_FTYPE_V16SF_V16SF_HI_INT:
- case V8DI_FTYPE_V8SF_V8DI_QI_INT:
- case V16SF_FTYPE_V16SI_V16SF_HI_INT:
- case V16SI_FTYPE_V16SF_V16SI_HI_INT:
- case V8DF_FTYPE_V8SF_V8DF_QI_INT:
- case V16SF_FTYPE_V16HI_V16SF_HI_INT:
- case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
- case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
- nargs = 4;
- break;
- case V4SF_FTYPE_V4SF_V4SF_INT_INT:
- case V2DF_FTYPE_V2DF_V2DF_INT_INT:
- nargs_constant = 2;
- nargs = 4;
- break;
- case INT_FTYPE_V4SF_V4SF_INT_INT:
- case INT_FTYPE_V2DF_V2DF_INT_INT:
- return ix86_expand_sse_comi_round (d, exp, target);
- case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
- case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
- case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
- case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
- case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
- case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
- case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
- case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
- nargs = 5;
- break;
- case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
- case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
- nargs_constant = 4;
- nargs = 5;
- break;
- case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
- case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
- case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
- case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
- nargs_constant = 3;
- nargs = 5;
- break;
- case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
- case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
- case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
- case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
- case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
- case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
- nargs = 6;
- nargs_constant = 4;
- break;
- case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
- case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
- case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
- case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
- nargs = 6;
- nargs_constant = 3;
- break;
- default:
- gcc_unreachable ();
- }
- gcc_assert (nargs <= ARRAY_SIZE (args));
-
- if (optimize
- || target == 0
- || GET_MODE (target) != tmode
- || !insn_p->operand[0].predicate (target, tmode))
- target = gen_reg_rtx (tmode);
-
- for (i = 0; i < nargs; i++)
- {
- tree arg = CALL_EXPR_ARG (exp, i);
- rtx op = expand_normal (arg);
- machine_mode mode = insn_p->operand[i + 1].mode;
- bool match = insn_p->operand[i + 1].predicate (op, mode);
-
- if (i == nargs - nargs_constant)
- {
- if (!match)
- {
- switch (icode)
- {
- case CODE_FOR_avx512f_getmantv8df_mask_round:
- case CODE_FOR_avx512f_getmantv16sf_mask_round:
- case CODE_FOR_avx512f_vgetmantv2df_round:
- case CODE_FOR_avx512f_vgetmantv2df_mask_round:
- case CODE_FOR_avx512f_vgetmantv4sf_round:
- case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
- error ("the immediate argument must be a 4-bit immediate");
- return const0_rtx;
- case CODE_FOR_avx512f_cmpv8df3_mask_round:
- case CODE_FOR_avx512f_cmpv16sf3_mask_round:
- case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
- case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
- error ("the immediate argument must be a 5-bit immediate");
- return const0_rtx;
- default:
- error ("the immediate argument must be an 8-bit immediate");
- return const0_rtx;
- }
- }
- }
- else if (i == nargs-1)
- {
- if (!insn_p->operand[nargs].predicate (op, SImode))
- {
- error ("incorrect rounding operand");
- return const0_rtx;
- }
-
- /* If there is no rounding use normal version of the pattern. */
- if (INTVAL (op) == NO_ROUND)
- redundant_embed_rnd = 1;
- }
- else
- {
- if (VECTOR_MODE_P (mode))
- op = safe_vector_operand (op, mode);
-
- op = fixup_modeless_constant (op, mode);
-
- if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
- {
- if (optimize || !match)
- op = copy_to_mode_reg (mode, op);
- }
- else
- {
- op = copy_to_reg (op);
- op = lowpart_subreg (mode, op, GET_MODE (op));
- }
- }
-
- args[i].op = op;
- args[i].mode = mode;
- }
-
- switch (nargs)
- {
- case 1:
- pat = GEN_FCN (icode) (target, args[0].op);
- break;
- case 2:
- pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
- break;
- case 3:
- pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
- args[2].op);
- break;
- case 4:
- pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
- args[2].op, args[3].op);
- break;
- case 5:
- pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
- args[2].op, args[3].op, args[4].op);
- break;
- case 6:
- pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
- args[2].op, args[3].op, args[4].op,
- args[5].op);
- break;
- default:
- gcc_unreachable ();
- }
-
- if (!pat)
- return 0;
-
- if (redundant_embed_rnd)
- pat = ix86_erase_embedded_rounding (pat);
-
- emit_insn (pat);
- return target;
-}
-
-/* Subroutine of ix86_expand_builtin to take care of special insns
- with variable number of operands. */
-
-static rtx
-ix86_expand_special_args_builtin (const struct builtin_description *d,
- tree exp, rtx target)
-{
- tree arg;
- rtx pat, op;
- unsigned int i, nargs, arg_adjust, memory;
- bool aligned_mem = false;
- struct
- {
- rtx op;
- machine_mode mode;
- } args[3];
- enum insn_code icode = d->icode;
- bool last_arg_constant = false;
- const struct insn_data_d *insn_p = &insn_data[icode];
- machine_mode tmode = insn_p->operand[0].mode;
- enum { load, store } klass;
-
- switch ((enum ix86_builtin_func_type) d->flag)
- {
- case VOID_FTYPE_VOID:
- emit_insn (GEN_FCN (icode) (target));
- return 0;
- case VOID_FTYPE_UINT64:
- case VOID_FTYPE_UNSIGNED:
- nargs = 0;
- klass = store;
- memory = 0;
- break;
-
- case INT_FTYPE_VOID:
- case USHORT_FTYPE_VOID:
- case UINT64_FTYPE_VOID:
- case UINT_FTYPE_VOID:
- case UNSIGNED_FTYPE_VOID:
- nargs = 0;
- klass = load;
- memory = 0;
- break;
- case UINT64_FTYPE_PUNSIGNED:
- case V2DI_FTYPE_PV2DI:
- case V4DI_FTYPE_PV4DI:
- case V32QI_FTYPE_PCCHAR:
- case V16QI_FTYPE_PCCHAR:
- case V8SF_FTYPE_PCV4SF:
- case V8SF_FTYPE_PCFLOAT:
- case V4SF_FTYPE_PCFLOAT:
- case V4DF_FTYPE_PCV2DF:
- case V4DF_FTYPE_PCDOUBLE:
- case V2DF_FTYPE_PCDOUBLE:
- case VOID_FTYPE_PVOID:
- case V8DI_FTYPE_PV8DI:
- nargs = 1;
- klass = load;
- memory = 0;
- switch (icode)
- {
- case CODE_FOR_sse4_1_movntdqa:
- case CODE_FOR_avx2_movntdqa:
- case CODE_FOR_avx512f_movntdqa:
- aligned_mem = true;
- break;
- default:
- break;
- }
- break;
- case VOID_FTYPE_PV2SF_V4SF:
- case VOID_FTYPE_PV8DI_V8DI:
- case VOID_FTYPE_PV4DI_V4DI:
- case VOID_FTYPE_PV2DI_V2DI:
- case VOID_FTYPE_PCHAR_V32QI:
- case VOID_FTYPE_PCHAR_V16QI:
- case VOID_FTYPE_PFLOAT_V16SF:
- case VOID_FTYPE_PFLOAT_V8SF:
- case VOID_FTYPE_PFLOAT_V4SF:
- case VOID_FTYPE_PDOUBLE_V8DF:
- case VOID_FTYPE_PDOUBLE_V4DF:
- case VOID_FTYPE_PDOUBLE_V2DF:
- case VOID_FTYPE_PLONGLONG_LONGLONG:
- case VOID_FTYPE_PULONGLONG_ULONGLONG:
- case VOID_FTYPE_PUNSIGNED_UNSIGNED:
- case VOID_FTYPE_PINT_INT:
- nargs = 1;
- klass = store;
- /* Reserve memory operand for target. */
- memory = ARRAY_SIZE (args);
- switch (icode)
- {
- /* These builtins and instructions require the memory
- to be properly aligned. */
- case CODE_FOR_avx_movntv4di:
- case CODE_FOR_sse2_movntv2di:
- case CODE_FOR_avx_movntv8sf:
- case CODE_FOR_sse_movntv4sf:
- case CODE_FOR_sse4a_vmmovntv4sf:
- case CODE_FOR_avx_movntv4df:
- case CODE_FOR_sse2_movntv2df:
- case CODE_FOR_sse4a_vmmovntv2df:
- case CODE_FOR_sse2_movntidi:
- case CODE_FOR_sse_movntq:
- case CODE_FOR_sse2_movntisi:
- case CODE_FOR_avx512f_movntv16sf:
- case CODE_FOR_avx512f_movntv8df:
- case CODE_FOR_avx512f_movntv8di:
- aligned_mem = true;
- break;
- default:
- break;
- }
- break;
- case VOID_FTYPE_PVOID_PCVOID:
- nargs = 1;
- klass = store;
- memory = 0;
-
- break;
- case V4SF_FTYPE_V4SF_PCV2SF:
- case V2DF_FTYPE_V2DF_PCDOUBLE:
- nargs = 2;
- klass = load;
- memory = 1;
- break;
- case V8SF_FTYPE_PCV8SF_V8SI:
- case V4DF_FTYPE_PCV4DF_V4DI:
- case V4SF_FTYPE_PCV4SF_V4SI:
- case V2DF_FTYPE_PCV2DF_V2DI:
- case V8SI_FTYPE_PCV8SI_V8SI:
- case V4DI_FTYPE_PCV4DI_V4DI:
- case V4SI_FTYPE_PCV4SI_V4SI:
- case V2DI_FTYPE_PCV2DI_V2DI:
- case VOID_FTYPE_INT_INT64:
- nargs = 2;
- klass = load;
- memory = 0;
- break;
- case VOID_FTYPE_PV8DF_V8DF_UQI:
- case VOID_FTYPE_PV4DF_V4DF_UQI:
- case VOID_FTYPE_PV2DF_V2DF_UQI:
- case VOID_FTYPE_PV16SF_V16SF_UHI:
- case VOID_FTYPE_PV8SF_V8SF_UQI:
- case VOID_FTYPE_PV4SF_V4SF_UQI:
- case VOID_FTYPE_PV8DI_V8DI_UQI:
- case VOID_FTYPE_PV4DI_V4DI_UQI:
- case VOID_FTYPE_PV2DI_V2DI_UQI:
- case VOID_FTYPE_PV16SI_V16SI_UHI:
- case VOID_FTYPE_PV8SI_V8SI_UQI:
- case VOID_FTYPE_PV4SI_V4SI_UQI:
- case VOID_FTYPE_PV64QI_V64QI_UDI:
- case VOID_FTYPE_PV32HI_V32HI_USI:
- case VOID_FTYPE_PV32QI_V32QI_USI:
- case VOID_FTYPE_PV16QI_V16QI_UHI:
- case VOID_FTYPE_PV16HI_V16HI_UHI:
- case VOID_FTYPE_PV8HI_V8HI_UQI:
- switch (icode)
- {
- /* These builtins and instructions require the memory
- to be properly aligned. */
- case CODE_FOR_avx512f_storev16sf_mask:
- case CODE_FOR_avx512f_storev16si_mask:
- case CODE_FOR_avx512f_storev8df_mask:
- case CODE_FOR_avx512f_storev8di_mask:
- case CODE_FOR_avx512vl_storev8sf_mask:
- case CODE_FOR_avx512vl_storev8si_mask:
- case CODE_FOR_avx512vl_storev4df_mask:
- case CODE_FOR_avx512vl_storev4di_mask:
- case CODE_FOR_avx512vl_storev4sf_mask:
- case CODE_FOR_avx512vl_storev4si_mask:
- case CODE_FOR_avx512vl_storev2df_mask:
- case CODE_FOR_avx512vl_storev2di_mask:
- aligned_mem = true;
- break;
- default:
- break;
- }
- /* FALLTHRU */
- case VOID_FTYPE_PV8SF_V8SI_V8SF:
- case VOID_FTYPE_PV4DF_V4DI_V4DF:
- case VOID_FTYPE_PV4SF_V4SI_V4SF:
- case VOID_FTYPE_PV2DF_V2DI_V2DF:
- case VOID_FTYPE_PV8SI_V8SI_V8SI:
- case VOID_FTYPE_PV4DI_V4DI_V4DI:
- case VOID_FTYPE_PV4SI_V4SI_V4SI:
- case VOID_FTYPE_PV2DI_V2DI_V2DI:
- case VOID_FTYPE_PV8SI_V8DI_UQI:
- case VOID_FTYPE_PV8HI_V8DI_UQI:
- case VOID_FTYPE_PV16HI_V16SI_UHI:
- case VOID_FTYPE_PV16QI_V8DI_UQI:
- case VOID_FTYPE_PV16QI_V16SI_UHI:
- case VOID_FTYPE_PV4SI_V4DI_UQI:
- case VOID_FTYPE_PV4SI_V2DI_UQI:
- case VOID_FTYPE_PV8HI_V4DI_UQI:
- case VOID_FTYPE_PV8HI_V2DI_UQI:
- case VOID_FTYPE_PV8HI_V8SI_UQI:
- case VOID_FTYPE_PV8HI_V4SI_UQI:
- case VOID_FTYPE_PV16QI_V4DI_UQI:
- case VOID_FTYPE_PV16QI_V2DI_UQI:
- case VOID_FTYPE_PV16QI_V8SI_UQI:
- case VOID_FTYPE_PV16QI_V4SI_UQI:
- case VOID_FTYPE_PCHAR_V64QI_UDI:
- case VOID_FTYPE_PCHAR_V32QI_USI:
- case VOID_FTYPE_PCHAR_V16QI_UHI:
- case VOID_FTYPE_PSHORT_V32HI_USI:
- case VOID_FTYPE_PSHORT_V16HI_UHI:
- case VOID_FTYPE_PSHORT_V8HI_UQI:
- case VOID_FTYPE_PINT_V16SI_UHI:
- case VOID_FTYPE_PINT_V8SI_UQI:
- case VOID_FTYPE_PINT_V4SI_UQI:
- case VOID_FTYPE_PINT64_V8DI_UQI:
- case VOID_FTYPE_PINT64_V4DI_UQI:
- case VOID_FTYPE_PINT64_V2DI_UQI:
- case VOID_FTYPE_PDOUBLE_V8DF_UQI:
- case VOID_FTYPE_PDOUBLE_V4DF_UQI:
- case VOID_FTYPE_PDOUBLE_V2DF_UQI:
- case VOID_FTYPE_PFLOAT_V16SF_UHI:
- case VOID_FTYPE_PFLOAT_V8SF_UQI:
- case VOID_FTYPE_PFLOAT_V4SF_UQI:
- case VOID_FTYPE_PV32QI_V32HI_USI:
- case VOID_FTYPE_PV16QI_V16HI_UHI:
- case VOID_FTYPE_PV8QI_V8HI_UQI:
- nargs = 2;
- klass = store;
- /* Reserve memory operand for target. */
- memory = ARRAY_SIZE (args);
- break;
- case V4SF_FTYPE_PCV4SF_V4SF_UQI:
- case V8SF_FTYPE_PCV8SF_V8SF_UQI:
- case V16SF_FTYPE_PCV16SF_V16SF_UHI:
- case V4SI_FTYPE_PCV4SI_V4SI_UQI:
- case V8SI_FTYPE_PCV8SI_V8SI_UQI:
- case V16SI_FTYPE_PCV16SI_V16SI_UHI:
- case V2DF_FTYPE_PCV2DF_V2DF_UQI:
- case V4DF_FTYPE_PCV4DF_V4DF_UQI:
- case V8DF_FTYPE_PCV8DF_V8DF_UQI:
- case V2DI_FTYPE_PCV2DI_V2DI_UQI:
- case V4DI_FTYPE_PCV4DI_V4DI_UQI:
- case V8DI_FTYPE_PCV8DI_V8DI_UQI:
- case V64QI_FTYPE_PCV64QI_V64QI_UDI:
- case V32HI_FTYPE_PCV32HI_V32HI_USI:
- case V32QI_FTYPE_PCV32QI_V32QI_USI:
- case V16QI_FTYPE_PCV16QI_V16QI_UHI:
- case V16HI_FTYPE_PCV16HI_V16HI_UHI:
- case V8HI_FTYPE_PCV8HI_V8HI_UQI:
- switch (icode)
- {
- /* These builtins and instructions require the memory
- to be properly aligned. */
- case CODE_FOR_avx512f_loadv16sf_mask:
- case CODE_FOR_avx512f_loadv16si_mask:
- case CODE_FOR_avx512f_loadv8df_mask:
- case CODE_FOR_avx512f_loadv8di_mask:
- case CODE_FOR_avx512vl_loadv8sf_mask:
- case CODE_FOR_avx512vl_loadv8si_mask:
- case CODE_FOR_avx512vl_loadv4df_mask:
- case CODE_FOR_avx512vl_loadv4di_mask:
- case CODE_FOR_avx512vl_loadv4sf_mask:
- case CODE_FOR_avx512vl_loadv4si_mask:
- case CODE_FOR_avx512vl_loadv2df_mask:
- case CODE_FOR_avx512vl_loadv2di_mask:
- case CODE_FOR_avx512bw_loadv64qi_mask:
- case CODE_FOR_avx512vl_loadv32qi_mask:
- case CODE_FOR_avx512vl_loadv16qi_mask:
- case CODE_FOR_avx512bw_loadv32hi_mask:
- case CODE_FOR_avx512vl_loadv16hi_mask:
- case CODE_FOR_avx512vl_loadv8hi_mask:
- aligned_mem = true;
- break;
- default:
- break;
- }
- /* FALLTHRU */
- case V64QI_FTYPE_PCCHAR_V64QI_UDI:
- case V32QI_FTYPE_PCCHAR_V32QI_USI:
- case V16QI_FTYPE_PCCHAR_V16QI_UHI:
- case V32HI_FTYPE_PCSHORT_V32HI_USI:
- case V16HI_FTYPE_PCSHORT_V16HI_UHI:
- case V8HI_FTYPE_PCSHORT_V8HI_UQI:
- case V16SI_FTYPE_PCINT_V16SI_UHI:
- case V8SI_FTYPE_PCINT_V8SI_UQI:
- case V4SI_FTYPE_PCINT_V4SI_UQI:
- case V8DI_FTYPE_PCINT64_V8DI_UQI:
- case V4DI_FTYPE_PCINT64_V4DI_UQI:
- case V2DI_FTYPE_PCINT64_V2DI_UQI:
- case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
- case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
- case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
- case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
- case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
- case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
- nargs = 3;
- klass = load;
- memory = 0;
- break;
- case VOID_FTYPE_UINT_UINT_UINT:
- case VOID_FTYPE_UINT64_UINT_UINT:
- case UCHAR_FTYPE_UINT_UINT_UINT:
- case UCHAR_FTYPE_UINT64_UINT_UINT:
- nargs = 3;
- klass = load;
- memory = ARRAY_SIZE (args);
- last_arg_constant = true;
- break;
- default:
- gcc_unreachable ();
- }
-
- gcc_assert (nargs <= ARRAY_SIZE (args));
-
- if (klass == store)
- {
- arg = CALL_EXPR_ARG (exp, 0);
- op = expand_normal (arg);
- gcc_assert (target == 0);
- if (memory)
- {
- op = ix86_zero_extend_to_Pmode (op);
- target = gen_rtx_MEM (tmode, op);
- /* target at this point has just BITS_PER_UNIT MEM_ALIGN
- on it. Try to improve it using get_pointer_alignment,
- and if the special builtin is one that requires strict
- mode alignment, also from it's GET_MODE_ALIGNMENT.
- Failure to do so could lead to ix86_legitimate_combined_insn
- rejecting all changes to such insns. */
- unsigned int align = get_pointer_alignment (arg);
- if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
- align = GET_MODE_ALIGNMENT (tmode);
- if (MEM_ALIGN (target) < align)
- set_mem_align (target, align);
- }
- else
- target = force_reg (tmode, op);
- arg_adjust = 1;
- }
- else
- {
- arg_adjust = 0;
- if (optimize
- || target == 0
- || !register_operand (target, tmode)
- || GET_MODE (target) != tmode)
- target = gen_reg_rtx (tmode);
- }
-
- for (i = 0; i < nargs; i++)
- {
- machine_mode mode = insn_p->operand[i + 1].mode;
- bool match;
-
- arg = CALL_EXPR_ARG (exp, i + arg_adjust);
- op = expand_normal (arg);
- match = insn_p->operand[i + 1].predicate (op, mode);
-
- if (last_arg_constant && (i + 1) == nargs)
- {
- if (!match)
- {
- if (icode == CODE_FOR_lwp_lwpvalsi3
- || icode == CODE_FOR_lwp_lwpinssi3
- || icode == CODE_FOR_lwp_lwpvaldi3
- || icode == CODE_FOR_lwp_lwpinsdi3)
- error ("the last argument must be a 32-bit immediate");
- else
- error ("the last argument must be an 8-bit immediate");
- return const0_rtx;
- }
- }
- else
- {
- if (i == memory)
- {
- /* This must be the memory operand. */
- op = ix86_zero_extend_to_Pmode (op);
- op = gen_rtx_MEM (mode, op);
- /* op at this point has just BITS_PER_UNIT MEM_ALIGN
- on it. Try to improve it using get_pointer_alignment,
- and if the special builtin is one that requires strict
- mode alignment, also from it's GET_MODE_ALIGNMENT.
- Failure to do so could lead to ix86_legitimate_combined_insn
- rejecting all changes to such insns. */
- unsigned int align = get_pointer_alignment (arg);
- if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
- align = GET_MODE_ALIGNMENT (mode);
- if (MEM_ALIGN (op) < align)
- set_mem_align (op, align);
- }
- else
- {
- /* This must be register. */
- if (VECTOR_MODE_P (mode))
- op = safe_vector_operand (op, mode);
-
- op = fixup_modeless_constant (op, mode);
-
- if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
- op = copy_to_mode_reg (mode, op);
- else
- {
- op = copy_to_reg (op);
- op = lowpart_subreg (mode, op, GET_MODE (op));
- }
- }
- }
-
- args[i].op = op;
- args[i].mode = mode;
- }
-
- switch (nargs)
- {
- case 0:
- pat = GEN_FCN (icode) (target);
- break;
- case 1:
- pat = GEN_FCN (icode) (target, args[0].op);
- break;
- case 2:
- pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
- break;
- case 3:
- pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
- break;
- default:
- gcc_unreachable ();
- }
-
- if (! pat)
- return 0;
- emit_insn (pat);
- return klass == store ? 0 : target;
-}
-
-/* Return the integer constant in ARG. Constrain it to be in the range
- of the subparts of VEC_TYPE; issue an error if not. */
-
-static int
-get_element_number (tree vec_type, tree arg)
-{
- unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
-
- if (!tree_fits_uhwi_p (arg)
- || (elt = tree_to_uhwi (arg), elt > max))
- {
- error ("selector must be an integer constant in the range 0..%wi", max);
- return 0;
- }
-
- return elt;
-}
-
-/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
- ix86_expand_vector_init. We DO have language-level syntax for this, in
- the form of (type){ init-list }. Except that since we can't place emms
- instructions from inside the compiler, we can't allow the use of MMX
- registers unless the user explicitly asks for it. So we do *not* define
- vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
- we have builtins invoked by mmintrin.h that gives us license to emit
- these sorts of instructions. */
-
-static rtx
-ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
-{
- machine_mode tmode = TYPE_MODE (type);
- machine_mode inner_mode = GET_MODE_INNER (tmode);
- int i, n_elt = GET_MODE_NUNITS (tmode);
- rtvec v = rtvec_alloc (n_elt);
-
- gcc_assert (VECTOR_MODE_P (tmode));
- gcc_assert (call_expr_nargs (exp) == n_elt);
-
- for (i = 0; i < n_elt; ++i)
- {
- rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
- RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
- }
-
- if (!target || !register_operand (target, tmode))
- target = gen_reg_rtx (tmode);
-
- ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
- return target;
-}
-
-/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
- ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
- had a language-level syntax for referencing vector elements. */
-
-static rtx
-ix86_expand_vec_ext_builtin (tree exp, rtx target)
-{
- machine_mode tmode, mode0;
- tree arg0, arg1;
- int elt;
- rtx op0;
-
- arg0 = CALL_EXPR_ARG (exp, 0);
- arg1 = CALL_EXPR_ARG (exp, 1);
-
- op0 = expand_normal (arg0);
- elt = get_element_number (TREE_TYPE (arg0), arg1);
-
- tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
- mode0 = TYPE_MODE (TREE_TYPE (arg0));
- gcc_assert (VECTOR_MODE_P (mode0));
-
- op0 = force_reg (mode0, op0);
-
- if (optimize || !target || !register_operand (target, tmode))
- target = gen_reg_rtx (tmode);
-
- ix86_expand_vector_extract (true, target, op0, elt);
-
- return target;
-}
-
-/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
- ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
- a language-level syntax for referencing vector elements. */
-
-static rtx
-ix86_expand_vec_set_builtin (tree exp)
-{
- machine_mode tmode, mode1;
- tree arg0, arg1, arg2;
- int elt;
- rtx op0, op1, target;
-
- arg0 = CALL_EXPR_ARG (exp, 0);
- arg1 = CALL_EXPR_ARG (exp, 1);
- arg2 = CALL_EXPR_ARG (exp, 2);
-
- tmode = TYPE_MODE (TREE_TYPE (arg0));
- mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
- gcc_assert (VECTOR_MODE_P (tmode));
-
- op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
- op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
- elt = get_element_number (TREE_TYPE (arg0), arg2);
-
- if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
- op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
-
- op0 = force_reg (tmode, op0);
- op1 = force_reg (mode1, op1);
-
- /* OP0 is the source of these builtin functions and shouldn't be
- modified. Create a copy, use it and return it as target. */
- target = gen_reg_rtx (tmode);
- emit_move_insn (target, op0);
- ix86_expand_vector_set (true, target, op1, elt);
-
- return target;
-}
-
-/* Expand an expression EXP that calls a built-in function,
- with result going to TARGET if that's convenient
- (and in mode MODE if that's convenient).
- SUBTARGET may be used as the target for computing one of EXP's operands.
- IGNORE is nonzero if the value is to be ignored. */
-
-static rtx
-ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
- machine_mode mode, int ignore)
-{
- size_t i;
- enum insn_code icode, icode2;
- tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
- tree arg0, arg1, arg2, arg3, arg4;
- rtx op0, op1, op2, op3, op4, pat, pat2, insn;
- machine_mode mode0, mode1, mode2, mode3, mode4;
- unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
-
- /* For CPU builtins that can be folded, fold first and expand the fold. */
- switch (fcode)
- {
- case IX86_BUILTIN_CPU_INIT:
- {
- /* Make it call __cpu_indicator_init in libgcc. */
- tree call_expr, fndecl, type;
- type = build_function_type_list (integer_type_node, NULL_TREE);
- fndecl = build_fn_decl ("__cpu_indicator_init", type);
- call_expr = build_call_expr (fndecl, 0);
- return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
- }
- case IX86_BUILTIN_CPU_IS:
- case IX86_BUILTIN_CPU_SUPPORTS:
- {
- tree arg0 = CALL_EXPR_ARG (exp, 0);
- tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
- gcc_assert (fold_expr != NULL_TREE);
- return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
- }
- }
-
- HOST_WIDE_INT isa = ix86_isa_flags;
- HOST_WIDE_INT isa2 = ix86_isa_flags2;
- HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
- HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
- /* The general case is we require all the ISAs specified in bisa{,2}
- to be enabled.
- The exceptions are:
- OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
- OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
- OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
- where for each this pair it is sufficient if either of the ISAs is
- enabled, plus if it is ored with other options also those others. */
- if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
- == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
- && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
- isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
- if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
- == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
- && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
- isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
- if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
- == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
- && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
- isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
- if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
- {
- bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
- if (TARGET_ABI_X32)
- bisa |= OPTION_MASK_ABI_X32;
- else
- bisa |= OPTION_MASK_ABI_64;
- char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
- (enum fpmath_unit) 0, false, add_abi_p);
- if (!opts)
- error ("%qE needs unknown isa option", fndecl);
- else
- {
- gcc_assert (opts != NULL);
- error ("%qE needs isa option %s", fndecl, opts);
- free (opts);
- }
- return expand_call (exp, target, ignore);
- }
-
- switch (fcode)
- {
- case IX86_BUILTIN_MASKMOVQ:
- case IX86_BUILTIN_MASKMOVDQU:
- icode = (fcode == IX86_BUILTIN_MASKMOVQ
- ? CODE_FOR_mmx_maskmovq
- : CODE_FOR_sse2_maskmovdqu);
- /* Note the arg order is different from the operand order. */
- arg1 = CALL_EXPR_ARG (exp, 0);
- arg2 = CALL_EXPR_ARG (exp, 1);
- arg0 = CALL_EXPR_ARG (exp, 2);
- op0 = expand_normal (arg0);
- op1 = expand_normal (arg1);
- op2 = expand_normal (arg2);
- mode0 = insn_data[icode].operand[0].mode;
- mode1 = insn_data[icode].operand[1].mode;
- mode2 = insn_data[icode].operand[2].mode;
-
- op0 = ix86_zero_extend_to_Pmode (op0);
- op0 = gen_rtx_MEM (mode1, op0);
-
- if (!insn_data[icode].operand[0].predicate (op0, mode0))
- op0 = copy_to_mode_reg (mode0, op0);
- if (!insn_data[icode].operand[1].predicate (op1, mode1))
- op1 = copy_to_mode_reg (mode1, op1);
- if (!insn_data[icode].operand[2].predicate (op2, mode2))
- op2 = copy_to_mode_reg (mode2, op2);
- pat = GEN_FCN (icode) (op0, op1, op2);
- if (! pat)
- return 0;
- emit_insn (pat);
- return 0;
-
- case IX86_BUILTIN_LDMXCSR:
- op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
- target = assign_386_stack_local (SImode, SLOT_TEMP);
- emit_move_insn (target, op0);
- emit_insn (gen_sse_ldmxcsr (target));
- return 0;
-
- case IX86_BUILTIN_STMXCSR:
- target = assign_386_stack_local (SImode, SLOT_TEMP);
- emit_insn (gen_sse_stmxcsr (target));
- return copy_to_mode_reg (SImode, target);
-
- case IX86_BUILTIN_CLFLUSH:
- arg0 = CALL_EXPR_ARG (exp, 0);
- op0 = expand_normal (arg0);
- icode = CODE_FOR_sse2_clflush;
- if (!insn_data[icode].operand[0].predicate (op0, Pmode))
- op0 = ix86_zero_extend_to_Pmode (op0);
-
- emit_insn (gen_sse2_clflush (op0));
- return 0;
-
- case IX86_BUILTIN_CLWB:
- arg0 = CALL_EXPR_ARG (exp, 0);
- op0 = expand_normal (arg0);
- icode = CODE_FOR_clwb;
- if (!insn_data[icode].operand[0].predicate (op0, Pmode))
- op0 = ix86_zero_extend_to_Pmode (op0);
-
- emit_insn (gen_clwb (op0));
- return 0;
-
- case IX86_BUILTIN_CLFLUSHOPT:
- arg0 = CALL_EXPR_ARG (exp, 0);
- op0 = expand_normal (arg0);
- icode = CODE_FOR_clflushopt;
- if (!insn_data[icode].operand[0].predicate (op0, Pmode))
- op0 = ix86_zero_extend_to_Pmode (op0);
-
- emit_insn (gen_clflushopt (op0));
- return 0;
-
- case IX86_BUILTIN_MONITOR:
- case IX86_BUILTIN_MONITORX:
- arg0 = CALL_EXPR_ARG (exp, 0);
- arg1 = CALL_EXPR_ARG (exp, 1);
- arg2 = CALL_EXPR_ARG (exp, 2);
- op0 = expand_normal (arg0);
- op1 = expand_normal (arg1);
- op2 = expand_normal (arg2);
- if (!REG_P (op0))
- op0 = ix86_zero_extend_to_Pmode (op0);
- if (!REG_P (op1))
- op1 = copy_to_mode_reg (SImode, op1);
- if (!REG_P (op2))
- op2 = copy_to_mode_reg (SImode, op2);
-
- emit_insn (fcode == IX86_BUILTIN_MONITOR
- ? ix86_gen_monitor (op0, op1, op2)
- : ix86_gen_monitorx (op0, op1, op2));
- return 0;
-
- case IX86_BUILTIN_MWAIT:
- arg0 = CALL_EXPR_ARG (exp, 0);
- arg1 = CALL_EXPR_ARG (exp, 1);
- op0 = expand_normal (arg0);
- op1 = expand_normal (arg1);
- if (!REG_P (op0))
- op0 = copy_to_mode_reg (SImode, op0);
- if (!REG_P (op1))
- op1 = copy_to_mode_reg (SImode, op1);
- emit_insn (gen_sse3_mwait (op0, op1));
- return 0;
-
- case IX86_BUILTIN_MWAITX:
- arg0 = CALL_EXPR_ARG (exp, 0);
- arg1 = CALL_EXPR_ARG (exp, 1);
- arg2 = CALL_EXPR_ARG (exp, 2);
- op0 = expand_normal (arg0);
- op1 = expand_normal (arg1);
- op2 = expand_normal (arg2);
- if (!REG_P (op0))
- op0 = copy_to_mode_reg (SImode, op0);
- if (!REG_P (op1))
- op1 = copy_to_mode_reg (SImode, op1);
- if (!REG_P (op2))
- op2 = copy_to_mode_reg (SImode, op2);
- emit_insn (gen_mwaitx (op0, op1, op2));
- return 0;
-
- case IX86_BUILTIN_UMONITOR:
- arg0 = CALL_EXPR_ARG (exp, 0);
- op0 = expand_normal (arg0);
-
- op0 = ix86_zero_extend_to_Pmode (op0);
-
- insn = (TARGET_64BIT
- ? gen_umonitor_di (op0)
- : gen_umonitor_si (op0));
-
- emit_insn (insn);
- return 0;
-
- case IX86_BUILTIN_UMWAIT:
- case IX86_BUILTIN_TPAUSE:
- arg0 = CALL_EXPR_ARG (exp, 0);
- arg1 = CALL_EXPR_ARG (exp, 1);
- op0 = expand_normal (arg0);
- op1 = expand_normal (arg1);
-
- if (!REG_P (op0))
- op0 = copy_to_mode_reg (SImode, op0);
-
- op1 = force_reg (DImode, op1);
-
- if (TARGET_64BIT)
- {
- op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
- NULL, 1, OPTAB_DIRECT);
- switch (fcode)
- {
- case IX86_BUILTIN_UMWAIT:
- icode = CODE_FOR_umwait_rex64;
- break;
- case IX86_BUILTIN_TPAUSE:
- icode = CODE_FOR_tpause_rex64;
- break;
- default:
- gcc_unreachable ();
- }
-
- op2 = gen_lowpart (SImode, op2);
- op1 = gen_lowpart (SImode, op1);
- pat = GEN_FCN (icode) (op0, op1, op2);
- }
- else
- {
- switch (fcode)
- {
- case IX86_BUILTIN_UMWAIT:
- icode = CODE_FOR_umwait;
- break;
- case IX86_BUILTIN_TPAUSE:
- icode = CODE_FOR_tpause;
- break;
- default:
- gcc_unreachable ();
- }
- pat = GEN_FCN (icode) (op0, op1);
- }
-
- if (!pat)
- return 0;
-
- emit_insn (pat);
-
- if (target == 0
- || !register_operand (target, QImode))
- target = gen_reg_rtx (QImode);
-
- pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
- const0_rtx);
- emit_insn (gen_rtx_SET (target, pat));
-
- return target;
-
- case IX86_BUILTIN_CLZERO:
- arg0 = CALL_EXPR_ARG (exp, 0);
- op0 = expand_normal (arg0);
- if (!REG_P (op0))
- op0 = ix86_zero_extend_to_Pmode (op0);
- emit_insn (ix86_gen_clzero (op0));
- return 0;
-
- case IX86_BUILTIN_CLDEMOTE:
- arg0 = CALL_EXPR_ARG (exp, 0);
- op0 = expand_normal (arg0);
- icode = CODE_FOR_cldemote;
- if (!insn_data[icode].operand[0].predicate (op0, Pmode))
- op0 = ix86_zero_extend_to_Pmode (op0);
-
- emit_insn (gen_cldemote (op0));
- return 0;
-
- case IX86_BUILTIN_VEC_INIT_V2SI:
- case IX86_BUILTIN_VEC_INIT_V4HI:
- case IX86_BUILTIN_VEC_INIT_V8QI:
- return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
-
- case IX86_BUILTIN_VEC_EXT_V2DF:
- case IX86_BUILTIN_VEC_EXT_V2DI:
- case IX86_BUILTIN_VEC_EXT_V4SF:
- case IX86_BUILTIN_VEC_EXT_V4SI:
- case IX86_BUILTIN_VEC_EXT_V8HI:
- case IX86_BUILTIN_VEC_EXT_V2SI:
- case IX86_BUILTIN_VEC_EXT_V4HI:
- case IX86_BUILTIN_VEC_EXT_V16QI:
- return ix86_expand_vec_ext_builtin (exp, target);
-
- case IX86_BUILTIN_VEC_SET_V2DI:
- case IX86_BUILTIN_VEC_SET_V4SF:
- case IX86_BUILTIN_VEC_SET_V4SI:
- case IX86_BUILTIN_VEC_SET_V8HI:
- case IX86_BUILTIN_VEC_SET_V4HI:
- case IX86_BUILTIN_VEC_SET_V16QI:
- return ix86_expand_vec_set_builtin (exp);
-
- case IX86_BUILTIN_NANQ:
- case IX86_BUILTIN_NANSQ:
- return expand_call (exp, target, ignore);
-
- case IX86_BUILTIN_RDPID:
-
- op0 = gen_reg_rtx (word_mode);
-
- if (TARGET_64BIT)
- {
- insn = gen_rdpid_rex64 (op0);
- op0 = convert_to_mode (SImode, op0, 1);
- }
- else
- insn = gen_rdpid (op0);
-
- emit_insn (insn);
-
- if (target == 0
- || !register_operand (target, SImode))
- target = gen_reg_rtx (SImode);
-
- emit_move_insn (target, op0);
- return target;
-
- case IX86_BUILTIN_RDPMC:
- case IX86_BUILTIN_RDTSC:
- case IX86_BUILTIN_RDTSCP:
- case IX86_BUILTIN_XGETBV:
-
- op0 = gen_reg_rtx (DImode);
- op1 = gen_reg_rtx (DImode);
-
- if (fcode == IX86_BUILTIN_RDPMC)
- {
- arg0 = CALL_EXPR_ARG (exp, 0);
- op2 = expand_normal (arg0);
- if (!register_operand (op2, SImode))
- op2 = copy_to_mode_reg (SImode, op2);
-
- insn = (TARGET_64BIT
- ? gen_rdpmc_rex64 (op0, op1, op2)
- : gen_rdpmc (op0, op2));
- emit_insn (insn);
- }
- else if (fcode == IX86_BUILTIN_XGETBV)
- {
- arg0 = CALL_EXPR_ARG (exp, 0);
- op2 = expand_normal (arg0);
- if (!register_operand (op2, SImode))
- op2 = copy_to_mode_reg (SImode, op2);
-
- insn = (TARGET_64BIT
- ? gen_xgetbv_rex64 (op0, op1, op2)
- : gen_xgetbv (op0, op2));
- emit_insn (insn);
- }
- else if (fcode == IX86_BUILTIN_RDTSC)
- {
- insn = (TARGET_64BIT
- ? gen_rdtsc_rex64 (op0, op1)
- : gen_rdtsc (op0));
- emit_insn (insn);
- }
- else
- {
- op2 = gen_reg_rtx (SImode);
-
- insn = (TARGET_64BIT
- ? gen_rdtscp_rex64 (op0, op1, op2)
- : gen_rdtscp (op0, op2));
- emit_insn (insn);
-
- arg0 = CALL_EXPR_ARG (exp, 0);
- op4 = expand_normal (arg0);
- if (!address_operand (op4, VOIDmode))
- {
- op4 = convert_memory_address (Pmode, op4);
- op4 = copy_addr_to_reg (op4);
- }
- emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
- }
-
- if (target == 0
- || !register_operand (target, DImode))
- target = gen_reg_rtx (DImode);
-
- if (TARGET_64BIT)
- {
- op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
- op1, 1, OPTAB_DIRECT);
- op0 = expand_simple_binop (DImode, IOR, op0, op1,
- op0, 1, OPTAB_DIRECT);
- }
-
- emit_move_insn (target, op0);
- return target;
-
- case IX86_BUILTIN_MOVDIR64B:
-
- arg0 = CALL_EXPR_ARG (exp, 0);
- arg1 = CALL_EXPR_ARG (exp, 1);
- op0 = expand_normal (arg0);
- op1 = expand_normal (arg1);
-
- op0 = ix86_zero_extend_to_Pmode (op0);
- if (!address_operand (op1, VOIDmode))
- {
- op1 = convert_memory_address (Pmode, op1);
- op1 = copy_addr_to_reg (op1);
- }
- op1 = gen_rtx_MEM (XImode, op1);
-
- insn = (TARGET_64BIT
- ? gen_movdir64b_di (op0, op1)
- : gen_movdir64b_si (op0, op1));
- emit_insn (insn);
- return 0;
-
- case IX86_BUILTIN_FXSAVE:
- case IX86_BUILTIN_FXRSTOR:
- case IX86_BUILTIN_FXSAVE64:
- case IX86_BUILTIN_FXRSTOR64:
- case IX86_BUILTIN_FNSTENV:
- case IX86_BUILTIN_FLDENV:
- mode0 = BLKmode;
- switch (fcode)
- {
- case IX86_BUILTIN_FXSAVE:
- icode = CODE_FOR_fxsave;
- break;
- case IX86_BUILTIN_FXRSTOR:
- icode = CODE_FOR_fxrstor;
- break;
- case IX86_BUILTIN_FXSAVE64:
- icode = CODE_FOR_fxsave64;
- break;
- case IX86_BUILTIN_FXRSTOR64:
- icode = CODE_FOR_fxrstor64;
- break;
- case IX86_BUILTIN_FNSTENV:
- icode = CODE_FOR_fnstenv;
- break;
- case IX86_BUILTIN_FLDENV:
- icode = CODE_FOR_fldenv;
- break;
- default:
- gcc_unreachable ();
- }
-
- arg0 = CALL_EXPR_ARG (exp, 0);
- op0 = expand_normal (arg0);
-
- if (!address_operand (op0, VOIDmode))
- {
- op0 = convert_memory_address (Pmode, op0);
- op0 = copy_addr_to_reg (op0);
- }
- op0 = gen_rtx_MEM (mode0, op0);
-
- pat = GEN_FCN (icode) (op0);
- if (pat)
- emit_insn (pat);
- return 0;
-
- case IX86_BUILTIN_XSETBV:
- arg0 = CALL_EXPR_ARG (exp, 0);
- arg1 = CALL_EXPR_ARG (exp, 1);
- op0 = expand_normal (arg0);
- op1 = expand_normal (arg1);
-
- if (!REG_P (op0))
- op0 = copy_to_mode_reg (SImode, op0);
-
- op1 = force_reg (DImode, op1);
-
- if (TARGET_64BIT)
- {
- op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
- NULL, 1, OPTAB_DIRECT);
-
- icode = CODE_FOR_xsetbv_rex64;
-
- op2 = gen_lowpart (SImode, op2);
- op1 = gen_lowpart (SImode, op1);
- pat = GEN_FCN (icode) (op0, op1, op2);
- }
- else
- {
- icode = CODE_FOR_xsetbv;
-
- pat = GEN_FCN (icode) (op0, op1);
- }
- if (pat)
- emit_insn (pat);
- return 0;
-
- case IX86_BUILTIN_XSAVE:
- case IX86_BUILTIN_XRSTOR:
- case IX86_BUILTIN_XSAVE64:
- case IX86_BUILTIN_XRSTOR64:
- case IX86_BUILTIN_XSAVEOPT:
- case IX86_BUILTIN_XSAVEOPT64:
- case IX86_BUILTIN_XSAVES:
- case IX86_BUILTIN_XRSTORS:
- case IX86_BUILTIN_XSAVES64:
- case IX86_BUILTIN_XRSTORS64:
- case IX86_BUILTIN_XSAVEC:
- case IX86_BUILTIN_XSAVEC64:
- arg0 = CALL_EXPR_ARG (exp, 0);
- arg1 = CALL_EXPR_ARG (exp, 1);
- op0 = expand_normal (arg0);
- op1 = expand_normal (arg1);
-
- if (!address_operand (op0, VOIDmode))
- {
- op0 = convert_memory_address (Pmode, op0);
- op0 = copy_addr_to_reg (op0);
- }
- op0 = gen_rtx_MEM (BLKmode, op0);
-
- op1 = force_reg (DImode, op1);
-
- if (TARGET_64BIT)
- {
- op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
- NULL, 1, OPTAB_DIRECT);
- switch (fcode)
- {
- case IX86_BUILTIN_XSAVE:
- icode = CODE_FOR_xsave_rex64;
- break;
- case IX86_BUILTIN_XRSTOR:
- icode = CODE_FOR_xrstor_rex64;
- break;
- case IX86_BUILTIN_XSAVE64:
- icode = CODE_FOR_xsave64;
- break;
- case IX86_BUILTIN_XRSTOR64:
- icode = CODE_FOR_xrstor64;
- break;
- case IX86_BUILTIN_XSAVEOPT:
- icode = CODE_FOR_xsaveopt_rex64;
- break;
- case IX86_BUILTIN_XSAVEOPT64:
- icode = CODE_FOR_xsaveopt64;
- break;
- case IX86_BUILTIN_XSAVES:
- icode = CODE_FOR_xsaves_rex64;
- break;
- case IX86_BUILTIN_XRSTORS:
- icode = CODE_FOR_xrstors_rex64;
- break;
- case IX86_BUILTIN_XSAVES64:
- icode = CODE_FOR_xsaves64;
- break;
- case IX86_BUILTIN_XRSTORS64:
- icode = CODE_FOR_xrstors64;
- break;
- case IX86_BUILTIN_XSAVEC:
- icode = CODE_FOR_xsavec_rex64;
- break;
- case IX86_BUILTIN_XSAVEC64:
- icode = CODE_FOR_xsavec64;
- break;
- default:
- gcc_unreachable ();
- }
-
- op2 = gen_lowpart (SImode, op2);
- op1 = gen_lowpart (SImode, op1);
- pat = GEN_FCN (icode) (op0, op1, op2);
- }
- else
- {
- switch (fcode)
- {
- case IX86_BUILTIN_XSAVE:
- icode = CODE_FOR_xsave;
- break;
- case IX86_BUILTIN_XRSTOR:
- icode = CODE_FOR_xrstor;
- break;
- case IX86_BUILTIN_XSAVEOPT:
- icode = CODE_FOR_xsaveopt;
- break;
- case IX86_BUILTIN_XSAVES:
- icode = CODE_FOR_xsaves;
- break;
- case IX86_BUILTIN_XRSTORS:
- icode = CODE_FOR_xrstors;
- break;
- case IX86_BUILTIN_XSAVEC:
- icode = CODE_FOR_xsavec;
- break;
- default:
- gcc_unreachable ();
- }
- pat = GEN_FCN (icode) (op0, op1);
- }
-
- if (pat)
- emit_insn (pat);
- return 0;
-
- case IX86_BUILTIN_LLWPCB:
- arg0 = CALL_EXPR_ARG (exp, 0);
- op0 = expand_normal (arg0);
- icode = CODE_FOR_lwp_llwpcb;
- if (!insn_data[icode].operand[0].predicate (op0, Pmode))
- op0 = ix86_zero_extend_to_Pmode (op0);
- emit_insn (gen_lwp_llwpcb (op0));
- return 0;
-
- case IX86_BUILTIN_SLWPCB:
- icode = CODE_FOR_lwp_slwpcb;
- if (!target
- || !insn_data[icode].operand[0].predicate (target, Pmode))
- target = gen_reg_rtx (Pmode);
- emit_insn (gen_lwp_slwpcb (target));
- return target;
-
- case IX86_BUILTIN_BEXTRI32:
- case IX86_BUILTIN_BEXTRI64:
- arg0 = CALL_EXPR_ARG (exp, 0);
- arg1 = CALL_EXPR_ARG (exp, 1);
- op0 = expand_normal (arg0);
- op1 = expand_normal (arg1);
- icode = (fcode == IX86_BUILTIN_BEXTRI32
- ? CODE_FOR_tbm_bextri_si
- : CODE_FOR_tbm_bextri_di);
- if (!CONST_INT_P (op1))
- {
- error ("last argument must be an immediate");
- return const0_rtx;
- }
- else
- {
- unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
- unsigned char lsb_index = INTVAL (op1) & 0xFF;
- op1 = GEN_INT (length);
- op2 = GEN_INT (lsb_index);
-
- mode1 = insn_data[icode].operand[1].mode;
- if (!insn_data[icode].operand[1].predicate (op0, mode1))
- op0 = copy_to_mode_reg (mode1, op0);
-
- mode0 = insn_data[icode].operand[0].mode;
- if (target == 0
- || !register_operand (target, mode0))
- target = gen_reg_rtx (mode0);
-
- pat = GEN_FCN (icode) (target, op0, op1, op2);
- if (pat)
- emit_insn (pat);
- return target;
- }
-
- case IX86_BUILTIN_RDRAND16_STEP:
- icode = CODE_FOR_rdrandhi_1;
- mode0 = HImode;
- goto rdrand_step;
-
- case IX86_BUILTIN_RDRAND32_STEP:
- icode = CODE_FOR_rdrandsi_1;
- mode0 = SImode;
- goto rdrand_step;
-
- case IX86_BUILTIN_RDRAND64_STEP:
- icode = CODE_FOR_rdranddi_1;
- mode0 = DImode;
-
-rdrand_step:
- arg0 = CALL_EXPR_ARG (exp, 0);
- op1 = expand_normal (arg0);
- if (!address_operand (op1, VOIDmode))
- {
- op1 = convert_memory_address (Pmode, op1);
- op1 = copy_addr_to_reg (op1);
- }
-
- op0 = gen_reg_rtx (mode0);
- emit_insn (GEN_FCN (icode) (op0));
-
- emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
-
- op1 = gen_reg_rtx (SImode);
- emit_move_insn (op1, CONST1_RTX (SImode));
-
- /* Emit SImode conditional move. */
- if (mode0 == HImode)
- {
- if (TARGET_ZERO_EXTEND_WITH_AND
- && optimize_function_for_speed_p (cfun))
- {
- op2 = force_reg (SImode, const0_rtx);
-
- emit_insn (gen_movstricthi
- (gen_lowpart (HImode, op2), op0));
- }
- else
- {
- op2 = gen_reg_rtx (SImode);
-
- emit_insn (gen_zero_extendhisi2 (op2, op0));
- }
- }
- else if (mode0 == SImode)
- op2 = op0;
- else
- op2 = gen_rtx_SUBREG (SImode, op0, 0);
-
- if (target == 0
- || !register_operand (target, SImode))
- target = gen_reg_rtx (SImode);
-
- pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
- const0_rtx);
- emit_insn (gen_rtx_SET (target,
- gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
- return target;
-
- case IX86_BUILTIN_RDSEED16_STEP:
- icode = CODE_FOR_rdseedhi_1;
- mode0 = HImode;
- goto rdseed_step;
-
- case IX86_BUILTIN_RDSEED32_STEP:
- icode = CODE_FOR_rdseedsi_1;
- mode0 = SImode;
- goto rdseed_step;
-
- case IX86_BUILTIN_RDSEED64_STEP:
- icode = CODE_FOR_rdseeddi_1;
- mode0 = DImode;
-
-rdseed_step:
- arg0 = CALL_EXPR_ARG (exp, 0);
- op1 = expand_normal (arg0);
- if (!address_operand (op1, VOIDmode))
- {
- op1 = convert_memory_address (Pmode, op1);
- op1 = copy_addr_to_reg (op1);
- }
-
- op0 = gen_reg_rtx (mode0);
- emit_insn (GEN_FCN (icode) (op0));
-
- emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
-
- op2 = gen_reg_rtx (QImode);
-
- pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
- const0_rtx);
- emit_insn (gen_rtx_SET (op2, pat));
-
- if (target == 0
- || !register_operand (target, SImode))
- target = gen_reg_rtx (SImode);
-
- emit_insn (gen_zero_extendqisi2 (target, op2));
- return target;
-
- case IX86_BUILTIN_SBB32:
- icode = CODE_FOR_subborrowsi;
- icode2 = CODE_FOR_subborrowsi_0;
- mode0 = SImode;
- mode1 = DImode;
- mode2 = CCmode;
- goto handlecarry;
-
- case IX86_BUILTIN_SBB64:
- icode = CODE_FOR_subborrowdi;
- icode2 = CODE_FOR_subborrowdi_0;
- mode0 = DImode;
- mode1 = TImode;
- mode2 = CCmode;
- goto handlecarry;
-
- case IX86_BUILTIN_ADDCARRYX32:
- icode = CODE_FOR_addcarrysi;
- icode2 = CODE_FOR_addcarrysi_0;
- mode0 = SImode;
- mode1 = DImode;
- mode2 = CCCmode;
- goto handlecarry;
-
- case IX86_BUILTIN_ADDCARRYX64:
- icode = CODE_FOR_addcarrydi;
- icode2 = CODE_FOR_addcarrydi_0;
- mode0 = DImode;
- mode1 = TImode;
- mode2 = CCCmode;
-
- handlecarry:
- arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
- arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
- arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
- arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
-
- op1 = expand_normal (arg0);
- if (!integer_zerop (arg0))
- op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
-
- op2 = expand_normal (arg1);
- if (!register_operand (op2, mode0))
- op2 = copy_to_mode_reg (mode0, op2);
-
- op3 = expand_normal (arg2);
- if (!register_operand (op3, mode0))
- op3 = copy_to_mode_reg (mode0, op3);
-
- op4 = expand_normal (arg3);
- if (!address_operand (op4, VOIDmode))
- {
- op4 = convert_memory_address (Pmode, op4);
- op4 = copy_addr_to_reg (op4);
- }
-
- op0 = gen_reg_rtx (mode0);
- if (integer_zerop (arg0))
- {
- /* If arg0 is 0, optimize right away into add or sub
- instruction that sets CCCmode flags. */
- op1 = gen_rtx_REG (mode2, FLAGS_REG);
- emit_insn (GEN_FCN (icode2) (op0, op2, op3));
- }
- else
- {
- /* Generate CF from input operand. */
- emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
-
- /* Generate instruction that consumes CF. */
- op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
- pat = gen_rtx_LTU (mode1, op1, const0_rtx);
- pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
- emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
- }
-
- /* Return current CF value. */
- if (target == 0)
- target = gen_reg_rtx (QImode);
-
- pat = gen_rtx_LTU (QImode, op1, const0_rtx);
- emit_insn (gen_rtx_SET (target, pat));
-
- /* Store the result. */
- emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
-
- return target;
-
- case IX86_BUILTIN_READ_FLAGS:
- emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
-
- if (optimize
- || target == NULL_RTX
- || !nonimmediate_operand (target, word_mode)
- || GET_MODE (target) != word_mode)
- target = gen_reg_rtx (word_mode);
-
- emit_insn (gen_pop (target));
- return target;
-
- case IX86_BUILTIN_WRITE_FLAGS:
-
- arg0 = CALL_EXPR_ARG (exp, 0);
- op0 = expand_normal (arg0);
- if (!general_no_elim_operand (op0, word_mode))
- op0 = copy_to_mode_reg (word_mode, op0);
-
- emit_insn (gen_push (op0));
- emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
- return 0;
-
- case IX86_BUILTIN_KTESTC8:
- icode = CODE_FOR_ktestqi;
- mode3 = CCCmode;
- goto kortest;
-
- case IX86_BUILTIN_KTESTZ8:
- icode = CODE_FOR_ktestqi;
- mode3 = CCZmode;
- goto kortest;
-
- case IX86_BUILTIN_KTESTC16:
- icode = CODE_FOR_ktesthi;
- mode3 = CCCmode;
- goto kortest;
-
- case IX86_BUILTIN_KTESTZ16:
- icode = CODE_FOR_ktesthi;
- mode3 = CCZmode;
- goto kortest;
-
- case IX86_BUILTIN_KTESTC32:
- icode = CODE_FOR_ktestsi;
- mode3 = CCCmode;
- goto kortest;
-
- case IX86_BUILTIN_KTESTZ32:
- icode = CODE_FOR_ktestsi;
- mode3 = CCZmode;
- goto kortest;
-
- case IX86_BUILTIN_KTESTC64:
- icode = CODE_FOR_ktestdi;
- mode3 = CCCmode;
- goto kortest;
-
- case IX86_BUILTIN_KTESTZ64:
- icode = CODE_FOR_ktestdi;
- mode3 = CCZmode;
- goto kortest;
-
- case IX86_BUILTIN_KORTESTC8:
- icode = CODE_FOR_kortestqi;
- mode3 = CCCmode;
- goto kortest;
-
- case IX86_BUILTIN_KORTESTZ8:
- icode = CODE_FOR_kortestqi;
- mode3 = CCZmode;
- goto kortest;
-
- case IX86_BUILTIN_KORTESTC16:
- icode = CODE_FOR_kortesthi;
- mode3 = CCCmode;
- goto kortest;
-
- case IX86_BUILTIN_KORTESTZ16:
- icode = CODE_FOR_kortesthi;
- mode3 = CCZmode;
- goto kortest;
-
- case IX86_BUILTIN_KORTESTC32:
- icode = CODE_FOR_kortestsi;
- mode3 = CCCmode;
- goto kortest;
-
- case IX86_BUILTIN_KORTESTZ32:
- icode = CODE_FOR_kortestsi;
- mode3 = CCZmode;
- goto kortest;
-
- case IX86_BUILTIN_KORTESTC64:
- icode = CODE_FOR_kortestdi;
- mode3 = CCCmode;
- goto kortest;
-
- case IX86_BUILTIN_KORTESTZ64:
- icode = CODE_FOR_kortestdi;
- mode3 = CCZmode;
-
- kortest:
- arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
- arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
- op0 = expand_normal (arg0);
- op1 = expand_normal (arg1);
-
- mode0 = insn_data[icode].operand[0].mode;
- mode1 = insn_data[icode].operand[1].mode;
-
- if (GET_MODE (op0) != VOIDmode)
- op0 = force_reg (GET_MODE (op0), op0);
-
- op0 = gen_lowpart (mode0, op0);
-
- if (!insn_data[icode].operand[0].predicate (op0, mode0))
- op0 = copy_to_mode_reg (mode0, op0);
-
- if (GET_MODE (op1) != VOIDmode)
- op1 = force_reg (GET_MODE (op1), op1);
-
- op1 = gen_lowpart (mode1, op1);
-
- if (!insn_data[icode].operand[1].predicate (op1, mode1))
- op1 = copy_to_mode_reg (mode1, op1);
-
- target = gen_reg_rtx (QImode);
-
- /* Emit kortest. */
- emit_insn (GEN_FCN (icode) (op0, op1));
- /* And use setcc to return result from flags. */
- ix86_expand_setcc (target, EQ,
- gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
- return target;
-
- case IX86_BUILTIN_GATHERSIV2DF:
- icode = CODE_FOR_avx2_gathersiv2df;
- goto gather_gen;
- case IX86_BUILTIN_GATHERSIV4DF:
- icode = CODE_FOR_avx2_gathersiv4df;
- goto gather_gen;
- case IX86_BUILTIN_GATHERDIV2DF:
- icode = CODE_FOR_avx2_gatherdiv2df;
- goto gather_gen;
- case IX86_BUILTIN_GATHERDIV4DF:
- icode = CODE_FOR_avx2_gatherdiv4df;
- goto gather_gen;
- case IX86_BUILTIN_GATHERSIV4SF:
- icode = CODE_FOR_avx2_gathersiv4sf;
- goto gather_gen;
- case IX86_BUILTIN_GATHERSIV8SF:
- icode = CODE_FOR_avx2_gathersiv8sf;
- goto gather_gen;
- case IX86_BUILTIN_GATHERDIV4SF:
- icode = CODE_FOR_avx2_gatherdiv4sf;
- goto gather_gen;
- case IX86_BUILTIN_GATHERDIV8SF:
- icode = CODE_FOR_avx2_gatherdiv8sf;
- goto gather_gen;
- case IX86_BUILTIN_GATHERSIV2DI:
- icode = CODE_FOR_avx2_gathersiv2di;
- goto gather_gen;
- case IX86_BUILTIN_GATHERSIV4DI:
- icode = CODE_FOR_avx2_gathersiv4di;
- goto gather_gen;
- case IX86_BUILTIN_GATHERDIV2DI:
- icode = CODE_FOR_avx2_gatherdiv2di;
- goto gather_gen;
- case IX86_BUILTIN_GATHERDIV4DI:
- icode = CODE_FOR_avx2_gatherdiv4di;
- goto gather_gen;
- case IX86_BUILTIN_GATHERSIV4SI:
- icode = CODE_FOR_avx2_gathersiv4si;
- goto gather_gen;
- case IX86_BUILTIN_GATHERSIV8SI:
- icode = CODE_FOR_avx2_gathersiv8si;
- goto gather_gen;
- case IX86_BUILTIN_GATHERDIV4SI:
- icode = CODE_FOR_avx2_gatherdiv4si;
- goto gather_gen;
- case IX86_BUILTIN_GATHERDIV8SI:
- icode = CODE_FOR_avx2_gatherdiv8si;
- goto gather_gen;
- case IX86_BUILTIN_GATHERALTSIV4DF:
- icode = CODE_FOR_avx2_gathersiv4df;
- goto gather_gen;
- case IX86_BUILTIN_GATHERALTDIV8SF:
- icode = CODE_FOR_avx2_gatherdiv8sf;
- goto gather_gen;
- case IX86_BUILTIN_GATHERALTSIV4DI:
- icode = CODE_FOR_avx2_gathersiv4di;
- goto gather_gen;
- case IX86_BUILTIN_GATHERALTDIV8SI:
- icode = CODE_FOR_avx2_gatherdiv8si;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3SIV16SF:
- icode = CODE_FOR_avx512f_gathersiv16sf;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3SIV8DF:
- icode = CODE_FOR_avx512f_gathersiv8df;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3DIV16SF:
- icode = CODE_FOR_avx512f_gatherdiv16sf;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3DIV8DF:
- icode = CODE_FOR_avx512f_gatherdiv8df;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3SIV16SI:
- icode = CODE_FOR_avx512f_gathersiv16si;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3SIV8DI:
- icode = CODE_FOR_avx512f_gathersiv8di;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3DIV16SI:
- icode = CODE_FOR_avx512f_gatherdiv16si;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3DIV8DI:
- icode = CODE_FOR_avx512f_gatherdiv8di;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3ALTSIV8DF:
- icode = CODE_FOR_avx512f_gathersiv8df;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3ALTDIV16SF:
- icode = CODE_FOR_avx512f_gatherdiv16sf;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3ALTSIV8DI:
- icode = CODE_FOR_avx512f_gathersiv8di;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3ALTDIV16SI:
- icode = CODE_FOR_avx512f_gatherdiv16si;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3SIV2DF:
- icode = CODE_FOR_avx512vl_gathersiv2df;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3SIV4DF:
- icode = CODE_FOR_avx512vl_gathersiv4df;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3DIV2DF:
- icode = CODE_FOR_avx512vl_gatherdiv2df;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3DIV4DF:
- icode = CODE_FOR_avx512vl_gatherdiv4df;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3SIV4SF:
- icode = CODE_FOR_avx512vl_gathersiv4sf;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3SIV8SF:
- icode = CODE_FOR_avx512vl_gathersiv8sf;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3DIV4SF:
- icode = CODE_FOR_avx512vl_gatherdiv4sf;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3DIV8SF:
- icode = CODE_FOR_avx512vl_gatherdiv8sf;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3SIV2DI:
- icode = CODE_FOR_avx512vl_gathersiv2di;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3SIV4DI:
- icode = CODE_FOR_avx512vl_gathersiv4di;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3DIV2DI:
- icode = CODE_FOR_avx512vl_gatherdiv2di;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3DIV4DI:
- icode = CODE_FOR_avx512vl_gatherdiv4di;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3SIV4SI:
- icode = CODE_FOR_avx512vl_gathersiv4si;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3SIV8SI:
- icode = CODE_FOR_avx512vl_gathersiv8si;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3DIV4SI:
- icode = CODE_FOR_avx512vl_gatherdiv4si;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3DIV8SI:
- icode = CODE_FOR_avx512vl_gatherdiv8si;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3ALTSIV4DF:
- icode = CODE_FOR_avx512vl_gathersiv4df;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3ALTDIV8SF:
- icode = CODE_FOR_avx512vl_gatherdiv8sf;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3ALTSIV4DI:
- icode = CODE_FOR_avx512vl_gathersiv4di;
- goto gather_gen;
- case IX86_BUILTIN_GATHER3ALTDIV8SI:
- icode = CODE_FOR_avx512vl_gatherdiv8si;
- goto gather_gen;
- case IX86_BUILTIN_SCATTERSIV16SF:
- icode = CODE_FOR_avx512f_scattersiv16sf;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERSIV8DF:
- icode = CODE_FOR_avx512f_scattersiv8df;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERDIV16SF:
- icode = CODE_FOR_avx512f_scatterdiv16sf;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERDIV8DF:
- icode = CODE_FOR_avx512f_scatterdiv8df;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERSIV16SI:
- icode = CODE_FOR_avx512f_scattersiv16si;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERSIV8DI:
- icode = CODE_FOR_avx512f_scattersiv8di;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERDIV16SI:
- icode = CODE_FOR_avx512f_scatterdiv16si;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERDIV8DI:
- icode = CODE_FOR_avx512f_scatterdiv8di;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERSIV8SF:
- icode = CODE_FOR_avx512vl_scattersiv8sf;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERSIV4SF:
- icode = CODE_FOR_avx512vl_scattersiv4sf;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERSIV4DF:
- icode = CODE_FOR_avx512vl_scattersiv4df;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERSIV2DF:
- icode = CODE_FOR_avx512vl_scattersiv2df;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERDIV8SF:
- icode = CODE_FOR_avx512vl_scatterdiv8sf;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERDIV4SF:
- icode = CODE_FOR_avx512vl_scatterdiv4sf;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERDIV4DF:
- icode = CODE_FOR_avx512vl_scatterdiv4df;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERDIV2DF:
- icode = CODE_FOR_avx512vl_scatterdiv2df;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERSIV8SI:
- icode = CODE_FOR_avx512vl_scattersiv8si;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERSIV4SI:
- icode = CODE_FOR_avx512vl_scattersiv4si;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERSIV4DI:
- icode = CODE_FOR_avx512vl_scattersiv4di;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERSIV2DI:
- icode = CODE_FOR_avx512vl_scattersiv2di;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERDIV8SI:
- icode = CODE_FOR_avx512vl_scatterdiv8si;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERDIV4SI:
- icode = CODE_FOR_avx512vl_scatterdiv4si;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERDIV4DI:
- icode = CODE_FOR_avx512vl_scatterdiv4di;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERDIV2DI:
- icode = CODE_FOR_avx512vl_scatterdiv2di;
- goto scatter_gen;
- case IX86_BUILTIN_GATHERPFDPD:
- icode = CODE_FOR_avx512pf_gatherpfv8sidf;
- goto vec_prefetch_gen;
- case IX86_BUILTIN_SCATTERALTSIV8DF:
- icode = CODE_FOR_avx512f_scattersiv8df;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERALTDIV16SF:
- icode = CODE_FOR_avx512f_scatterdiv16sf;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERALTSIV8DI:
- icode = CODE_FOR_avx512f_scattersiv8di;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERALTDIV16SI:
- icode = CODE_FOR_avx512f_scatterdiv16si;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERALTSIV4DF:
- icode = CODE_FOR_avx512vl_scattersiv4df;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERALTDIV8SF:
- icode = CODE_FOR_avx512vl_scatterdiv8sf;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERALTSIV4DI:
- icode = CODE_FOR_avx512vl_scattersiv4di;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERALTDIV8SI:
- icode = CODE_FOR_avx512vl_scatterdiv8si;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERALTSIV2DF:
- icode = CODE_FOR_avx512vl_scattersiv2df;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERALTDIV4SF:
- icode = CODE_FOR_avx512vl_scatterdiv4sf;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERALTSIV2DI:
- icode = CODE_FOR_avx512vl_scattersiv2di;
- goto scatter_gen;
- case IX86_BUILTIN_SCATTERALTDIV4SI:
- icode = CODE_FOR_avx512vl_scatterdiv4si;
- goto scatter_gen;
- case IX86_BUILTIN_GATHERPFDPS:
- icode = CODE_FOR_avx512pf_gatherpfv16sisf;
- goto vec_prefetch_gen;
- case IX86_BUILTIN_GATHERPFQPD:
- icode = CODE_FOR_avx512pf_gatherpfv8didf;
- goto vec_prefetch_gen;
- case IX86_BUILTIN_GATHERPFQPS:
- icode = CODE_FOR_avx512pf_gatherpfv8disf;
- goto vec_prefetch_gen;
- case IX86_BUILTIN_SCATTERPFDPD:
- icode = CODE_FOR_avx512pf_scatterpfv8sidf;
- goto vec_prefetch_gen;
- case IX86_BUILTIN_SCATTERPFDPS:
- icode = CODE_FOR_avx512pf_scatterpfv16sisf;
- goto vec_prefetch_gen;
- case IX86_BUILTIN_SCATTERPFQPD:
- icode = CODE_FOR_avx512pf_scatterpfv8didf;
- goto vec_prefetch_gen;
- case IX86_BUILTIN_SCATTERPFQPS:
- icode = CODE_FOR_avx512pf_scatterpfv8disf;
- goto vec_prefetch_gen;
-
- gather_gen:
- rtx half;
- rtx (*gen) (rtx, rtx);
-
- arg0 = CALL_EXPR_ARG (exp, 0);
- arg1 = CALL_EXPR_ARG (exp, 1);
- arg2 = CALL_EXPR_ARG (exp, 2);
- arg3 = CALL_EXPR_ARG (exp, 3);
- arg4 = CALL_EXPR_ARG (exp, 4);
- op0 = expand_normal (arg0);
- op1 = expand_normal (arg1);
- op2 = expand_normal (arg2);
- op3 = expand_normal (arg3);
- op4 = expand_normal (arg4);
- /* Note the arg order is different from the operand order. */
- mode0 = insn_data[icode].operand[1].mode;
- mode2 = insn_data[icode].operand[3].mode;
- mode3 = insn_data[icode].operand[4].mode;
- mode4 = insn_data[icode].operand[5].mode;
-
- if (target == NULL_RTX
- || GET_MODE (target) != insn_data[icode].operand[0].mode
- || !insn_data[icode].operand[0].predicate (target,
- GET_MODE (target)))
- subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
- else
- subtarget = target;
-
- switch (fcode)
- {
- case IX86_BUILTIN_GATHER3ALTSIV8DF:
- case IX86_BUILTIN_GATHER3ALTSIV8DI:
- half = gen_reg_rtx (V8SImode);
- if (!nonimmediate_operand (op2, V16SImode))
- op2 = copy_to_mode_reg (V16SImode, op2);
- emit_insn (gen_vec_extract_lo_v16si (half, op2));
- op2 = half;
- break;
- case IX86_BUILTIN_GATHER3ALTSIV4DF:
- case IX86_BUILTIN_GATHER3ALTSIV4DI:
- case IX86_BUILTIN_GATHERALTSIV4DF:
- case IX86_BUILTIN_GATHERALTSIV4DI:
- half = gen_reg_rtx (V4SImode);
- if (!nonimmediate_operand (op2, V8SImode))
- op2 = copy_to_mode_reg (V8SImode, op2);
- emit_insn (gen_vec_extract_lo_v8si (half, op2));
- op2 = half;
- break;
- case IX86_BUILTIN_GATHER3ALTDIV16SF:
- case IX86_BUILTIN_GATHER3ALTDIV16SI:
- half = gen_reg_rtx (mode0);
- if (mode0 == V8SFmode)
- gen = gen_vec_extract_lo_v16sf;
- else
- gen = gen_vec_extract_lo_v16si;
- if (!nonimmediate_operand (op0, GET_MODE (op0)))
- op0 = copy_to_mode_reg (GET_MODE (op0), op0);
- emit_insn (gen (half, op0));
- op0 = half;
- op3 = lowpart_subreg (QImode, op3, HImode);
- break;
- case IX86_BUILTIN_GATHER3ALTDIV8SF:
- case IX86_BUILTIN_GATHER3ALTDIV8SI:
- case IX86_BUILTIN_GATHERALTDIV8SF:
- case IX86_BUILTIN_GATHERALTDIV8SI:
- half = gen_reg_rtx (mode0);
- if (mode0 == V4SFmode)
- gen = gen_vec_extract_lo_v8sf;
- else
- gen = gen_vec_extract_lo_v8si;
- if (!nonimmediate_operand (op0, GET_MODE (op0)))
- op0 = copy_to_mode_reg (GET_MODE (op0), op0);
- emit_insn (gen (half, op0));
- op0 = half;
- if (VECTOR_MODE_P (GET_MODE (op3)))
- {
- half = gen_reg_rtx (mode0);
- if (!nonimmediate_operand (op3, GET_MODE (op3)))
- op3 = copy_to_mode_reg (GET_MODE (op3), op3);
- emit_insn (gen (half, op3));
- op3 = half;
- }
- break;
- default:
- break;
- }
-
- /* Force memory operand only with base register here. But we
- don't want to do it on memory operand for other builtin
- functions. */
- op1 = ix86_zero_extend_to_Pmode (op1);
-
- if (!insn_data[icode].operand[1].predicate (op0, mode0))
- op0 = copy_to_mode_reg (mode0, op0);
- if (!insn_data[icode].operand[2].predicate (op1, Pmode))
- op1 = copy_to_mode_reg (Pmode, op1);
- if (!insn_data[icode].operand[3].predicate (op2, mode2))
- op2 = copy_to_mode_reg (mode2, op2);
-
- op3 = fixup_modeless_constant (op3, mode3);
-
- if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
- {
- if (!insn_data[icode].operand[4].predicate (op3, mode3))
- op3 = copy_to_mode_reg (mode3, op3);
- }
- else
- {
- op3 = copy_to_reg (op3);
- op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
- }
- if (!insn_data[icode].operand[5].predicate (op4, mode4))
- {
- error ("the last argument must be scale 1, 2, 4, 8");
- return const0_rtx;
- }
-
- /* Optimize. If mask is known to have all high bits set,
- replace op0 with pc_rtx to signal that the instruction
- overwrites the whole destination and doesn't use its
- previous contents. */
- if (optimize)
- {
- if (TREE_CODE (arg3) == INTEGER_CST)
- {
- if (integer_all_onesp (arg3))
- op0 = pc_rtx;
- }
- else if (TREE_CODE (arg3) == VECTOR_CST)
- {
- unsigned int negative = 0;
- for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
- {
- tree cst = VECTOR_CST_ELT (arg3, i);
- if (TREE_CODE (cst) == INTEGER_CST
- && tree_int_cst_sign_bit (cst))
- negative++;
- else if (TREE_CODE (cst) == REAL_CST
- && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
- negative++;
- }
- if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
- op0 = pc_rtx;
- }
- else if (TREE_CODE (arg3) == SSA_NAME
- && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
- {
- /* Recognize also when mask is like:
- __v2df src = _mm_setzero_pd ();
- __v2df mask = _mm_cmpeq_pd (src, src);
- or
- __v8sf src = _mm256_setzero_ps ();
- __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
- as that is a cheaper way to load all ones into
- a register than having to load a constant from
- memory. */
- gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
- if (is_gimple_call (def_stmt))
- {
- tree fndecl = gimple_call_fndecl (def_stmt);
- if (fndecl
- && fndecl_built_in_p (fndecl, BUILT_IN_MD))
- switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
- {
- case IX86_BUILTIN_CMPPD:
- case IX86_BUILTIN_CMPPS:
- case IX86_BUILTIN_CMPPD256:
- case IX86_BUILTIN_CMPPS256:
- if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
- break;
- /* FALLTHRU */
- case IX86_BUILTIN_CMPEQPD:
- case IX86_BUILTIN_CMPEQPS:
- if (initializer_zerop (gimple_call_arg (def_stmt, 0))
- && initializer_zerop (gimple_call_arg (def_stmt,
- 1)))
- op0 = pc_rtx;
- break;
- default:
- break;
- }
- }
- }
- }
-
- pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
- if (! pat)
- return const0_rtx;
- emit_insn (pat);
-
- switch (fcode)
- {
- case IX86_BUILTIN_GATHER3DIV16SF:
- if (target == NULL_RTX)
- target = gen_reg_rtx (V8SFmode);
- emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
- break;
- case IX86_BUILTIN_GATHER3DIV16SI:
- if (target == NULL_RTX)
- target = gen_reg_rtx (V8SImode);
- emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
- break;
- case IX86_BUILTIN_GATHER3DIV8SF:
- case IX86_BUILTIN_GATHERDIV8SF:
- if (target == NULL_RTX)
- target = gen_reg_rtx (V4SFmode);
- emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
- break;
- case IX86_BUILTIN_GATHER3DIV8SI:
- case IX86_BUILTIN_GATHERDIV8SI:
- if (target == NULL_RTX)
- target = gen_reg_rtx (V4SImode);
- emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
- break;
- default:
- target = subtarget;
- break;
- }
- return target;
-
- scatter_gen:
- arg0 = CALL_EXPR_ARG (exp, 0);
- arg1 = CALL_EXPR_ARG (exp, 1);
- arg2 = CALL_EXPR_ARG (exp, 2);
- arg3 = CALL_EXPR_ARG (exp, 3);
- arg4 = CALL_EXPR_ARG (exp, 4);
- op0 = expand_normal (arg0);
- op1 = expand_normal (arg1);
- op2 = expand_normal (arg2);
- op3 = expand_normal (arg3);
- op4 = expand_normal (arg4);
- mode1 = insn_data[icode].operand[1].mode;
- mode2 = insn_data[icode].operand[2].mode;
- mode3 = insn_data[icode].operand[3].mode;
- mode4 = insn_data[icode].operand[4].mode;
-
- /* Scatter instruction stores operand op3 to memory with
- indices from op2 and scale from op4 under writemask op1.
- If index operand op2 has more elements then source operand
- op3 one need to use only its low half. And vice versa. */
- switch (fcode)
- {
- case IX86_BUILTIN_SCATTERALTSIV8DF:
- case IX86_BUILTIN_SCATTERALTSIV8DI:
- half = gen_reg_rtx (V8SImode);
- if (!nonimmediate_operand (op2, V16SImode))
- op2 = copy_to_mode_reg (V16SImode, op2);
- emit_insn (gen_vec_extract_lo_v16si (half, op2));
- op2 = half;
- break;
- case IX86_BUILTIN_SCATTERALTDIV16SF:
- case IX86_BUILTIN_SCATTERALTDIV16SI:
- half = gen_reg_rtx (mode3);
- if (mode3 == V8SFmode)
- gen = gen_vec_extract_lo_v16sf;
- else
- gen = gen_vec_extract_lo_v16si;
- if (!nonimmediate_operand (op3, GET_MODE (op3)))
- op3 = copy_to_mode_reg (GET_MODE (op3), op3);
- emit_insn (gen (half, op3));
- op3 = half;
- break;
- case IX86_BUILTIN_SCATTERALTSIV4DF:
- case IX86_BUILTIN_SCATTERALTSIV4DI:
- half = gen_reg_rtx (V4SImode);
- if (!nonimmediate_operand (op2, V8SImode))
- op2 = copy_to_mode_reg (V8SImode, op2);
- emit_insn (gen_vec_extract_lo_v8si (half, op2));
- op2 = half;
- break;
- case IX86_BUILTIN_SCATTERALTDIV8SF:
- case IX86_BUILTIN_SCATTERALTDIV8SI:
- half = gen_reg_rtx (mode3);
- if (mode3 == V4SFmode)
- gen = gen_vec_extract_lo_v8sf;
- else
- gen = gen_vec_extract_lo_v8si;
- if (!nonimmediate_operand (op3, GET_MODE (op3)))
- op3 = copy_to_mode_reg (GET_MODE (op3), op3);
- emit_insn (gen (half, op3));
- op3 = half;
- break;
- case IX86_BUILTIN_SCATTERALTSIV2DF:
- case IX86_BUILTIN_SCATTERALTSIV2DI:
- if (!nonimmediate_operand (op2, V4SImode))
- op2 = copy_to_mode_reg (V4SImode, op2);
- break;
- case IX86_BUILTIN_SCATTERALTDIV4SF:
- case IX86_BUILTIN_SCATTERALTDIV4SI:
- if (!nonimmediate_operand (op3, GET_MODE (op3)))
- op3 = copy_to_mode_reg (GET_MODE (op3), op3);
- break;
- default:
- break;
- }
-
- /* Force memory operand only with base register here. But we
- don't want to do it on memory operand for other builtin
- functions. */
- op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
-
- if (!insn_data[icode].operand[0].predicate (op0, Pmode))
- op0 = copy_to_mode_reg (Pmode, op0);
-
- op1 = fixup_modeless_constant (op1, mode1);
-
- if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
- {
- if (!insn_data[icode].operand[1].predicate (op1, mode1))
- op1 = copy_to_mode_reg (mode1, op1);
- }
- else
- {
- op1 = copy_to_reg (op1);
- op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
- }
-
- if (!insn_data[icode].operand[2].predicate (op2, mode2))
- op2 = copy_to_mode_reg (mode2, op2);
-
- if (!insn_data[icode].operand[3].predicate (op3, mode3))
- op3 = copy_to_mode_reg (mode3, op3);
-
- if (!insn_data[icode].operand[4].predicate (op4, mode4))
- {
- error ("the last argument must be scale 1, 2, 4, 8");
- return const0_rtx;
- }
-
- pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
- if (! pat)
- return const0_rtx;
-
- emit_insn (pat);
- return 0;
-
- vec_prefetch_gen:
- arg0 = CALL_EXPR_ARG (exp, 0);
- arg1 = CALL_EXPR_ARG (exp, 1);
- arg2 = CALL_EXPR_ARG (exp, 2);
- arg3 = CALL_EXPR_ARG (exp, 3);
- arg4 = CALL_EXPR_ARG (exp, 4);
- op0 = expand_normal (arg0);
- op1 = expand_normal (arg1);
- op2 = expand_normal (arg2);
- op3 = expand_normal (arg3);
- op4 = expand_normal (arg4);
- mode0 = insn_data[icode].operand[0].mode;
- mode1 = insn_data[icode].operand[1].mode;
- mode3 = insn_data[icode].operand[3].mode;
- mode4 = insn_data[icode].operand[4].mode;
-
- op0 = fixup_modeless_constant (op0, mode0);
-
- if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
- {
- if (!insn_data[icode].operand[0].predicate (op0, mode0))
- op0 = copy_to_mode_reg (mode0, op0);
- }
- else
- {
- op0 = copy_to_reg (op0);
- op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
- }
-
- if (!insn_data[icode].operand[1].predicate (op1, mode1))
- op1 = copy_to_mode_reg (mode1, op1);
-
- /* Force memory operand only with base register here. But we
- don't want to do it on memory operand for other builtin
- functions. */
- op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
-
- if (!insn_data[icode].operand[2].predicate (op2, Pmode))
- op2 = copy_to_mode_reg (Pmode, op2);
-
- if (!insn_data[icode].operand[3].predicate (op3, mode3))
- {
- error ("the forth argument must be scale 1, 2, 4, 8");
- return const0_rtx;
- }
-
- if (!insn_data[icode].operand[4].predicate (op4, mode4))
- {
- error ("incorrect hint operand");
- return const0_rtx;
- }
-
- pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
- if (! pat)
- return const0_rtx;
-
- emit_insn (pat);
-
- return 0;
-
- case IX86_BUILTIN_XABORT:
- icode = CODE_FOR_xabort;
- arg0 = CALL_EXPR_ARG (exp, 0);
- op0 = expand_normal (arg0);
- mode0 = insn_data[icode].operand[0].mode;
- if (!insn_data[icode].operand[0].predicate (op0, mode0))
- {
- error ("the argument to %<xabort%> intrinsic must "
- "be an 8-bit immediate");
- return const0_rtx;
- }
- emit_insn (gen_xabort (op0));
- return 0;
-
- case IX86_BUILTIN_RSTORSSP:
- case IX86_BUILTIN_CLRSSBSY:
- arg0 = CALL_EXPR_ARG (exp, 0);
- op0 = expand_normal (arg0);
- icode = (fcode == IX86_BUILTIN_RSTORSSP
- ? CODE_FOR_rstorssp
- : CODE_FOR_clrssbsy);
- if (!address_operand (op0, VOIDmode))
- {
- op1 = convert_memory_address (Pmode, op0);
- op0 = copy_addr_to_reg (op1);
- }
- emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
- return 0;
-
- case IX86_BUILTIN_WRSSD:
- case IX86_BUILTIN_WRSSQ:
- case IX86_BUILTIN_WRUSSD:
- case IX86_BUILTIN_WRUSSQ:
- arg0 = CALL_EXPR_ARG (exp, 0);
- op0 = expand_normal (arg0);
- arg1 = CALL_EXPR_ARG (exp, 1);
- op1 = expand_normal (arg1);
- switch (fcode)
- {
- case IX86_BUILTIN_WRSSD:
- icode = CODE_FOR_wrsssi;
- mode = SImode;
- break;
- case IX86_BUILTIN_WRSSQ:
- icode = CODE_FOR_wrssdi;
- mode = DImode;
- break;
- case IX86_BUILTIN_WRUSSD:
- icode = CODE_FOR_wrusssi;
- mode = SImode;
- break;
- case IX86_BUILTIN_WRUSSQ:
- icode = CODE_FOR_wrussdi;
- mode = DImode;
- break;
- }
- op0 = force_reg (mode, op0);
- if (!address_operand (op1, VOIDmode))
- {
- op2 = convert_memory_address (Pmode, op1);
- op1 = copy_addr_to_reg (op2);
- }
- emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
- return 0;
-
- default:
- break;
- }
-
- if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
- && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
- {
- i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
- return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
- target);
- }
-
- if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
- && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
- {
- i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
- rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
- rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
- rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
- int masked = 1;
- machine_mode mode, wide_mode, nar_mode;
-
- nar_mode = V4SFmode;
- mode = V16SFmode;
- wide_mode = V64SFmode;
- fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
- fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
-
- switch (fcode)
- {
- case IX86_BUILTIN_4FMAPS:
- fcn = gen_avx5124fmaddps_4fmaddps;
- masked = 0;
- goto v4fma_expand;
-
- case IX86_BUILTIN_4DPWSSD:
- nar_mode = V4SImode;
- mode = V16SImode;
- wide_mode = V64SImode;
- fcn = gen_avx5124vnniw_vp4dpwssd;
- masked = 0;
- goto v4fma_expand;
-
- case IX86_BUILTIN_4DPWSSDS:
- nar_mode = V4SImode;
- mode = V16SImode;
- wide_mode = V64SImode;
- fcn = gen_avx5124vnniw_vp4dpwssds;
- masked = 0;
- goto v4fma_expand;
-
- case IX86_BUILTIN_4FNMAPS:
- fcn = gen_avx5124fmaddps_4fnmaddps;
- masked = 0;
- goto v4fma_expand;
-
- case IX86_BUILTIN_4FNMAPS_MASK:
- fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
- fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
- goto v4fma_expand;
-
- case IX86_BUILTIN_4DPWSSD_MASK:
- nar_mode = V4SImode;
- mode = V16SImode;
- wide_mode = V64SImode;
- fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
- fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
- goto v4fma_expand;
-
- case IX86_BUILTIN_4DPWSSDS_MASK:
- nar_mode = V4SImode;
- mode = V16SImode;
- wide_mode = V64SImode;
- fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
- fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
- goto v4fma_expand;
-
- case IX86_BUILTIN_4FMAPS_MASK:
- {
- tree args[4];
- rtx ops[4];
- rtx wide_reg;
- rtx accum;
- rtx addr;
- rtx mem;
-
-v4fma_expand:
- wide_reg = gen_reg_rtx (wide_mode);
- for (i = 0; i < 4; i++)
- {
- args[i] = CALL_EXPR_ARG (exp, i);
- ops[i] = expand_normal (args[i]);
-
- emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
- ops[i]);
- }
-
- accum = expand_normal (CALL_EXPR_ARG (exp, 4));
- accum = force_reg (mode, accum);
-
- addr = expand_normal (CALL_EXPR_ARG (exp, 5));
- addr = force_reg (Pmode, addr);
-
- mem = gen_rtx_MEM (nar_mode, addr);
-
- target = gen_reg_rtx (mode);
-
- emit_move_insn (target, accum);
-
- if (! masked)
- emit_insn (fcn (target, accum, wide_reg, mem));
- else
- {
- rtx merge, mask;
- merge = expand_normal (CALL_EXPR_ARG (exp, 6));
-
- mask = expand_normal (CALL_EXPR_ARG (exp, 7));
-
- if (CONST_INT_P (mask))
- mask = fixup_modeless_constant (mask, HImode);
-
- mask = force_reg (HImode, mask);
-
- if (GET_MODE (mask) != HImode)
- mask = gen_rtx_SUBREG (HImode, mask, 0);
-
- /* If merge is 0 then we're about to emit z-masked variant. */
- if (const0_operand (merge, mode))
- emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
- /* If merge is the same as accum then emit merge-masked variant. */
- else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
- {
- merge = force_reg (mode, merge);
- emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
- }
- /* Merge with something unknown might happen if we z-mask w/ -O0. */
- else
- {
- target = gen_reg_rtx (mode);
- emit_move_insn (target, merge);
- emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
- }
- }
- return target;
- }
-
- case IX86_BUILTIN_4FNMASS:
- fcn = gen_avx5124fmaddps_4fnmaddss;
- masked = 0;
- goto s4fma_expand;
-
- case IX86_BUILTIN_4FMASS:
- fcn = gen_avx5124fmaddps_4fmaddss;
- masked = 0;
- goto s4fma_expand;
-
- case IX86_BUILTIN_4FNMASS_MASK:
- fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
- fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
- goto s4fma_expand;
-
- case IX86_BUILTIN_4FMASS_MASK:
- {
- tree args[4];
- rtx ops[4];
- rtx wide_reg;
- rtx accum;
- rtx addr;
- rtx mem;
-
- fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
- fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
-
-s4fma_expand:
- mode = V4SFmode;
- wide_reg = gen_reg_rtx (V64SFmode);
- for (i = 0; i < 4; i++)
- {
- rtx tmp;
- args[i] = CALL_EXPR_ARG (exp, i);
- ops[i] = expand_normal (args[i]);
-
- tmp = gen_reg_rtx (SFmode);
- emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
-
- emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
- gen_rtx_SUBREG (V16SFmode, tmp, 0));
- }
-
- accum = expand_normal (CALL_EXPR_ARG (exp, 4));
- accum = force_reg (V4SFmode, accum);
-
- addr = expand_normal (CALL_EXPR_ARG (exp, 5));
- addr = force_reg (Pmode, addr);
-
- mem = gen_rtx_MEM (V4SFmode, addr);
-
- target = gen_reg_rtx (V4SFmode);
-
- emit_move_insn (target, accum);
-
- if (! masked)
- emit_insn (fcn (target, accum, wide_reg, mem));
- else
- {
- rtx merge, mask;
- merge = expand_normal (CALL_EXPR_ARG (exp, 6));
-
- mask = expand_normal (CALL_EXPR_ARG (exp, 7));
-
- if (CONST_INT_P (mask))
- mask = fixup_modeless_constant (mask, QImode);
-
- mask = force_reg (QImode, mask);
-
- if (GET_MODE (mask) != QImode)
- mask = gen_rtx_SUBREG (QImode, mask, 0);
-
- /* If merge is 0 then we're about to emit z-masked variant. */
- if (const0_operand (merge, mode))
- emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
- /* If merge is the same as accum then emit merge-masked
- variant. */
- else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
- {
- merge = force_reg (mode, merge);
- emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
- }
- /* Merge with something unknown might happen if we z-mask
- w/ -O0. */
- else
- {
- target = gen_reg_rtx (mode);
- emit_move_insn (target, merge);
- emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
- }
- }
- return target;
- }
- case IX86_BUILTIN_RDPID:
- return ix86_expand_special_args_builtin (bdesc_args + i, exp,
- target);
- case IX86_BUILTIN_FABSQ:
- case IX86_BUILTIN_COPYSIGNQ:
- if (!TARGET_SSE)
- /* Emit a normal call if SSE isn't available. */
- return expand_call (exp, target, ignore);
- /* FALLTHRU */
- default:
- return ix86_expand_args_builtin (bdesc_args + i, exp, target);
- }
- }
-
- if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
- && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
- {
- i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
- return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
- }
-
- if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
- && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
- {
- i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
- return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
- }
-
- if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
- && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
- {
- i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
- return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
- }
-
- if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
- && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
- {
- i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
- return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
- }
-
- if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
- && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
- {
- i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
- const struct builtin_description *d = bdesc_multi_arg + i;
- return ix86_expand_multi_arg_builtin (d->icode, exp, target,
- (enum ix86_builtin_func_type)
- d->flag, d->comparison);
- }
-
- if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
- && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
- {
- i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
- return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
- target);
- }
-
- if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
- && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
- {
- i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
- return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp,
- target);
- }
-
- gcc_unreachable ();
-}
-
-/* This returns the target-specific builtin with code CODE if
- current_function_decl has visibility on this builtin, which is checked
- using isa flags. Returns NULL_TREE otherwise. */
-
-static tree ix86_get_builtin (enum ix86_builtins code)
-{
- struct cl_target_option *opts;
- tree target_tree = NULL_TREE;
-
- /* Determine the isa flags of current_function_decl. */
-
- if (current_function_decl)
- target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
-
- if (target_tree == NULL)
- target_tree = target_option_default_node;
-
- opts = TREE_TARGET_OPTION (target_tree);
-
- if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
- || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
- return ix86_builtin_decl (code, true);
- else
- return NULL_TREE;
-}
-
-/* Returns a function decl for a vectorized version of the combined function
- with combined_fn code FN and the result vector type TYPE, or NULL_TREE
- if it is not available. */
-
-static tree
-ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
- tree type_in)
-{
- machine_mode in_mode, out_mode;
- int in_n, out_n;
-
- if (TREE_CODE (type_out) != VECTOR_TYPE
- || TREE_CODE (type_in) != VECTOR_TYPE)
- return NULL_TREE;
-
- out_mode = TYPE_MODE (TREE_TYPE (type_out));
- out_n = TYPE_VECTOR_SUBPARTS (type_out);
- in_mode = TYPE_MODE (TREE_TYPE (type_in));
- in_n = TYPE_VECTOR_SUBPARTS (type_in);
-
- switch (fn)
- {
- CASE_CFN_EXP2:
- if (out_mode == SFmode && in_mode == SFmode)
- {
- if (out_n == 16 && in_n == 16)
- return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
- }
- break;
-
- CASE_CFN_IFLOOR:
- CASE_CFN_LFLOOR:
- CASE_CFN_LLFLOOR:
- /* The round insn does not trap on denormals. */
- if (flag_trapping_math || !TARGET_SSE4_1)
- break;
-
- if (out_mode == SImode && in_mode == DFmode)
- {
- if (out_n == 4 && in_n == 2)
- return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
- else if (out_n == 8 && in_n == 4)
- return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
- else if (out_n == 16 && in_n == 8)
- return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
- }
- if (out_mode == SImode && in_mode == SFmode)
- {
- if (out_n == 4 && in_n == 4)
- return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
- else if (out_n == 8 && in_n == 8)
- return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
- else if (out_n == 16 && in_n == 16)
- return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
- }
- break;
-
- CASE_CFN_ICEIL:
- CASE_CFN_LCEIL:
- CASE_CFN_LLCEIL:
- /* The round insn does not trap on denormals. */
- if (flag_trapping_math || !TARGET_SSE4_1)
- break;
-
- if (out_mode == SImode && in_mode == DFmode)
- {
- if (out_n == 4 && in_n == 2)
- return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
- else if (out_n == 8 && in_n == 4)
- return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
- else if (out_n == 16 && in_n == 8)
- return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
- }
- if (out_mode == SImode && in_mode == SFmode)
- {
- if (out_n == 4 && in_n == 4)
- return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
- else if (out_n == 8 && in_n == 8)
- return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
- else if (out_n == 16 && in_n == 16)
- return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
- }
- break;
-
- CASE_CFN_IRINT:
- CASE_CFN_LRINT:
- CASE_CFN_LLRINT:
- if (out_mode == SImode && in_mode == DFmode)
- {
- if (out_n == 4 && in_n == 2)
- return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
- else if (out_n == 8 && in_n == 4)
- return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
- else if (out_n == 16 && in_n == 8)
- return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
- }
- if (out_mode == SImode && in_mode == SFmode)
- {
- if (out_n == 4 && in_n == 4)
- return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
- else if (out_n == 8 && in_n == 8)
- return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
- else if (out_n == 16 && in_n == 16)
- return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
- }
- break;
-
- CASE_CFN_IROUND:
- CASE_CFN_LROUND:
- CASE_CFN_LLROUND:
- /* The round insn does not trap on denormals. */
- if (flag_trapping_math || !TARGET_SSE4_1)
- break;
-
- if (out_mode == SImode && in_mode == DFmode)
- {
- if (out_n == 4 && in_n == 2)
- return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
- else if (out_n == 8 && in_n == 4)
- return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
- else if (out_n == 16 && in_n == 8)
- return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
- }
- if (out_mode == SImode && in_mode == SFmode)
- {
- if (out_n == 4 && in_n == 4)
- return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
- else if (out_n == 8 && in_n == 8)
- return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
- else if (out_n == 16 && in_n == 16)
- return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
- }
- break;
-
- CASE_CFN_FLOOR:
- /* The round insn does not trap on denormals. */
- if (flag_trapping_math || !TARGET_SSE4_1)
- break;
-
- if (out_mode == DFmode && in_mode == DFmode)
- {
- if (out_n == 2 && in_n == 2)
- return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
- else if (out_n == 4 && in_n == 4)
- return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
- else if (out_n == 8 && in_n == 8)
- return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
- }
- if (out_mode == SFmode && in_mode == SFmode)
- {
- if (out_n == 4 && in_n == 4)
- return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
- else if (out_n == 8 && in_n == 8)
- return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
- else if (out_n == 16 && in_n == 16)
- return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
- }
- break;
-
- CASE_CFN_CEIL:
- /* The round insn does not trap on denormals. */
- if (flag_trapping_math || !TARGET_SSE4_1)
- break;
-
- if (out_mode == DFmode && in_mode == DFmode)
- {
- if (out_n == 2 && in_n == 2)
- return ix86_get_builtin (IX86_BUILTIN_CEILPD);
- else if (out_n == 4 && in_n == 4)
- return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
- else if (out_n == 8 && in_n == 8)
- return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
- }
- if (out_mode == SFmode && in_mode == SFmode)
- {
- if (out_n == 4 && in_n == 4)
- return ix86_get_builtin (IX86_BUILTIN_CEILPS);
- else if (out_n == 8 && in_n == 8)
- return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
- else if (out_n == 16 && in_n == 16)
- return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
- }
- break;
-
- CASE_CFN_TRUNC:
- /* The round insn does not trap on denormals. */
- if (flag_trapping_math || !TARGET_SSE4_1)
- break;
-
- if (out_mode == DFmode && in_mode == DFmode)
- {
- if (out_n == 2 && in_n == 2)
- return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
- else if (out_n == 4 && in_n == 4)
- return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
- else if (out_n == 8 && in_n == 8)
- return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
- }
- if (out_mode == SFmode && in_mode == SFmode)
- {
- if (out_n == 4 && in_n == 4)
- return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
- else if (out_n == 8 && in_n == 8)
- return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
- else if (out_n == 16 && in_n == 16)
- return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
- }
- break;
-
- CASE_CFN_RINT:
- /* The round insn does not trap on denormals. */
- if (flag_trapping_math || !TARGET_SSE4_1)
- break;
-
- if (out_mode == DFmode && in_mode == DFmode)
- {
- if (out_n == 2 && in_n == 2)
- return ix86_get_builtin (IX86_BUILTIN_RINTPD);
- else if (out_n == 4 && in_n == 4)
- return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
- }
- if (out_mode == SFmode && in_mode == SFmode)
- {
- if (out_n == 4 && in_n == 4)
- return ix86_get_builtin (IX86_BUILTIN_RINTPS);
- else if (out_n == 8 && in_n == 8)
- return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
- }
- break;
-
- CASE_CFN_FMA:
- if (out_mode == DFmode && in_mode == DFmode)
- {
- if (out_n == 2 && in_n == 2)
- return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
- if (out_n == 4 && in_n == 4)
- return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
- }
- if (out_mode == SFmode && in_mode == SFmode)
- {
- if (out_n == 4 && in_n == 4)
- return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
- if (out_n == 8 && in_n == 8)
- return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
- }
- break;
-
- default:
- break;
- }
-
- /* Dispatch to a handler for a vectorization library. */
- if (ix86_veclib_handler)
- return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
-
- return NULL_TREE;
-}
-
-/* Handler for an SVML-style interface to
- a library with vectorized intrinsics. */
-
-static tree
-ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
-{
- char name[20];
- tree fntype, new_fndecl, args;
- unsigned arity;
- const char *bname;
- machine_mode el_mode, in_mode;
- int n, in_n;
-
- /* The SVML is suitable for unsafe math only. */
- if (!flag_unsafe_math_optimizations)
- return NULL_TREE;
-
- el_mode = TYPE_MODE (TREE_TYPE (type_out));
- n = TYPE_VECTOR_SUBPARTS (type_out);
- in_mode = TYPE_MODE (TREE_TYPE (type_in));
- in_n = TYPE_VECTOR_SUBPARTS (type_in);
- if (el_mode != in_mode
- || n != in_n)
- return NULL_TREE;
-
- switch (fn)
- {
- CASE_CFN_EXP:
- CASE_CFN_LOG:
- CASE_CFN_LOG10:
- CASE_CFN_POW:
- CASE_CFN_TANH:
- CASE_CFN_TAN:
- CASE_CFN_ATAN:
- CASE_CFN_ATAN2:
- CASE_CFN_ATANH:
- CASE_CFN_CBRT:
- CASE_CFN_SINH:
- CASE_CFN_SIN:
- CASE_CFN_ASINH:
- CASE_CFN_ASIN:
- CASE_CFN_COSH:
- CASE_CFN_COS:
- CASE_CFN_ACOSH:
- CASE_CFN_ACOS:
- if ((el_mode != DFmode || n != 2)
- && (el_mode != SFmode || n != 4))
- return NULL_TREE;
- break;
-
- default:
- return NULL_TREE;
- }
-
- tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
- bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
-
- if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
- strcpy (name, "vmlsLn4");
- else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
- strcpy (name, "vmldLn2");
- else if (n == 4)
- {
- sprintf (name, "vmls%s", bname+10);
- name[strlen (name)-1] = '4';
- }
- else
- sprintf (name, "vmld%s2", bname+10);
-
- /* Convert to uppercase. */
- name[4] &= ~0x20;
-
- arity = 0;
- for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
- arity++;
-
- if (arity == 1)
- fntype = build_function_type_list (type_out, type_in, NULL);
- else
- fntype = build_function_type_list (type_out, type_in, type_in, NULL);
-
- /* Build a function declaration for the vectorized function. */
- new_fndecl = build_decl (BUILTINS_LOCATION,
- FUNCTION_DECL, get_identifier (name), fntype);
- TREE_PUBLIC (new_fndecl) = 1;
- DECL_EXTERNAL (new_fndecl) = 1;
- DECL_IS_NOVOPS (new_fndecl) = 1;
- TREE_READONLY (new_fndecl) = 1;
-
- return new_fndecl;
-}
-
-/* Handler for an ACML-style interface to
- a library with vectorized intrinsics. */
-
-static tree
-ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
-{
- char name[20] = "__vr.._";
- tree fntype, new_fndecl, args;
- unsigned arity;
- const char *bname;
- machine_mode el_mode, in_mode;
- int n, in_n;
-
- /* The ACML is 64bits only and suitable for unsafe math only as
- it does not correctly support parts of IEEE with the required
- precision such as denormals. */
- if (!TARGET_64BIT
- || !flag_unsafe_math_optimizations)
- return NULL_TREE;
-
- el_mode = TYPE_MODE (TREE_TYPE (type_out));
- n = TYPE_VECTOR_SUBPARTS (type_out);
- in_mode = TYPE_MODE (TREE_TYPE (type_in));
- in_n = TYPE_VECTOR_SUBPARTS (type_in);
- if (el_mode != in_mode
- || n != in_n)
- return NULL_TREE;
-
- switch (fn)
- {
- CASE_CFN_SIN:
- CASE_CFN_COS:
- CASE_CFN_EXP:
- CASE_CFN_LOG:
- CASE_CFN_LOG2:
- CASE_CFN_LOG10:
- if (el_mode == DFmode && n == 2)
- {
- name[4] = 'd';
- name[5] = '2';
- }
- else if (el_mode == SFmode && n == 4)
- {
- name[4] = 's';
- name[5] = '4';
- }
- else
- return NULL_TREE;
- break;
-
- default:
- return NULL_TREE;
- }
-
- tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
- bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
- sprintf (name + 7, "%s", bname+10);
-
- arity = 0;
- for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
- arity++;
-
- if (arity == 1)
- fntype = build_function_type_list (type_out, type_in, NULL);
- else
- fntype = build_function_type_list (type_out, type_in, type_in, NULL);
-
- /* Build a function declaration for the vectorized function. */
- new_fndecl = build_decl (BUILTINS_LOCATION,
- FUNCTION_DECL, get_identifier (name), fntype);
- TREE_PUBLIC (new_fndecl) = 1;
- DECL_EXTERNAL (new_fndecl) = 1;
- DECL_IS_NOVOPS (new_fndecl) = 1;
- TREE_READONLY (new_fndecl) = 1;
-
- return new_fndecl;
-}
-
-/* Returns a decl of a function that implements gather load with
- memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
- Return NULL_TREE if it is not available. */
-
-static tree
-ix86_vectorize_builtin_gather (const_tree mem_vectype,
- const_tree index_type, int scale)
-{
- bool si;
- enum ix86_builtins code;
-
- if (! TARGET_AVX2 || !TARGET_USE_GATHER)
- return NULL_TREE;
-
- if ((TREE_CODE (index_type) != INTEGER_TYPE
- && !POINTER_TYPE_P (index_type))
- || (TYPE_MODE (index_type) != SImode
- && TYPE_MODE (index_type) != DImode))
- return NULL_TREE;
-
- if (TYPE_PRECISION (index_type) > POINTER_SIZE)
- return NULL_TREE;
-
- /* v*gather* insn sign extends index to pointer mode. */
- if (TYPE_PRECISION (index_type) < POINTER_SIZE
- && TYPE_UNSIGNED (index_type))
- return NULL_TREE;
-
- if (scale <= 0
- || scale > 8
- || (scale & (scale - 1)) != 0)
- return NULL_TREE;
-
- si = TYPE_MODE (index_type) == SImode;
- switch (TYPE_MODE (mem_vectype))
- {
- case E_V2DFmode:
- if (TARGET_AVX512VL)
- code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
- else
- code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
- break;
- case E_V4DFmode:
- if (TARGET_AVX512VL)
- code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
- else
- code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
- break;
- case E_V2DImode:
- if (TARGET_AVX512VL)
- code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
- else
- code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
- break;
- case E_V4DImode:
- if (TARGET_AVX512VL)
- code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
- else
- code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
- break;
- case E_V4SFmode:
- if (TARGET_AVX512VL)
- code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
- else
- code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
- break;
- case E_V8SFmode:
- if (TARGET_AVX512VL)
- code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
- else
- code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
- break;
- case E_V4SImode:
- if (TARGET_AVX512VL)
- code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
- else
- code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
- break;
- case E_V8SImode:
- if (TARGET_AVX512VL)
- code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
- else
- code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
- break;
- case E_V8DFmode:
- if (TARGET_AVX512F)
- code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
- else
- return NULL_TREE;
- break;
- case E_V8DImode:
- if (TARGET_AVX512F)
- code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
- else
- return NULL_TREE;
- break;
- case E_V16SFmode:
- if (TARGET_AVX512F)
- code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
- else
- return NULL_TREE;
- break;
- case E_V16SImode:
- if (TARGET_AVX512F)
- code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
- else
- return NULL_TREE;
- break;
- default:
- return NULL_TREE;
- }
-
- return ix86_get_builtin (code);
-}
-
-/* Returns a decl of a function that implements scatter store with
- register type VECTYPE and index type INDEX_TYPE and SCALE.
- Return NULL_TREE if it is not available. */
-
-static tree
-ix86_vectorize_builtin_scatter (const_tree vectype,
- const_tree index_type, int scale)
-{
- bool si;
- enum ix86_builtins code;
-
- if (!TARGET_AVX512F)
- return NULL_TREE;
-
- if ((TREE_CODE (index_type) != INTEGER_TYPE
- && !POINTER_TYPE_P (index_type))
- || (TYPE_MODE (index_type) != SImode
- && TYPE_MODE (index_type) != DImode))
- return NULL_TREE;
-
- if (TYPE_PRECISION (index_type) > POINTER_SIZE)
- return NULL_TREE;
-
- /* v*scatter* insn sign extends index to pointer mode. */
- if (TYPE_PRECISION (index_type) < POINTER_SIZE
- && TYPE_UNSIGNED (index_type))
- return NULL_TREE;
-
- /* Scale can be 1, 2, 4 or 8. */
- if (scale <= 0
- || scale > 8
- || (scale & (scale - 1)) != 0)
- return NULL_TREE;
-
- si = TYPE_MODE (index_type) == SImode;
- switch (TYPE_MODE (vectype))
- {
- case E_V8DFmode:
- code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
- break;
- case E_V8DImode:
- code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
- break;
- case E_V16SFmode:
- code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
- break;
- case E_V16SImode:
- code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
- break;
- case E_V4DFmode:
- if (TARGET_AVX512VL)
- code = si ? IX86_BUILTIN_SCATTERALTSIV4DF : IX86_BUILTIN_SCATTERDIV4DF;
- else
- return NULL_TREE;
- break;
- case E_V4DImode:
- if (TARGET_AVX512VL)
- code = si ? IX86_BUILTIN_SCATTERALTSIV4DI : IX86_BUILTIN_SCATTERDIV4DI;
- else
- return NULL_TREE;
- break;
- case E_V8SFmode:
- if (TARGET_AVX512VL)
- code = si ? IX86_BUILTIN_SCATTERSIV8SF : IX86_BUILTIN_SCATTERALTDIV8SF;
- else
- return NULL_TREE;
- break;
- case E_V8SImode:
- if (TARGET_AVX512VL)
- code = si ? IX86_BUILTIN_SCATTERSIV8SI : IX86_BUILTIN_SCATTERALTDIV8SI;
- else
- return NULL_TREE;
- break;
- case E_V2DFmode:
- if (TARGET_AVX512VL)
- code = si ? IX86_BUILTIN_SCATTERALTSIV2DF : IX86_BUILTIN_SCATTERDIV2DF;
- else
- return NULL_TREE;
- break;
- case E_V2DImode:
- if (TARGET_AVX512VL)
- code = si ? IX86_BUILTIN_SCATTERALTSIV2DI : IX86_BUILTIN_SCATTERDIV2DI;
- else
- return NULL_TREE;
- break;
- case E_V4SFmode:
- if (TARGET_AVX512VL)
- code = si ? IX86_BUILTIN_SCATTERSIV4SF : IX86_BUILTIN_SCATTERALTDIV4SF;
- else
- return NULL_TREE;
- break;
- case E_V4SImode:
- if (TARGET_AVX512VL)
- code = si ? IX86_BUILTIN_SCATTERSIV4SI : IX86_BUILTIN_SCATTERALTDIV4SI;
- else
- return NULL_TREE;
- break;
- default:
- return NULL_TREE;
- }
-
- return ix86_builtins[code];
-}
-
-/* Return true if it is safe to use the rsqrt optabs to optimize
- 1.0/sqrt. */
-
-static bool
-use_rsqrt_p ()
-{
- return (TARGET_SSE && TARGET_SSE_MATH
- && flag_finite_math_only
- && !flag_trapping_math
- && flag_unsafe_math_optimizations);
-}
-
-/* Returns a code for a target-specific builtin that implements
- reciprocal of the function, or NULL_TREE if not available. */
-
-static tree
-ix86_builtin_reciprocal (tree fndecl)
-{
- enum ix86_builtins fn_code
- = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
- switch (fn_code)
- {
- /* Vectorized version of sqrt to rsqrt conversion. */
- case IX86_BUILTIN_SQRTPS_NR:
- return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
-
- case IX86_BUILTIN_SQRTPS_NR256:
- return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
-
- default:
- return NULL_TREE;
- }
-}
-\f
-/* Helper for avx_vpermilps256_operand et al. This is also used by
- the expansion functions to turn the parallel back into a mask.
- The return value is 0 for no match and the imm8+1 for a match. */
-
-int
-avx_vpermilp_parallel (rtx par, machine_mode mode)
-{
- unsigned i, nelt = GET_MODE_NUNITS (mode);
- unsigned mask = 0;
- unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
-
- if (XVECLEN (par, 0) != (int) nelt)
- return 0;
-
- /* Validate that all of the elements are constants, and not totally
- out of range. Copy the data into an integral array to make the
- subsequent checks easier. */
- for (i = 0; i < nelt; ++i)
- {
- rtx er = XVECEXP (par, 0, i);
- unsigned HOST_WIDE_INT ei;
-
- if (!CONST_INT_P (er))
- return 0;
- ei = INTVAL (er);
- if (ei >= nelt)
- return 0;
- ipar[i] = ei;
- }
-
- switch (mode)
- {
- case E_V8DFmode:
- /* In the 512-bit DFmode case, we can only move elements within
- a 128-bit lane. First fill the second part of the mask,
- then fallthru. */
- for (i = 4; i < 6; ++i)
- {
- if (ipar[i] < 4 || ipar[i] >= 6)
- return 0;
- mask |= (ipar[i] - 4) << i;
- }
- for (i = 6; i < 8; ++i)
- {
- if (ipar[i] < 6)
- return 0;
- mask |= (ipar[i] - 6) << i;
- }
- /* FALLTHRU */
-
- case E_V4DFmode:
- /* In the 256-bit DFmode case, we can only move elements within
- a 128-bit lane. */
- for (i = 0; i < 2; ++i)
- {
- if (ipar[i] >= 2)
- return 0;
- mask |= ipar[i] << i;
- }
- for (i = 2; i < 4; ++i)
- {
- if (ipar[i] < 2)
- return 0;
- mask |= (ipar[i] - 2) << i;
- }
- break;
-
- case E_V16SFmode:
- /* In 512 bit SFmode case, permutation in the upper 256 bits
- must mirror the permutation in the lower 256-bits. */
- for (i = 0; i < 8; ++i)
- if (ipar[i] + 8 != ipar[i + 8])
- return 0;
- /* FALLTHRU */
-
- case E_V8SFmode:
- /* In 256 bit SFmode case, we have full freedom of
- movement within the low 128-bit lane, but the high 128-bit
- lane must mirror the exact same pattern. */
- for (i = 0; i < 4; ++i)
- if (ipar[i] + 4 != ipar[i + 4])
- return 0;
- nelt = 4;
- /* FALLTHRU */
-
- case E_V2DFmode:
- case E_V4SFmode:
- /* In the 128-bit case, we've full freedom in the placement of
- the elements from the source operand. */
- for (i = 0; i < nelt; ++i)
- mask |= ipar[i] << (i * (nelt / 2));
- break;
-
- default:
- gcc_unreachable ();
- }
-
- /* Make sure success has a non-zero value by adding one. */
- return mask + 1;
-}
-
-/* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
- the expansion functions to turn the parallel back into a mask.
- The return value is 0 for no match and the imm8+1 for a match. */
-
-int
-avx_vperm2f128_parallel (rtx par, machine_mode mode)
-{
- unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
- unsigned mask = 0;
- unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
-
- if (XVECLEN (par, 0) != (int) nelt)
- return 0;
-
- /* Validate that all of the elements are constants, and not totally
- out of range. Copy the data into an integral array to make the
- subsequent checks easier. */
- for (i = 0; i < nelt; ++i)
- {
- rtx er = XVECEXP (par, 0, i);
- unsigned HOST_WIDE_INT ei;
-
- if (!CONST_INT_P (er))
- return 0;
- ei = INTVAL (er);
- if (ei >= 2 * nelt)
- return 0;
- ipar[i] = ei;
- }
-
- /* Validate that the halves of the permute are halves. */
- for (i = 0; i < nelt2 - 1; ++i)
- if (ipar[i] + 1 != ipar[i + 1])
- return 0;
- for (i = nelt2; i < nelt - 1; ++i)
- if (ipar[i] + 1 != ipar[i + 1])
- return 0;
-
- /* Reconstruct the mask. */
- for (i = 0; i < 2; ++i)
- {
- unsigned e = ipar[i * nelt2];
- if (e % nelt2)
- return 0;
- e /= nelt2;
- mask |= e << (i * 4);
- }
-
- /* Make sure success has a non-zero value by adding one. */
- return mask + 1;
-}
-\f
-/* Return a register priority for hard reg REGNO. */
-static int
-ix86_register_priority (int hard_regno)
-{
- /* ebp and r13 as the base always wants a displacement, r12 as the
- base always wants an index. So discourage their usage in an
- address. */
- if (hard_regno == R12_REG || hard_regno == R13_REG)
- return 0;
- if (hard_regno == BP_REG)
- return 1;
- /* New x86-64 int registers result in bigger code size. Discourage
- them. */
- if (IN_RANGE (hard_regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
- return 2;
- /* New x86-64 SSE registers result in bigger code size. Discourage
- them. */
- if (IN_RANGE (hard_regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
- return 2;
- if (IN_RANGE (hard_regno, FIRST_EXT_REX_SSE_REG, LAST_EXT_REX_SSE_REG))
- return 1;
- /* Usage of AX register results in smaller code. Prefer it. */
- if (hard_regno == AX_REG)
- return 4;
- return 3;
-}
-
-/* Implement TARGET_PREFERRED_RELOAD_CLASS.
-
- Put float CONST_DOUBLE in the constant pool instead of fp regs.
- QImode must go into class Q_REGS.
- Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
- movdf to do mem-to-mem moves through integer regs. */
-
-static reg_class_t
-ix86_preferred_reload_class (rtx x, reg_class_t regclass)
-{
- machine_mode mode = GET_MODE (x);
-
- /* We're only allowed to return a subclass of CLASS. Many of the
- following checks fail for NO_REGS, so eliminate that early. */
- if (regclass == NO_REGS)
- return NO_REGS;
-
- /* All classes can load zeros. */
- if (x == CONST0_RTX (mode))
- return regclass;
-
- /* Force constants into memory if we are loading a (nonzero) constant into
- an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
- instructions to load from a constant. */
- if (CONSTANT_P (x)
- && (MAYBE_MMX_CLASS_P (regclass)
- || MAYBE_SSE_CLASS_P (regclass)
- || MAYBE_MASK_CLASS_P (regclass)))
- return NO_REGS;
-
- /* Floating-point constants need more complex checks. */
- if (CONST_DOUBLE_P (x))
- {
- /* General regs can load everything. */
- if (INTEGER_CLASS_P (regclass))
- return regclass;
-
- /* Floats can load 0 and 1 plus some others. Note that we eliminated
- zero above. We only want to wind up preferring 80387 registers if
- we plan on doing computation with them. */
- if (IS_STACK_MODE (mode)
- && standard_80387_constant_p (x) > 0)
- {
- /* Limit class to FP regs. */
- if (FLOAT_CLASS_P (regclass))
- return FLOAT_REGS;
- }
-
- return NO_REGS;
- }
-
- /* Prefer SSE regs only, if we can use them for math. */
- if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
- return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
-
- /* Generally when we see PLUS here, it's the function invariant
- (plus soft-fp const_int). Which can only be computed into general
- regs. */
- if (GET_CODE (x) == PLUS)
- return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
-
- /* QImode constants are easy to load, but non-constant QImode data
- must go into Q_REGS. */
- if (GET_MODE (x) == QImode && !CONSTANT_P (x))
- {
- if (Q_CLASS_P (regclass))
- return regclass;
- else if (reg_class_subset_p (Q_REGS, regclass))
- return Q_REGS;
- else
- return NO_REGS;
- }
-
- return regclass;
-}
-
-/* Discourage putting floating-point values in SSE registers unless
- SSE math is being used, and likewise for the 387 registers. */
-static reg_class_t
-ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
-{
- machine_mode mode = GET_MODE (x);
-
- /* Restrict the output reload class to the register bank that we are doing
- math on. If we would like not to return a subset of CLASS, reject this
- alternative: if reload cannot do this, it will still use its choice. */
- mode = GET_MODE (x);
- if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
- return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
-
- if (IS_STACK_MODE (mode))
- return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
-
- return regclass;
-}
-
-static reg_class_t
-ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
- machine_mode mode, secondary_reload_info *sri)
-{
- /* Double-word spills from general registers to non-offsettable memory
- references (zero-extended addresses) require special handling. */
- if (TARGET_64BIT
- && MEM_P (x)
- && GET_MODE_SIZE (mode) > UNITS_PER_WORD
- && INTEGER_CLASS_P (rclass)
- && !offsettable_memref_p (x))
- {
- sri->icode = (in_p
- ? CODE_FOR_reload_noff_load
- : CODE_FOR_reload_noff_store);
- /* Add the cost of moving address to a temporary. */
- sri->extra_cost = 1;
-
- return NO_REGS;
- }
-
- /* QImode spills from non-QI registers require
- intermediate register on 32bit targets. */
- if (mode == QImode
- && ((!TARGET_64BIT && !in_p
- && INTEGER_CLASS_P (rclass)
- && MAYBE_NON_Q_CLASS_P (rclass))
- || (!TARGET_AVX512DQ
- && MAYBE_MASK_CLASS_P (rclass))))
- {
- int regno = true_regnum (x);
-
- /* Return Q_REGS if the operand is in memory. */
- if (regno == -1)
- return Q_REGS;
-
- return NO_REGS;
- }
-
- /* This condition handles corner case where an expression involving
- pointers gets vectorized. We're trying to use the address of a
- stack slot as a vector initializer.
-
- (set (reg:V2DI 74 [ vect_cst_.2 ])
- (vec_duplicate:V2DI (reg/f:DI 20 frame)))
-
- Eventually frame gets turned into sp+offset like this:
-
- (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
- (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
- (const_int 392 [0x188]))))
-
- That later gets turned into:
-
- (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
- (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
- (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
-
- We'll have the following reload recorded:
-
- Reload 0: reload_in (DI) =
- (plus:DI (reg/f:DI 7 sp)
- (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
- reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
- SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
- reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
- reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
- reload_reg_rtx: (reg:V2DI 22 xmm1)
-
- Which isn't going to work since SSE instructions can't handle scalar
- additions. Returning GENERAL_REGS forces the addition into integer
- register and reload can handle subsequent reloads without problems. */
-
- if (in_p && GET_CODE (x) == PLUS
- && SSE_CLASS_P (rclass)
- && SCALAR_INT_MODE_P (mode))
- return GENERAL_REGS;
-
- return NO_REGS;
-}
-
-/* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
-
-static bool
-ix86_class_likely_spilled_p (reg_class_t rclass)
-{
- switch (rclass)
- {
- case AREG:
- case DREG:
- case CREG:
- case BREG:
- case AD_REGS:
- case SIREG:
- case DIREG:
- case SSE_FIRST_REG:
- case FP_TOP_REG:
- case FP_SECOND_REG:
- return true;
-
- default:
- break;
- }
-
- return false;
-}
-
-/* If we are copying between registers from different register sets
- (e.g. FP and integer), we may need a memory location.
-
- The function can't work reliably when one of the CLASSES is a class
- containing registers from multiple sets. We avoid this by never combining
- different sets in a single alternative in the machine description.
- Ensure that this constraint holds to avoid unexpected surprises.
-
- When STRICT is false, we are being called from REGISTER_MOVE_COST,
- so do not enforce these sanity checks.
-
- To optimize register_move_cost performance, define inline variant. */
-
-static inline bool
-inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
- reg_class_t class2, int strict)
-{
- if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
- return false;
-
- if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
- || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
- || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
- || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
- || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
- || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
- || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
- || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
- {
- gcc_assert (!strict || lra_in_progress);
- return true;
- }
-
- if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
- return true;
-
- /* Between mask and general, we have moves no larger than word size. */
- if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
- && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
- return true;
-
- /* ??? This is a lie. We do have moves between mmx/general, and for
- mmx/sse2. But by saying we need secondary memory we discourage the
- register allocator from using the mmx registers unless needed. */
- if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
- return true;
-
- if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
- {
- /* SSE1 doesn't have any direct moves from other classes. */
- if (!TARGET_SSE2)
- return true;
-
- /* If the target says that inter-unit moves are more expensive
- than moving through memory, then don't generate them. */
- if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
- || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
- return true;
-
- /* Between SSE and general, we have moves no larger than word size. */
- if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
- return true;
- }
-
- return false;
-}
-
-/* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
-
-static bool
-ix86_secondary_memory_needed (machine_mode mode, reg_class_t class1,
- reg_class_t class2)
-{
- return inline_secondary_memory_needed (mode, class1, class2, true);
-}
-
-/* Implement TARGET_SECONDARY_MEMORY_NEEDED_MODE.
-
- get_secondary_mem widens integral modes to BITS_PER_WORD.
- There is no need to emit full 64 bit move on 64 bit targets
- for integral modes that can be moved using 32 bit move. */
-
-static machine_mode
-ix86_secondary_memory_needed_mode (machine_mode mode)
-{
- if (GET_MODE_BITSIZE (mode) < 32 && INTEGRAL_MODE_P (mode))
- return mode_for_size (32, GET_MODE_CLASS (mode), 0).require ();
- return mode;
-}
-
-/* Implement the TARGET_CLASS_MAX_NREGS hook.
-
- On the 80386, this is the size of MODE in words,
- except in the FP regs, where a single reg is always enough. */
-
-static unsigned char
-ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
-{
- if (MAYBE_INTEGER_CLASS_P (rclass))
- {
- if (mode == XFmode)
- return (TARGET_64BIT ? 2 : 3);
- else if (mode == XCmode)
- return (TARGET_64BIT ? 4 : 6);
- else
- return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
- }
- else
- {
- if (COMPLEX_MODE_P (mode))
- return 2;
- else
- return 1;
- }
-}
-
-/* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
-
-static bool
-ix86_can_change_mode_class (machine_mode from, machine_mode to,
- reg_class_t regclass)
-{
- if (from == to)
- return true;
-
- /* x87 registers can't do subreg at all, as all values are reformatted
- to extended precision. */
- if (MAYBE_FLOAT_CLASS_P (regclass))
- return false;
-
- if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
- {
- /* Vector registers do not support QI or HImode loads. If we don't
- disallow a change to these modes, reload will assume it's ok to
- drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
- the vec_dupv4hi pattern. */
- if (GET_MODE_SIZE (from) < 4)
- return false;
- }
-
- return true;
-}
-
-/* Return index of MODE in the sse load/store tables. */
-
-static inline int
-sse_store_index (machine_mode mode)
-{
- switch (GET_MODE_SIZE (mode))
- {
- case 4:
- return 0;
- case 8:
- return 1;
- case 16:
- return 2;
- case 32:
- return 3;
- case 64:
- return 4;
- default:
- return -1;
- }
-}
-
-/* Return the cost of moving data of mode M between a
- register and memory. A value of 2 is the default; this cost is
- relative to those in `REGISTER_MOVE_COST'.
-
- This function is used extensively by register_move_cost that is used to
- build tables at startup. Make it inline in this case.
- When IN is 2, return maximum of in and out move cost.
-
- If moving between registers and memory is more expensive than
- between two registers, you should define this macro to express the
- relative cost.
-
- Model also increased moving costs of QImode registers in non
- Q_REGS classes.
- */
-static inline int
-inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in)
-{
- int cost;
- if (FLOAT_CLASS_P (regclass))
- {
- int index;
- switch (mode)
- {
- case E_SFmode:
- index = 0;
- break;
- case E_DFmode:
- index = 1;
- break;
- case E_XFmode:
- index = 2;
- break;
- default:
- return 100;
- }
- if (in == 2)
- return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
- return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
- }
- if (SSE_CLASS_P (regclass))
- {
- int index = sse_store_index (mode);
- if (index == -1)
- return 100;
- if (in == 2)
- return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
- return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
- }
- if (MMX_CLASS_P (regclass))
- {
- int index;
- switch (GET_MODE_SIZE (mode))
- {
- case 4:
- index = 0;
- break;
- case 8:
- index = 1;
- break;
- default:
- return 100;
- }
- if (in == 2)
- return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
- return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
- }
- switch (GET_MODE_SIZE (mode))
- {
- case 1:
- if (Q_CLASS_P (regclass) || TARGET_64BIT)
- {
- if (!in)
- return ix86_cost->int_store[0];
- if (TARGET_PARTIAL_REG_DEPENDENCY
- && optimize_function_for_speed_p (cfun))
- cost = ix86_cost->movzbl_load;
- else
- cost = ix86_cost->int_load[0];
- if (in == 2)
- return MAX (cost, ix86_cost->int_store[0]);
- return cost;
- }
- else
- {
- if (in == 2)
- return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
- if (in)
- return ix86_cost->movzbl_load;
- else
- return ix86_cost->int_store[0] + 4;
- }
- break;
- case 2:
- if (in == 2)
- return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
- return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
- default:
- if (in == 2)
- cost = MAX (ix86_cost->int_load[2], ix86_cost->int_store[2]);
- else if (in)
- cost = ix86_cost->int_load[2];
- else
- cost = ix86_cost->int_store[2];
- /* Multiply with the number of GPR moves needed. */
- return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
- }
-}
-
-static int
-ix86_memory_move_cost (machine_mode mode, reg_class_t regclass, bool in)
-{
- return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
-}
-
-
-/* Return the cost of moving data from a register in class CLASS1 to
- one in class CLASS2.
-
- It is not required that the cost always equal 2 when FROM is the same as TO;
- on some machines it is expensive to move between registers if they are not
- general registers. */
-
-static int
-ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
- reg_class_t class2_i)
-{
- enum reg_class class1 = (enum reg_class) class1_i;
- enum reg_class class2 = (enum reg_class) class2_i;
-
- /* In case we require secondary memory, compute cost of the store followed
- by load. In order to avoid bad register allocation choices, we need
- for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
-
- if (inline_secondary_memory_needed (mode, class1, class2, false))
- {
- int cost = 1;
-
- cost += inline_memory_move_cost (mode, class1, 2);
- cost += inline_memory_move_cost (mode, class2, 2);
-
- /* In case of copying from general_purpose_register we may emit multiple
- stores followed by single load causing memory size mismatch stall.
- Count this as arbitrarily high cost of 20. */
- if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD
- && TARGET_MEMORY_MISMATCH_STALL
- && targetm.class_max_nregs (class1, mode)
- > targetm.class_max_nregs (class2, mode))
- cost += 20;
-
- /* In the case of FP/MMX moves, the registers actually overlap, and we
- have to switch modes in order to treat them differently. */
- if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
- || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
- cost += 20;
-
- return cost;
- }
-
- /* Moves between SSE/MMX and integer unit are expensive. */
- if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
- || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
-
- /* ??? By keeping returned value relatively high, we limit the number
- of moves between integer and MMX/SSE registers for all targets.
- Additionally, high value prevents problem with x86_modes_tieable_p(),
- where integer modes in MMX/SSE registers are not tieable
- because of missing QImode and HImode moves to, from or between
- MMX/SSE registers. */
- return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2)
- ? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer);
-
- if (MAYBE_FLOAT_CLASS_P (class1))
- return ix86_cost->fp_move;
- if (MAYBE_SSE_CLASS_P (class1))
- {
- if (GET_MODE_BITSIZE (mode) <= 128)
- return ix86_cost->xmm_move;
- if (GET_MODE_BITSIZE (mode) <= 256)
- return ix86_cost->ymm_move;
- return ix86_cost->zmm_move;
- }
- if (MAYBE_MMX_CLASS_P (class1))
- return ix86_cost->mmx_move;
- return 2;
-}
-
-/* Implement TARGET_HARD_REGNO_NREGS. This is ordinarily the length in
- words of a value of mode MODE but can be less for certain modes in
- special long registers.
-
- Actually there are no two word move instructions for consecutive
- registers. And only registers 0-3 may have mov byte instructions
- applied to them. */
-
-static unsigned int
-ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
-{
- if (GENERAL_REGNO_P (regno))
- {
- if (mode == XFmode)
- return TARGET_64BIT ? 2 : 3;
- if (mode == XCmode)
- return TARGET_64BIT ? 4 : 6;
- return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
- }
- if (COMPLEX_MODE_P (mode))
- return 2;
- if (mode == V64SFmode || mode == V64SImode)
- return 4;
- return 1;
-}
-
-/* Implement TARGET_HARD_REGNO_MODE_OK. */
-
-static bool
-ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
-{
- /* Flags and only flags can only hold CCmode values. */
- if (CC_REGNO_P (regno))
- return GET_MODE_CLASS (mode) == MODE_CC;
- if (GET_MODE_CLASS (mode) == MODE_CC
- || GET_MODE_CLASS (mode) == MODE_RANDOM
- || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
- return false;
- if (STACK_REGNO_P (regno))
- return VALID_FP_MODE_P (mode);
- if (MASK_REGNO_P (regno))
- return (VALID_MASK_REG_MODE (mode)
- || (TARGET_AVX512BW
- && VALID_MASK_AVX512BW_MODE (mode)));
- if (SSE_REGNO_P (regno))
- {
- /* We implement the move patterns for all vector modes into and
- out of SSE registers, even when no operation instructions
- are available. */
-
- /* For AVX-512 we allow, regardless of regno:
- - XI mode
- - any of 512-bit wide vector mode
- - any scalar mode. */
- if (TARGET_AVX512F
- && (mode == XImode
- || VALID_AVX512F_REG_MODE (mode)
- || VALID_AVX512F_SCALAR_MODE (mode)))
- return true;
-
- /* For AVX-5124FMAPS or AVX-5124VNNIW
- allow V64SF and V64SI modes for special regnos. */
- if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
- && (mode == V64SFmode || mode == V64SImode)
- && MOD4_SSE_REGNO_P (regno))
- return true;
-
- /* TODO check for QI/HI scalars. */
- /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
- if (TARGET_AVX512VL
- && (mode == OImode
- || mode == TImode
- || VALID_AVX256_REG_MODE (mode)
- || VALID_AVX512VL_128_REG_MODE (mode)))
- return true;
-
- /* xmm16-xmm31 are only available for AVX-512. */
- if (EXT_REX_SSE_REGNO_P (regno))
- return false;
-
- /* OImode and AVX modes are available only when AVX is enabled. */
- return ((TARGET_AVX
- && VALID_AVX256_REG_OR_OI_MODE (mode))
- || VALID_SSE_REG_MODE (mode)
- || VALID_SSE2_REG_MODE (mode)
- || VALID_MMX_REG_MODE (mode)
- || VALID_MMX_REG_MODE_3DNOW (mode));
- }
- if (MMX_REGNO_P (regno))
- {
- /* We implement the move patterns for 3DNOW modes even in MMX mode,
- so if the register is available at all, then we can move data of
- the given mode into or out of it. */
- return (VALID_MMX_REG_MODE (mode)
- || VALID_MMX_REG_MODE_3DNOW (mode));
- }
-
- if (mode == QImode)
- {
- /* Take care for QImode values - they can be in non-QI regs,
- but then they do cause partial register stalls. */
- if (ANY_QI_REGNO_P (regno))
- return true;
- if (!TARGET_PARTIAL_REG_STALL)
- return true;
- /* LRA checks if the hard register is OK for the given mode.
- QImode values can live in non-QI regs, so we allow all
- registers here. */
- if (lra_in_progress)
- return true;
- return !can_create_pseudo_p ();
- }
- /* We handle both integer and floats in the general purpose registers. */
- else if (VALID_INT_MODE_P (mode))
- return true;
- else if (VALID_FP_MODE_P (mode))
- return true;
- else if (VALID_DFP_MODE_P (mode))
- return true;
- /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
- on to use that value in smaller contexts, this can easily force a
- pseudo to be allocated to GENERAL_REGS. Since this is no worse than
- supporting DImode, allow it. */
- else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
- return true;
-
- return false;
-}
-
-/* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that
- saves SSE registers across calls is Win64 (thus no need to check the
- current ABI here), and with AVX enabled Win64 only guarantees that
- the low 16 bytes are saved. */
-
-static bool
-ix86_hard_regno_call_part_clobbered (rtx_insn *insn ATTRIBUTE_UNUSED,
- unsigned int regno, machine_mode mode)
-{
- return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16;
-}
-
-/* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
- tieable integer mode. */
-
-static bool
-ix86_tieable_integer_mode_p (machine_mode mode)
-{
- switch (mode)
- {
- case E_HImode:
- case E_SImode:
- return true;
-
- case E_QImode:
- return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
-
- case E_DImode:
- return TARGET_64BIT;
-
- default:
- return false;
- }
-}
-
-/* Implement TARGET_MODES_TIEABLE_P.
-
- Return true if MODE1 is accessible in a register that can hold MODE2
- without copying. That is, all register classes that can hold MODE2
- can also hold MODE1. */
-
-static bool
-ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
-{
- if (mode1 == mode2)
- return true;
-
- if (ix86_tieable_integer_mode_p (mode1)
- && ix86_tieable_integer_mode_p (mode2))
- return true;
-
- /* MODE2 being XFmode implies fp stack or general regs, which means we
- can tie any smaller floating point modes to it. Note that we do not
- tie this with TFmode. */
- if (mode2 == XFmode)
- return mode1 == SFmode || mode1 == DFmode;
-
- /* MODE2 being DFmode implies fp stack, general or sse regs, which means
- that we can tie it with SFmode. */
- if (mode2 == DFmode)
- return mode1 == SFmode;
-
- /* If MODE2 is only appropriate for an SSE register, then tie with
- any other mode acceptable to SSE registers. */
- if (GET_MODE_SIZE (mode2) == 64
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
- return (GET_MODE_SIZE (mode1) == 64
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
- if (GET_MODE_SIZE (mode2) == 32
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
- return (GET_MODE_SIZE (mode1) == 32
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
- if (GET_MODE_SIZE (mode2) == 16
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
- return (GET_MODE_SIZE (mode1) == 16
- && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
-
- /* If MODE2 is appropriate for an MMX register, then tie
- with any other mode acceptable to MMX registers. */
- if (GET_MODE_SIZE (mode2) == 8
- && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
- return (GET_MODE_SIZE (mode1) == 8
- && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
-
- return false;
-}
-
-/* Return the cost of moving between two registers of mode MODE. */
-
-static int
-ix86_set_reg_reg_cost (machine_mode mode)
-{
- unsigned int units = UNITS_PER_WORD;
-
- switch (GET_MODE_CLASS (mode))
- {
- default:
- break;
-
- case MODE_CC:
- units = GET_MODE_SIZE (CCmode);
- break;
-
- case MODE_FLOAT:
- if ((TARGET_SSE && mode == TFmode)
- || (TARGET_80387 && mode == XFmode)
- || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
- || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
- units = GET_MODE_SIZE (mode);
- break;
-
- case MODE_COMPLEX_FLOAT:
- if ((TARGET_SSE && mode == TCmode)
- || (TARGET_80387 && mode == XCmode)
- || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
- || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
- units = GET_MODE_SIZE (mode);
- break;
-
- case MODE_VECTOR_INT:
- case MODE_VECTOR_FLOAT:
- if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
- || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
- || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
- || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
- || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
- units = GET_MODE_SIZE (mode);
- }
-
- /* Return the cost of moving between two registers of mode MODE,
- assuming that the move will be in pieces of at most UNITS bytes. */
- return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
-}
-
-/* Return cost of vector operation in MODE given that scalar version has
- COST. */
-
-static int
-ix86_vec_cost (machine_mode mode, int cost)
-{
- if (!VECTOR_MODE_P (mode))
- return cost;
-
- if (GET_MODE_BITSIZE (mode) == 128
- && TARGET_SSE_SPLIT_REGS)
- return cost * 2;
- if (GET_MODE_BITSIZE (mode) > 128
- && TARGET_AVX128_OPTIMAL)
- return cost * GET_MODE_BITSIZE (mode) / 128;
- return cost;
-}
-
-/* Return cost of multiplication in MODE. */
-
-static int
-ix86_multiplication_cost (const struct processor_costs *cost,
- enum machine_mode mode)
-{
- machine_mode inner_mode = mode;
- if (VECTOR_MODE_P (mode))
- inner_mode = GET_MODE_INNER (mode);
-
- if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
- return inner_mode == DFmode ? cost->mulsd : cost->mulss;
- else if (X87_FLOAT_MODE_P (mode))
- return cost->fmul;
- else if (FLOAT_MODE_P (mode))
- return ix86_vec_cost (mode,
- inner_mode == DFmode ? cost->mulsd : cost->mulss);
- else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
- {
- /* vpmullq is used in this case. No emulation is needed. */
- if (TARGET_AVX512DQ)
- return ix86_vec_cost (mode, cost->mulss);
-
- /* V*QImode is emulated with 7-13 insns. */
- if (mode == V16QImode || mode == V32QImode)
- {
- int extra = 11;
- if (TARGET_XOP && mode == V16QImode)
- extra = 5;
- else if (TARGET_SSSE3)
- extra = 6;
- return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * extra);
- }
- /* V*DImode is emulated with 5-8 insns. */
- else if (mode == V2DImode || mode == V4DImode)
- {
- if (TARGET_XOP && mode == V2DImode)
- return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 3);
- else
- return ix86_vec_cost (mode, cost->mulss * 3 + cost->sse_op * 5);
- }
- /* Without sse4.1, we don't have PMULLD; it's emulated with 7
- insns, including two PMULUDQ. */
- else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
- return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5);
- else
- return ix86_vec_cost (mode, cost->mulss);
- }
- else
- return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7);
-}
-
-/* Return cost of multiplication in MODE. */
-
-static int
-ix86_division_cost (const struct processor_costs *cost,
- enum machine_mode mode)
-{
- machine_mode inner_mode = mode;
- if (VECTOR_MODE_P (mode))
- inner_mode = GET_MODE_INNER (mode);
-
- if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
- return inner_mode == DFmode ? cost->divsd : cost->divss;
- else if (X87_FLOAT_MODE_P (mode))
- return cost->fdiv;
- else if (FLOAT_MODE_P (mode))
- return ix86_vec_cost (mode,
- inner_mode == DFmode ? cost->divsd : cost->divss);
- else
- return cost->divide[MODE_INDEX (mode)];
-}
-
-/* Return cost of shift in MODE.
- If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL.
- AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE
- if op1 is a result of subreg.
-
- SKIP_OP0/1 is set to true if cost of OP0/1 should be ignored. */
-
-static int
-ix86_shift_rotate_cost (const struct processor_costs *cost,
- enum machine_mode mode, bool constant_op1,
- HOST_WIDE_INT op1_val,
- bool speed,
- bool and_in_op1,
- bool shift_and_truncate,
- bool *skip_op0, bool *skip_op1)
-{
- if (skip_op0)
- *skip_op0 = *skip_op1 = false;
- if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
- {
- /* V*QImode is emulated with 1-11 insns. */
- if (mode == V16QImode || mode == V32QImode)
- {
- int count = 11;
- if (TARGET_XOP && mode == V16QImode)
- {
- /* For XOP we use vpshab, which requires a broadcast of the
- value to the variable shift insn. For constants this
- means a V16Q const in mem; even when we can perform the
- shift with one insn set the cost to prefer paddb. */
- if (constant_op1)
- {
- if (skip_op1)
- *skip_op1 = true;
- return ix86_vec_cost (mode,
- cost->sse_op
- + (speed
- ? 2
- : COSTS_N_BYTES
- (GET_MODE_UNIT_SIZE (mode))));
- }
- count = 3;
- }
- else if (TARGET_SSSE3)
- count = 7;
- return ix86_vec_cost (mode, cost->sse_op * count);
- }
- else
- return ix86_vec_cost (mode, cost->sse_op);
- }
- if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
- {
- if (constant_op1)
- {
- if (op1_val > 32)
- return cost->shift_const + COSTS_N_INSNS (2);
- else
- return cost->shift_const * 2;
- }
- else
- {
- if (and_in_op1)
- return cost->shift_var * 2;
- else
- return cost->shift_var * 6 + COSTS_N_INSNS (2);
- }
- }
- else
- {
- if (constant_op1)
- return cost->shift_const;
- else if (shift_and_truncate)
- {
- if (skip_op0)
- *skip_op0 = *skip_op1 = true;
- /* Return the cost after shift-and truncation. */
- return cost->shift_var;
- }
- else
- return cost->shift_var;
- }
- return cost->shift_const;
-}
-
-/* Compute a (partial) cost for rtx X. Return true if the complete
- cost has been computed, and false if subexpressions should be
- scanned. In either case, *TOTAL contains the cost result. */
-
-static bool
-ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
- int *total, bool speed)
-{
- rtx mask;
- enum rtx_code code = GET_CODE (x);
- enum rtx_code outer_code = (enum rtx_code) outer_code_i;
- const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
- int src_cost;
-
- switch (code)
- {
- case SET:
- if (register_operand (SET_DEST (x), VOIDmode)
- && register_operand (SET_SRC (x), VOIDmode))
- {
- *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
- return true;
- }
-
- if (register_operand (SET_SRC (x), VOIDmode))
- /* Avoid potentially incorrect high cost from rtx_costs
- for non-tieable SUBREGs. */
- src_cost = 0;
- else
- {
- src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
-
- if (CONSTANT_P (SET_SRC (x)))
- /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
- a small value, possibly zero for cheap constants. */
- src_cost += COSTS_N_INSNS (1);
- }
-
- *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
- return true;
-
- case CONST_INT:
- case CONST:
- case LABEL_REF:
- case SYMBOL_REF:
- if (x86_64_immediate_operand (x, VOIDmode))
- *total = 0;
- else
- *total = 1;
- return true;
-
- case CONST_DOUBLE:
- if (IS_STACK_MODE (mode))
- switch (standard_80387_constant_p (x))
- {
- case -1:
- case 0:
- break;
- case 1: /* 0.0 */
- *total = 1;
- return true;
- default: /* Other constants */
- *total = 2;
- return true;
- }
- /* FALLTHRU */
-
- case CONST_VECTOR:
- switch (standard_sse_constant_p (x, mode))
- {
- case 0:
- break;
- case 1: /* 0: xor eliminates false dependency */
- *total = 0;
- return true;
- default: /* -1: cmp contains false dependency */
- *total = 1;
- return true;
- }
- /* FALLTHRU */
-
- case CONST_WIDE_INT:
- /* Fall back to (MEM (SYMBOL_REF)), since that's where
- it'll probably end up. Add a penalty for size. */
- *total = (COSTS_N_INSNS (1)
- + (!TARGET_64BIT && flag_pic)
- + (GET_MODE_SIZE (mode) <= 4
- ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
- return true;
-
- case ZERO_EXTEND:
- /* The zero extensions is often completely free on x86_64, so make
- it as cheap as possible. */
- if (TARGET_64BIT && mode == DImode
- && GET_MODE (XEXP (x, 0)) == SImode)
- *total = 1;
- else if (TARGET_ZERO_EXTEND_WITH_AND)
- *total = cost->add;
- else
- *total = cost->movzx;
- return false;
-
- case SIGN_EXTEND:
- *total = cost->movsx;
- return false;
-
- case ASHIFT:
- if (SCALAR_INT_MODE_P (mode)
- && GET_MODE_SIZE (mode) < UNITS_PER_WORD
- && CONST_INT_P (XEXP (x, 1)))
- {
- HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
- if (value == 1)
- {
- *total = cost->add;
- return false;
- }
- if ((value == 2 || value == 3)
- && cost->lea <= cost->shift_const)
- {
- *total = cost->lea;
- return false;
- }
- }
- /* FALLTHRU */
-
- case ROTATE:
- case ASHIFTRT:
- case LSHIFTRT:
- case ROTATERT:
- bool skip_op0, skip_op1;
- *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)),
- CONST_INT_P (XEXP (x, 1))
- ? INTVAL (XEXP (x, 1)) : -1,
- speed,
- GET_CODE (XEXP (x, 1)) == AND,
- SUBREG_P (XEXP (x, 1))
- && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND,
- &skip_op0, &skip_op1);
- if (skip_op0 || skip_op1)
- {
- if (!skip_op0)
- *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
- if (!skip_op1)
- *total += rtx_cost (XEXP (x, 1), mode, code, 0, speed);
- return true;
- }
- return false;
-
- case FMA:
- {
- rtx sub;
-
- gcc_assert (FLOAT_MODE_P (mode));
- gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
-
- *total = ix86_vec_cost (mode,
- GET_MODE_INNER (mode) == SFmode
- ? cost->fmass : cost->fmasd);
- *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
-
- /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
- sub = XEXP (x, 0);
- if (GET_CODE (sub) == NEG)
- sub = XEXP (sub, 0);
- *total += rtx_cost (sub, mode, FMA, 0, speed);
-
- sub = XEXP (x, 2);
- if (GET_CODE (sub) == NEG)
- sub = XEXP (sub, 0);
- *total += rtx_cost (sub, mode, FMA, 2, speed);
- return true;
- }
-
- case MULT:
- if (!FLOAT_MODE_P (mode) && !VECTOR_MODE_P (mode))
- {
- rtx op0 = XEXP (x, 0);
- rtx op1 = XEXP (x, 1);
- int nbits;
- if (CONST_INT_P (XEXP (x, 1)))
- {
- unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
- for (nbits = 0; value != 0; value &= value - 1)
- nbits++;
- }
- else
- /* This is arbitrary. */
- nbits = 7;
-
- /* Compute costs correctly for widening multiplication. */
- if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
- && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
- == GET_MODE_SIZE (mode))
- {
- int is_mulwiden = 0;
- machine_mode inner_mode = GET_MODE (op0);
-
- if (GET_CODE (op0) == GET_CODE (op1))
- is_mulwiden = 1, op1 = XEXP (op1, 0);
- else if (CONST_INT_P (op1))
- {
- if (GET_CODE (op0) == SIGN_EXTEND)
- is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
- == INTVAL (op1);
- else
- is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
- }
-
- if (is_mulwiden)
- op0 = XEXP (op0, 0), mode = GET_MODE (op0);
- }
-
- *total = (cost->mult_init[MODE_INDEX (mode)]
- + nbits * cost->mult_bit
- + rtx_cost (op0, mode, outer_code, opno, speed)
- + rtx_cost (op1, mode, outer_code, opno, speed));
-
- return true;
- }
- *total = ix86_multiplication_cost (cost, mode);
- return false;
-
- case DIV:
- case UDIV:
- case MOD:
- case UMOD:
- *total = ix86_division_cost (cost, mode);
- return false;
-
- case PLUS:
- if (GET_MODE_CLASS (mode) == MODE_INT
- && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
- {
- if (GET_CODE (XEXP (x, 0)) == PLUS
- && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
- && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
- && CONSTANT_P (XEXP (x, 1)))
- {
- HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
- if (val == 2 || val == 4 || val == 8)
- {
- *total = cost->lea;
- *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
- outer_code, opno, speed);
- *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
- outer_code, opno, speed);
- *total += rtx_cost (XEXP (x, 1), mode,
- outer_code, opno, speed);
- return true;
- }
- }
- else if (GET_CODE (XEXP (x, 0)) == MULT
- && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
- {
- HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
- if (val == 2 || val == 4 || val == 8)
- {
- *total = cost->lea;
- *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
- outer_code, opno, speed);
- *total += rtx_cost (XEXP (x, 1), mode,
- outer_code, opno, speed);
- return true;
- }
- }
- else if (GET_CODE (XEXP (x, 0)) == PLUS)
- {
- /* Add with carry, ignore the cost of adding a carry flag. */
- if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
- *total = cost->add;
- else
- {
- *total = cost->lea;
- *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
- outer_code, opno, speed);
- }
-
- *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
- outer_code, opno, speed);
- *total += rtx_cost (XEXP (x, 1), mode,
- outer_code, opno, speed);
- return true;
- }
- }
- /* FALLTHRU */
-
- case MINUS:
- /* Subtract with borrow, ignore the cost of subtracting a carry flag. */
- if (GET_MODE_CLASS (mode) == MODE_INT
- && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
- && GET_CODE (XEXP (x, 0)) == MINUS
- && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
- {
- *total = cost->add;
- *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
- outer_code, opno, speed);
- *total += rtx_cost (XEXP (x, 1), mode,
- outer_code, opno, speed);
- return true;
- }
-
- if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
- {
- *total = cost->addss;
- return false;
- }
- else if (X87_FLOAT_MODE_P (mode))
- {
- *total = cost->fadd;
- return false;
- }
- else if (FLOAT_MODE_P (mode))
- {
- *total = ix86_vec_cost (mode, cost->addss);
- return false;
- }
- /* FALLTHRU */
-
- case AND:
- case IOR:
- case XOR:
- if (GET_MODE_CLASS (mode) == MODE_INT
- && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
- {
- *total = (cost->add * 2
- + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
- << (GET_MODE (XEXP (x, 0)) != DImode))
- + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
- << (GET_MODE (XEXP (x, 1)) != DImode)));
- return true;
- }
- /* FALLTHRU */
-
- case NEG:
- if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
- {
- *total = cost->sse_op;
- return false;
- }
- else if (X87_FLOAT_MODE_P (mode))
- {
- *total = cost->fchs;
- return false;
- }
- else if (FLOAT_MODE_P (mode))
- {
- *total = ix86_vec_cost (mode, cost->sse_op);
- return false;
- }
- /* FALLTHRU */
-
- case NOT:
- if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
- *total = ix86_vec_cost (mode, cost->sse_op);
- else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
- *total = cost->add * 2;
- else
- *total = cost->add;
- return false;
-
- case COMPARE:
- if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
- && XEXP (XEXP (x, 0), 1) == const1_rtx
- && CONST_INT_P (XEXP (XEXP (x, 0), 2))
- && XEXP (x, 1) == const0_rtx)
- {
- /* This kind of construct is implemented using test[bwl].
- Treat it as if we had an AND. */
- mode = GET_MODE (XEXP (XEXP (x, 0), 0));
- *total = (cost->add
- + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
- opno, speed)
- + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
- return true;
- }
-
- /* The embedded comparison operand is completely free. */
- if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
- && XEXP (x, 1) == const0_rtx)
- *total = 0;
-
- return false;
-
- case FLOAT_EXTEND:
- if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
- *total = 0;
- else
- *total = ix86_vec_cost (mode, cost->addss);
- return false;
-
- case FLOAT_TRUNCATE:
- if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
- *total = cost->fadd;
- else
- *total = ix86_vec_cost (mode, cost->addss);
- return false;
-
- case ABS:
- /* SSE requires memory load for the constant operand. It may make
- sense to account for this. Of course the constant operand may or
- may not be reused. */
- if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
- *total = cost->sse_op;
- else if (X87_FLOAT_MODE_P (mode))
- *total = cost->fabs;
- else if (FLOAT_MODE_P (mode))
- *total = ix86_vec_cost (mode, cost->sse_op);
- return false;
-
- case SQRT:
- if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
- *total = mode == SFmode ? cost->sqrtss : cost->sqrtsd;
- else if (X87_FLOAT_MODE_P (mode))
- *total = cost->fsqrt;
- else if (FLOAT_MODE_P (mode))
- *total = ix86_vec_cost (mode,
- mode == SFmode ? cost->sqrtss : cost->sqrtsd);
- return false;
-
- case UNSPEC:
- if (XINT (x, 1) == UNSPEC_TP)
- *total = 0;
- return false;
-
- case VEC_SELECT:
- case VEC_CONCAT:
- case VEC_DUPLICATE:
- /* ??? Assume all of these vector manipulation patterns are
- recognizable. In which case they all pretty much have the
- same cost. */
- *total = cost->sse_op;
- return true;
- case VEC_MERGE:
- mask = XEXP (x, 2);
- /* This is masked instruction, assume the same cost,
- as nonmasked variant. */
- if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
- *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
- else
- *total = cost->sse_op;
- return true;
-
- default:
- return false;
- }
-}
-
-#if TARGET_MACHO
-
-static int current_machopic_label_num;
-
-/* Given a symbol name and its associated stub, write out the
- definition of the stub. */
-
-void
-machopic_output_stub (FILE *file, const char *symb, const char *stub)
-{
- unsigned int length;
- char *binder_name, *symbol_name, lazy_ptr_name[32];
- int label = ++current_machopic_label_num;
-
- /* For 64-bit we shouldn't get here. */
- gcc_assert (!TARGET_64BIT);
-
- /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
- symb = targetm.strip_name_encoding (symb);
-
- length = strlen (stub);
- binder_name = XALLOCAVEC (char, length + 32);
- GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
-
- length = strlen (symb);
- symbol_name = XALLOCAVEC (char, length + 32);
- GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
-
- sprintf (lazy_ptr_name, "L%d$lz", label);
-
- if (MACHOPIC_ATT_STUB)
- switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
- else if (MACHOPIC_PURE)
- switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
- else
- switch_to_section (darwin_sections[machopic_symbol_stub_section]);
-
- fprintf (file, "%s:\n", stub);
- fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
-
- if (MACHOPIC_ATT_STUB)
- {
- fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
- }
- else if (MACHOPIC_PURE)
- {
- /* PIC stub. */
- /* 25-byte PIC stub using "CALL get_pc_thunk". */
- rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
- output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
- fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
- label, lazy_ptr_name, label);
- fprintf (file, "\tjmp\t*%%ecx\n");
- }
- else
- fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
-
- /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
- it needs no stub-binding-helper. */
- if (MACHOPIC_ATT_STUB)
- return;
-
- fprintf (file, "%s:\n", binder_name);
-
- if (MACHOPIC_PURE)
- {
- fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
- fprintf (file, "\tpushl\t%%ecx\n");
- }
- else
- fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
-
- fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
-
- /* N.B. Keep the correspondence of these
- 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
- old-pic/new-pic/non-pic stubs; altering this will break
- compatibility with existing dylibs. */
- if (MACHOPIC_PURE)
- {
- /* 25-byte PIC stub using "CALL get_pc_thunk". */
- switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
- }
- else
- /* 16-byte -mdynamic-no-pic stub. */
- switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
-
- fprintf (file, "%s:\n", lazy_ptr_name);
- fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
- fprintf (file, ASM_LONG "%s\n", binder_name);
-}
-#endif /* TARGET_MACHO */
-
-/* Order the registers for register allocator. */
-
-void
-x86_order_regs_for_local_alloc (void)
-{
- int pos = 0;
- int i;
-
- /* First allocate the local general purpose registers. */
- for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
- if (GENERAL_REGNO_P (i) && call_used_regs[i])
- reg_alloc_order [pos++] = i;
-
- /* Global general purpose registers. */
- for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
- if (GENERAL_REGNO_P (i) && !call_used_regs[i])
- reg_alloc_order [pos++] = i;
-
- /* x87 registers come first in case we are doing FP math
- using them. */
- if (!TARGET_SSE_MATH)
- for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
- reg_alloc_order [pos++] = i;
-
- /* SSE registers. */
- for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
- reg_alloc_order [pos++] = i;
- for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
- reg_alloc_order [pos++] = i;
-
- /* Extended REX SSE registers. */
- for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
- reg_alloc_order [pos++] = i;
-
- /* Mask register. */
- for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
- reg_alloc_order [pos++] = i;
-
- /* x87 registers. */
- if (TARGET_SSE_MATH)
- for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
- reg_alloc_order [pos++] = i;
-
- for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
- reg_alloc_order [pos++] = i;
-
- /* Initialize the rest of array as we do not allocate some registers
- at all. */
- while (pos < FIRST_PSEUDO_REGISTER)
- reg_alloc_order [pos++] = 0;
-}
-
-/* Handle a "callee_pop_aggregate_return" attribute; arguments as
- in struct attribute_spec handler. */
-static tree
-ix86_handle_callee_pop_aggregate_return (tree *node, tree name, tree args, int,
- bool *no_add_attrs)
-{
- if (TREE_CODE (*node) != FUNCTION_TYPE
- && TREE_CODE (*node) != METHOD_TYPE
- && TREE_CODE (*node) != FIELD_DECL
- && TREE_CODE (*node) != TYPE_DECL)
- {
- warning (OPT_Wattributes, "%qE attribute only applies to functions",
- name);
- *no_add_attrs = true;
- return NULL_TREE;
- }
- if (TARGET_64BIT)
- {
- warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
- name);
- *no_add_attrs = true;
- return NULL_TREE;
- }
- if (is_attribute_p ("callee_pop_aggregate_return", name))
- {
- tree cst;
-
- cst = TREE_VALUE (args);
- if (TREE_CODE (cst) != INTEGER_CST)
- {
- warning (OPT_Wattributes,
- "%qE attribute requires an integer constant argument",
- name);
- *no_add_attrs = true;
- }
- else if (compare_tree_int (cst, 0) != 0
- && compare_tree_int (cst, 1) != 0)
- {
- warning (OPT_Wattributes,
- "argument to %qE attribute is neither zero, nor one",
- name);
- *no_add_attrs = true;
- }
-
- return NULL_TREE;
- }
-
- return NULL_TREE;
-}
-
-/* Handle a "ms_abi" or "sysv" attribute; arguments as in
- struct attribute_spec.handler. */
-static tree
-ix86_handle_abi_attribute (tree *node, tree name, tree, int,
- bool *no_add_attrs)
-{
- if (TREE_CODE (*node) != FUNCTION_TYPE
- && TREE_CODE (*node) != METHOD_TYPE
- && TREE_CODE (*node) != FIELD_DECL
- && TREE_CODE (*node) != TYPE_DECL)
- {
- warning (OPT_Wattributes, "%qE attribute only applies to functions",
- name);
- *no_add_attrs = true;
- return NULL_TREE;
- }
-
- /* Can combine regparm with all attributes but fastcall. */
- if (is_attribute_p ("ms_abi", name))
- {
- if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
- {
- error ("ms_abi and sysv_abi attributes are not compatible");
- }
-
- return NULL_TREE;
- }
- else if (is_attribute_p ("sysv_abi", name))
- {
- if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
- {
- error ("ms_abi and sysv_abi attributes are not compatible");
- }
-
- return NULL_TREE;
- }
-
- return NULL_TREE;
-}
-
-/* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
- struct attribute_spec.handler. */
-static tree
-ix86_handle_struct_attribute (tree *node, tree name, tree, int,
- bool *no_add_attrs)
-{
- tree *type = NULL;
- if (DECL_P (*node))
- {
- if (TREE_CODE (*node) == TYPE_DECL)
- type = &TREE_TYPE (*node);
- }
- else
- type = node;
-
- if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
- {
- warning (OPT_Wattributes, "%qE attribute ignored",
- name);
- *no_add_attrs = true;
- }
-
- else if ((is_attribute_p ("ms_struct", name)
- && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
- || ((is_attribute_p ("gcc_struct", name)
- && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
- {
- warning (OPT_Wattributes, "%qE incompatible attribute ignored",
- name);
- *no_add_attrs = true;
- }
-
- return NULL_TREE;
-}
-
-static tree
-ix86_handle_fndecl_attribute (tree *node, tree name, tree args, int,
- bool *no_add_attrs)
-{
- if (TREE_CODE (*node) != FUNCTION_DECL)
- {
- warning (OPT_Wattributes, "%qE attribute only applies to functions",
- name);
- *no_add_attrs = true;
- }
-
- if (is_attribute_p ("indirect_branch", name))
- {
- tree cst = TREE_VALUE (args);
- if (TREE_CODE (cst) != STRING_CST)
- {
- warning (OPT_Wattributes,
- "%qE attribute requires a string constant argument",
- name);
- *no_add_attrs = true;
- }
- else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0
- && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0
- && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0
- && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0)
- {
- warning (OPT_Wattributes,
- "argument to %qE attribute is not "
- "(keep|thunk|thunk-inline|thunk-extern)", name);
- *no_add_attrs = true;
- }
- }
-
- if (is_attribute_p ("function_return", name))
- {
- tree cst = TREE_VALUE (args);
- if (TREE_CODE (cst) != STRING_CST)
- {
- warning (OPT_Wattributes,
- "%qE attribute requires a string constant argument",
- name);
- *no_add_attrs = true;
- }
- else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0
- && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0
- && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0
- && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0)
- {
- warning (OPT_Wattributes,
- "argument to %qE attribute is not "
- "(keep|thunk|thunk-inline|thunk-extern)", name);
- *no_add_attrs = true;
- }
- }
-
- return NULL_TREE;
-}
-
-static tree
-ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
- int, bool *)
-{
- return NULL_TREE;
-}
-
-static tree
-ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
-{
- /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
- but the function type contains args and return type data. */
- tree func_type = *node;
- tree return_type = TREE_TYPE (func_type);
-
- int nargs = 0;
- tree current_arg_type = TYPE_ARG_TYPES (func_type);
- while (current_arg_type
- && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
- {
- if (nargs == 0)
- {
- if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
- error ("interrupt service routine should have a pointer "
- "as the first argument");
- }
- else if (nargs == 1)
- {
- if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
- || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
- error ("interrupt service routine should have %qs "
- "as the second argument",
- TARGET_64BIT
- ? (TARGET_X32 ? "unsigned long long int"
- : "unsigned long int")
- : "unsigned int");
- }
- nargs++;
- current_arg_type = TREE_CHAIN (current_arg_type);
- }
- if (!nargs || nargs > 2)
- error ("interrupt service routine can only have a pointer argument "
- "and an optional integer argument");
- if (! VOID_TYPE_P (return_type))
- error ("interrupt service routine can%'t have non-void return value");
-
- return NULL_TREE;
-}
-
-static bool
-ix86_ms_bitfield_layout_p (const_tree record_type)
-{
- return ((TARGET_MS_BITFIELD_LAYOUT
- && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
- || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
-}
-
-/* Returns an expression indicating where the this parameter is
- located on entry to the FUNCTION. */
-
-static rtx
-x86_this_parameter (tree function)
-{
- tree type = TREE_TYPE (function);
- bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
- int nregs;
-
- if (TARGET_64BIT)
- {
- const int *parm_regs;
-
- if (ix86_function_type_abi (type) == MS_ABI)
- parm_regs = x86_64_ms_abi_int_parameter_registers;
- else
- parm_regs = x86_64_int_parameter_registers;
- return gen_rtx_REG (Pmode, parm_regs[aggr]);
- }
-
- nregs = ix86_function_regparm (type, function);
-
- if (nregs > 0 && !stdarg_p (type))
- {
- int regno;
- unsigned int ccvt = ix86_get_callcvt (type);
-
- if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
- regno = aggr ? DX_REG : CX_REG;
- else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
- {
- regno = CX_REG;
- if (aggr)
- return gen_rtx_MEM (SImode,
- plus_constant (Pmode, stack_pointer_rtx, 4));
- }
- else
- {
- regno = AX_REG;
- if (aggr)
- {
- regno = DX_REG;
- if (nregs == 1)
- return gen_rtx_MEM (SImode,
- plus_constant (Pmode,
- stack_pointer_rtx, 4));
- }
- }
- return gen_rtx_REG (SImode, regno);
- }
-
- return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
- aggr ? 8 : 4));
-}
-
-/* Determine whether x86_output_mi_thunk can succeed. */
-
-static bool
-x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
- const_tree function)
-{
- /* 64-bit can handle anything. */
- if (TARGET_64BIT)
- return true;
-
- /* For 32-bit, everything's fine if we have one free register. */
- if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
- return true;
-
- /* Need a free register for vcall_offset. */
- if (vcall_offset)
- return false;
-
- /* Need a free register for GOT references. */
- if (flag_pic && !targetm.binds_local_p (function))
- return false;
-
- /* Otherwise ok. */
- return true;
-}
-
-/* Output the assembler code for a thunk function. THUNK_DECL is the
- declaration for the thunk function itself, FUNCTION is the decl for
- the target function. DELTA is an immediate constant offset to be
- added to THIS. If VCALL_OFFSET is nonzero, the word at
- *(*this + vcall_offset) should be added to THIS. */
-
-static void
-x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
- HOST_WIDE_INT vcall_offset, tree function)
-{
- rtx this_param = x86_this_parameter (function);
- rtx this_reg, tmp, fnaddr;
- unsigned int tmp_regno;
- rtx_insn *insn;
-
- if (TARGET_64BIT)
- tmp_regno = R10_REG;
- else
- {
- unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
- if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
- tmp_regno = AX_REG;
- else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
- tmp_regno = DX_REG;
- else
- tmp_regno = CX_REG;
- }
-
- emit_note (NOTE_INSN_PROLOGUE_END);
-
- /* CET is enabled, insert EB instruction. */
- if ((flag_cf_protection & CF_BRANCH))
- emit_insn (gen_nop_endbr ());
-
- /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
- pull it in now and let DELTA benefit. */
- if (REG_P (this_param))
- this_reg = this_param;
- else if (vcall_offset)
- {
- /* Put the this parameter into %eax. */
- this_reg = gen_rtx_REG (Pmode, AX_REG);
- emit_move_insn (this_reg, this_param);
- }
- else
- this_reg = NULL_RTX;
-
- /* Adjust the this parameter by a fixed constant. */
- if (delta)
- {
- rtx delta_rtx = GEN_INT (delta);
- rtx delta_dst = this_reg ? this_reg : this_param;
-
- if (TARGET_64BIT)
- {
- if (!x86_64_general_operand (delta_rtx, Pmode))
- {
- tmp = gen_rtx_REG (Pmode, tmp_regno);
- emit_move_insn (tmp, delta_rtx);
- delta_rtx = tmp;
- }
- }
-
- ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
- }
-
- /* Adjust the this parameter by a value stored in the vtable. */
- if (vcall_offset)
- {
- rtx vcall_addr, vcall_mem, this_mem;
-
- tmp = gen_rtx_REG (Pmode, tmp_regno);
-
- this_mem = gen_rtx_MEM (ptr_mode, this_reg);
- if (Pmode != ptr_mode)
- this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
- emit_move_insn (tmp, this_mem);
-
- /* Adjust the this parameter. */
- vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
- if (TARGET_64BIT
- && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
- {
- rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
- emit_move_insn (tmp2, GEN_INT (vcall_offset));
- vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
- }
-
- vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
- if (Pmode != ptr_mode)
- emit_insn (gen_addsi_1_zext (this_reg,
- gen_rtx_REG (ptr_mode,
- REGNO (this_reg)),
- vcall_mem));
- else
- ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
- }
-
- /* If necessary, drop THIS back to its stack slot. */
- if (this_reg && this_reg != this_param)
- emit_move_insn (this_param, this_reg);
-
- fnaddr = XEXP (DECL_RTL (function), 0);
- if (TARGET_64BIT)
- {
- if (!flag_pic || targetm.binds_local_p (function)
- || TARGET_PECOFF)
- ;
- else
- {
- tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
- tmp = gen_rtx_CONST (Pmode, tmp);
- fnaddr = gen_const_mem (Pmode, tmp);
- }
- }
- else
- {
- if (!flag_pic || targetm.binds_local_p (function))
- ;
-#if TARGET_MACHO
- else if (TARGET_MACHO)
- {
- fnaddr = machopic_indirect_call_target (DECL_RTL (function));
- fnaddr = XEXP (fnaddr, 0);
- }
-#endif /* TARGET_MACHO */
- else
- {
- tmp = gen_rtx_REG (Pmode, CX_REG);
- output_set_got (tmp, NULL_RTX);
-
- fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
- fnaddr = gen_rtx_CONST (Pmode, fnaddr);
- fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
- fnaddr = gen_const_mem (Pmode, fnaddr);
- }
- }
-
- /* Our sibling call patterns do not allow memories, because we have no
- predicate that can distinguish between frame and non-frame memory.
- For our purposes here, we can get away with (ab)using a jump pattern,
- because we're going to do no optimization. */
- if (MEM_P (fnaddr))
- {
- if (sibcall_insn_operand (fnaddr, word_mode))
- {
- fnaddr = XEXP (DECL_RTL (function), 0);
- tmp = gen_rtx_MEM (QImode, fnaddr);
- tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
- tmp = emit_call_insn (tmp);
- SIBLING_CALL_P (tmp) = 1;
- }
- else
- emit_jump_insn (gen_indirect_jump (fnaddr));
- }
- else
- {
- if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
- {
- // CM_LARGE_PIC always uses pseudo PIC register which is
- // uninitialized. Since FUNCTION is local and calling it
- // doesn't go through PLT, we use scratch register %r11 as
- // PIC register and initialize it here.
- pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
- ix86_init_large_pic_reg (tmp_regno);
- fnaddr = legitimize_pic_address (fnaddr,
- gen_rtx_REG (Pmode, tmp_regno));
- }
-
- if (!sibcall_insn_operand (fnaddr, word_mode))
- {
- tmp = gen_rtx_REG (word_mode, tmp_regno);
- if (GET_MODE (fnaddr) != word_mode)
- fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
- emit_move_insn (tmp, fnaddr);
- fnaddr = tmp;
- }
-
- tmp = gen_rtx_MEM (QImode, fnaddr);
- tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
- tmp = emit_call_insn (tmp);
- SIBLING_CALL_P (tmp) = 1;
- }
- emit_barrier ();
-
- /* Emit just enough of rest_of_compilation to get the insns emitted.
- Note that use_thunk calls assemble_start_function et al. */
- insn = get_insns ();
- shorten_branches (insn);
- final_start_function (insn, file, 1);
- final (insn, file, 1);
- final_end_function ();
-}
-
-static void
-x86_file_start (void)
-{
- default_file_start ();
- if (TARGET_16BIT)
- fputs ("\t.code16gcc\n", asm_out_file);
-#if TARGET_MACHO
- darwin_file_start ();
-#endif
- if (X86_FILE_START_VERSION_DIRECTIVE)
- fputs ("\t.version\t\"01.01\"\n", asm_out_file);
- if (X86_FILE_START_FLTUSED)
- fputs ("\t.global\t__fltused\n", asm_out_file);
- if (ix86_asm_dialect == ASM_INTEL)
- fputs ("\t.intel_syntax noprefix\n", asm_out_file);
-}
-
-int
-x86_field_alignment (tree type, int computed)
-{
- machine_mode mode;
-
- if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
- return computed;
- if (TARGET_IAMCU)
- return iamcu_alignment (type, computed);
- mode = TYPE_MODE (strip_array_types (type));
- if (mode == DFmode || mode == DCmode
- || GET_MODE_CLASS (mode) == MODE_INT
- || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
- return MIN (32, computed);
- return computed;
-}
-
-/* Print call to TARGET to FILE. */
-
-static void
-x86_print_call_or_nop (FILE *file, const char *target)
-{
- if (flag_nop_mcount || !strcmp (target, "nop"))
- /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
- fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
- else
- fprintf (file, "1:\tcall\t%s\n", target);
-}
-
-static bool
-current_fentry_name (const char **name)
-{
- tree attr = lookup_attribute ("fentry_name",
- DECL_ATTRIBUTES (current_function_decl));
- if (!attr)
- return false;
- *name = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (attr)));
- return true;
-}
-
-static bool
-current_fentry_section (const char **name)
-{
- tree attr = lookup_attribute ("fentry_section",
- DECL_ATTRIBUTES (current_function_decl));
- if (!attr)
- return false;
- *name = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (attr)));
- return true;
-}
-
-/* Output assembler code to FILE to increment profiler label # LABELNO
- for profiling a function entry. */
-void
-x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
-{
- if (cfun->machine->endbr_queued_at_entrance)
- fprintf (file, "\t%s\n", TARGET_64BIT ? "endbr64" : "endbr32");
-
- const char *mcount_name = MCOUNT_NAME;
-
- if (current_fentry_name (&mcount_name))
- ;
- else if (fentry_name)
- mcount_name = fentry_name;
- else if (flag_fentry)
- mcount_name = MCOUNT_NAME_BEFORE_PROLOGUE;
-
- if (TARGET_64BIT)
- {
-#ifndef NO_PROFILE_COUNTERS
- fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
-#endif
-
- if (!TARGET_PECOFF && flag_pic)
- fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
- else
- x86_print_call_or_nop (file, mcount_name);
- }
- else if (flag_pic)
- {
-#ifndef NO_PROFILE_COUNTERS
- fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
- LPREFIX, labelno);
-#endif
- fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
- }
- else
- {
-#ifndef NO_PROFILE_COUNTERS
- fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
- LPREFIX, labelno);
-#endif
- x86_print_call_or_nop (file, mcount_name);
- }
-
- if (flag_record_mcount
- || lookup_attribute ("fentry_section",
- DECL_ATTRIBUTES (current_function_decl)))
- {
- const char *sname = "__mcount_loc";
-
- if (current_fentry_section (&sname))
- ;
- else if (fentry_section)
- sname = fentry_section;
-
- fprintf (file, "\t.section %s, \"a\",@progbits\n", sname);
- fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
- fprintf (file, "\t.previous\n");
- }
-}
-
-/* We don't have exact information about the insn sizes, but we may assume
- quite safely that we are informed about all 1 byte insns and memory
- address sizes. This is enough to eliminate unnecessary padding in
- 99% of cases. */
-
-int
-ix86_min_insn_size (rtx_insn *insn)
-{
- int l = 0, len;
-
- if (!INSN_P (insn) || !active_insn_p (insn))
- return 0;
-
- /* Discard alignments we've emit and jump instructions. */
- if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
- && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
- return 0;
-
- /* Important case - calls are always 5 bytes.
- It is common to have many calls in the row. */
- if (CALL_P (insn)
- && symbolic_reference_mentioned_p (PATTERN (insn))
- && !SIBLING_CALL_P (insn))
- return 5;
- len = get_attr_length (insn);
- if (len <= 1)
- return 1;
-
- /* For normal instructions we rely on get_attr_length being exact,
- with a few exceptions. */
- if (!JUMP_P (insn))
- {
- enum attr_type type = get_attr_type (insn);
-
- switch (type)
- {
- case TYPE_MULTI:
- if (GET_CODE (PATTERN (insn)) == ASM_INPUT
- || asm_noperands (PATTERN (insn)) >= 0)
- return 0;
- break;
- case TYPE_OTHER:
- case TYPE_FCMP:
- break;
- default:
- /* Otherwise trust get_attr_length. */
- return len;
- }
-
- l = get_attr_length_address (insn);
- if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
- l = 4;
- }
- if (l)
- return 1+l;
- else
- return 2;
-}
-
-#ifdef ASM_OUTPUT_MAX_SKIP_PAD
-
-/* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
- window. */
-
-static void
-ix86_avoid_jump_mispredicts (void)
-{
- rtx_insn *insn, *start = get_insns ();
- int nbytes = 0, njumps = 0;
- bool isjump = false;
-
- /* Look for all minimal intervals of instructions containing 4 jumps.
- The intervals are bounded by START and INSN. NBYTES is the total
- size of instructions in the interval including INSN and not including
- START. When the NBYTES is smaller than 16 bytes, it is possible
- that the end of START and INSN ends up in the same 16byte page.
-
- The smallest offset in the page INSN can start is the case where START
- ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
- We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
-
- Don't consider asm goto as jump, while it can contain a jump, it doesn't
- have to, control transfer to label(s) can be performed through other
- means, and also we estimate minimum length of all asm stmts as 0. */
- for (insn = start; insn; insn = NEXT_INSN (insn))
- {
- int min_size;
-
- if (LABEL_P (insn))
- {
- align_flags alignment = label_to_alignment (insn);
- int align = alignment.levels[0].log;
- int max_skip = alignment.levels[0].maxskip;
-
- if (max_skip > 15)
- max_skip = 15;
- /* If align > 3, only up to 16 - max_skip - 1 bytes can be
- already in the current 16 byte page, because otherwise
- ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
- bytes to reach 16 byte boundary. */
- if (align <= 0
- || (align <= 3 && max_skip != (1 << align) - 1))
- max_skip = 0;
- if (dump_file)
- fprintf (dump_file, "Label %i with max_skip %i\n",
- INSN_UID (insn), max_skip);
- if (max_skip)
- {
- while (nbytes + max_skip >= 16)
- {
- start = NEXT_INSN (start);
- if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
- || CALL_P (start))
- njumps--, isjump = true;
- else
- isjump = false;
- nbytes -= ix86_min_insn_size (start);
- }
- }
- continue;
- }
-
- min_size = ix86_min_insn_size (insn);
- nbytes += min_size;
- if (dump_file)
- fprintf (dump_file, "Insn %i estimated to %i bytes\n",
- INSN_UID (insn), min_size);
- if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
- || CALL_P (insn))
- njumps++;
- else
- continue;
-
- while (njumps > 3)
- {
- start = NEXT_INSN (start);
- if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
- || CALL_P (start))
- njumps--, isjump = true;
- else
- isjump = false;
- nbytes -= ix86_min_insn_size (start);
- }
- gcc_assert (njumps >= 0);
- if (dump_file)
- fprintf (dump_file, "Interval %i to %i has %i bytes\n",
- INSN_UID (start), INSN_UID (insn), nbytes);
-
- if (njumps == 3 && isjump && nbytes < 16)
- {
- int padsize = 15 - nbytes + ix86_min_insn_size (insn);
-
- if (dump_file)
- fprintf (dump_file, "Padding insn %i by %i bytes!\n",
- INSN_UID (insn), padsize);
- emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
- }
- }
-}
-#endif
-
-/* AMD Athlon works faster
- when RET is not destination of conditional jump or directly preceded
- by other jump instruction. We avoid the penalty by inserting NOP just
- before the RET instructions in such cases. */
-static void
-ix86_pad_returns (void)
-{
- edge e;
- edge_iterator ei;
-
- FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
- {
- basic_block bb = e->src;
- rtx_insn *ret = BB_END (bb);
- rtx_insn *prev;
- bool replace = false;
-
- if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
- || optimize_bb_for_size_p (bb))
- continue;
- for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
- if (active_insn_p (prev) || LABEL_P (prev))
- break;
- if (prev && LABEL_P (prev))
- {
- edge e;
- edge_iterator ei;
-
- FOR_EACH_EDGE (e, ei, bb->preds)
- if (EDGE_FREQUENCY (e) && e->src->index >= 0
- && !(e->flags & EDGE_FALLTHRU))
- {
- replace = true;
- break;
- }
- }
- if (!replace)
- {
- prev = prev_active_insn (ret);
- if (prev
- && ((JUMP_P (prev) && any_condjump_p (prev))
- || CALL_P (prev)))
- replace = true;
- /* Empty functions get branch mispredict even when
- the jump destination is not visible to us. */
- if (!prev && !optimize_function_for_size_p (cfun))
- replace = true;
- }
- if (replace)
- {
- emit_jump_insn_before (gen_simple_return_internal_long (), ret);
- delete_insn (ret);
- }
- }
-}
-
-/* Count the minimum number of instructions in BB. Return 4 if the
- number of instructions >= 4. */
-
-static int
-ix86_count_insn_bb (basic_block bb)
-{
- rtx_insn *insn;
- int insn_count = 0;
-
- /* Count number of instructions in this block. Return 4 if the number
- of instructions >= 4. */
- FOR_BB_INSNS (bb, insn)
- {
- /* Only happen in exit blocks. */
- if (JUMP_P (insn)
- && ANY_RETURN_P (PATTERN (insn)))
- break;
-
- if (NONDEBUG_INSN_P (insn)
- && GET_CODE (PATTERN (insn)) != USE
- && GET_CODE (PATTERN (insn)) != CLOBBER)
- {
- insn_count++;
- if (insn_count >= 4)
- return insn_count;
- }
- }
-
- return insn_count;
-}
-
-
-/* Count the minimum number of instructions in code path in BB.
- Return 4 if the number of instructions >= 4. */
-
-static int
-ix86_count_insn (basic_block bb)
-{
- edge e;
- edge_iterator ei;
- int min_prev_count;
-
- /* Only bother counting instructions along paths with no
- more than 2 basic blocks between entry and exit. Given
- that BB has an edge to exit, determine if a predecessor
- of BB has an edge from entry. If so, compute the number
- of instructions in the predecessor block. If there
- happen to be multiple such blocks, compute the minimum. */
- min_prev_count = 4;
- FOR_EACH_EDGE (e, ei, bb->preds)
- {
- edge prev_e;
- edge_iterator prev_ei;
-
- if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
- {
- min_prev_count = 0;
- break;
- }
- FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
- {
- if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
- {
- int count = ix86_count_insn_bb (e->src);
- if (count < min_prev_count)
- min_prev_count = count;
- break;
- }
- }
- }
-
- if (min_prev_count < 4)
- min_prev_count += ix86_count_insn_bb (bb);
-
- return min_prev_count;
-}
-
-/* Pad short function to 4 instructions. */
-
-static void
-ix86_pad_short_function (void)
-{
- edge e;
- edge_iterator ei;
-
- FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
- {
- rtx_insn *ret = BB_END (e->src);
- if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
- {
- int insn_count = ix86_count_insn (e->src);
-
- /* Pad short function. */
- if (insn_count < 4)
- {
- rtx_insn *insn = ret;
-
- /* Find epilogue. */
- while (insn
- && (!NOTE_P (insn)
- || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
- insn = PREV_INSN (insn);
-
- if (!insn)
- insn = ret;
-
- /* Two NOPs count as one instruction. */
- insn_count = 2 * (4 - insn_count);
- emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
- }
- }
- }
-}
-
-/* Fix up a Windows system unwinder issue. If an EH region falls through into
- the epilogue, the Windows system unwinder will apply epilogue logic and
- produce incorrect offsets. This can be avoided by adding a nop between
- the last insn that can throw and the first insn of the epilogue. */
-
-static void
-ix86_seh_fixup_eh_fallthru (void)
-{
- edge e;
- edge_iterator ei;
-
- FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
- {
- rtx_insn *insn, *next;
-
- /* Find the beginning of the epilogue. */
- for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
- if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
- break;
- if (insn == NULL)
- continue;
-
- /* We only care about preceding insns that can throw. */
- insn = prev_active_insn (insn);
- if (insn == NULL || !can_throw_internal (insn))
- continue;
-
- /* Do not separate calls from their debug information. */
- for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
- if (NOTE_P (next) && NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION)
- insn = next;
- else
- break;
-
- emit_insn_after (gen_nops (const1_rtx), insn);
- }
-}
-
-/* Implement machine specific optimizations. We implement padding of returns
- for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
-static void
-ix86_reorg (void)
-{
- /* We are freeing block_for_insn in the toplev to keep compatibility
- with old MDEP_REORGS that are not CFG based. Recompute it now. */
- compute_bb_for_insn ();
-
- if (TARGET_SEH && current_function_has_exception_handlers ())
- ix86_seh_fixup_eh_fallthru ();
-
- if (optimize && optimize_function_for_speed_p (cfun))
- {
- if (TARGET_PAD_SHORT_FUNCTION)
- ix86_pad_short_function ();
- else if (TARGET_PAD_RETURNS)
- ix86_pad_returns ();
-#ifdef ASM_OUTPUT_MAX_SKIP_PAD
- if (TARGET_FOUR_JUMP_LIMIT)
- ix86_avoid_jump_mispredicts ();
-#endif
- }
-}
-
-/* Return nonzero when QImode register that must be represented via REX prefix
- is used. */
-bool
-x86_extended_QIreg_mentioned_p (rtx_insn *insn)
-{
- int i;
- extract_insn_cached (insn);
- for (i = 0; i < recog_data.n_operands; i++)
- if (GENERAL_REG_P (recog_data.operand[i])
- && !QI_REGNO_P (REGNO (recog_data.operand[i])))
- return true;
- return false;
-}
-
-/* Return true when INSN mentions register that must be encoded using REX
- prefix. */
-bool
-x86_extended_reg_mentioned_p (rtx insn)
-{
- subrtx_iterator::array_type array;
- FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
- {
- const_rtx x = *iter;
- if (REG_P (x)
- && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
- return true;
- }
- return false;
-}
-
-/* If profitable, negate (without causing overflow) integer constant
- of mode MODE at location LOC. Return true in this case. */
-bool
-x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
-{
- HOST_WIDE_INT val;
-
- if (!CONST_INT_P (*loc))
- return false;
-
- switch (mode)
- {
- case E_DImode:
- /* DImode x86_64 constants must fit in 32 bits. */
- gcc_assert (x86_64_immediate_operand (*loc, mode));
-
- mode = SImode;
- break;
-
- case E_SImode:
- case E_HImode:
- case E_QImode:
- break;
-
- default:
- gcc_unreachable ();
- }
-
- /* Avoid overflows. */
- if (mode_signbit_p (mode, *loc))
- return false;
-
- val = INTVAL (*loc);
-
- /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
- Exceptions: -128 encodes smaller than 128, so swap sign and op. */
- if ((val < 0 && val != -128)
- || val == 128)
- {
- *loc = GEN_INT (-val);
- return true;
- }
-
- return false;
-}
-
-/* Generate an unsigned DImode/SImode to FP conversion. This is the same code
- optabs would emit if we didn't have TFmode patterns. */
-
-void
-x86_emit_floatuns (rtx operands[2])
-{
- rtx_code_label *neglab, *donelab;
- rtx i0, i1, f0, in, out;
- machine_mode mode, inmode;
-
- inmode = GET_MODE (operands[1]);
- gcc_assert (inmode == SImode || inmode == DImode);
-
- out = operands[0];
- in = force_reg (inmode, operands[1]);
- mode = GET_MODE (out);
- neglab = gen_label_rtx ();
- donelab = gen_label_rtx ();
- f0 = gen_reg_rtx (mode);
-
- emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
-
- expand_float (out, in, 0);
-
- emit_jump_insn (gen_jump (donelab));
- emit_barrier ();
-
- emit_label (neglab);
-
- i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
- 1, OPTAB_DIRECT);
- i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
- 1, OPTAB_DIRECT);
- i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
-
- expand_float (f0, i0, 0);
-
- emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
-
- emit_label (donelab);
-}
-\f
-static bool canonicalize_perm (struct expand_vec_perm_d *d);
-static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
-static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
-static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
-
-/* Get a vector mode of the same size as the original but with elements
- twice as wide. This is only guaranteed to apply to integral vectors. */
-
-static inline machine_mode
-get_mode_wider_vector (machine_mode o)
-{
- /* ??? Rely on the ordering that genmodes.c gives to vectors. */
- machine_mode n = GET_MODE_WIDER_MODE (o).require ();
- gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
- gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
- return n;
-}
-
-/* A subroutine of ix86_expand_vector_init_duplicate. Tries to
- fill target with val via vec_duplicate. */
-
-static bool
-ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
-{
- bool ok;
- rtx_insn *insn;
- rtx dup;
-
- /* First attempt to recognize VAL as-is. */
- dup = gen_vec_duplicate (mode, val);
- insn = emit_insn (gen_rtx_SET (target, dup));
- if (recog_memoized (insn) < 0)
- {
- rtx_insn *seq;
- machine_mode innermode = GET_MODE_INNER (mode);
- rtx reg;
-
- /* If that fails, force VAL into a register. */
-
- start_sequence ();
- reg = force_reg (innermode, val);
- if (GET_MODE (reg) != innermode)
- reg = gen_lowpart (innermode, reg);
- SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
- seq = get_insns ();
- end_sequence ();
- if (seq)
- emit_insn_before (seq, insn);
-
- ok = recog_memoized (insn) >= 0;
- gcc_assert (ok);
- }
- return true;
-}
-
-/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
- with all elements equal to VAR. Return true if successful. */
-
-static bool
-ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
- rtx target, rtx val)
-{
- bool ok;
-
- switch (mode)
- {
- case E_V2SImode:
- case E_V2SFmode:
- if (!mmx_ok)
- return false;
- /* FALLTHRU */
-
- case E_V4DFmode:
- case E_V4DImode:
- case E_V8SFmode:
- case E_V8SImode:
- case E_V2DFmode:
- case E_V2DImode:
- case E_V4SFmode:
- case E_V4SImode:
- case E_V16SImode:
- case E_V8DImode:
- case E_V16SFmode:
- case E_V8DFmode:
- return ix86_vector_duplicate_value (mode, target, val);
-
- case E_V4HImode:
- if (!mmx_ok)
- return false;
- if (TARGET_SSE || TARGET_3DNOW_A)
- {
- rtx x;
-
- val = gen_lowpart (SImode, val);
- x = gen_rtx_TRUNCATE (HImode, val);
- x = gen_rtx_VEC_DUPLICATE (mode, x);
- emit_insn (gen_rtx_SET (target, x));
- return true;
- }
- goto widen;
-
- case E_V8QImode:
- if (!mmx_ok)
- return false;
- goto widen;
-
- case E_V8HImode:
- if (TARGET_AVX2)
- return ix86_vector_duplicate_value (mode, target, val);
-
- if (TARGET_SSE2)
- {
- struct expand_vec_perm_d dperm;
- rtx tmp1, tmp2;
-
- permute:
- memset (&dperm, 0, sizeof (dperm));
- dperm.target = target;
- dperm.vmode = mode;
- dperm.nelt = GET_MODE_NUNITS (mode);
- dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
- dperm.one_operand_p = true;
-
- /* Extend to SImode using a paradoxical SUBREG. */
- tmp1 = gen_reg_rtx (SImode);
- emit_move_insn (tmp1, gen_lowpart (SImode, val));
-
- /* Insert the SImode value as low element of a V4SImode vector. */
- tmp2 = gen_reg_rtx (V4SImode);
- emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
- emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
-
- ok = (expand_vec_perm_1 (&dperm)
- || expand_vec_perm_broadcast_1 (&dperm));
- gcc_assert (ok);
- return ok;
- }
- goto widen;
-
- case E_V16QImode:
- if (TARGET_AVX2)
- return ix86_vector_duplicate_value (mode, target, val);
-
- if (TARGET_SSE2)
- goto permute;
- goto widen;
-
- widen:
- /* Replicate the value once into the next wider mode and recurse. */
- {
- machine_mode smode, wsmode, wvmode;
- rtx x;
-
- smode = GET_MODE_INNER (mode);
- wvmode = get_mode_wider_vector (mode);
- wsmode = GET_MODE_INNER (wvmode);
-
- val = convert_modes (wsmode, smode, val, true);
- x = expand_simple_binop (wsmode, ASHIFT, val,
- GEN_INT (GET_MODE_BITSIZE (smode)),
- NULL_RTX, 1, OPTAB_LIB_WIDEN);
- val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
-
- x = gen_reg_rtx (wvmode);
- ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
- gcc_assert (ok);
- emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
- return ok;
- }
-
- case E_V16HImode:
- case E_V32QImode:
- if (TARGET_AVX2)
- return ix86_vector_duplicate_value (mode, target, val);
- else
- {
- machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
- rtx x = gen_reg_rtx (hvmode);
-
- ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
- gcc_assert (ok);
-
- x = gen_rtx_VEC_CONCAT (mode, x, x);
- emit_insn (gen_rtx_SET (target, x));
- }
- return true;
-
- case E_V64QImode:
- case E_V32HImode:
- if (TARGET_AVX512BW)
- return ix86_vector_duplicate_value (mode, target, val);
- else
- {
- machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
- rtx x = gen_reg_rtx (hvmode);
-
- ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
- gcc_assert (ok);
-
- x = gen_rtx_VEC_CONCAT (mode, x, x);
- emit_insn (gen_rtx_SET (target, x));
- }
- return true;
-
- default:
- return false;
- }
-}
-
-/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
- whose ONE_VAR element is VAR, and other elements are zero. Return true
- if successful. */
-
-static bool
-ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
- rtx target, rtx var, int one_var)
-{
- machine_mode vsimode;
- rtx new_target;
- rtx x, tmp;
- bool use_vector_set = false;
- rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
-
- switch (mode)
- {
- case E_V2DImode:
- /* For SSE4.1, we normally use vector set. But if the second
- element is zero and inter-unit moves are OK, we use movq
- instead. */
- use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
- && !(TARGET_INTER_UNIT_MOVES_TO_VEC
- && one_var == 0));
- break;
- case E_V16QImode:
- case E_V4SImode:
- case E_V4SFmode:
- use_vector_set = TARGET_SSE4_1;
- break;
- case E_V8HImode:
- use_vector_set = TARGET_SSE2;
- break;
- case E_V4HImode:
- use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
- break;
- case E_V32QImode:
- case E_V16HImode:
- use_vector_set = TARGET_AVX;
- break;
- case E_V8SImode:
- use_vector_set = TARGET_AVX;
- gen_vec_set_0 = gen_vec_setv8si_0;
- break;
- case E_V8SFmode:
- use_vector_set = TARGET_AVX;
- gen_vec_set_0 = gen_vec_setv8sf_0;
- break;
- case E_V4DFmode:
- use_vector_set = TARGET_AVX;
- gen_vec_set_0 = gen_vec_setv4df_0;
- break;
- case E_V4DImode:
- /* Use ix86_expand_vector_set in 64bit mode only. */
- use_vector_set = TARGET_AVX && TARGET_64BIT;
- gen_vec_set_0 = gen_vec_setv4di_0;
- break;
- case E_V16SImode:
- use_vector_set = TARGET_AVX512F && one_var == 0;
- gen_vec_set_0 = gen_vec_setv16si_0;
- break;
- case E_V16SFmode:
- use_vector_set = TARGET_AVX512F && one_var == 0;
- gen_vec_set_0 = gen_vec_setv16sf_0;
- break;
- case E_V8DFmode:
- use_vector_set = TARGET_AVX512F && one_var == 0;
- gen_vec_set_0 = gen_vec_setv8df_0;
- break;
- case E_V8DImode:
- /* Use ix86_expand_vector_set in 64bit mode only. */
- use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
- gen_vec_set_0 = gen_vec_setv8di_0;
- break;
- default:
- break;
- }
-
- if (use_vector_set)
- {
- if (gen_vec_set_0 && one_var == 0)
- {
- var = force_reg (GET_MODE_INNER (mode), var);
- emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
- return true;
- }
- emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
- var = force_reg (GET_MODE_INNER (mode), var);
- ix86_expand_vector_set (mmx_ok, target, var, one_var);
- return true;
- }
-
- switch (mode)
- {
- case E_V2SFmode:
- case E_V2SImode:
- if (!mmx_ok)
- return false;
- /* FALLTHRU */
-
- case E_V2DFmode:
- case E_V2DImode:
- if (one_var != 0)
- return false;
- var = force_reg (GET_MODE_INNER (mode), var);
- x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
- emit_insn (gen_rtx_SET (target, x));
- return true;
-
- case E_V4SFmode:
- case E_V4SImode:
- if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
- new_target = gen_reg_rtx (mode);
- else
- new_target = target;
- var = force_reg (GET_MODE_INNER (mode), var);
- x = gen_rtx_VEC_DUPLICATE (mode, var);
- x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
- emit_insn (gen_rtx_SET (new_target, x));
- if (one_var != 0)
- {
- /* We need to shuffle the value to the correct position, so
- create a new pseudo to store the intermediate result. */
-
- /* With SSE2, we can use the integer shuffle insns. */
- if (mode != V4SFmode && TARGET_SSE2)
- {
- emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
- const1_rtx,
- GEN_INT (one_var == 1 ? 0 : 1),
- GEN_INT (one_var == 2 ? 0 : 1),
- GEN_INT (one_var == 3 ? 0 : 1)));
- if (target != new_target)
- emit_move_insn (target, new_target);
- return true;
- }
-
- /* Otherwise convert the intermediate result to V4SFmode and
- use the SSE1 shuffle instructions. */
- if (mode != V4SFmode)
- {
- tmp = gen_reg_rtx (V4SFmode);
- emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
- }
- else
- tmp = new_target;
-
- emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
- const1_rtx,
- GEN_INT (one_var == 1 ? 0 : 1),
- GEN_INT (one_var == 2 ? 0+4 : 1+4),
- GEN_INT (one_var == 3 ? 0+4 : 1+4)));
-
- if (mode != V4SFmode)
- emit_move_insn (target, gen_lowpart (V4SImode, tmp));
- else if (tmp != target)
- emit_move_insn (target, tmp);
- }
- else if (target != new_target)
- emit_move_insn (target, new_target);
- return true;
-
- case E_V8HImode:
- case E_V16QImode:
- vsimode = V4SImode;
- goto widen;
- case E_V4HImode:
- case E_V8QImode:
- if (!mmx_ok)
- return false;
- vsimode = V2SImode;
- goto widen;
- widen:
- if (one_var != 0)
- return false;
-
- /* Zero extend the variable element to SImode and recurse. */
- var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
-
- x = gen_reg_rtx (vsimode);
- if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
- var, one_var))
- gcc_unreachable ();
-
- emit_move_insn (target, gen_lowpart (mode, x));
- return true;
-
- default:
- return false;
- }
-}
-
-/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
- consisting of the values in VALS. It is known that all elements
- except ONE_VAR are constants. Return true if successful. */
-
-static bool
-ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
- rtx target, rtx vals, int one_var)
-{
- rtx var = XVECEXP (vals, 0, one_var);
- machine_mode wmode;
- rtx const_vec, x;
-
- const_vec = copy_rtx (vals);
- XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
- const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
-
- switch (mode)
- {
- case E_V2DFmode:
- case E_V2DImode:
- case E_V2SFmode:
- case E_V2SImode:
- /* For the two element vectors, it's just as easy to use
- the general case. */
- return false;
-
- case E_V4DImode:
- /* Use ix86_expand_vector_set in 64bit mode only. */
- if (!TARGET_64BIT)
- return false;
- /* FALLTHRU */
- case E_V4DFmode:
- case E_V8SFmode:
- case E_V8SImode:
- case E_V16HImode:
- case E_V32QImode:
- case E_V4SFmode:
- case E_V4SImode:
- case E_V8HImode:
- case E_V4HImode:
- break;
-
- case E_V16QImode:
- if (TARGET_SSE4_1)
- break;
- wmode = V8HImode;
- goto widen;
- case E_V8QImode:
- wmode = V4HImode;
- goto widen;
- widen:
- /* There's no way to set one QImode entry easily. Combine
- the variable value with its adjacent constant value, and
- promote to an HImode set. */
- x = XVECEXP (vals, 0, one_var ^ 1);
- if (one_var & 1)
- {
- var = convert_modes (HImode, QImode, var, true);
- var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
- NULL_RTX, 1, OPTAB_LIB_WIDEN);
- x = GEN_INT (INTVAL (x) & 0xff);
- }
- else
- {
- var = convert_modes (HImode, QImode, var, true);
- x = gen_int_mode (UINTVAL (x) << 8, HImode);
- }
- if (x != const0_rtx)
- var = expand_simple_binop (HImode, IOR, var, x, var,
- 1, OPTAB_LIB_WIDEN);
-
- x = gen_reg_rtx (wmode);
- emit_move_insn (x, gen_lowpart (wmode, const_vec));
- ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
-
- emit_move_insn (target, gen_lowpart (mode, x));
- return true;
-
- default:
- return false;
- }
-
- emit_move_insn (target, const_vec);
- ix86_expand_vector_set (mmx_ok, target, var, one_var);
- return true;
-}
-
-/* A subroutine of ix86_expand_vector_init_general. Use vector
- concatenate to handle the most general case: all values variable,
- and none identical. */
-
-static void
-ix86_expand_vector_init_concat (machine_mode mode,
- rtx target, rtx *ops, int n)
-{
- machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
- rtx first[16], second[8], third[4];
- rtvec v;
- int i, j;
-
- switch (n)
- {
- case 2:
- switch (mode)
- {
- case E_V16SImode:
- cmode = V8SImode;
- break;
- case E_V16SFmode:
- cmode = V8SFmode;
- break;
- case E_V8DImode:
- cmode = V4DImode;
- break;
- case E_V8DFmode:
- cmode = V4DFmode;
- break;
- case E_V8SImode:
- cmode = V4SImode;
- break;
- case E_V8SFmode:
- cmode = V4SFmode;
- break;
- case E_V4DImode:
- cmode = V2DImode;
- break;
- case E_V4DFmode:
- cmode = V2DFmode;
- break;
- case E_V4SImode:
- cmode = V2SImode;
- break;
- case E_V4SFmode:
- cmode = V2SFmode;
- break;
- case E_V2DImode:
- cmode = DImode;
- break;
- case E_V2SImode:
- cmode = SImode;
- break;
- case E_V2DFmode:
- cmode = DFmode;
- break;
- case E_V2SFmode:
- cmode = SFmode;
- break;
- default:
- gcc_unreachable ();
- }
-
- if (!register_operand (ops[1], cmode))
- ops[1] = force_reg (cmode, ops[1]);
- if (!register_operand (ops[0], cmode))
- ops[0] = force_reg (cmode, ops[0]);
- emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
- ops[1])));
- break;
-
- case 4:
- switch (mode)
- {
- case E_V4DImode:
- cmode = V2DImode;
- break;
- case E_V4DFmode:
- cmode = V2DFmode;
- break;
- case E_V4SImode:
- cmode = V2SImode;
- break;
- case E_V4SFmode:
- cmode = V2SFmode;
- break;
- default:
- gcc_unreachable ();
- }
- goto half;
-
- case 8:
- switch (mode)
- {
- case E_V8DImode:
- cmode = V2DImode;
- hmode = V4DImode;
- break;
- case E_V8DFmode:
- cmode = V2DFmode;
- hmode = V4DFmode;
- break;
- case E_V8SImode:
- cmode = V2SImode;
- hmode = V4SImode;
- break;
- case E_V8SFmode:
- cmode = V2SFmode;
- hmode = V4SFmode;
- break;
- default:
- gcc_unreachable ();
- }
- goto half;
-
- case 16:
- switch (mode)
- {
- case E_V16SImode:
- cmode = V2SImode;
- hmode = V4SImode;
- gmode = V8SImode;
- break;
- case E_V16SFmode:
- cmode = V2SFmode;
- hmode = V4SFmode;
- gmode = V8SFmode;
- break;
- default:
- gcc_unreachable ();
- }
- goto half;
-
-half:
- /* FIXME: We process inputs backward to help RA. PR 36222. */
- i = n - 1;
- j = (n >> 1) - 1;
- for (; i > 0; i -= 2, j--)
- {
- first[j] = gen_reg_rtx (cmode);
- v = gen_rtvec (2, ops[i - 1], ops[i]);
- ix86_expand_vector_init (false, first[j],
- gen_rtx_PARALLEL (cmode, v));
- }
-
- n >>= 1;
- if (n > 4)
- {
- gcc_assert (hmode != VOIDmode);
- gcc_assert (gmode != VOIDmode);
- for (i = j = 0; i < n; i += 2, j++)
- {
- second[j] = gen_reg_rtx (hmode);
- ix86_expand_vector_init_concat (hmode, second [j],
- &first [i], 2);
- }
- n >>= 1;
- for (i = j = 0; i < n; i += 2, j++)
- {
- third[j] = gen_reg_rtx (gmode);
- ix86_expand_vector_init_concat (gmode, third[j],
- &second[i], 2);
- }
- n >>= 1;
- ix86_expand_vector_init_concat (mode, target, third, n);
- }
- else if (n > 2)
- {
- gcc_assert (hmode != VOIDmode);
- for (i = j = 0; i < n; i += 2, j++)
- {
- second[j] = gen_reg_rtx (hmode);
- ix86_expand_vector_init_concat (hmode, second [j],
- &first [i], 2);
- }
- n >>= 1;
- ix86_expand_vector_init_concat (mode, target, second, n);
- }
- else
- ix86_expand_vector_init_concat (mode, target, first, n);
- break;
-
- default:
- gcc_unreachable ();
- }
-}
-
-/* A subroutine of ix86_expand_vector_init_general. Use vector
- interleave to handle the most general case: all values variable,
- and none identical. */
-
-static void
-ix86_expand_vector_init_interleave (machine_mode mode,
- rtx target, rtx *ops, int n)
-{
- machine_mode first_imode, second_imode, third_imode, inner_mode;
- int i, j;
- rtx op0, op1;
- rtx (*gen_load_even) (rtx, rtx, rtx);
- rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
- rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
-
- switch (mode)
- {
- case E_V8HImode:
- gen_load_even = gen_vec_setv8hi;
- gen_interleave_first_low = gen_vec_interleave_lowv4si;
- gen_interleave_second_low = gen_vec_interleave_lowv2di;
- inner_mode = HImode;
- first_imode = V4SImode;
- second_imode = V2DImode;
- third_imode = VOIDmode;
- break;
- case E_V16QImode:
- gen_load_even = gen_vec_setv16qi;
- gen_interleave_first_low = gen_vec_interleave_lowv8hi;
- gen_interleave_second_low = gen_vec_interleave_lowv4si;
- inner_mode = QImode;
- first_imode = V8HImode;
- second_imode = V4SImode;
- third_imode = V2DImode;
- break;
- default:
- gcc_unreachable ();
- }
-
- for (i = 0; i < n; i++)
- {
- /* Extend the odd elment to SImode using a paradoxical SUBREG. */
- op0 = gen_reg_rtx (SImode);
- emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
-
- /* Insert the SImode value as low element of V4SImode vector. */
- op1 = gen_reg_rtx (V4SImode);
- op0 = gen_rtx_VEC_MERGE (V4SImode,
- gen_rtx_VEC_DUPLICATE (V4SImode,
- op0),
- CONST0_RTX (V4SImode),
- const1_rtx);
- emit_insn (gen_rtx_SET (op1, op0));
-
- /* Cast the V4SImode vector back to a vector in orignal mode. */
- op0 = gen_reg_rtx (mode);
- emit_move_insn (op0, gen_lowpart (mode, op1));
-
- /* Load even elements into the second position. */
- emit_insn (gen_load_even (op0,
- force_reg (inner_mode,
- ops [i + i + 1]),
- const1_rtx));
-
- /* Cast vector to FIRST_IMODE vector. */
- ops[i] = gen_reg_rtx (first_imode);
- emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
- }
-
- /* Interleave low FIRST_IMODE vectors. */
- for (i = j = 0; i < n; i += 2, j++)
- {
- op0 = gen_reg_rtx (first_imode);
- emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
-
- /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
- ops[j] = gen_reg_rtx (second_imode);
- emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
- }
-
- /* Interleave low SECOND_IMODE vectors. */
- switch (second_imode)
- {
- case E_V4SImode:
- for (i = j = 0; i < n / 2; i += 2, j++)
- {
- op0 = gen_reg_rtx (second_imode);
- emit_insn (gen_interleave_second_low (op0, ops[i],
- ops[i + 1]));
-
- /* Cast the SECOND_IMODE vector to the THIRD_IMODE
- vector. */
- ops[j] = gen_reg_rtx (third_imode);
- emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
- }
- second_imode = V2DImode;
- gen_interleave_second_low = gen_vec_interleave_lowv2di;
- /* FALLTHRU */
-
- case E_V2DImode:
- op0 = gen_reg_rtx (second_imode);
- emit_insn (gen_interleave_second_low (op0, ops[0],
- ops[1]));
-
- /* Cast the SECOND_IMODE vector back to a vector on original
- mode. */
- emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
- break;
-
- default:
- gcc_unreachable ();
- }
-}
-
-/* A subroutine of ix86_expand_vector_init. Handle the most general case:
- all values variable, and none identical. */
-
-static void
-ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
- rtx target, rtx vals)
-{
- rtx ops[64], op0, op1, op2, op3, op4, op5;
- machine_mode half_mode = VOIDmode;
- machine_mode quarter_mode = VOIDmode;
- int n, i;
-
- switch (mode)
- {
- case E_V2SFmode:
- case E_V2SImode:
- if (!mmx_ok && !TARGET_SSE)
- break;
- /* FALLTHRU */
-
- case E_V16SImode:
- case E_V16SFmode:
- case E_V8DFmode:
- case E_V8DImode:
- case E_V8SFmode:
- case E_V8SImode:
- case E_V4DFmode:
- case E_V4DImode:
- case E_V4SFmode:
- case E_V4SImode:
- case E_V2DFmode:
- case E_V2DImode:
- n = GET_MODE_NUNITS (mode);
- for (i = 0; i < n; i++)
- ops[i] = XVECEXP (vals, 0, i);
- ix86_expand_vector_init_concat (mode, target, ops, n);
- return;
-
- case E_V2TImode:
- for (i = 0; i < 2; i++)
- ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
- op0 = gen_reg_rtx (V4DImode);
- ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
- emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
- return;
-
- case E_V4TImode:
- for (i = 0; i < 4; i++)
- ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
- ops[4] = gen_reg_rtx (V4DImode);
- ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
- ops[5] = gen_reg_rtx (V4DImode);
- ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
- op0 = gen_reg_rtx (V8DImode);
- ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
- emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
- return;
-
- case E_V32QImode:
- half_mode = V16QImode;
- goto half;
-
- case E_V16HImode:
- half_mode = V8HImode;
- goto half;
-
-half:
- n = GET_MODE_NUNITS (mode);
- for (i = 0; i < n; i++)
- ops[i] = XVECEXP (vals, 0, i);
- op0 = gen_reg_rtx (half_mode);
- op1 = gen_reg_rtx (half_mode);
- ix86_expand_vector_init_interleave (half_mode, op0, ops,
- n >> 2);
- ix86_expand_vector_init_interleave (half_mode, op1,
- &ops [n >> 1], n >> 2);
- emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
- return;
-
- case E_V64QImode:
- quarter_mode = V16QImode;
- half_mode = V32QImode;
- goto quarter;
-
- case E_V32HImode:
- quarter_mode = V8HImode;
- half_mode = V16HImode;
- goto quarter;
-
-quarter:
- n = GET_MODE_NUNITS (mode);
- for (i = 0; i < n; i++)
- ops[i] = XVECEXP (vals, 0, i);
- op0 = gen_reg_rtx (quarter_mode);
- op1 = gen_reg_rtx (quarter_mode);
- op2 = gen_reg_rtx (quarter_mode);
- op3 = gen_reg_rtx (quarter_mode);
- op4 = gen_reg_rtx (half_mode);
- op5 = gen_reg_rtx (half_mode);
- ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
- n >> 3);
- ix86_expand_vector_init_interleave (quarter_mode, op1,
- &ops [n >> 2], n >> 3);
- ix86_expand_vector_init_interleave (quarter_mode, op2,
- &ops [n >> 1], n >> 3);
- ix86_expand_vector_init_interleave (quarter_mode, op3,
- &ops [(n >> 1) | (n >> 2)], n >> 3);
- emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
- emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
- emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
- return;
-
- case E_V16QImode:
- if (!TARGET_SSE4_1)
- break;
- /* FALLTHRU */
-
- case E_V8HImode:
- if (!TARGET_SSE2)
- break;
-
- /* Don't use ix86_expand_vector_init_interleave if we can't
- move from GPR to SSE register directly. */
- if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
- break;
-
- n = GET_MODE_NUNITS (mode);
- for (i = 0; i < n; i++)
- ops[i] = XVECEXP (vals, 0, i);
- ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
- return;
-
- case E_V4HImode:
- case E_V8QImode:
- break;
-
- default:
- gcc_unreachable ();
- }
-
- {
- int i, j, n_elts, n_words, n_elt_per_word;
- machine_mode inner_mode;
- rtx words[4], shift;
-
- inner_mode = GET_MODE_INNER (mode);
- n_elts = GET_MODE_NUNITS (mode);
- n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
- n_elt_per_word = n_elts / n_words;
- shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
-
- for (i = 0; i < n_words; ++i)
- {
- rtx word = NULL_RTX;
-
- for (j = 0; j < n_elt_per_word; ++j)
- {
- rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
- elt = convert_modes (word_mode, inner_mode, elt, true);
-
- if (j == 0)
- word = elt;
- else
- {
- word = expand_simple_binop (word_mode, ASHIFT, word, shift,
- word, 1, OPTAB_LIB_WIDEN);
- word = expand_simple_binop (word_mode, IOR, word, elt,
- word, 1, OPTAB_LIB_WIDEN);
- }
- }
-
- words[i] = word;
- }
-
- if (n_words == 1)
- emit_move_insn (target, gen_lowpart (mode, words[0]));
- else if (n_words == 2)
- {
- rtx tmp = gen_reg_rtx (mode);
- emit_clobber (tmp);
- emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
- emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
- emit_move_insn (target, tmp);
- }
- else if (n_words == 4)
- {
- rtx tmp = gen_reg_rtx (V4SImode);
- gcc_assert (word_mode == SImode);
- vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
- ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
- emit_move_insn (target, gen_lowpart (mode, tmp));
- }
- else
- gcc_unreachable ();
- }
-}
-
-/* Initialize vector TARGET via VALS. Suppress the use of MMX
- instructions unless MMX_OK is true. */
-
-void
-ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
-{
- machine_mode mode = GET_MODE (target);
- machine_mode inner_mode = GET_MODE_INNER (mode);
- int n_elts = GET_MODE_NUNITS (mode);
- int n_var = 0, one_var = -1;
- bool all_same = true, all_const_zero = true;
- int i;
- rtx x;
-
- /* Handle first initialization from vector elts. */
- if (n_elts != XVECLEN (vals, 0))
- {
- rtx subtarget = target;
- x = XVECEXP (vals, 0, 0);
- gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
- if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
- {
- rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
- if (inner_mode == QImode || inner_mode == HImode)
- {
- unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
- mode = mode_for_vector (SImode, n_bits / 4).require ();
- inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
- ops[0] = gen_lowpart (inner_mode, ops[0]);
- ops[1] = gen_lowpart (inner_mode, ops[1]);
- subtarget = gen_reg_rtx (mode);
- }
- ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
- if (subtarget != target)
- emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
- return;
- }
- gcc_unreachable ();
- }
-
- for (i = 0; i < n_elts; ++i)
- {
- x = XVECEXP (vals, 0, i);
- if (!(CONST_SCALAR_INT_P (x)
- || CONST_DOUBLE_P (x)
- || CONST_FIXED_P (x)))
- n_var++, one_var = i;
- else if (x != CONST0_RTX (inner_mode))
- all_const_zero = false;
- if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
- all_same = false;
- }
-
- /* Constants are best loaded from the constant pool. */
- if (n_var == 0)
- {
- emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
- return;
- }
-
- /* If all values are identical, broadcast the value. */
- if (all_same
- && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
- XVECEXP (vals, 0, 0)))
- return;
-
- /* Values where only one field is non-constant are best loaded from
- the pool and overwritten via move later. */
- if (n_var == 1)
- {
- if (all_const_zero
- && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
- XVECEXP (vals, 0, one_var),
- one_var))
- return;
-
- if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
- return;
- }
-
- ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
-}
-
-void
-ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
-{
- machine_mode mode = GET_MODE (target);
- machine_mode inner_mode = GET_MODE_INNER (mode);
- machine_mode half_mode;
- bool use_vec_merge = false;
- rtx tmp;
- static rtx (*gen_extract[6][2]) (rtx, rtx)
- = {
- { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
- { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
- { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
- { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
- { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
- { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
- };
- static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
- = {
- { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
- { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
- { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
- { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
- { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
- { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
- };
- int i, j, n;
- machine_mode mmode = VOIDmode;
- rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
-
- switch (mode)
- {
- case E_V2SFmode:
- case E_V2SImode:
- if (mmx_ok)
- {
- tmp = gen_reg_rtx (GET_MODE_INNER (mode));
- ix86_expand_vector_extract (true, tmp, target, 1 - elt);
- if (elt == 0)
- tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
- else
- tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
- emit_insn (gen_rtx_SET (target, tmp));
- return;
- }
- break;
-
- case E_V2DImode:
- use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
- if (use_vec_merge)
- break;
-
- tmp = gen_reg_rtx (GET_MODE_INNER (mode));
- ix86_expand_vector_extract (false, tmp, target, 1 - elt);
- if (elt == 0)
- tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
- else
- tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
- emit_insn (gen_rtx_SET (target, tmp));
- return;
-
- case E_V2DFmode:
- {
- rtx op0, op1;
-
- /* For the two element vectors, we implement a VEC_CONCAT with
- the extraction of the other element. */
-
- tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
- tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
-
- if (elt == 0)
- op0 = val, op1 = tmp;
- else
- op0 = tmp, op1 = val;
-
- tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
- emit_insn (gen_rtx_SET (target, tmp));
- }
- return;
-
- case E_V4SFmode:
- use_vec_merge = TARGET_SSE4_1;
- if (use_vec_merge)
- break;
-
- switch (elt)
- {
- case 0:
- use_vec_merge = true;
- break;
-
- case 1:
- /* tmp = target = A B C D */
- tmp = copy_to_reg (target);
- /* target = A A B B */
- emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
- /* target = X A B B */
- ix86_expand_vector_set (false, target, val, 0);
- /* target = A X C D */
- emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
- const1_rtx, const0_rtx,
- GEN_INT (2+4), GEN_INT (3+4)));
- return;
-
- case 2:
- /* tmp = target = A B C D */
- tmp = copy_to_reg (target);
- /* tmp = X B C D */
- ix86_expand_vector_set (false, tmp, val, 0);
- /* target = A B X D */
- emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
- const0_rtx, const1_rtx,
- GEN_INT (0+4), GEN_INT (3+4)));
- return;
-
- case 3:
- /* tmp = target = A B C D */
- tmp = copy_to_reg (target);
- /* tmp = X B C D */
- ix86_expand_vector_set (false, tmp, val, 0);
- /* target = A B X D */
- emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
- const0_rtx, const1_rtx,
- GEN_INT (2+4), GEN_INT (0+4)));
- return;
-
- default:
- gcc_unreachable ();
- }
- break;
-
- case E_V4SImode:
- use_vec_merge = TARGET_SSE4_1;
- if (use_vec_merge)
- break;
-
- /* Element 0 handled by vec_merge below. */
- if (elt == 0)
- {
- use_vec_merge = true;
- break;
- }
-
- if (TARGET_SSE2)
- {
- /* With SSE2, use integer shuffles to swap element 0 and ELT,
- store into element 0, then shuffle them back. */
-
- rtx order[4];
-
- order[0] = GEN_INT (elt);
- order[1] = const1_rtx;
- order[2] = const2_rtx;
- order[3] = GEN_INT (3);
- order[elt] = const0_rtx;
-
- emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
- order[1], order[2], order[3]));
-
- ix86_expand_vector_set (false, target, val, 0);
-
- emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
- order[1], order[2], order[3]));
- }
- else
- {
- /* For SSE1, we have to reuse the V4SF code. */
- rtx t = gen_reg_rtx (V4SFmode);
- emit_move_insn (t, gen_lowpart (V4SFmode, target));
- ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
- emit_move_insn (target, gen_lowpart (mode, t));
- }
- return;
-
- case E_V8HImode:
- use_vec_merge = TARGET_SSE2;
- break;
- case E_V4HImode:
- use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
- break;
-
- case E_V16QImode:
- use_vec_merge = TARGET_SSE4_1;
- break;
-
- case E_V8QImode:
- break;
-
- case E_V32QImode:
- half_mode = V16QImode;
- j = 0;
- n = 16;
- goto half;
-
- case E_V16HImode:
- half_mode = V8HImode;
- j = 1;
- n = 8;
- goto half;
-
- case E_V8SImode:
- half_mode = V4SImode;
- j = 2;
- n = 4;
- goto half;
-
- case E_V4DImode:
- half_mode = V2DImode;
- j = 3;
- n = 2;
- goto half;
-
- case E_V8SFmode:
- half_mode = V4SFmode;
- j = 4;
- n = 4;
- goto half;
-
- case E_V4DFmode:
- half_mode = V2DFmode;
- j = 5;
- n = 2;
- goto half;
-
-half:
- /* Compute offset. */
- i = elt / n;
- elt %= n;
-
- gcc_assert (i <= 1);
-
- /* Extract the half. */
- tmp = gen_reg_rtx (half_mode);
- emit_insn (gen_extract[j][i] (tmp, target));
-
- /* Put val in tmp at elt. */
- ix86_expand_vector_set (false, tmp, val, elt);
-
- /* Put it back. */
- emit_insn (gen_insert[j][i] (target, target, tmp));
- return;
-
- case E_V8DFmode:
- if (TARGET_AVX512F)
- {
- mmode = QImode;
- gen_blendm = gen_avx512f_blendmv8df;
- }
- break;
-
- case E_V8DImode:
- if (TARGET_AVX512F)
- {
- mmode = QImode;
- gen_blendm = gen_avx512f_blendmv8di;
- }
- break;
-
- case E_V16SFmode:
- if (TARGET_AVX512F)
- {
- mmode = HImode;
- gen_blendm = gen_avx512f_blendmv16sf;
- }
- break;
-
- case E_V16SImode:
- if (TARGET_AVX512F)
- {
- mmode = HImode;
- gen_blendm = gen_avx512f_blendmv16si;
- }
- break;
-
- case E_V32HImode:
- if (TARGET_AVX512BW)
- {
- mmode = SImode;
- gen_blendm = gen_avx512bw_blendmv32hi;
- }
- else if (TARGET_AVX512F)
- {
- half_mode = E_V8HImode;
- n = 8;
- goto quarter;
- }
- break;
-
- case E_V64QImode:
- if (TARGET_AVX512BW)
- {
- mmode = DImode;
- gen_blendm = gen_avx512bw_blendmv64qi;
- }
- else if (TARGET_AVX512F)
- {
- half_mode = E_V16QImode;
- n = 16;
- goto quarter;
- }
- break;
-
-quarter:
- /* Compute offset. */
- i = elt / n;
- elt %= n;
-
- gcc_assert (i <= 3);
-
- {
- /* Extract the quarter. */
- tmp = gen_reg_rtx (V4SImode);
- rtx tmp2 = gen_lowpart (V16SImode, target);
- rtx mask = gen_reg_rtx (QImode);
-
- emit_move_insn (mask, constm1_rtx);
- emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
- tmp, mask));
-
- tmp2 = gen_reg_rtx (half_mode);
- emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
- tmp = tmp2;
-
- /* Put val in tmp at elt. */
- ix86_expand_vector_set (false, tmp, val, elt);
-
- /* Put it back. */
- tmp2 = gen_reg_rtx (V16SImode);
- rtx tmp3 = gen_lowpart (V16SImode, target);
- mask = gen_reg_rtx (HImode);
- emit_move_insn (mask, constm1_rtx);
- tmp = gen_lowpart (V4SImode, tmp);
- emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
- tmp3, mask));
- emit_move_insn (target, gen_lowpart (mode, tmp2));
- }
- return;
-
- default:
- break;
- }
-
- if (mmode != VOIDmode)
- {
- tmp = gen_reg_rtx (mode);
- emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
- /* The avx512*_blendm<mode> expanders have different operand order
- from VEC_MERGE. In VEC_MERGE, the first input operand is used for
- elements where the mask is set and second input operand otherwise,
- in {sse,avx}*_*blend* the first input operand is used for elements
- where the mask is clear and second input operand otherwise. */
- emit_insn (gen_blendm (target, target, tmp,
- force_reg (mmode,
- gen_int_mode (HOST_WIDE_INT_1U << elt,
- mmode))));
- }
- else if (use_vec_merge)
- {
- tmp = gen_rtx_VEC_DUPLICATE (mode, val);
- tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
- GEN_INT (HOST_WIDE_INT_1U << elt));
- emit_insn (gen_rtx_SET (target, tmp));
- }
- else
- {
- rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
-
- emit_move_insn (mem, target);
-
- tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
- emit_move_insn (tmp, val);
-
- emit_move_insn (target, mem);
- }
-}
-
-void
-ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
-{
- machine_mode mode = GET_MODE (vec);
- machine_mode inner_mode = GET_MODE_INNER (mode);
- bool use_vec_extr = false;
- rtx tmp;
-
- switch (mode)
- {
- case E_V2SImode:
- case E_V2SFmode:
- if (!mmx_ok)
- break;
- /* FALLTHRU */
-
- case E_V2DFmode:
- case E_V2DImode:
- case E_V2TImode:
- case E_V4TImode:
- use_vec_extr = true;
- break;
-
- case E_V4SFmode:
- use_vec_extr = TARGET_SSE4_1;
- if (use_vec_extr)
- break;
-
- switch (elt)
- {
- case 0:
- tmp = vec;
- break;
-
- case 1:
- case 3:
- tmp = gen_reg_rtx (mode);
- emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
- GEN_INT (elt), GEN_INT (elt),
- GEN_INT (elt+4), GEN_INT (elt+4)));
- break;
-
- case 2:
- tmp = gen_reg_rtx (mode);
- emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
- break;
-
- default:
- gcc_unreachable ();
- }
- vec = tmp;
- use_vec_extr = true;
- elt = 0;
- break;
-
- case E_V4SImode:
- use_vec_extr = TARGET_SSE4_1;
- if (use_vec_extr)
- break;
-
- if (TARGET_SSE2)
- {
- switch (elt)
- {
- case 0:
- tmp = vec;
- break;
-
- case 1:
- case 3:
- tmp = gen_reg_rtx (mode);
- emit_insn (gen_sse2_pshufd_1 (tmp, vec,
- GEN_INT (elt), GEN_INT (elt),
- GEN_INT (elt), GEN_INT (elt)));
- break;
-
- case 2:
- tmp = gen_reg_rtx (mode);
- emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
- break;
-
- default:
- gcc_unreachable ();
- }
- vec = tmp;
- use_vec_extr = true;
- elt = 0;
- }
- else
- {
- /* For SSE1, we have to reuse the V4SF code. */
- ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
- gen_lowpart (V4SFmode, vec), elt);
- return;
- }
- break;
-
- case E_V8HImode:
- use_vec_extr = TARGET_SSE2;
- break;
- case E_V4HImode:
- use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
- break;
-
- case E_V16QImode:
- use_vec_extr = TARGET_SSE4_1;
- break;
-
- case E_V8SFmode:
- if (TARGET_AVX)
- {
- tmp = gen_reg_rtx (V4SFmode);
- if (elt < 4)
- emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
- else
- emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
- ix86_expand_vector_extract (false, target, tmp, elt & 3);
- return;
- }
- break;
-
- case E_V4DFmode:
- if (TARGET_AVX)
- {
- tmp = gen_reg_rtx (V2DFmode);
- if (elt < 2)
- emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
- else
- emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
- ix86_expand_vector_extract (false, target, tmp, elt & 1);
- return;
- }
- break;
-
- case E_V32QImode:
- if (TARGET_AVX)
- {
- tmp = gen_reg_rtx (V16QImode);
- if (elt < 16)
- emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
- else
- emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
- ix86_expand_vector_extract (false, target, tmp, elt & 15);
- return;
- }
- break;
-
- case E_V16HImode:
- if (TARGET_AVX)
- {
- tmp = gen_reg_rtx (V8HImode);
- if (elt < 8)
- emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
- else
- emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
- ix86_expand_vector_extract (false, target, tmp, elt & 7);
- return;
- }
- break;
-
- case E_V8SImode:
- if (TARGET_AVX)
- {
- tmp = gen_reg_rtx (V4SImode);
- if (elt < 4)
- emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
- else
- emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
- ix86_expand_vector_extract (false, target, tmp, elt & 3);
- return;
- }
- break;
-
- case E_V4DImode:
- if (TARGET_AVX)
- {
- tmp = gen_reg_rtx (V2DImode);
- if (elt < 2)
- emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
- else
- emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
- ix86_expand_vector_extract (false, target, tmp, elt & 1);
- return;
- }
- break;
-
- case E_V32HImode:
- if (TARGET_AVX512BW)
- {
- tmp = gen_reg_rtx (V16HImode);
- if (elt < 16)
- emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
- else
- emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
- ix86_expand_vector_extract (false, target, tmp, elt & 15);
- return;
- }
- break;
-
- case E_V64QImode:
- if (TARGET_AVX512BW)
- {
- tmp = gen_reg_rtx (V32QImode);
- if (elt < 32)
- emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
- else
- emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
- ix86_expand_vector_extract (false, target, tmp, elt & 31);
- return;
- }
- break;
-
- case E_V16SFmode:
- tmp = gen_reg_rtx (V8SFmode);
- if (elt < 8)
- emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
- else
- emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
- ix86_expand_vector_extract (false, target, tmp, elt & 7);
- return;
-
- case E_V8DFmode:
- tmp = gen_reg_rtx (V4DFmode);
- if (elt < 4)
- emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
- else
- emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
- ix86_expand_vector_extract (false, target, tmp, elt & 3);
- return;
-
- case E_V16SImode:
- tmp = gen_reg_rtx (V8SImode);
- if (elt < 8)
- emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
- else
- emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
- ix86_expand_vector_extract (false, target, tmp, elt & 7);
- return;
-
- case E_V8DImode:
- tmp = gen_reg_rtx (V4DImode);
- if (elt < 4)
- emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
- else
- emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
- ix86_expand_vector_extract (false, target, tmp, elt & 3);
- return;
-
- case E_V8QImode:
- /* ??? Could extract the appropriate HImode element and shift. */
- default:
- break;
- }
-
- if (use_vec_extr)
- {
- tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
- tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
-
- /* Let the rtl optimizers know about the zero extension performed. */
- if (inner_mode == QImode || inner_mode == HImode)
- {
- tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
- target = gen_lowpart (SImode, target);
- }
-
- emit_insn (gen_rtx_SET (target, tmp));
- }
- else
- {
- rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
-
- emit_move_insn (mem, vec);
-
- tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
- emit_move_insn (target, tmp);
- }
-}
-
-/* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
- to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
- The upper bits of DEST are undefined, though they shouldn't cause
- exceptions (some bits from src or all zeros are ok). */
-
-static void
-emit_reduc_half (rtx dest, rtx src, int i)
-{
- rtx tem, d = dest;
- switch (GET_MODE (src))
- {
- case E_V4SFmode:
- if (i == 128)
- tem = gen_sse_movhlps (dest, src, src);
- else
- tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
- GEN_INT (1 + 4), GEN_INT (1 + 4));
- break;
- case E_V2DFmode:
- tem = gen_vec_interleave_highv2df (dest, src, src);
- break;
- case E_V16QImode:
- case E_V8HImode:
- case E_V4SImode:
- case E_V2DImode:
- d = gen_reg_rtx (V1TImode);
- tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
- GEN_INT (i / 2));
- break;
- case E_V8SFmode:
- if (i == 256)
- tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
- else
- tem = gen_avx_shufps256 (dest, src, src,
- GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
- break;
- case E_V4DFmode:
- if (i == 256)
- tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
- else
- tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
- break;
- case E_V32QImode:
- case E_V16HImode:
- case E_V8SImode:
- case E_V4DImode:
- if (i == 256)
- {
- if (GET_MODE (dest) != V4DImode)
- d = gen_reg_rtx (V4DImode);
- tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
- gen_lowpart (V4DImode, src),
- const1_rtx);
- }
- else
- {
- d = gen_reg_rtx (V2TImode);
- tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
- GEN_INT (i / 2));
- }
- break;
- case E_V64QImode:
- case E_V32HImode:
- case E_V16SImode:
- case E_V16SFmode:
- case E_V8DImode:
- case E_V8DFmode:
- if (i > 128)
- tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
- gen_lowpart (V16SImode, src),
- gen_lowpart (V16SImode, src),
- GEN_INT (0x4 + (i == 512 ? 4 : 0)),
- GEN_INT (0x5 + (i == 512 ? 4 : 0)),
- GEN_INT (0x6 + (i == 512 ? 4 : 0)),
- GEN_INT (0x7 + (i == 512 ? 4 : 0)),
- GEN_INT (0xC), GEN_INT (0xD),
- GEN_INT (0xE), GEN_INT (0xF),
- GEN_INT (0x10), GEN_INT (0x11),
- GEN_INT (0x12), GEN_INT (0x13),
- GEN_INT (0x14), GEN_INT (0x15),
- GEN_INT (0x16), GEN_INT (0x17));
- else
- tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
- gen_lowpart (V16SImode, src),
- GEN_INT (i == 128 ? 0x2 : 0x1),
- GEN_INT (0x3),
- GEN_INT (0x3),
- GEN_INT (0x3),
- GEN_INT (i == 128 ? 0x6 : 0x5),
- GEN_INT (0x7),
- GEN_INT (0x7),
- GEN_INT (0x7),
- GEN_INT (i == 128 ? 0xA : 0x9),
- GEN_INT (0xB),
- GEN_INT (0xB),
- GEN_INT (0xB),
- GEN_INT (i == 128 ? 0xE : 0xD),
- GEN_INT (0xF),
- GEN_INT (0xF),
- GEN_INT (0xF));
- break;
- default:
- gcc_unreachable ();
- }
- emit_insn (tem);
- if (d != dest)
- emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
-}
-
-/* Expand a vector reduction. FN is the binary pattern to reduce;
- DEST is the destination; IN is the input vector. */
-
-void
-ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
-{
- rtx half, dst, vec = in;
- machine_mode mode = GET_MODE (in);
- int i;
-
- /* SSE4 has a special instruction for V8HImode UMIN reduction. */
- if (TARGET_SSE4_1
- && mode == V8HImode
- && fn == gen_uminv8hi3)
- {
- emit_insn (gen_sse4_1_phminposuw (dest, in));
- return;
- }
-
- for (i = GET_MODE_BITSIZE (mode);
- i > GET_MODE_UNIT_BITSIZE (mode);
- i >>= 1)
- {
- half = gen_reg_rtx (mode);
- emit_reduc_half (half, vec, i);
- if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
- dst = dest;
- else
- dst = gen_reg_rtx (mode);
- emit_insn (fn (dst, half, vec));
- vec = dst;
- }
-}
-\f
-/* Target hook for scalar_mode_supported_p. */
-static bool
-ix86_scalar_mode_supported_p (scalar_mode mode)
-{
- if (DECIMAL_FLOAT_MODE_P (mode))
- return default_decimal_float_supported_p ();
- else if (mode == TFmode)
- return true;
- else
- return default_scalar_mode_supported_p (mode);
-}
-
-/* Implements target hook vector_mode_supported_p. */
-static bool
-ix86_vector_mode_supported_p (machine_mode mode)
-{
- if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
- return true;
- if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
- return true;
- if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
- return true;
- if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
- return true;
- if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
- return true;
- if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
- return true;
- return false;
-}
-
-/* Target hook for c_mode_for_suffix. */
-static machine_mode
-ix86_c_mode_for_suffix (char suffix)
-{
- if (suffix == 'q')
- return TFmode;
- if (suffix == 'w')
- return XFmode;
-
- return VOIDmode;
-}
-
-/* Worker function for TARGET_MD_ASM_ADJUST.
-
- We implement asm flag outputs, and maintain source compatibility
- with the old cc0-based compiler. */
-
-static rtx_insn *
-ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
- vec<const char *> &constraints,
- vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
-{
- bool saw_asm_flag = false;
-
- start_sequence ();
- for (unsigned i = 0, n = outputs.length (); i < n; ++i)
- {
- const char *con = constraints[i];
- if (strncmp (con, "=@cc", 4) != 0)
- continue;
- con += 4;
- if (strchr (con, ',') != NULL)
- {
- error ("alternatives not allowed in asm flag output");
- continue;
- }
-
- bool invert = false;
- if (con[0] == 'n')
- invert = true, con++;
-
- machine_mode mode = CCmode;
- rtx_code code = UNKNOWN;
-
- switch (con[0])
- {
- case 'a':
- if (con[1] == 0)
- mode = CCAmode, code = EQ;
- else if (con[1] == 'e' && con[2] == 0)
- mode = CCCmode, code = NE;
- break;
- case 'b':
- if (con[1] == 0)
- mode = CCCmode, code = EQ;
- else if (con[1] == 'e' && con[2] == 0)
- mode = CCAmode, code = NE;
- break;
- case 'c':
- if (con[1] == 0)
- mode = CCCmode, code = EQ;
- break;
- case 'e':
- if (con[1] == 0)
- mode = CCZmode, code = EQ;
- break;
- case 'g':
- if (con[1] == 0)
- mode = CCGCmode, code = GT;
- else if (con[1] == 'e' && con[2] == 0)
- mode = CCGCmode, code = GE;
- break;
- case 'l':
- if (con[1] == 0)
- mode = CCGCmode, code = LT;
- else if (con[1] == 'e' && con[2] == 0)
- mode = CCGCmode, code = LE;
- break;
- case 'o':
- if (con[1] == 0)
- mode = CCOmode, code = EQ;
- break;
- case 'p':
- if (con[1] == 0)
- mode = CCPmode, code = EQ;
- break;
- case 's':
- if (con[1] == 0)
- mode = CCSmode, code = EQ;
- break;
- case 'z':
- if (con[1] == 0)
- mode = CCZmode, code = EQ;
- break;
- }
- if (code == UNKNOWN)
- {
- error ("unknown asm flag output %qs", constraints[i]);
- continue;
- }
- if (invert)
- code = reverse_condition (code);
-
- rtx dest = outputs[i];
- if (!saw_asm_flag)
- {
- /* This is the first asm flag output. Here we put the flags
- register in as the real output and adjust the condition to
- allow it. */
- constraints[i] = "=Bf";
- outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
- saw_asm_flag = true;
- }
- else
- {
- /* We don't need the flags register as output twice. */
- constraints[i] = "=X";
- outputs[i] = gen_rtx_SCRATCH (SImode);
- }
-
- rtx x = gen_rtx_REG (mode, FLAGS_REG);
- x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
-
- machine_mode dest_mode = GET_MODE (dest);
- if (!SCALAR_INT_MODE_P (dest_mode))
- {
- error ("invalid type for asm flag output");
- continue;
- }
-
- if (dest_mode == DImode && !TARGET_64BIT)
- dest_mode = SImode;
-
- if (dest_mode != QImode)
- {
- rtx destqi = gen_reg_rtx (QImode);
- emit_insn (gen_rtx_SET (destqi, x));
-
- if (TARGET_ZERO_EXTEND_WITH_AND
- && optimize_function_for_speed_p (cfun))
- {
- x = force_reg (dest_mode, const0_rtx);
-
- emit_insn (gen_movstrictqi
- (gen_lowpart (QImode, x), destqi));
- }
- else
- x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
- }
-
- if (dest_mode != GET_MODE (dest))
- {
- rtx tmp = gen_reg_rtx (SImode);
-
- emit_insn (gen_rtx_SET (tmp, x));
- emit_insn (gen_zero_extendsidi2 (dest, tmp));
- }
- else
- emit_insn (gen_rtx_SET (dest, x));
- }
- rtx_insn *seq = get_insns ();
- end_sequence ();
-
- if (saw_asm_flag)
- return seq;
- else
- {
- /* If we had no asm flag outputs, clobber the flags. */
- clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
- SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
- return NULL;
- }
-}
-
-/* Implements target vector targetm.asm.encode_section_info. */
-
-static void ATTRIBUTE_UNUSED
-ix86_encode_section_info (tree decl, rtx rtl, int first)
-{
- default_encode_section_info (decl, rtl, first);
-
- if (ix86_in_large_data_p (decl))
- SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
-}
-
-/* Worker function for REVERSE_CONDITION. */
-
-enum rtx_code
-ix86_reverse_condition (enum rtx_code code, machine_mode mode)
-{
- return (mode == CCFPmode
- ? reverse_condition_maybe_unordered (code)
- : reverse_condition (code));
-}
-
-/* Output code to perform an x87 FP register move, from OPERANDS[1]
- to OPERANDS[0]. */
-
-const char *
-output_387_reg_move (rtx_insn *insn, rtx *operands)
-{
- if (REG_P (operands[0]))
- {
- if (REG_P (operands[1])
- && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
- {
- if (REGNO (operands[0]) == FIRST_STACK_REG)
- return output_387_ffreep (operands, 0);
- return "fstp\t%y0";
- }
- if (STACK_TOP_P (operands[0]))
- return "fld%Z1\t%y1";
- return "fst\t%y0";
- }
- else if (MEM_P (operands[0]))
- {
- gcc_assert (REG_P (operands[1]));
- if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
- return "fstp%Z0\t%y0";
- else
- {
- /* There is no non-popping store to memory for XFmode.
- So if we need one, follow the store with a load. */
- if (GET_MODE (operands[0]) == XFmode)
- return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
- else
- return "fst%Z0\t%y0";
- }
- }
- else
- gcc_unreachable();
-}
-
-/* Output code to perform a conditional jump to LABEL, if C2 flag in
- FP status register is set. */
-
-void
-ix86_emit_fp_unordered_jump (rtx label)
-{
- rtx reg = gen_reg_rtx (HImode);
- rtx_insn *insn;
- rtx temp;
-
- emit_insn (gen_x86_fnstsw_1 (reg));
-
- if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
- {
- emit_insn (gen_x86_sahf_1 (reg));
-
- temp = gen_rtx_REG (CCmode, FLAGS_REG);
- temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
- }
- else
- {
- emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
-
- temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
- temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
- }
-
- temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
- gen_rtx_LABEL_REF (VOIDmode, label),
- pc_rtx);
- insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
- predict_jump (REG_BR_PROB_BASE * 10 / 100);
- JUMP_LABEL (insn) = label;
-}
-
-/* Output code to perform an sinh XFmode calculation. */
-
-void ix86_emit_i387_sinh (rtx op0, rtx op1)
-{
- rtx e1 = gen_reg_rtx (XFmode);
- rtx e2 = gen_reg_rtx (XFmode);
- rtx scratch = gen_reg_rtx (HImode);
- rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
- rtx half = const_double_from_real_value (dconsthalf, XFmode);
- rtx cst1, tmp;
- rtx_code_label *jump_label = gen_label_rtx ();
- rtx_insn *insn;
-
- /* scratch = fxam (op1) */
- emit_insn (gen_fxamxf2_i387 (scratch, op1));
-
- /* e1 = expm1 (|op1|) */
- emit_insn (gen_absxf2 (e2, op1));
- emit_insn (gen_expm1xf2 (e1, e2));
-
- /* e2 = e1 / (e1 + 1.0) + e1 */
- cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
- emit_insn (gen_addxf3 (e2, e1, cst1));
- emit_insn (gen_divxf3 (e2, e1, e2));
- emit_insn (gen_addxf3 (e2, e2, e1));
-
- /* flags = signbit (op1) */
- emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
-
- /* if (flags) then e2 = -e2 */
- tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
- gen_rtx_EQ (VOIDmode, flags, const0_rtx),
- gen_rtx_LABEL_REF (VOIDmode, jump_label),
- pc_rtx);
- insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
- predict_jump (REG_BR_PROB_BASE * 50 / 100);
- JUMP_LABEL (insn) = jump_label;
-
- emit_insn (gen_negxf2 (e2, e2));
-
- emit_label (jump_label);
- LABEL_NUSES (jump_label) = 1;
-
- /* op0 = 0.5 * e2 */
- half = force_reg (XFmode, half);
- emit_insn (gen_mulxf3 (op0, e2, half));
-}
-
-/* Output code to perform an cosh XFmode calculation. */
-
-void ix86_emit_i387_cosh (rtx op0, rtx op1)
-{
- rtx e1 = gen_reg_rtx (XFmode);
- rtx e2 = gen_reg_rtx (XFmode);
- rtx half = const_double_from_real_value (dconsthalf, XFmode);
- rtx cst1;
-
- /* e1 = exp (op1) */
- emit_insn (gen_expxf2 (e1, op1));
-
- /* e2 = e1 + 1.0 / e1 */
- cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
- emit_insn (gen_divxf3 (e2, cst1, e1));
- emit_insn (gen_addxf3 (e2, e1, e2));
-
- /* op0 = 0.5 * e2 */
- half = force_reg (XFmode, half);
- emit_insn (gen_mulxf3 (op0, e2, half));
-}
-
-/* Output code to perform an tanh XFmode calculation. */
-
-void ix86_emit_i387_tanh (rtx op0, rtx op1)
-{
- rtx e1 = gen_reg_rtx (XFmode);
- rtx e2 = gen_reg_rtx (XFmode);
- rtx scratch = gen_reg_rtx (HImode);
- rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
- rtx cst2, tmp;
- rtx_code_label *jump_label = gen_label_rtx ();
- rtx_insn *insn;
-
- /* scratch = fxam (op1) */
- emit_insn (gen_fxamxf2_i387 (scratch, op1));
-
- /* e1 = expm1 (-|2 * op1|) */
- emit_insn (gen_addxf3 (e2, op1, op1));
- emit_insn (gen_absxf2 (e2, e2));
- emit_insn (gen_negxf2 (e2, e2));
- emit_insn (gen_expm1xf2 (e1, e2));
-
- /* e2 = e1 / (e1 + 2.0) */
- cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
- emit_insn (gen_addxf3 (e2, e1, cst2));
- emit_insn (gen_divxf3 (e2, e1, e2));
-
- /* flags = signbit (op1) */
- emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
-
- /* if (!flags) then e2 = -e2 */
- tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
- gen_rtx_NE (VOIDmode, flags, const0_rtx),
- gen_rtx_LABEL_REF (VOIDmode, jump_label),
- pc_rtx);
- insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
- predict_jump (REG_BR_PROB_BASE * 50 / 100);
- JUMP_LABEL (insn) = jump_label;
-
- emit_insn (gen_negxf2 (e2, e2));
-
- emit_label (jump_label);
- LABEL_NUSES (jump_label) = 1;
-
- emit_move_insn (op0, e2);
-}
-
-/* Output code to perform an asinh XFmode calculation. */
-
-void ix86_emit_i387_asinh (rtx op0, rtx op1)
-{
- rtx e1 = gen_reg_rtx (XFmode);
- rtx e2 = gen_reg_rtx (XFmode);
- rtx scratch = gen_reg_rtx (HImode);
- rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
- rtx cst1, tmp;
- rtx_code_label *jump_label = gen_label_rtx ();
- rtx_insn *insn;
-
- /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
- emit_insn (gen_mulxf3 (e1, op1, op1));
- cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
- emit_insn (gen_addxf3 (e2, e1, cst1));
- emit_insn (gen_sqrtxf2 (e2, e2));
- emit_insn (gen_addxf3 (e2, e2, cst1));
-
- /* e1 = e1 / e2 */
- emit_insn (gen_divxf3 (e1, e1, e2));
-
- /* scratch = fxam (op1) */
- emit_insn (gen_fxamxf2_i387 (scratch, op1));
-
- /* e1 = e1 + |op1| */
- emit_insn (gen_absxf2 (e2, op1));
- emit_insn (gen_addxf3 (e1, e1, e2));
-
- /* e2 = log1p (e1) */
- ix86_emit_i387_log1p (e2, e1);
-
- /* flags = signbit (op1) */
- emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
-
- /* if (flags) then e2 = -e2 */
- tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
- gen_rtx_EQ (VOIDmode, flags, const0_rtx),
- gen_rtx_LABEL_REF (VOIDmode, jump_label),
- pc_rtx);
- insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
- predict_jump (REG_BR_PROB_BASE * 50 / 100);
- JUMP_LABEL (insn) = jump_label;
-
- emit_insn (gen_negxf2 (e2, e2));
-
- emit_label (jump_label);
- LABEL_NUSES (jump_label) = 1;
-
- emit_move_insn (op0, e2);
-}
-
-/* Output code to perform an acosh XFmode calculation. */
-
-void ix86_emit_i387_acosh (rtx op0, rtx op1)
-{
- rtx e1 = gen_reg_rtx (XFmode);
- rtx e2 = gen_reg_rtx (XFmode);
- rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
-
- /* e2 = sqrt (op1 + 1.0) */
- emit_insn (gen_addxf3 (e2, op1, cst1));
- emit_insn (gen_sqrtxf2 (e2, e2));
-
- /* e1 = sqrt (op1 - 1.0) */
- emit_insn (gen_subxf3 (e1, op1, cst1));
- emit_insn (gen_sqrtxf2 (e1, e1));
-
- /* e1 = e1 * e2 */
- emit_insn (gen_mulxf3 (e1, e1, e2));
-
- /* e1 = e1 + op1 */
- emit_insn (gen_addxf3 (e1, e1, op1));
-
- /* op0 = log (e1) */
- emit_insn (gen_logxf2 (op0, e1));
-}
-
-/* Output code to perform an atanh XFmode calculation. */
-
-void ix86_emit_i387_atanh (rtx op0, rtx op1)
-{
- rtx e1 = gen_reg_rtx (XFmode);
- rtx e2 = gen_reg_rtx (XFmode);
- rtx scratch = gen_reg_rtx (HImode);
- rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
- rtx half = const_double_from_real_value (dconsthalf, XFmode);
- rtx cst1, tmp;
- rtx_code_label *jump_label = gen_label_rtx ();
- rtx_insn *insn;
-
- /* scratch = fxam (op1) */
- emit_insn (gen_fxamxf2_i387 (scratch, op1));
-
- /* e2 = |op1| */
- emit_insn (gen_absxf2 (e2, op1));
-
- /* e1 = -(e2 + e2) / (e2 + 1.0) */
- cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
- emit_insn (gen_addxf3 (e1, e2, cst1));
- emit_insn (gen_addxf3 (e2, e2, e2));
- emit_insn (gen_negxf2 (e2, e2));
- emit_insn (gen_divxf3 (e1, e2, e1));
-
- /* e2 = log1p (e1) */
- ix86_emit_i387_log1p (e2, e1);
-
- /* flags = signbit (op1) */
- emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
-
- /* if (!flags) then e2 = -e2 */
- tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
- gen_rtx_NE (VOIDmode, flags, const0_rtx),
- gen_rtx_LABEL_REF (VOIDmode, jump_label),
- pc_rtx);
- insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
- predict_jump (REG_BR_PROB_BASE * 50 / 100);
- JUMP_LABEL (insn) = jump_label;
-
- emit_insn (gen_negxf2 (e2, e2));
-
- emit_label (jump_label);
- LABEL_NUSES (jump_label) = 1;
-
- /* op0 = 0.5 * e2 */
- half = force_reg (XFmode, half);
- emit_insn (gen_mulxf3 (op0, e2, half));
-}
-
-/* Output code to perform a log1p XFmode calculation. */
-
-void ix86_emit_i387_log1p (rtx op0, rtx op1)
-{
- rtx_code_label *label1 = gen_label_rtx ();
- rtx_code_label *label2 = gen_label_rtx ();
-
- rtx tmp = gen_reg_rtx (XFmode);
- rtx res = gen_reg_rtx (XFmode);
- rtx cst, cstln2, cst1;
- rtx_insn *insn;
-
- cst = const_double_from_real_value
- (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
- cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
-
- emit_insn (gen_absxf2 (tmp, op1));
-
- cst = force_reg (XFmode, cst);
- ix86_expand_branch (GE, tmp, cst, label1);
- predict_jump (REG_BR_PROB_BASE * 10 / 100);
- insn = get_last_insn ();
- JUMP_LABEL (insn) = label1;
-
- emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
- emit_jump (label2);
-
- emit_label (label1);
- LABEL_NUSES (label1) = 1;
-
- cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
- emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
- emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
-
- emit_label (label2);
- LABEL_NUSES (label2) = 1;
-
- emit_move_insn (op0, res);
-}
-
-/* Emit code for round calculation. */
-void ix86_emit_i387_round (rtx op0, rtx op1)
-{
- machine_mode inmode = GET_MODE (op1);
- machine_mode outmode = GET_MODE (op0);
- rtx e1 = gen_reg_rtx (XFmode);
- rtx e2 = gen_reg_rtx (XFmode);
- rtx scratch = gen_reg_rtx (HImode);
- rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
- rtx half = const_double_from_real_value (dconsthalf, XFmode);
- rtx res = gen_reg_rtx (outmode);
- rtx_code_label *jump_label = gen_label_rtx ();
- rtx (*floor_insn) (rtx, rtx);
- rtx (*neg_insn) (rtx, rtx);
- rtx_insn *insn;
- rtx tmp;
-
- switch (inmode)
- {
- case E_SFmode:
- case E_DFmode:
- tmp = gen_reg_rtx (XFmode);
-
- emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
- op1 = tmp;
- break;
- case E_XFmode:
- break;
- default:
- gcc_unreachable ();
- }
-
- switch (outmode)
- {
- case E_SFmode:
- floor_insn = gen_frndintxf2_floor;
- neg_insn = gen_negsf2;
- break;
- case E_DFmode:
- floor_insn = gen_frndintxf2_floor;
- neg_insn = gen_negdf2;
- break;
- case E_XFmode:
- floor_insn = gen_frndintxf2_floor;
- neg_insn = gen_negxf2;
- break;
- case E_HImode:
- floor_insn = gen_lfloorxfhi2;
- neg_insn = gen_neghi2;
- break;
- case E_SImode:
- floor_insn = gen_lfloorxfsi2;
- neg_insn = gen_negsi2;
- break;
- case E_DImode:
- floor_insn = gen_lfloorxfdi2;
- neg_insn = gen_negdi2;
- break;
- default:
- gcc_unreachable ();
- }
-
- /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
-
- /* scratch = fxam(op1) */
- emit_insn (gen_fxamxf2_i387 (scratch, op1));
-
- /* e1 = fabs(op1) */
- emit_insn (gen_absxf2 (e1, op1));
-
- /* e2 = e1 + 0.5 */
- half = force_reg (XFmode, half);
- emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
-
- /* res = floor(e2) */
- switch (outmode)
- {
- case E_SFmode:
- case E_DFmode:
- {
- tmp = gen_reg_rtx (XFmode);
-
- emit_insn (floor_insn (tmp, e2));
- emit_insn (gen_rtx_SET (res,
- gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
- UNSPEC_TRUNC_NOOP)));
- }
- break;
- default:
- emit_insn (floor_insn (res, e2));
- }
-
- /* flags = signbit(a) */
- emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
-
- /* if (flags) then res = -res */
- tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
- gen_rtx_EQ (VOIDmode, flags, const0_rtx),
- gen_rtx_LABEL_REF (VOIDmode, jump_label),
- pc_rtx);
- insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
- predict_jump (REG_BR_PROB_BASE * 50 / 100);
- JUMP_LABEL (insn) = jump_label;
-
- emit_insn (neg_insn (res, res));
-
- emit_label (jump_label);
- LABEL_NUSES (jump_label) = 1;
-
- emit_move_insn (op0, res);
-}
-
-/* Output code to perform a Newton-Rhapson approximation of a single precision
- floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
-
-void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
-{
- rtx x0, x1, e0, e1;
-
- x0 = gen_reg_rtx (mode);
- e0 = gen_reg_rtx (mode);
- e1 = gen_reg_rtx (mode);
- x1 = gen_reg_rtx (mode);
-
- /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
-
- b = force_reg (mode, b);
-
- /* x0 = rcp(b) estimate */
- if (mode == V16SFmode || mode == V8DFmode)
- {
- if (TARGET_AVX512ER)
- {
- emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
- UNSPEC_RCP28)));
- /* res = a * x0 */
- emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
- return;
- }
- else
- emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
- UNSPEC_RCP14)));
- }
- else
- emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
- UNSPEC_RCP)));
-
- /* e0 = x0 * b */
- emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
-
- /* e0 = x0 * e0 */
- emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
-
- /* e1 = x0 + x0 */
- emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
-
- /* x1 = e1 - e0 */
- emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
-
- /* res = a * x1 */
- emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
-}
-
-/* Output code to perform a Newton-Rhapson approximation of a
- single precision floating point [reciprocal] square root. */
-
-void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
-{
- rtx x0, e0, e1, e2, e3, mthree, mhalf;
- REAL_VALUE_TYPE r;
- int unspec;
-
- x0 = gen_reg_rtx (mode);
- e0 = gen_reg_rtx (mode);
- e1 = gen_reg_rtx (mode);
- e2 = gen_reg_rtx (mode);
- e3 = gen_reg_rtx (mode);
-
- if (TARGET_AVX512ER && mode == V16SFmode)
- {
- if (recip)
- /* res = rsqrt28(a) estimate */
- emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
- UNSPEC_RSQRT28)));
- else
- {
- /* x0 = rsqrt28(a) estimate */
- emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
- UNSPEC_RSQRT28)));
- /* res = rcp28(x0) estimate */
- emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
- UNSPEC_RCP28)));
- }
- return;
- }
-
- real_from_integer (&r, VOIDmode, -3, SIGNED);
- mthree = const_double_from_real_value (r, SFmode);
-
- real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
- mhalf = const_double_from_real_value (r, SFmode);
- unspec = UNSPEC_RSQRT;
-
- if (VECTOR_MODE_P (mode))
- {
- mthree = ix86_build_const_vector (mode, true, mthree);
- mhalf = ix86_build_const_vector (mode, true, mhalf);
- /* There is no 512-bit rsqrt. There is however rsqrt14. */
- if (GET_MODE_SIZE (mode) == 64)
- unspec = UNSPEC_RSQRT14;
- }
-
- /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
- rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
-
- a = force_reg (mode, a);
-
- /* x0 = rsqrt(a) estimate */
- emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
- unspec)));
-
- /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
- if (!recip)
- {
- rtx zero = force_reg (mode, CONST0_RTX(mode));
- rtx mask;
-
- /* Handle masked compare. */
- if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
- {
- mask = gen_reg_rtx (HImode);
- /* Imm value 0x4 corresponds to not-equal comparison. */
- emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
- emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
- }
- else
- {
- mask = gen_reg_rtx (mode);
- emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
- emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
+ default:
+ break;
}
}
- /* e0 = x0 * a */
- emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
- /* e1 = e0 * x0 */
- emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
-
- /* e2 = e1 - 3. */
- mthree = force_reg (mode, mthree);
- emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
+#ifdef SUBTARGET_FOLD_BUILTIN
+ return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
+#endif
- mhalf = force_reg (mode, mhalf);
- if (recip)
- /* e3 = -.5 * x0 */
- emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
- else
- /* e3 = -.5 * e0 */
- emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
- /* ret = e2 * e3 */
- emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
+ return NULL_TREE;
}
-#ifdef TARGET_SOLARIS
-/* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
+/* Fold a MD builtin (use ix86_fold_builtin for folding into
+ constant) in GIMPLE. */
-static void
-i386_solaris_elf_named_section (const char *name, unsigned int flags,
- tree decl)
+bool
+ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
{
- /* With Binutils 2.15, the "@unwind" marker must be specified on
- every occurrence of the ".eh_frame" section, not just the first
- one. */
- if (TARGET_64BIT
- && strcmp (name, ".eh_frame") == 0)
- {
- fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
- flags & SECTION_WRITE ? "aw" : "a");
- return;
- }
+ gimple *stmt = gsi_stmt (*gsi);
+ tree fndecl = gimple_call_fndecl (stmt);
+ gcc_checking_assert (fndecl && fndecl_built_in_p (fndecl, BUILT_IN_MD));
+ int n_args = gimple_call_num_args (stmt);
+ enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
+ tree decl = NULL_TREE;
+ tree arg0, arg1;
+ enum rtx_code rcode;
+ unsigned HOST_WIDE_INT count;
+ bool is_vshift;
-#ifndef USE_GAS
- if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
+ switch (fn_code)
{
- solaris_elf_asm_comdat_section (name, flags, decl);
- return;
- }
+ case IX86_BUILTIN_TZCNT32:
+ decl = builtin_decl_implicit (BUILT_IN_CTZ);
+ goto fold_tzcnt_lzcnt;
- /* Solaris/x86 as uses the same syntax for the SHF_EXCLUDE flags as the
- SPARC assembler. One cannot mix single-letter flags and #exclude, so
- only emit the latter here. */
- if (flags & SECTION_EXCLUDE)
- {
- fprintf (asm_out_file, "\t.section\t%s,#exclude\n", name);
- return;
- }
-#endif
+ case IX86_BUILTIN_TZCNT64:
+ decl = builtin_decl_implicit (BUILT_IN_CTZLL);
+ goto fold_tzcnt_lzcnt;
- default_elf_asm_named_section (name, flags, decl);
-}
-#endif /* TARGET_SOLARIS */
+ case IX86_BUILTIN_LZCNT32:
+ decl = builtin_decl_implicit (BUILT_IN_CLZ);
+ goto fold_tzcnt_lzcnt;
-/* Return the mangling of TYPE if it is an extended fundamental type. */
+ case IX86_BUILTIN_LZCNT64:
+ decl = builtin_decl_implicit (BUILT_IN_CLZLL);
+ goto fold_tzcnt_lzcnt;
-static const char *
-ix86_mangle_type (const_tree type)
-{
- type = TYPE_MAIN_VARIANT (type);
+ fold_tzcnt_lzcnt:
+ gcc_assert (n_args == 1);
+ arg0 = gimple_call_arg (stmt, 0);
+ if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
+ {
+ int prec = TYPE_PRECISION (TREE_TYPE (arg0));
+ /* If arg0 is provably non-zero, optimize into generic
+ __builtin_c[tl]z{,ll} function the middle-end handles
+ better. */
+ if (!expr_not_equal_to (arg0, wi::zero (prec)))
+ return false;
- if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
- && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
- return NULL;
+ location_t loc = gimple_location (stmt);
+ gimple *g = gimple_build_call (decl, 1, arg0);
+ gimple_set_location (g, loc);
+ tree lhs = make_ssa_name (integer_type_node);
+ gimple_call_set_lhs (g, lhs);
+ gsi_insert_before (gsi, g, GSI_SAME_STMT);
+ g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
+ gimple_set_location (g, loc);
+ gsi_replace (gsi, g, false);
+ return true;
+ }
+ break;
- switch (TYPE_MODE (type))
- {
- case E_TFmode:
- /* __float128 is "g". */
- return "g";
- case E_XFmode:
- /* "long double" or __float80 is "e". */
- return "e";
- default:
- return NULL;
- }
-}
+ case IX86_BUILTIN_BZHI32:
+ case IX86_BUILTIN_BZHI64:
+ gcc_assert (n_args == 2);
+ arg1 = gimple_call_arg (stmt, 1);
+ if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
+ {
+ unsigned int idx = tree_to_uhwi (arg1) & 0xff;
+ arg0 = gimple_call_arg (stmt, 0);
+ if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
+ break;
+ location_t loc = gimple_location (stmt);
+ gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
+ gimple_set_location (g, loc);
+ gsi_replace (gsi, g, false);
+ return true;
+ }
+ break;
-static GTY(()) tree ix86_tls_stack_chk_guard_decl;
+ case IX86_BUILTIN_PDEP32:
+ case IX86_BUILTIN_PDEP64:
+ case IX86_BUILTIN_PEXT32:
+ case IX86_BUILTIN_PEXT64:
+ gcc_assert (n_args == 2);
+ arg1 = gimple_call_arg (stmt, 1);
+ if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
+ {
+ location_t loc = gimple_location (stmt);
+ arg0 = gimple_call_arg (stmt, 0);
+ gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
+ gimple_set_location (g, loc);
+ gsi_replace (gsi, g, false);
+ return true;
+ }
+ break;
-static tree
-ix86_stack_protect_guard (void)
-{
- if (TARGET_SSP_TLS_GUARD)
- {
- tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1);
- int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg);
- tree type = build_qualified_type (type_node, qual);
- tree t;
+ case IX86_BUILTIN_PSLLD:
+ case IX86_BUILTIN_PSLLD128:
+ case IX86_BUILTIN_PSLLD128_MASK:
+ case IX86_BUILTIN_PSLLD256:
+ case IX86_BUILTIN_PSLLD256_MASK:
+ case IX86_BUILTIN_PSLLD512:
+ case IX86_BUILTIN_PSLLDI:
+ case IX86_BUILTIN_PSLLDI128:
+ case IX86_BUILTIN_PSLLDI128_MASK:
+ case IX86_BUILTIN_PSLLDI256:
+ case IX86_BUILTIN_PSLLDI256_MASK:
+ case IX86_BUILTIN_PSLLDI512:
+ case IX86_BUILTIN_PSLLQ:
+ case IX86_BUILTIN_PSLLQ128:
+ case IX86_BUILTIN_PSLLQ128_MASK:
+ case IX86_BUILTIN_PSLLQ256:
+ case IX86_BUILTIN_PSLLQ256_MASK:
+ case IX86_BUILTIN_PSLLQ512:
+ case IX86_BUILTIN_PSLLQI:
+ case IX86_BUILTIN_PSLLQI128:
+ case IX86_BUILTIN_PSLLQI128_MASK:
+ case IX86_BUILTIN_PSLLQI256:
+ case IX86_BUILTIN_PSLLQI256_MASK:
+ case IX86_BUILTIN_PSLLQI512:
+ case IX86_BUILTIN_PSLLW:
+ case IX86_BUILTIN_PSLLW128:
+ case IX86_BUILTIN_PSLLW128_MASK:
+ case IX86_BUILTIN_PSLLW256:
+ case IX86_BUILTIN_PSLLW256_MASK:
+ case IX86_BUILTIN_PSLLW512_MASK:
+ case IX86_BUILTIN_PSLLWI:
+ case IX86_BUILTIN_PSLLWI128:
+ case IX86_BUILTIN_PSLLWI128_MASK:
+ case IX86_BUILTIN_PSLLWI256:
+ case IX86_BUILTIN_PSLLWI256_MASK:
+ case IX86_BUILTIN_PSLLWI512_MASK:
+ rcode = ASHIFT;
+ is_vshift = false;
+ goto do_shift;
+ case IX86_BUILTIN_PSRAD:
+ case IX86_BUILTIN_PSRAD128:
+ case IX86_BUILTIN_PSRAD128_MASK:
+ case IX86_BUILTIN_PSRAD256:
+ case IX86_BUILTIN_PSRAD256_MASK:
+ case IX86_BUILTIN_PSRAD512:
+ case IX86_BUILTIN_PSRADI:
+ case IX86_BUILTIN_PSRADI128:
+ case IX86_BUILTIN_PSRADI128_MASK:
+ case IX86_BUILTIN_PSRADI256:
+ case IX86_BUILTIN_PSRADI256_MASK:
+ case IX86_BUILTIN_PSRADI512:
+ case IX86_BUILTIN_PSRAQ128_MASK:
+ case IX86_BUILTIN_PSRAQ256_MASK:
+ case IX86_BUILTIN_PSRAQ512:
+ case IX86_BUILTIN_PSRAQI128_MASK:
+ case IX86_BUILTIN_PSRAQI256_MASK:
+ case IX86_BUILTIN_PSRAQI512:
+ case IX86_BUILTIN_PSRAW:
+ case IX86_BUILTIN_PSRAW128:
+ case IX86_BUILTIN_PSRAW128_MASK:
+ case IX86_BUILTIN_PSRAW256:
+ case IX86_BUILTIN_PSRAW256_MASK:
+ case IX86_BUILTIN_PSRAW512:
+ case IX86_BUILTIN_PSRAWI:
+ case IX86_BUILTIN_PSRAWI128:
+ case IX86_BUILTIN_PSRAWI128_MASK:
+ case IX86_BUILTIN_PSRAWI256:
+ case IX86_BUILTIN_PSRAWI256_MASK:
+ case IX86_BUILTIN_PSRAWI512:
+ rcode = ASHIFTRT;
+ is_vshift = false;
+ goto do_shift;
+ case IX86_BUILTIN_PSRLD:
+ case IX86_BUILTIN_PSRLD128:
+ case IX86_BUILTIN_PSRLD128_MASK:
+ case IX86_BUILTIN_PSRLD256:
+ case IX86_BUILTIN_PSRLD256_MASK:
+ case IX86_BUILTIN_PSRLD512:
+ case IX86_BUILTIN_PSRLDI:
+ case IX86_BUILTIN_PSRLDI128:
+ case IX86_BUILTIN_PSRLDI128_MASK:
+ case IX86_BUILTIN_PSRLDI256:
+ case IX86_BUILTIN_PSRLDI256_MASK:
+ case IX86_BUILTIN_PSRLDI512:
+ case IX86_BUILTIN_PSRLQ:
+ case IX86_BUILTIN_PSRLQ128:
+ case IX86_BUILTIN_PSRLQ128_MASK:
+ case IX86_BUILTIN_PSRLQ256:
+ case IX86_BUILTIN_PSRLQ256_MASK:
+ case IX86_BUILTIN_PSRLQ512:
+ case IX86_BUILTIN_PSRLQI:
+ case IX86_BUILTIN_PSRLQI128:
+ case IX86_BUILTIN_PSRLQI128_MASK:
+ case IX86_BUILTIN_PSRLQI256:
+ case IX86_BUILTIN_PSRLQI256_MASK:
+ case IX86_BUILTIN_PSRLQI512:
+ case IX86_BUILTIN_PSRLW:
+ case IX86_BUILTIN_PSRLW128:
+ case IX86_BUILTIN_PSRLW128_MASK:
+ case IX86_BUILTIN_PSRLW256:
+ case IX86_BUILTIN_PSRLW256_MASK:
+ case IX86_BUILTIN_PSRLW512:
+ case IX86_BUILTIN_PSRLWI:
+ case IX86_BUILTIN_PSRLWI128:
+ case IX86_BUILTIN_PSRLWI128_MASK:
+ case IX86_BUILTIN_PSRLWI256:
+ case IX86_BUILTIN_PSRLWI256_MASK:
+ case IX86_BUILTIN_PSRLWI512:
+ rcode = LSHIFTRT;
+ is_vshift = false;
+ goto do_shift;
+ case IX86_BUILTIN_PSLLVV16HI:
+ case IX86_BUILTIN_PSLLVV16SI:
+ case IX86_BUILTIN_PSLLVV2DI:
+ case IX86_BUILTIN_PSLLVV2DI_MASK:
+ case IX86_BUILTIN_PSLLVV32HI:
+ case IX86_BUILTIN_PSLLVV4DI:
+ case IX86_BUILTIN_PSLLVV4DI_MASK:
+ case IX86_BUILTIN_PSLLVV4SI:
+ case IX86_BUILTIN_PSLLVV4SI_MASK:
+ case IX86_BUILTIN_PSLLVV8DI:
+ case IX86_BUILTIN_PSLLVV8HI:
+ case IX86_BUILTIN_PSLLVV8SI:
+ case IX86_BUILTIN_PSLLVV8SI_MASK:
+ rcode = ASHIFT;
+ is_vshift = true;
+ goto do_shift;
+ case IX86_BUILTIN_PSRAVQ128:
+ case IX86_BUILTIN_PSRAVQ256:
+ case IX86_BUILTIN_PSRAVV16HI:
+ case IX86_BUILTIN_PSRAVV16SI:
+ case IX86_BUILTIN_PSRAVV32HI:
+ case IX86_BUILTIN_PSRAVV4SI:
+ case IX86_BUILTIN_PSRAVV4SI_MASK:
+ case IX86_BUILTIN_PSRAVV8DI:
+ case IX86_BUILTIN_PSRAVV8HI:
+ case IX86_BUILTIN_PSRAVV8SI:
+ case IX86_BUILTIN_PSRAVV8SI_MASK:
+ rcode = ASHIFTRT;
+ is_vshift = true;
+ goto do_shift;
+ case IX86_BUILTIN_PSRLVV16HI:
+ case IX86_BUILTIN_PSRLVV16SI:
+ case IX86_BUILTIN_PSRLVV2DI:
+ case IX86_BUILTIN_PSRLVV2DI_MASK:
+ case IX86_BUILTIN_PSRLVV32HI:
+ case IX86_BUILTIN_PSRLVV4DI:
+ case IX86_BUILTIN_PSRLVV4DI_MASK:
+ case IX86_BUILTIN_PSRLVV4SI:
+ case IX86_BUILTIN_PSRLVV4SI_MASK:
+ case IX86_BUILTIN_PSRLVV8DI:
+ case IX86_BUILTIN_PSRLVV8HI:
+ case IX86_BUILTIN_PSRLVV8SI:
+ case IX86_BUILTIN_PSRLVV8SI_MASK:
+ rcode = LSHIFTRT;
+ is_vshift = true;
+ goto do_shift;
- if (global_options_set.x_ix86_stack_protector_guard_symbol_str)
+ do_shift:
+ gcc_assert (n_args >= 2);
+ arg0 = gimple_call_arg (stmt, 0);
+ arg1 = gimple_call_arg (stmt, 1);
+ if (n_args > 2)
{
- t = ix86_tls_stack_chk_guard_decl;
-
- if (t == NULL)
- {
- rtx x;
-
- t = build_decl
- (UNKNOWN_LOCATION, VAR_DECL,
- get_identifier (ix86_stack_protector_guard_symbol_str),
- type);
- TREE_STATIC (t) = 1;
- TREE_PUBLIC (t) = 1;
- DECL_EXTERNAL (t) = 1;
- TREE_USED (t) = 1;
- TREE_THIS_VOLATILE (t) = 1;
- DECL_ARTIFICIAL (t) = 1;
- DECL_IGNORED_P (t) = 1;
-
- /* Do not share RTL as the declaration is visible outside of
- current function. */
- x = DECL_RTL (t);
- RTX_FLAG (x, used) = 1;
-
- ix86_tls_stack_chk_guard_decl = t;
- }
+ /* This is masked shift. Only optimize if the mask is all ones. */
+ tree argl = gimple_call_arg (stmt, n_args - 1);
+ if (!tree_fits_uhwi_p (argl))
+ break;
+ unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl);
+ unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
+ if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U)
+ break;
}
- else
+ if (is_vshift)
{
- tree asptrtype = build_pointer_type (type);
-
- t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset);
- t = build2 (MEM_REF, asptrtype, t,
- build_int_cst (asptrtype, 0));
- TREE_THIS_VOLATILE (t) = 1;
+ if (TREE_CODE (arg1) != VECTOR_CST)
+ break;
+ count = TYPE_PRECISION (TREE_TYPE (TREE_TYPE (arg0)));
+ if (integer_zerop (arg1))
+ count = 0;
+ else if (rcode == ASHIFTRT)
+ break;
+ else
+ for (unsigned int i = 0; i < VECTOR_CST_NELTS (arg1); ++i)
+ {
+ tree elt = VECTOR_CST_ELT (arg1, i);
+ if (!wi::neg_p (wi::to_wide (elt))
+ && wi::to_widest (elt) < count)
+ return false;
+ }
}
-
- return t;
- }
-
- return default_stack_protect_guard ();
-}
-
-/* For 32-bit code we can save PIC register setup by using
- __stack_chk_fail_local hidden function instead of calling
- __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
- register, so it is better to call __stack_chk_fail directly. */
-
-static tree ATTRIBUTE_UNUSED
-ix86_stack_protect_fail (void)
-{
- return TARGET_64BIT
- ? default_external_stack_protect_fail ()
- : default_hidden_stack_protect_fail ();
-}
-
-/* Select a format to encode pointers in exception handling data. CODE
- is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
- true if the symbol may be affected by dynamic relocations.
-
- ??? All x86 object file formats are capable of representing this.
- After all, the relocation needed is the same as for the call insn.
- Whether or not a particular assembler allows us to enter such, I
- guess we'll have to see. */
-int
-asm_preferred_eh_data_format (int code, int global)
-{
- if (flag_pic)
- {
- int type = DW_EH_PE_sdata8;
- if (!TARGET_64BIT
- || ix86_cmodel == CM_SMALL_PIC
- || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
- type = DW_EH_PE_sdata4;
- return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
- }
- if (ix86_cmodel == CM_SMALL
- || (ix86_cmodel == CM_MEDIUM && code))
- return DW_EH_PE_udata4;
- return DW_EH_PE_absptr;
-}
-\f
-/* Expand copysign from SIGN to the positive value ABS_VALUE
- storing in RESULT. If MASK is non-null, it shall be a mask to mask out
- the sign-bit. */
-static void
-ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
-{
- machine_mode mode = GET_MODE (sign);
- rtx sgn = gen_reg_rtx (mode);
- if (mask == NULL_RTX)
- {
- machine_mode vmode;
-
- if (mode == SFmode)
- vmode = V4SFmode;
- else if (mode == DFmode)
- vmode = V2DFmode;
else
- vmode = mode;
-
- mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
- if (!VECTOR_MODE_P (mode))
{
- /* We need to generate a scalar mode mask in this case. */
- rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
- tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
- mask = gen_reg_rtx (mode);
- emit_insn (gen_rtx_SET (mask, tmp));
+ arg1 = ix86_vector_shift_count (arg1);
+ if (!arg1)
+ break;
+ count = tree_to_uhwi (arg1);
}
- }
- else
- mask = gen_rtx_NOT (mode, mask);
- emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
- emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
-}
-
-/* Expand fabs (OP0) and return a new rtx that holds the result. The
- mask for masking out the sign-bit is stored in *SMASK, if that is
- non-null. */
-static rtx
-ix86_expand_sse_fabs (rtx op0, rtx *smask)
-{
- machine_mode vmode, mode = GET_MODE (op0);
- rtx xa, mask;
+ if (count == 0)
+ {
+ /* Just return the first argument for shift by 0. */
+ location_t loc = gimple_location (stmt);
+ gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
+ gimple_set_location (g, loc);
+ gsi_replace (gsi, g, false);
+ return true;
+ }
+ if (rcode != ASHIFTRT
+ && count >= TYPE_PRECISION (TREE_TYPE (TREE_TYPE (arg0))))
+ {
+ /* For shift counts equal or greater than precision, except for
+ arithmetic right shift the result is zero. */
+ location_t loc = gimple_location (stmt);
+ gimple *g = gimple_build_assign (gimple_call_lhs (stmt),
+ build_zero_cst (TREE_TYPE (arg0)));
+ gimple_set_location (g, loc);
+ gsi_replace (gsi, g, false);
+ return true;
+ }
+ break;
- xa = gen_reg_rtx (mode);
- if (mode == SFmode)
- vmode = V4SFmode;
- else if (mode == DFmode)
- vmode = V2DFmode;
- else
- vmode = mode;
- mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
- if (!VECTOR_MODE_P (mode))
- {
- /* We need to generate a scalar mode mask in this case. */
- rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
- tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
- mask = gen_reg_rtx (mode);
- emit_insn (gen_rtx_SET (mask, tmp));
+ default:
+ break;
}
- emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
-
- if (smask)
- *smask = mask;
-
- return xa;
-}
-
-/* Expands a comparison of OP0 with OP1 using comparison code CODE,
- swapping the operands if SWAP_OPERANDS is true. The expanded
- code is a forward jump to a newly created label in case the
- comparison is true. The generated label rtx is returned. */
-static rtx_code_label *
-ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
- bool swap_operands)
-{
- bool unordered_compare = ix86_unordered_fp_compare (code);
- rtx_code_label *label;
- rtx tmp, reg;
-
- if (swap_operands)
- std::swap (op0, op1);
-
- label = gen_label_rtx ();
- tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
- if (unordered_compare)
- tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
- reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
- emit_insn (gen_rtx_SET (reg, tmp));
- tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
- tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
- gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
- tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
- JUMP_LABEL (tmp) = label;
-
- return label;
-}
-
-/* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
- using comparison code CODE. Operands are swapped for the comparison if
- SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
-static rtx
-ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
- bool swap_operands)
-{
- rtx (*insn)(rtx, rtx, rtx, rtx);
- machine_mode mode = GET_MODE (op0);
- rtx mask = gen_reg_rtx (mode);
-
- if (swap_operands)
- std::swap (op0, op1);
-
- insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
-
- emit_insn (insn (mask, op0, op1,
- gen_rtx_fmt_ee (code, mode, op0, op1)));
- return mask;
-}
-
-/* Generate and return a rtx of mode MODE for 2**n where n is the number
- of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
-static rtx
-ix86_gen_TWO52 (machine_mode mode)
-{
- REAL_VALUE_TYPE TWO52r;
- rtx TWO52;
-
- real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
- TWO52 = const_double_from_real_value (TWO52r, mode);
- TWO52 = force_reg (mode, TWO52);
- return TWO52;
+ return false;
}
-/* Expand SSE sequence for computing lround from OP1 storing
- into OP0. */
-void
-ix86_expand_lround (rtx op0, rtx op1)
-{
- /* C code for the stuff we're doing below:
- tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
- return (long)tmp;
- */
- machine_mode mode = GET_MODE (op1);
- const struct real_format *fmt;
- REAL_VALUE_TYPE pred_half, half_minus_pred_half;
- rtx adj;
-
- /* load nextafter (0.5, 0.0) */
- fmt = REAL_MODE_FORMAT (mode);
- real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
- real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
-
- /* adj = copysign (0.5, op1) */
- adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
- ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
-
- /* adj = op1 + adj */
- adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
-
- /* op0 = (imode)adj */
- expand_fix (op0, adj, 0);
-}
+/* Handler for an SVML-style interface to
+ a library with vectorized intrinsics. */
-/* Expand SSE2 sequence for computing lround from OPERAND1 storing
- into OPERAND0. */
-void
-ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
+tree
+ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
{
- /* C code for the stuff we're doing below (for do_floor):
- xi = (long)op1;
- xi -= (double)xi > op1 ? 1 : 0;
- return xi;
- */
- machine_mode fmode = GET_MODE (op1);
- machine_mode imode = GET_MODE (op0);
- rtx ireg, freg, tmp;
- rtx_code_label *label;
-
- /* reg = (long)op1 */
- ireg = gen_reg_rtx (imode);
- expand_fix (ireg, op1, 0);
-
- /* freg = (double)reg */
- freg = gen_reg_rtx (fmode);
- expand_float (freg, ireg, 0);
-
- /* ireg = (freg > op1) ? ireg - 1 : ireg */
- label = ix86_expand_sse_compare_and_jump (UNLE,
- freg, op1, !do_floor);
- tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
- ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
- emit_move_insn (ireg, tmp);
-
- emit_label (label);
- LABEL_NUSES (label) = 1;
+ char name[20];
+ tree fntype, new_fndecl, args;
+ unsigned arity;
+ const char *bname;
+ machine_mode el_mode, in_mode;
+ int n, in_n;
- emit_move_insn (op0, ireg);
-}
+ /* The SVML is suitable for unsafe math only. */
+ if (!flag_unsafe_math_optimizations)
+ return NULL_TREE;
-/* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
-void
-ix86_expand_rint (rtx operand0, rtx operand1)
-{
- /* C code for the stuff we're doing below:
- xa = fabs (operand1);
- if (!isless (xa, 2**52))
- return operand1;
- two52 = 2**52;
- if (flag_rounding_math)
- {
- two52 = copysign (two52, operand1);
- xa = operand1;
- }
- xa = xa + two52 - two52;
- return copysign (xa, operand1);
- */
- machine_mode mode = GET_MODE (operand0);
- rtx res, xa, TWO52, two52, mask;
- rtx_code_label *label;
+ el_mode = TYPE_MODE (TREE_TYPE (type_out));
+ n = TYPE_VECTOR_SUBPARTS (type_out);
+ in_mode = TYPE_MODE (TREE_TYPE (type_in));
+ in_n = TYPE_VECTOR_SUBPARTS (type_in);
+ if (el_mode != in_mode
+ || n != in_n)
+ return NULL_TREE;
- res = gen_reg_rtx (mode);
- emit_move_insn (res, operand1);
+ switch (fn)
+ {
+ CASE_CFN_EXP:
+ CASE_CFN_LOG:
+ CASE_CFN_LOG10:
+ CASE_CFN_POW:
+ CASE_CFN_TANH:
+ CASE_CFN_TAN:
+ CASE_CFN_ATAN:
+ CASE_CFN_ATAN2:
+ CASE_CFN_ATANH:
+ CASE_CFN_CBRT:
+ CASE_CFN_SINH:
+ CASE_CFN_SIN:
+ CASE_CFN_ASINH:
+ CASE_CFN_ASIN:
+ CASE_CFN_COSH:
+ CASE_CFN_COS:
+ CASE_CFN_ACOSH:
+ CASE_CFN_ACOS:
+ if ((el_mode != DFmode || n != 2)
+ && (el_mode != SFmode || n != 4))
+ return NULL_TREE;
+ break;
- /* xa = abs (operand1) */
- xa = ix86_expand_sse_fabs (res, &mask);
+ default:
+ return NULL_TREE;
+ }
- /* if (!isless (xa, TWO52)) goto label; */
- TWO52 = ix86_gen_TWO52 (mode);
- label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+ tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
+ bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
- two52 = TWO52;
- if (flag_rounding_math)
+ if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
+ strcpy (name, "vmlsLn4");
+ else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
+ strcpy (name, "vmldLn2");
+ else if (n == 4)
{
- two52 = gen_reg_rtx (mode);
- ix86_sse_copysign_to_positive (two52, TWO52, res, mask);
- xa = res;
+ sprintf (name, "vmls%s", bname+10);
+ name[strlen (name)-1] = '4';
}
+ else
+ sprintf (name, "vmld%s2", bname+10);
- xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT);
- xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT);
+ /* Convert to uppercase. */
+ name[4] &= ~0x20;
- ix86_sse_copysign_to_positive (res, xa, res, mask);
+ arity = 0;
+ for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
+ arity++;
- emit_label (label);
- LABEL_NUSES (label) = 1;
+ if (arity == 1)
+ fntype = build_function_type_list (type_out, type_in, NULL);
+ else
+ fntype = build_function_type_list (type_out, type_in, type_in, NULL);
+
+ /* Build a function declaration for the vectorized function. */
+ new_fndecl = build_decl (BUILTINS_LOCATION,
+ FUNCTION_DECL, get_identifier (name), fntype);
+ TREE_PUBLIC (new_fndecl) = 1;
+ DECL_EXTERNAL (new_fndecl) = 1;
+ DECL_IS_NOVOPS (new_fndecl) = 1;
+ TREE_READONLY (new_fndecl) = 1;
- emit_move_insn (operand0, res);
+ return new_fndecl;
}
-/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
- into OPERAND0. */
-void
-ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
-{
- /* C code for the stuff we expand below.
- double xa = fabs (x), x2;
- if (!isless (xa, TWO52))
- return x;
- xa = xa + TWO52 - TWO52;
- x2 = copysign (xa, x);
- Compensate. Floor:
- if (x2 > x)
- x2 -= 1;
- Compensate. Ceil:
- if (x2 < x)
- x2 += 1;
- if (HONOR_SIGNED_ZEROS (mode))
- x2 = copysign (x2, x);
- return x2;
- */
- machine_mode mode = GET_MODE (operand0);
- rtx xa, TWO52, tmp, one, res, mask;
- rtx_code_label *label;
+/* Handler for an ACML-style interface to
+ a library with vectorized intrinsics. */
- TWO52 = ix86_gen_TWO52 (mode);
+tree
+ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
+{
+ char name[20] = "__vr.._";
+ tree fntype, new_fndecl, args;
+ unsigned arity;
+ const char *bname;
+ machine_mode el_mode, in_mode;
+ int n, in_n;
- /* Temporary for holding the result, initialized to the input
- operand to ease control flow. */
- res = gen_reg_rtx (mode);
- emit_move_insn (res, operand1);
+ /* The ACML is 64bits only and suitable for unsafe math only as
+ it does not correctly support parts of IEEE with the required
+ precision such as denormals. */
+ if (!TARGET_64BIT
+ || !flag_unsafe_math_optimizations)
+ return NULL_TREE;
- /* xa = abs (operand1) */
- xa = ix86_expand_sse_fabs (res, &mask);
+ el_mode = TYPE_MODE (TREE_TYPE (type_out));
+ n = TYPE_VECTOR_SUBPARTS (type_out);
+ in_mode = TYPE_MODE (TREE_TYPE (type_in));
+ in_n = TYPE_VECTOR_SUBPARTS (type_in);
+ if (el_mode != in_mode
+ || n != in_n)
+ return NULL_TREE;
- /* if (!isless (xa, TWO52)) goto label; */
- label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+ switch (fn)
+ {
+ CASE_CFN_SIN:
+ CASE_CFN_COS:
+ CASE_CFN_EXP:
+ CASE_CFN_LOG:
+ CASE_CFN_LOG2:
+ CASE_CFN_LOG10:
+ if (el_mode == DFmode && n == 2)
+ {
+ name[4] = 'd';
+ name[5] = '2';
+ }
+ else if (el_mode == SFmode && n == 4)
+ {
+ name[4] = 's';
+ name[5] = '4';
+ }
+ else
+ return NULL_TREE;
+ break;
- /* xa = xa + TWO52 - TWO52; */
- xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
- xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
+ default:
+ return NULL_TREE;
+ }
- /* xa = copysign (xa, operand1) */
- ix86_sse_copysign_to_positive (xa, xa, res, mask);
+ tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
+ bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
+ sprintf (name + 7, "%s", bname+10);
- /* generate 1.0 */
- one = force_reg (mode, const_double_from_real_value (dconst1, mode));
+ arity = 0;
+ for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
+ arity++;
- /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
- tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
- emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
- tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
- xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
- if (!do_floor && HONOR_SIGNED_ZEROS (mode))
- ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
- emit_move_insn (res, tmp);
+ if (arity == 1)
+ fntype = build_function_type_list (type_out, type_in, NULL);
+ else
+ fntype = build_function_type_list (type_out, type_in, type_in, NULL);
- emit_label (label);
- LABEL_NUSES (label) = 1;
+ /* Build a function declaration for the vectorized function. */
+ new_fndecl = build_decl (BUILTINS_LOCATION,
+ FUNCTION_DECL, get_identifier (name), fntype);
+ TREE_PUBLIC (new_fndecl) = 1;
+ DECL_EXTERNAL (new_fndecl) = 1;
+ DECL_IS_NOVOPS (new_fndecl) = 1;
+ TREE_READONLY (new_fndecl) = 1;
- emit_move_insn (operand0, res);
+ return new_fndecl;
}
-/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
- into OPERAND0. */
-void
-ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
-{
- /* C code for the stuff we expand below.
- double xa = fabs (x), x2;
- if (!isless (xa, TWO52))
- return x;
- x2 = (double)(long)x;
- Compensate. Floor:
- if (x2 > x)
- x2 -= 1;
- Compensate. Ceil:
- if (x2 < x)
- x2 += 1;
- if (HONOR_SIGNED_ZEROS (mode))
- return copysign (x2, x);
- return x2;
- */
- machine_mode mode = GET_MODE (operand0);
- rtx xa, xi, TWO52, tmp, one, res, mask;
- rtx_code_label *label;
+/* Returns a decl of a function that implements scatter store with
+ register type VECTYPE and index type INDEX_TYPE and SCALE.
+ Return NULL_TREE if it is not available. */
- TWO52 = ix86_gen_TWO52 (mode);
+static tree
+ix86_vectorize_builtin_scatter (const_tree vectype,
+ const_tree index_type, int scale)
+{
+ bool si;
+ enum ix86_builtins code;
- /* Temporary for holding the result, initialized to the input
- operand to ease control flow. */
- res = gen_reg_rtx (mode);
- emit_move_insn (res, operand1);
+ if (!TARGET_AVX512F)
+ return NULL_TREE;
- /* xa = abs (operand1) */
- xa = ix86_expand_sse_fabs (res, &mask);
+ if ((TREE_CODE (index_type) != INTEGER_TYPE
+ && !POINTER_TYPE_P (index_type))
+ || (TYPE_MODE (index_type) != SImode
+ && TYPE_MODE (index_type) != DImode))
+ return NULL_TREE;
- /* if (!isless (xa, TWO52)) goto label; */
- label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+ if (TYPE_PRECISION (index_type) > POINTER_SIZE)
+ return NULL_TREE;
- /* xa = (double)(long)x */
- xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
- expand_fix (xi, res, 0);
- expand_float (xa, xi, 0);
+ /* v*scatter* insn sign extends index to pointer mode. */
+ if (TYPE_PRECISION (index_type) < POINTER_SIZE
+ && TYPE_UNSIGNED (index_type))
+ return NULL_TREE;
- /* generate 1.0 */
- one = force_reg (mode, const_double_from_real_value (dconst1, mode));
+ /* Scale can be 1, 2, 4 or 8. */
+ if (scale <= 0
+ || scale > 8
+ || (scale & (scale - 1)) != 0)
+ return NULL_TREE;
- /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
- tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
- emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
- tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
- xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
- emit_move_insn (res, tmp);
+ si = TYPE_MODE (index_type) == SImode;
+ switch (TYPE_MODE (vectype))
+ {
+ case E_V8DFmode:
+ code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
+ break;
+ case E_V8DImode:
+ code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
+ break;
+ case E_V16SFmode:
+ code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
+ break;
+ case E_V16SImode:
+ code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
+ break;
+ case E_V4DFmode:
+ if (TARGET_AVX512VL)
+ code = si ? IX86_BUILTIN_SCATTERALTSIV4DF : IX86_BUILTIN_SCATTERDIV4DF;
+ else
+ return NULL_TREE;
+ break;
+ case E_V4DImode:
+ if (TARGET_AVX512VL)
+ code = si ? IX86_BUILTIN_SCATTERALTSIV4DI : IX86_BUILTIN_SCATTERDIV4DI;
+ else
+ return NULL_TREE;
+ break;
+ case E_V8SFmode:
+ if (TARGET_AVX512VL)
+ code = si ? IX86_BUILTIN_SCATTERSIV8SF : IX86_BUILTIN_SCATTERALTDIV8SF;
+ else
+ return NULL_TREE;
+ break;
+ case E_V8SImode:
+ if (TARGET_AVX512VL)
+ code = si ? IX86_BUILTIN_SCATTERSIV8SI : IX86_BUILTIN_SCATTERALTDIV8SI;
+ else
+ return NULL_TREE;
+ break;
+ case E_V2DFmode:
+ if (TARGET_AVX512VL)
+ code = si ? IX86_BUILTIN_SCATTERALTSIV2DF : IX86_BUILTIN_SCATTERDIV2DF;
+ else
+ return NULL_TREE;
+ break;
+ case E_V2DImode:
+ if (TARGET_AVX512VL)
+ code = si ? IX86_BUILTIN_SCATTERALTSIV2DI : IX86_BUILTIN_SCATTERDIV2DI;
+ else
+ return NULL_TREE;
+ break;
+ case E_V4SFmode:
+ if (TARGET_AVX512VL)
+ code = si ? IX86_BUILTIN_SCATTERSIV4SF : IX86_BUILTIN_SCATTERALTDIV4SF;
+ else
+ return NULL_TREE;
+ break;
+ case E_V4SImode:
+ if (TARGET_AVX512VL)
+ code = si ? IX86_BUILTIN_SCATTERSIV4SI : IX86_BUILTIN_SCATTERALTDIV4SI;
+ else
+ return NULL_TREE;
+ break;
+ default:
+ return NULL_TREE;
+ }
- if (HONOR_SIGNED_ZEROS (mode))
- ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
+ return get_ix86_builtin (code);
+}
- emit_label (label);
- LABEL_NUSES (label) = 1;
+/* Return true if it is safe to use the rsqrt optabs to optimize
+ 1.0/sqrt. */
- emit_move_insn (operand0, res);
+static bool
+use_rsqrt_p ()
+{
+ return (TARGET_SSE && TARGET_SSE_MATH
+ && flag_finite_math_only
+ && !flag_trapping_math
+ && flag_unsafe_math_optimizations);
}
+\f
+/* Helper for avx_vpermilps256_operand et al. This is also used by
+ the expansion functions to turn the parallel back into a mask.
+ The return value is 0 for no match and the imm8+1 for a match. */
-/* Expand SSE sequence for computing round from OPERAND1 storing
- into OPERAND0. Sequence that works without relying on DImode truncation
- via cvttsd2siq that is only available on 64bit targets. */
-void
-ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
-{
- /* C code for the stuff we expand below.
- double xa = fabs (x), xa2, x2;
- if (!isless (xa, TWO52))
- return x;
- Using the absolute value and copying back sign makes
- -0.0 -> -0.0 correct.
- xa2 = xa + TWO52 - TWO52;
- Compensate.
- dxa = xa2 - xa;
- if (dxa <= -0.5)
- xa2 += 1;
- else if (dxa > 0.5)
- xa2 -= 1;
- x2 = copysign (xa2, x);
- return x2;
- */
- machine_mode mode = GET_MODE (operand0);
- rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
- rtx_code_label *label;
-
- TWO52 = ix86_gen_TWO52 (mode);
+int
+avx_vpermilp_parallel (rtx par, machine_mode mode)
+{
+ unsigned i, nelt = GET_MODE_NUNITS (mode);
+ unsigned mask = 0;
+ unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
- /* Temporary for holding the result, initialized to the input
- operand to ease control flow. */
- res = gen_reg_rtx (mode);
- emit_move_insn (res, operand1);
+ if (XVECLEN (par, 0) != (int) nelt)
+ return 0;
- /* xa = abs (operand1) */
- xa = ix86_expand_sse_fabs (res, &mask);
+ /* Validate that all of the elements are constants, and not totally
+ out of range. Copy the data into an integral array to make the
+ subsequent checks easier. */
+ for (i = 0; i < nelt; ++i)
+ {
+ rtx er = XVECEXP (par, 0, i);
+ unsigned HOST_WIDE_INT ei;
- /* if (!isless (xa, TWO52)) goto label; */
- label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+ if (!CONST_INT_P (er))
+ return 0;
+ ei = INTVAL (er);
+ if (ei >= nelt)
+ return 0;
+ ipar[i] = ei;
+ }
- /* xa2 = xa + TWO52 - TWO52; */
- xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
- xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
+ switch (mode)
+ {
+ case E_V8DFmode:
+ /* In the 512-bit DFmode case, we can only move elements within
+ a 128-bit lane. First fill the second part of the mask,
+ then fallthru. */
+ for (i = 4; i < 6; ++i)
+ {
+ if (ipar[i] < 4 || ipar[i] >= 6)
+ return 0;
+ mask |= (ipar[i] - 4) << i;
+ }
+ for (i = 6; i < 8; ++i)
+ {
+ if (ipar[i] < 6)
+ return 0;
+ mask |= (ipar[i] - 6) << i;
+ }
+ /* FALLTHRU */
- /* dxa = xa2 - xa; */
- dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
+ case E_V4DFmode:
+ /* In the 256-bit DFmode case, we can only move elements within
+ a 128-bit lane. */
+ for (i = 0; i < 2; ++i)
+ {
+ if (ipar[i] >= 2)
+ return 0;
+ mask |= ipar[i] << i;
+ }
+ for (i = 2; i < 4; ++i)
+ {
+ if (ipar[i] < 2)
+ return 0;
+ mask |= (ipar[i] - 2) << i;
+ }
+ break;
- /* generate 0.5, 1.0 and -0.5 */
- half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
- one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
- mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
- 0, OPTAB_DIRECT);
+ case E_V16SFmode:
+ /* In 512 bit SFmode case, permutation in the upper 256 bits
+ must mirror the permutation in the lower 256-bits. */
+ for (i = 0; i < 8; ++i)
+ if (ipar[i] + 8 != ipar[i + 8])
+ return 0;
+ /* FALLTHRU */
- /* Compensate. */
- tmp = gen_reg_rtx (mode);
- /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
- tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
- emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
- xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
- /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
- tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
- emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
- xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+ case E_V8SFmode:
+ /* In 256 bit SFmode case, we have full freedom of
+ movement within the low 128-bit lane, but the high 128-bit
+ lane must mirror the exact same pattern. */
+ for (i = 0; i < 4; ++i)
+ if (ipar[i] + 4 != ipar[i + 4])
+ return 0;
+ nelt = 4;
+ /* FALLTHRU */
- /* res = copysign (xa2, operand1) */
- ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
+ case E_V2DFmode:
+ case E_V4SFmode:
+ /* In the 128-bit case, we've full freedom in the placement of
+ the elements from the source operand. */
+ for (i = 0; i < nelt; ++i)
+ mask |= ipar[i] << (i * (nelt / 2));
+ break;
- emit_label (label);
- LABEL_NUSES (label) = 1;
+ default:
+ gcc_unreachable ();
+ }
- emit_move_insn (operand0, res);
+ /* Make sure success has a non-zero value by adding one. */
+ return mask + 1;
}
-/* Expand SSE sequence for computing trunc from OPERAND1 storing
- into OPERAND0. */
-void
-ix86_expand_trunc (rtx operand0, rtx operand1)
-{
- /* C code for SSE variant we expand below.
- double xa = fabs (x), x2;
- if (!isless (xa, TWO52))
- return x;
- x2 = (double)(long)x;
- if (HONOR_SIGNED_ZEROS (mode))
- return copysign (x2, x);
- return x2;
- */
- machine_mode mode = GET_MODE (operand0);
- rtx xa, xi, TWO52, res, mask;
- rtx_code_label *label;
-
- TWO52 = ix86_gen_TWO52 (mode);
+/* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
+ the expansion functions to turn the parallel back into a mask.
+ The return value is 0 for no match and the imm8+1 for a match. */
- /* Temporary for holding the result, initialized to the input
- operand to ease control flow. */
- res = gen_reg_rtx (mode);
- emit_move_insn (res, operand1);
+int
+avx_vperm2f128_parallel (rtx par, machine_mode mode)
+{
+ unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
+ unsigned mask = 0;
+ unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
- /* xa = abs (operand1) */
- xa = ix86_expand_sse_fabs (res, &mask);
+ if (XVECLEN (par, 0) != (int) nelt)
+ return 0;
- /* if (!isless (xa, TWO52)) goto label; */
- label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+ /* Validate that all of the elements are constants, and not totally
+ out of range. Copy the data into an integral array to make the
+ subsequent checks easier. */
+ for (i = 0; i < nelt; ++i)
+ {
+ rtx er = XVECEXP (par, 0, i);
+ unsigned HOST_WIDE_INT ei;
- /* x = (double)(long)x */
- xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
- expand_fix (xi, res, 0);
- expand_float (res, xi, 0);
+ if (!CONST_INT_P (er))
+ return 0;
+ ei = INTVAL (er);
+ if (ei >= 2 * nelt)
+ return 0;
+ ipar[i] = ei;
+ }
- if (HONOR_SIGNED_ZEROS (mode))
- ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
+ /* Validate that the halves of the permute are halves. */
+ for (i = 0; i < nelt2 - 1; ++i)
+ if (ipar[i] + 1 != ipar[i + 1])
+ return 0;
+ for (i = nelt2; i < nelt - 1; ++i)
+ if (ipar[i] + 1 != ipar[i + 1])
+ return 0;
- emit_label (label);
- LABEL_NUSES (label) = 1;
+ /* Reconstruct the mask. */
+ for (i = 0; i < 2; ++i)
+ {
+ unsigned e = ipar[i * nelt2];
+ if (e % nelt2)
+ return 0;
+ e /= nelt2;
+ mask |= e << (i * 4);
+ }
- emit_move_insn (operand0, res);
+ /* Make sure success has a non-zero value by adding one. */
+ return mask + 1;
}
-
-/* Expand SSE sequence for computing trunc from OPERAND1 storing
- into OPERAND0. */
-void
-ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
+\f
+/* Return a register priority for hard reg REGNO. */
+static int
+ix86_register_priority (int hard_regno)
{
- machine_mode mode = GET_MODE (operand0);
- rtx xa, mask, TWO52, one, res, smask, tmp;
- rtx_code_label *label;
-
- /* C code for SSE variant we expand below.
- double xa = fabs (x), x2;
- if (!isless (xa, TWO52))
- return x;
- xa2 = xa + TWO52 - TWO52;
- Compensate:
- if (xa2 > xa)
- xa2 -= 1.0;
- x2 = copysign (xa2, x);
- return x2;
- */
-
- TWO52 = ix86_gen_TWO52 (mode);
-
- /* Temporary for holding the result, initialized to the input
- operand to ease control flow. */
- res = gen_reg_rtx (mode);
- emit_move_insn (res, operand1);
-
- /* xa = abs (operand1) */
- xa = ix86_expand_sse_fabs (res, &smask);
-
- /* if (!isless (xa, TWO52)) goto label; */
- label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
-
- /* res = xa + TWO52 - TWO52; */
- tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
- tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
- emit_move_insn (res, tmp);
-
- /* generate 1.0 */
- one = force_reg (mode, const_double_from_real_value (dconst1, mode));
-
- /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
- mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
- emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
- tmp = expand_simple_binop (mode, MINUS,
- res, mask, NULL_RTX, 0, OPTAB_DIRECT);
- emit_move_insn (res, tmp);
-
- /* res = copysign (res, operand1) */
- ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
-
- emit_label (label);
- LABEL_NUSES (label) = 1;
-
- emit_move_insn (operand0, res);
+ /* ebp and r13 as the base always wants a displacement, r12 as the
+ base always wants an index. So discourage their usage in an
+ address. */
+ if (hard_regno == R12_REG || hard_regno == R13_REG)
+ return 0;
+ if (hard_regno == BP_REG)
+ return 1;
+ /* New x86-64 int registers result in bigger code size. Discourage
+ them. */
+ if (IN_RANGE (hard_regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
+ return 2;
+ /* New x86-64 SSE registers result in bigger code size. Discourage
+ them. */
+ if (IN_RANGE (hard_regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
+ return 2;
+ if (IN_RANGE (hard_regno, FIRST_EXT_REX_SSE_REG, LAST_EXT_REX_SSE_REG))
+ return 1;
+ /* Usage of AX register results in smaller code. Prefer it. */
+ if (hard_regno == AX_REG)
+ return 4;
+ return 3;
}
-/* Expand SSE sequence for computing round from OPERAND1 storing
- into OPERAND0. */
-void
-ix86_expand_round (rtx operand0, rtx operand1)
-{
- /* C code for the stuff we're doing below:
- double xa = fabs (x);
- if (!isless (xa, TWO52))
- return x;
- xa = (double)(long)(xa + nextafter (0.5, 0.0));
- return copysign (xa, x);
- */
- machine_mode mode = GET_MODE (operand0);
- rtx res, TWO52, xa, xi, half, mask;
- rtx_code_label *label;
- const struct real_format *fmt;
- REAL_VALUE_TYPE pred_half, half_minus_pred_half;
+/* Implement TARGET_PREFERRED_RELOAD_CLASS.
+
+ Put float CONST_DOUBLE in the constant pool instead of fp regs.
+ QImode must go into class Q_REGS.
+ Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
+ movdf to do mem-to-mem moves through integer regs. */
- /* Temporary for holding the result, initialized to the input
- operand to ease control flow. */
- res = gen_reg_rtx (mode);
- emit_move_insn (res, operand1);
+static reg_class_t
+ix86_preferred_reload_class (rtx x, reg_class_t regclass)
+{
+ machine_mode mode = GET_MODE (x);
- TWO52 = ix86_gen_TWO52 (mode);
- xa = ix86_expand_sse_fabs (res, &mask);
- label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+ /* We're only allowed to return a subclass of CLASS. Many of the
+ following checks fail for NO_REGS, so eliminate that early. */
+ if (regclass == NO_REGS)
+ return NO_REGS;
- /* load nextafter (0.5, 0.0) */
- fmt = REAL_MODE_FORMAT (mode);
- real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
- real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
+ /* All classes can load zeros. */
+ if (x == CONST0_RTX (mode))
+ return regclass;
- /* xa = xa + 0.5 */
- half = force_reg (mode, const_double_from_real_value (pred_half, mode));
- xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
+ /* Force constants into memory if we are loading a (nonzero) constant into
+ an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
+ instructions to load from a constant. */
+ if (CONSTANT_P (x)
+ && (MAYBE_MMX_CLASS_P (regclass)
+ || MAYBE_SSE_CLASS_P (regclass)
+ || MAYBE_MASK_CLASS_P (regclass)))
+ return NO_REGS;
- /* xa = (double)(int64_t)xa */
- xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
- expand_fix (xi, xa, 0);
- expand_float (xa, xi, 0);
+ /* Floating-point constants need more complex checks. */
+ if (CONST_DOUBLE_P (x))
+ {
+ /* General regs can load everything. */
+ if (INTEGER_CLASS_P (regclass))
+ return regclass;
- /* res = copysign (xa, operand1) */
- ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
+ /* Floats can load 0 and 1 plus some others. Note that we eliminated
+ zero above. We only want to wind up preferring 80387 registers if
+ we plan on doing computation with them. */
+ if (IS_STACK_MODE (mode)
+ && standard_80387_constant_p (x) > 0)
+ {
+ /* Limit class to FP regs. */
+ if (FLOAT_CLASS_P (regclass))
+ return FLOAT_REGS;
+ }
- emit_label (label);
- LABEL_NUSES (label) = 1;
+ return NO_REGS;
+ }
- emit_move_insn (operand0, res);
-}
+ /* Prefer SSE regs only, if we can use them for math. */
+ if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+ return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
-/* Expand SSE sequence for computing round
- from OP1 storing into OP0 using sse4 round insn. */
-void
-ix86_expand_round_sse4 (rtx op0, rtx op1)
-{
- machine_mode mode = GET_MODE (op0);
- rtx e1, e2, res, half;
- const struct real_format *fmt;
- REAL_VALUE_TYPE pred_half, half_minus_pred_half;
- rtx (*gen_copysign) (rtx, rtx, rtx);
- rtx (*gen_round) (rtx, rtx, rtx);
+ /* Generally when we see PLUS here, it's the function invariant
+ (plus soft-fp const_int). Which can only be computed into general
+ regs. */
+ if (GET_CODE (x) == PLUS)
+ return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
- switch (mode)
+ /* QImode constants are easy to load, but non-constant QImode data
+ must go into Q_REGS. */
+ if (GET_MODE (x) == QImode && !CONSTANT_P (x))
{
- case E_SFmode:
- gen_copysign = gen_copysignsf3;
- gen_round = gen_sse4_1_roundsf2;
- break;
- case E_DFmode:
- gen_copysign = gen_copysigndf3;
- gen_round = gen_sse4_1_rounddf2;
- break;
- default:
- gcc_unreachable ();
+ if (Q_CLASS_P (regclass))
+ return regclass;
+ else if (reg_class_subset_p (Q_REGS, regclass))
+ return Q_REGS;
+ else
+ return NO_REGS;
}
- /* round (a) = trunc (a + copysign (0.5, a)) */
-
- /* load nextafter (0.5, 0.0) */
- fmt = REAL_MODE_FORMAT (mode);
- real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
- real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
- half = const_double_from_real_value (pred_half, mode);
+ return regclass;
+}
- /* e1 = copysign (0.5, op1) */
- e1 = gen_reg_rtx (mode);
- emit_insn (gen_copysign (e1, half, op1));
+/* Discourage putting floating-point values in SSE registers unless
+ SSE math is being used, and likewise for the 387 registers. */
+static reg_class_t
+ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
+{
+ machine_mode mode = GET_MODE (x);
- /* e2 = op1 + e1 */
- e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
+ /* Restrict the output reload class to the register bank that we are doing
+ math on. If we would like not to return a subset of CLASS, reject this
+ alternative: if reload cannot do this, it will still use its choice. */
+ mode = GET_MODE (x);
+ if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+ return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
- /* res = trunc (e2) */
- res = gen_reg_rtx (mode);
- emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
+ if (IS_STACK_MODE (mode))
+ return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
- emit_move_insn (op0, res);
+ return regclass;
}
-/* Handle fentry_name / fentry_section attribute. */
-
-static tree
-ix86_handle_fentry_name (tree *node, tree name, tree args,
- int, bool *no_add_attrs)
+static reg_class_t
+ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
+ machine_mode mode, secondary_reload_info *sri)
{
- if (TREE_CODE (*node) == FUNCTION_DECL
- && TREE_CODE (TREE_VALUE (args)) == STRING_CST)
- /* Do nothing else, just set the attribute. We'll get at
- it later with lookup_attribute. */
- ;
- else
+ /* Double-word spills from general registers to non-offsettable memory
+ references (zero-extended addresses) require special handling. */
+ if (TARGET_64BIT
+ && MEM_P (x)
+ && GET_MODE_SIZE (mode) > UNITS_PER_WORD
+ && INTEGER_CLASS_P (rclass)
+ && !offsettable_memref_p (x))
{
- warning (OPT_Wattributes, "%qE attribute ignored", name);
- *no_add_attrs = true;
- }
-
- return NULL_TREE;
-}
-\f
-
-/* Table of valid machine attributes. */
-static const struct attribute_spec ix86_attribute_table[] =
-{
- /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
- affects_type_identity, handler, exclude } */
- /* Stdcall attribute says callee is responsible for popping arguments
- if they are not variable. */
- { "stdcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
- NULL },
- /* Fastcall attribute says callee is responsible for popping arguments
- if they are not variable. */
- { "fastcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
- NULL },
- /* Thiscall attribute says callee is responsible for popping arguments
- if they are not variable. */
- { "thiscall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
- NULL },
- /* Cdecl attribute says the callee is a normal C declaration */
- { "cdecl", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
- NULL },
- /* Regparm attribute specifies how many integer arguments are to be
- passed in registers. */
- { "regparm", 1, 1, false, true, true, true, ix86_handle_cconv_attribute,
- NULL },
- /* Sseregparm attribute says we are using x86_64 calling conventions
- for FP arguments. */
- { "sseregparm", 0, 0, false, true, true, true, ix86_handle_cconv_attribute,
- NULL },
- /* The transactional memory builtins are implicitly regparm or fastcall
- depending on the ABI. Override the generic do-nothing attribute that
- these builtins were declared with. */
- { "*tm regparm", 0, 0, false, true, true, true,
- ix86_handle_tm_regparm_attribute, NULL },
- /* force_align_arg_pointer says this function realigns the stack at entry. */
- { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
- false, true, true, false, ix86_handle_force_align_arg_pointer_attribute,
- NULL },
-#if TARGET_DLLIMPORT_DECL_ATTRIBUTES
- { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute,
- NULL },
- { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute,
- NULL },
- { "shared", 0, 0, true, false, false, false,
- ix86_handle_shared_attribute, NULL },
-#endif
- { "ms_struct", 0, 0, false, false, false, false,
- ix86_handle_struct_attribute, NULL },
- { "gcc_struct", 0, 0, false, false, false, false,
- ix86_handle_struct_attribute, NULL },
-#ifdef SUBTARGET_ATTRIBUTE_TABLE
- SUBTARGET_ATTRIBUTE_TABLE,
-#endif
- /* ms_abi and sysv_abi calling convention function attributes. */
- { "ms_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute, NULL },
- { "sysv_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute,
- NULL },
- { "ms_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
- { "sysv_abi va_list", 0, 0, false, false, false, false, NULL, NULL },
- { "ms_hook_prologue", 0, 0, true, false, false, false,
- ix86_handle_fndecl_attribute, NULL },
- { "callee_pop_aggregate_return", 1, 1, false, true, true, true,
- ix86_handle_callee_pop_aggregate_return, NULL },
- { "interrupt", 0, 0, false, true, true, false,
- ix86_handle_interrupt_attribute, NULL },
- { "no_caller_saved_registers", 0, 0, false, true, true, false,
- ix86_handle_no_caller_saved_registers_attribute, NULL },
- { "naked", 0, 0, true, false, false, false,
- ix86_handle_fndecl_attribute, NULL },
- { "indirect_branch", 1, 1, true, false, false, false,
- ix86_handle_fndecl_attribute, NULL },
- { "function_return", 1, 1, true, false, false, false,
- ix86_handle_fndecl_attribute, NULL },
- { "indirect_return", 0, 0, false, true, true, false,
- NULL, NULL },
- { "fentry_name", 1, 1, true, false, false, false,
- ix86_handle_fentry_name, NULL },
- { "fentry_section", 1, 1, true, false, false, false,
- ix86_handle_fentry_name, NULL },
- { "cf_check", 0, 0, true, false, false, false,
- ix86_handle_fndecl_attribute, NULL },
-
- /* End element. */
- { NULL, 0, 0, false, false, false, false, NULL, NULL }
-};
+ sri->icode = (in_p
+ ? CODE_FOR_reload_noff_load
+ : CODE_FOR_reload_noff_store);
+ /* Add the cost of moving address to a temporary. */
+ sri->extra_cost = 1;
-/* Implement targetm.vectorize.builtin_vectorization_cost. */
-static int
-ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
- tree vectype, int)
-{
- bool fp = false;
- machine_mode mode = TImode;
- int index;
- if (vectype != NULL)
- {
- fp = FLOAT_TYPE_P (vectype);
- mode = TYPE_MODE (vectype);
+ return NO_REGS;
}
- switch (type_of_cost)
+ /* QImode spills from non-QI registers require
+ intermediate register on 32bit targets. */
+ if (mode == QImode
+ && ((!TARGET_64BIT && !in_p
+ && INTEGER_CLASS_P (rclass)
+ && MAYBE_NON_Q_CLASS_P (rclass))
+ || (!TARGET_AVX512DQ
+ && MAYBE_MASK_CLASS_P (rclass))))
{
- case scalar_stmt:
- return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
+ int regno = true_regnum (x);
- case scalar_load:
- /* load/store costs are relative to register move which is 2. Recompute
- it to COSTS_N_INSNS so everything have same base. */
- return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0]
- : ix86_cost->int_load [2]) / 2;
+ /* Return Q_REGS if the operand is in memory. */
+ if (regno == -1)
+ return Q_REGS;
- case scalar_store:
- return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0]
- : ix86_cost->int_store [2]) / 2;
+ return NO_REGS;
+ }
- case vector_stmt:
- return ix86_vec_cost (mode,
- fp ? ix86_cost->addss : ix86_cost->sse_op);
+ /* This condition handles corner case where an expression involving
+ pointers gets vectorized. We're trying to use the address of a
+ stack slot as a vector initializer.
- case vector_load:
- index = sse_store_index (mode);
- /* See PR82713 - we may end up being called on non-vector type. */
- if (index < 0)
- index = 2;
- return COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2;
+ (set (reg:V2DI 74 [ vect_cst_.2 ])
+ (vec_duplicate:V2DI (reg/f:DI 20 frame)))
- case vector_store:
- index = sse_store_index (mode);
- /* See PR82713 - we may end up being called on non-vector type. */
- if (index < 0)
- index = 2;
- return COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2;
+ Eventually frame gets turned into sp+offset like this:
- case vec_to_scalar:
- case scalar_to_vec:
- return ix86_vec_cost (mode, ix86_cost->sse_op);
+ (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
+ (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
+ (const_int 392 [0x188]))))
- /* We should have separate costs for unaligned loads and gather/scatter.
- Do that incrementally. */
- case unaligned_load:
- index = sse_store_index (mode);
- /* See PR82713 - we may end up being called on non-vector type. */
- if (index < 0)
- index = 2;
- return COSTS_N_INSNS (ix86_cost->sse_unaligned_load[index]) / 2;
+ That later gets turned into:
- case unaligned_store:
- index = sse_store_index (mode);
- /* See PR82713 - we may end up being called on non-vector type. */
- if (index < 0)
- index = 2;
- return COSTS_N_INSNS (ix86_cost->sse_unaligned_store[index]) / 2;
+ (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
+ (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
+ (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
- case vector_gather_load:
- return ix86_vec_cost (mode,
- COSTS_N_INSNS
- (ix86_cost->gather_static
- + ix86_cost->gather_per_elt
- * TYPE_VECTOR_SUBPARTS (vectype)) / 2);
+ We'll have the following reload recorded:
- case vector_scatter_store:
- return ix86_vec_cost (mode,
- COSTS_N_INSNS
- (ix86_cost->scatter_static
- + ix86_cost->scatter_per_elt
- * TYPE_VECTOR_SUBPARTS (vectype)) / 2);
+ Reload 0: reload_in (DI) =
+ (plus:DI (reg/f:DI 7 sp)
+ (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
+ reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
+ SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
+ reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
+ reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
+ reload_reg_rtx: (reg:V2DI 22 xmm1)
- case cond_branch_taken:
- return ix86_cost->cond_taken_branch_cost;
+ Which isn't going to work since SSE instructions can't handle scalar
+ additions. Returning GENERAL_REGS forces the addition into integer
+ register and reload can handle subsequent reloads without problems. */
- case cond_branch_not_taken:
- return ix86_cost->cond_not_taken_branch_cost;
+ if (in_p && GET_CODE (x) == PLUS
+ && SSE_CLASS_P (rclass)
+ && SCALAR_INT_MODE_P (mode))
+ return GENERAL_REGS;
- case vec_perm:
- case vec_promote_demote:
- return ix86_vec_cost (mode, ix86_cost->sse_op);
+ return NO_REGS;
+}
- case vec_construct:
- {
- /* N element inserts into SSE vectors. */
- int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
- /* One vinserti128 for combining two SSE vectors for AVX256. */
- if (GET_MODE_BITSIZE (mode) == 256)
- cost += ix86_vec_cost (mode, ix86_cost->addss);
- /* One vinserti64x4 and two vinserti128 for combining SSE
- and AVX256 vectors to AVX512. */
- else if (GET_MODE_BITSIZE (mode) == 512)
- cost += 3 * ix86_vec_cost (mode, ix86_cost->addss);
- return cost;
- }
+/* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
+
+static bool
+ix86_class_likely_spilled_p (reg_class_t rclass)
+{
+ switch (rclass)
+ {
+ case AREG:
+ case DREG:
+ case CREG:
+ case BREG:
+ case AD_REGS:
+ case SIREG:
+ case DIREG:
+ case SSE_FIRST_REG:
+ case FP_TOP_REG:
+ case FP_SECOND_REG:
+ return true;
default:
- gcc_unreachable ();
+ break;
}
+
+ return false;
}
-/* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
- insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
- insn every time. */
+/* If we are copying between registers from different register sets
+ (e.g. FP and integer), we may need a memory location.
+
+ The function can't work reliably when one of the CLASSES is a class
+ containing registers from multiple sets. We avoid this by never combining
+ different sets in a single alternative in the machine description.
+ Ensure that this constraint holds to avoid unexpected surprises.
-static GTY(()) rtx_insn *vselect_insn;
+ When STRICT is false, we are being called from REGISTER_MOVE_COST,
+ so do not enforce these sanity checks.
-/* Initialize vselect_insn. */
+ To optimize register_move_cost performance, define inline variant. */
-static void
-init_vselect_insn (void)
+static inline bool
+inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
+ reg_class_t class2, int strict)
{
- unsigned i;
- rtx x;
+ if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
+ return false;
- x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
- for (i = 0; i < MAX_VECT_LEN; ++i)
- XVECEXP (x, 0, i) = const0_rtx;
- x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
- const0_rtx), x);
- x = gen_rtx_SET (const0_rtx, x);
- start_sequence ();
- vselect_insn = emit_insn (x);
- end_sequence ();
-}
+ if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
+ || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
+ || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
+ || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
+ || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
+ || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
+ || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
+ || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
+ {
+ gcc_assert (!strict || lra_in_progress);
+ return true;
+ }
-/* Construct (set target (vec_select op0 (parallel perm))) and
- return true if that's a valid instruction in the active ISA. */
+ if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
+ return true;
-static bool
-expand_vselect (rtx target, rtx op0, const unsigned char *perm,
- unsigned nelt, bool testing_p)
-{
- unsigned int i;
- rtx x, save_vconcat;
- int icode;
+ /* Between mask and general, we have moves no larger than word size. */
+ if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
+ && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
+ return true;
- if (vselect_insn == NULL_RTX)
- init_vselect_insn ();
+ /* ??? This is a lie. We do have moves between mmx/general, and for
+ mmx/sse2. But by saying we need secondary memory we discourage the
+ register allocator from using the mmx registers unless needed. */
+ if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
+ return true;
- x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
- PUT_NUM_ELEM (XVEC (x, 0), nelt);
- for (i = 0; i < nelt; ++i)
- XVECEXP (x, 0, i) = GEN_INT (perm[i]);
- save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
- XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
- PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
- SET_DEST (PATTERN (vselect_insn)) = target;
- icode = recog_memoized (vselect_insn);
+ if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
+ {
+ /* SSE1 doesn't have any direct moves from other classes. */
+ if (!TARGET_SSE2)
+ return true;
- if (icode >= 0 && !testing_p)
- emit_insn (copy_rtx (PATTERN (vselect_insn)));
+ /* If the target says that inter-unit moves are more expensive
+ than moving through memory, then don't generate them. */
+ if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
+ || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
+ return true;
- SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
- XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
- INSN_CODE (vselect_insn) = -1;
+ /* Between SSE and general, we have moves no larger than word size. */
+ if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+ return true;
+ }
- return icode >= 0;
+ return false;
}
-/* Similar, but generate a vec_concat from op0 and op1 as well. */
+/* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
static bool
-expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
- const unsigned char *perm, unsigned nelt,
- bool testing_p)
+ix86_secondary_memory_needed (machine_mode mode, reg_class_t class1,
+ reg_class_t class2)
{
- machine_mode v2mode;
- rtx x;
- bool ok;
-
- if (vselect_insn == NULL_RTX)
- init_vselect_insn ();
+ return inline_secondary_memory_needed (mode, class1, class2, true);
+}
- if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
- return false;
- x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
- PUT_MODE (x, v2mode);
- XEXP (x, 0) = op0;
- XEXP (x, 1) = op1;
- ok = expand_vselect (target, x, perm, nelt, testing_p);
- XEXP (x, 0) = const0_rtx;
- XEXP (x, 1) = const0_rtx;
- return ok;
-}
-
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
- using movss or movsd. */
-static bool
-expand_vec_perm_movs (struct expand_vec_perm_d *d)
-{
- machine_mode vmode = d->vmode;
- unsigned i, nelt = d->nelt;
- rtx x;
+/* Implement TARGET_SECONDARY_MEMORY_NEEDED_MODE.
- if (d->one_operand_p)
- return false;
+ get_secondary_mem widens integral modes to BITS_PER_WORD.
+ There is no need to emit full 64 bit move on 64 bit targets
+ for integral modes that can be moved using 32 bit move. */
- if (!(TARGET_SSE && vmode == V4SFmode)
- && !(TARGET_SSE2 && vmode == V2DFmode))
- return false;
+static machine_mode
+ix86_secondary_memory_needed_mode (machine_mode mode)
+{
+ if (GET_MODE_BITSIZE (mode) < 32 && INTEGRAL_MODE_P (mode))
+ return mode_for_size (32, GET_MODE_CLASS (mode), 0).require ();
+ return mode;
+}
- /* Only the first element is changed. */
- if (d->perm[0] != nelt && d->perm[0] != 0)
- return false;
- for (i = 1; i < nelt; ++i)
- if (d->perm[i] != i + nelt - d->perm[0])
- return false;
+/* Implement the TARGET_CLASS_MAX_NREGS hook.
- if (d->testing_p)
- return true;
+ On the 80386, this is the size of MODE in words,
+ except in the FP regs, where a single reg is always enough. */
- if (d->perm[0] == nelt)
- x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
+static unsigned char
+ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
+{
+ if (MAYBE_INTEGER_CLASS_P (rclass))
+ {
+ if (mode == XFmode)
+ return (TARGET_64BIT ? 2 : 3);
+ else if (mode == XCmode)
+ return (TARGET_64BIT ? 4 : 6);
+ else
+ return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
+ }
else
- x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
-
- emit_insn (gen_rtx_SET (d->target, x));
-
- return true;
+ {
+ if (COMPLEX_MODE_P (mode))
+ return 2;
+ else
+ return 1;
+ }
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
- in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
+/* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
static bool
-expand_vec_perm_blend (struct expand_vec_perm_d *d)
+ix86_can_change_mode_class (machine_mode from, machine_mode to,
+ reg_class_t regclass)
{
- machine_mode mmode, vmode = d->vmode;
- unsigned i, mask, nelt = d->nelt;
- rtx target, op0, op1, maskop, x;
- rtx rperm[32], vperm;
+ if (from == to)
+ return true;
- if (d->one_operand_p)
- return false;
- if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
- && (TARGET_AVX512BW
- || GET_MODE_UNIT_SIZE (vmode) >= 4))
- ;
- else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
- ;
- else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
- ;
- else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
- ;
- else
+ /* x87 registers can't do subreg at all, as all values are reformatted
+ to extended precision. */
+ if (MAYBE_FLOAT_CLASS_P (regclass))
return false;
- /* This is a blend, not a permute. Elements must stay in their
- respective lanes. */
- for (i = 0; i < nelt; ++i)
+ if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
{
- unsigned e = d->perm[i];
- if (!(e == i || e == i + nelt))
+ /* Vector registers do not support QI or HImode loads. If we don't
+ disallow a change to these modes, reload will assume it's ok to
+ drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
+ the vec_dupv4hi pattern. */
+ if (GET_MODE_SIZE (from) < 4)
return false;
}
- if (d->testing_p)
- return true;
-
- /* ??? Without SSE4.1, we could implement this with and/andn/or. This
- decision should be extracted elsewhere, so that we only try that
- sequence once all budget==3 options have been tried. */
- target = d->target;
- op0 = d->op0;
- op1 = d->op1;
- mask = 0;
-
- switch (vmode)
- {
- case E_V8DFmode:
- case E_V16SFmode:
- case E_V4DFmode:
- case E_V8SFmode:
- case E_V2DFmode:
- case E_V4SFmode:
- case E_V8HImode:
- case E_V8SImode:
- case E_V32HImode:
- case E_V64QImode:
- case E_V16SImode:
- case E_V8DImode:
- for (i = 0; i < nelt; ++i)
- mask |= (d->perm[i] >= nelt) << i;
- break;
-
- case E_V2DImode:
- for (i = 0; i < 2; ++i)
- mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
- vmode = V8HImode;
- goto do_subreg;
-
- case E_V4SImode:
- for (i = 0; i < 4; ++i)
- mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
- vmode = V8HImode;
- goto do_subreg;
+ return true;
+}
- case E_V16QImode:
- /* See if bytes move in pairs so we can use pblendw with
- an immediate argument, rather than pblendvb with a vector
- argument. */
- for (i = 0; i < 16; i += 2)
- if (d->perm[i] + 1 != d->perm[i + 1])
- {
- use_pblendvb:
- for (i = 0; i < nelt; ++i)
- rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
+/* Return index of MODE in the sse load/store tables. */
- finish_pblendvb:
- vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
- vperm = force_reg (vmode, vperm);
+static inline int
+sse_store_index (machine_mode mode)
+{
+ switch (GET_MODE_SIZE (mode))
+ {
+ case 4:
+ return 0;
+ case 8:
+ return 1;
+ case 16:
+ return 2;
+ case 32:
+ return 3;
+ case 64:
+ return 4;
+ default:
+ return -1;
+ }
+}
- if (GET_MODE_SIZE (vmode) == 16)
- emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
- else
- emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
- if (target != d->target)
- emit_move_insn (d->target, gen_lowpart (d->vmode, target));
- return true;
- }
+/* Return the cost of moving data of mode M between a
+ register and memory. A value of 2 is the default; this cost is
+ relative to those in `REGISTER_MOVE_COST'.
- for (i = 0; i < 8; ++i)
- mask |= (d->perm[i * 2] >= 16) << i;
- vmode = V8HImode;
- /* FALLTHRU */
+ This function is used extensively by register_move_cost that is used to
+ build tables at startup. Make it inline in this case.
+ When IN is 2, return maximum of in and out move cost.
- do_subreg:
- target = gen_reg_rtx (vmode);
- op0 = gen_lowpart (vmode, op0);
- op1 = gen_lowpart (vmode, op1);
- break;
+ If moving between registers and memory is more expensive than
+ between two registers, you should define this macro to express the
+ relative cost.
- case E_V32QImode:
- /* See if bytes move in pairs. If not, vpblendvb must be used. */
- for (i = 0; i < 32; i += 2)
- if (d->perm[i] + 1 != d->perm[i + 1])
- goto use_pblendvb;
- /* See if bytes move in quadruplets. If yes, vpblendd
- with immediate can be used. */
- for (i = 0; i < 32; i += 4)
- if (d->perm[i] + 2 != d->perm[i + 2])
- break;
- if (i < 32)
+ Model also increased moving costs of QImode registers in non
+ Q_REGS classes.
+ */
+static inline int
+inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in)
+{
+ int cost;
+ if (FLOAT_CLASS_P (regclass))
+ {
+ int index;
+ switch (mode)
{
- /* See if bytes move the same in both lanes. If yes,
- vpblendw with immediate can be used. */
- for (i = 0; i < 16; i += 2)
- if (d->perm[i] + 16 != d->perm[i + 16])
- goto use_pblendvb;
-
- /* Use vpblendw. */
- for (i = 0; i < 16; ++i)
- mask |= (d->perm[i * 2] >= 32) << i;
- vmode = V16HImode;
- goto do_subreg;
+ case E_SFmode:
+ index = 0;
+ break;
+ case E_DFmode:
+ index = 1;
+ break;
+ case E_XFmode:
+ index = 2;
+ break;
+ default:
+ return 100;
}
-
- /* Use vpblendd. */
- for (i = 0; i < 8; ++i)
- mask |= (d->perm[i * 4] >= 32) << i;
- vmode = V8SImode;
- goto do_subreg;
-
- case E_V16HImode:
- /* See if words move in pairs. If yes, vpblendd can be used. */
- for (i = 0; i < 16; i += 2)
- if (d->perm[i] + 1 != d->perm[i + 1])
- break;
- if (i < 16)
+ if (in == 2)
+ return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
+ return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
+ }
+ if (SSE_CLASS_P (regclass))
+ {
+ int index = sse_store_index (mode);
+ if (index == -1)
+ return 100;
+ if (in == 2)
+ return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
+ return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
+ }
+ if (MMX_CLASS_P (regclass))
+ {
+ int index;
+ switch (GET_MODE_SIZE (mode))
{
- /* See if words move the same in both lanes. If not,
- vpblendvb must be used. */
- for (i = 0; i < 8; i++)
- if (d->perm[i] + 8 != d->perm[i + 8])
- {
- /* Use vpblendvb. */
- for (i = 0; i < 32; ++i)
- rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
-
- vmode = V32QImode;
- nelt = 32;
- target = gen_reg_rtx (vmode);
- op0 = gen_lowpart (vmode, op0);
- op1 = gen_lowpart (vmode, op1);
- goto finish_pblendvb;
- }
-
- /* Use vpblendw. */
- for (i = 0; i < 16; ++i)
- mask |= (d->perm[i] >= 16) << i;
- break;
+ case 4:
+ index = 0;
+ break;
+ case 8:
+ index = 1;
+ break;
+ default:
+ return 100;
}
-
- /* Use vpblendd. */
- for (i = 0; i < 8; ++i)
- mask |= (d->perm[i * 2] >= 16) << i;
- vmode = V8SImode;
- goto do_subreg;
-
- case E_V4DImode:
- /* Use vpblendd. */
- for (i = 0; i < 4; ++i)
- mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
- vmode = V8SImode;
- goto do_subreg;
-
- default:
- gcc_unreachable ();
+ if (in == 2)
+ return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
+ return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
}
-
- switch (vmode)
+ switch (GET_MODE_SIZE (mode))
{
- case E_V8DFmode:
- case E_V8DImode:
- mmode = QImode;
- break;
- case E_V16SFmode:
- case E_V16SImode:
- mmode = HImode;
- break;
- case E_V32HImode:
- mmode = SImode;
- break;
- case E_V64QImode:
- mmode = DImode;
- break;
- default:
- mmode = VOIDmode;
+ case 1:
+ if (Q_CLASS_P (regclass) || TARGET_64BIT)
+ {
+ if (!in)
+ return ix86_cost->int_store[0];
+ if (TARGET_PARTIAL_REG_DEPENDENCY
+ && optimize_function_for_speed_p (cfun))
+ cost = ix86_cost->movzbl_load;
+ else
+ cost = ix86_cost->int_load[0];
+ if (in == 2)
+ return MAX (cost, ix86_cost->int_store[0]);
+ return cost;
+ }
+ else
+ {
+ if (in == 2)
+ return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
+ if (in)
+ return ix86_cost->movzbl_load;
+ else
+ return ix86_cost->int_store[0] + 4;
+ }
+ break;
+ case 2:
+ if (in == 2)
+ return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
+ return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
+ default:
+ if (in == 2)
+ cost = MAX (ix86_cost->int_load[2], ix86_cost->int_store[2]);
+ else if (in)
+ cost = ix86_cost->int_load[2];
+ else
+ cost = ix86_cost->int_store[2];
+ /* Multiply with the number of GPR moves needed. */
+ return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
}
+}
- if (mmode != VOIDmode)
- maskop = force_reg (mmode, gen_int_mode (mask, mmode));
- else
- maskop = GEN_INT (mask);
-
- /* This matches five different patterns with the different modes. */
- x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
- x = gen_rtx_SET (target, x);
- emit_insn (x);
- if (target != d->target)
- emit_move_insn (d->target, gen_lowpart (d->vmode, target));
-
- return true;
+static int
+ix86_memory_move_cost (machine_mode mode, reg_class_t regclass, bool in)
+{
+ return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
- in terms of the variable form of vpermilps.
- Note that we will have already failed the immediate input vpermilps,
- which requires that the high and low part shuffle be identical; the
- variable form doesn't require that. */
+/* Return the cost of moving data from a register in class CLASS1 to
+ one in class CLASS2.
+
+ It is not required that the cost always equal 2 when FROM is the same as TO;
+ on some machines it is expensive to move between registers if they are not
+ general registers. */
-static bool
-expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
+static int
+ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
+ reg_class_t class2_i)
{
- rtx rperm[8], vperm;
- unsigned i;
+ enum reg_class class1 = (enum reg_class) class1_i;
+ enum reg_class class2 = (enum reg_class) class2_i;
- if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
- return false;
+ /* In case we require secondary memory, compute cost of the store followed
+ by load. In order to avoid bad register allocation choices, we need
+ for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
- /* We can only permute within the 128-bit lane. */
- for (i = 0; i < 8; ++i)
+ if (inline_secondary_memory_needed (mode, class1, class2, false))
{
- unsigned e = d->perm[i];
- if (i < 4 ? e >= 4 : e < 4)
- return false;
- }
+ int cost = 1;
- if (d->testing_p)
- return true;
+ cost += inline_memory_move_cost (mode, class1, 2);
+ cost += inline_memory_move_cost (mode, class2, 2);
- for (i = 0; i < 8; ++i)
- {
- unsigned e = d->perm[i];
+ /* In case of copying from general_purpose_register we may emit multiple
+ stores followed by single load causing memory size mismatch stall.
+ Count this as arbitrarily high cost of 20. */
+ if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD
+ && TARGET_MEMORY_MISMATCH_STALL
+ && targetm.class_max_nregs (class1, mode)
+ > targetm.class_max_nregs (class2, mode))
+ cost += 20;
- /* Within each 128-bit lane, the elements of op0 are numbered
- from 0 and the elements of op1 are numbered from 4. */
- if (e >= 8 + 4)
- e -= 8;
- else if (e >= 4)
- e -= 4;
+ /* In the case of FP/MMX moves, the registers actually overlap, and we
+ have to switch modes in order to treat them differently. */
+ if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
+ || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
+ cost += 20;
- rperm[i] = GEN_INT (e);
+ return cost;
}
- vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
- vperm = force_reg (V8SImode, vperm);
- emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
-
- return true;
-}
-
-/* Return true if permutation D can be performed as VMODE permutation
- instead. */
+ /* Moves between SSE/MMX and integer unit are expensive. */
+ if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
+ || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
-static bool
-valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
-{
- unsigned int i, j, chunk;
+ /* ??? By keeping returned value relatively high, we limit the number
+ of moves between integer and MMX/SSE registers for all targets.
+ Additionally, high value prevents problem with x86_modes_tieable_p(),
+ where integer modes in MMX/SSE registers are not tieable
+ because of missing QImode and HImode moves to, from or between
+ MMX/SSE registers. */
+ return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2)
+ ? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer);
- if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
- || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
- || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
- return false;
+ if (MAYBE_FLOAT_CLASS_P (class1))
+ return ix86_cost->fp_move;
+ if (MAYBE_SSE_CLASS_P (class1))
+ {
+ if (GET_MODE_BITSIZE (mode) <= 128)
+ return ix86_cost->xmm_move;
+ if (GET_MODE_BITSIZE (mode) <= 256)
+ return ix86_cost->ymm_move;
+ return ix86_cost->zmm_move;
+ }
+ if (MAYBE_MMX_CLASS_P (class1))
+ return ix86_cost->mmx_move;
+ return 2;
+}
- if (GET_MODE_NUNITS (vmode) >= d->nelt)
- return true;
+/* Implement TARGET_HARD_REGNO_NREGS. This is ordinarily the length in
+ words of a value of mode MODE but can be less for certain modes in
+ special long registers.
- chunk = d->nelt / GET_MODE_NUNITS (vmode);
- for (i = 0; i < d->nelt; i += chunk)
- if (d->perm[i] & (chunk - 1))
- return false;
- else
- for (j = 1; j < chunk; ++j)
- if (d->perm[i] + j != d->perm[i + j])
- return false;
+ Actually there are no two word move instructions for consecutive
+ registers. And only registers 0-3 may have mov byte instructions
+ applied to them. */
- return true;
+static unsigned int
+ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
+{
+ if (GENERAL_REGNO_P (regno))
+ {
+ if (mode == XFmode)
+ return TARGET_64BIT ? 2 : 3;
+ if (mode == XCmode)
+ return TARGET_64BIT ? 4 : 6;
+ return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
+ }
+ if (COMPLEX_MODE_P (mode))
+ return 2;
+ if (mode == V64SFmode || mode == V64SImode)
+ return 4;
+ return 1;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
- in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
+/* Implement TARGET_HARD_REGNO_MODE_OK. */
static bool
-expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
+ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
{
- unsigned i, nelt, eltsz, mask;
- unsigned char perm[64];
- machine_mode vmode = V16QImode;
- rtx rperm[64], vperm, target, op0, op1;
-
- nelt = d->nelt;
-
- if (!d->one_operand_p)
- {
- if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
- {
- if (TARGET_AVX2
- && valid_perm_using_mode_p (V2TImode, d))
- {
- if (d->testing_p)
- return true;
-
- /* Use vperm2i128 insn. The pattern uses
- V4DImode instead of V2TImode. */
- target = d->target;
- if (d->vmode != V4DImode)
- target = gen_reg_rtx (V4DImode);
- op0 = gen_lowpart (V4DImode, d->op0);
- op1 = gen_lowpart (V4DImode, d->op1);
- rperm[0]
- = GEN_INT ((d->perm[0] / (nelt / 2))
- | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
- emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
- if (target != d->target)
- emit_move_insn (d->target, gen_lowpart (d->vmode, target));
- return true;
- }
- return false;
- }
- }
- else
+ /* Flags and only flags can only hold CCmode values. */
+ if (CC_REGNO_P (regno))
+ return GET_MODE_CLASS (mode) == MODE_CC;
+ if (GET_MODE_CLASS (mode) == MODE_CC
+ || GET_MODE_CLASS (mode) == MODE_RANDOM
+ || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
+ return false;
+ if (STACK_REGNO_P (regno))
+ return VALID_FP_MODE_P (mode);
+ if (MASK_REGNO_P (regno))
+ return (VALID_MASK_REG_MODE (mode)
+ || (TARGET_AVX512BW
+ && VALID_MASK_AVX512BW_MODE (mode)));
+ if (SSE_REGNO_P (regno))
{
- if (GET_MODE_SIZE (d->vmode) == 16)
- {
- if (!TARGET_SSSE3)
- return false;
- }
- else if (GET_MODE_SIZE (d->vmode) == 32)
- {
- if (!TARGET_AVX2)
- return false;
-
- /* V4DImode should be already handled through
- expand_vselect by vpermq instruction. */
- gcc_assert (d->vmode != V4DImode);
-
- vmode = V32QImode;
- if (d->vmode == V8SImode
- || d->vmode == V16HImode
- || d->vmode == V32QImode)
- {
- /* First see if vpermq can be used for
- V8SImode/V16HImode/V32QImode. */
- if (valid_perm_using_mode_p (V4DImode, d))
- {
- for (i = 0; i < 4; i++)
- perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
- if (d->testing_p)
- return true;
- target = gen_reg_rtx (V4DImode);
- if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
- perm, 4, false))
- {
- emit_move_insn (d->target,
- gen_lowpart (d->vmode, target));
- return true;
- }
- return false;
- }
-
- /* Next see if vpermd can be used. */
- if (valid_perm_using_mode_p (V8SImode, d))
- vmode = V8SImode;
- }
- /* Or if vpermps can be used. */
- else if (d->vmode == V8SFmode)
- vmode = V8SImode;
+ /* We implement the move patterns for all vector modes into and
+ out of SSE registers, even when no operation instructions
+ are available. */
- if (vmode == V32QImode)
- {
- /* vpshufb only works intra lanes, it is not
- possible to shuffle bytes in between the lanes. */
- for (i = 0; i < nelt; ++i)
- if ((d->perm[i] ^ i) & (nelt / 2))
- return false;
- }
- }
- else if (GET_MODE_SIZE (d->vmode) == 64)
- {
- if (!TARGET_AVX512BW)
- return false;
+ /* For AVX-512 we allow, regardless of regno:
+ - XI mode
+ - any of 512-bit wide vector mode
+ - any scalar mode. */
+ if (TARGET_AVX512F
+ && (mode == XImode
+ || VALID_AVX512F_REG_MODE (mode)
+ || VALID_AVX512F_SCALAR_MODE (mode)))
+ return true;
- /* If vpermq didn't work, vpshufb won't work either. */
- if (d->vmode == V8DFmode || d->vmode == V8DImode)
- return false;
+ /* For AVX-5124FMAPS or AVX-5124VNNIW
+ allow V64SF and V64SI modes for special regnos. */
+ if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
+ && (mode == V64SFmode || mode == V64SImode)
+ && MOD4_SSE_REGNO_P (regno))
+ return true;
- vmode = V64QImode;
- if (d->vmode == V16SImode
- || d->vmode == V32HImode
- || d->vmode == V64QImode)
- {
- /* First see if vpermq can be used for
- V16SImode/V32HImode/V64QImode. */
- if (valid_perm_using_mode_p (V8DImode, d))
- {
- for (i = 0; i < 8; i++)
- perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
- if (d->testing_p)
- return true;
- target = gen_reg_rtx (V8DImode);
- if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
- perm, 8, false))
- {
- emit_move_insn (d->target,
- gen_lowpart (d->vmode, target));
- return true;
- }
- return false;
- }
+ /* TODO check for QI/HI scalars. */
+ /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
+ if (TARGET_AVX512VL
+ && (mode == OImode
+ || mode == TImode
+ || VALID_AVX256_REG_MODE (mode)
+ || VALID_AVX512VL_128_REG_MODE (mode)))
+ return true;
- /* Next see if vpermd can be used. */
- if (valid_perm_using_mode_p (V16SImode, d))
- vmode = V16SImode;
- }
- /* Or if vpermps can be used. */
- else if (d->vmode == V16SFmode)
- vmode = V16SImode;
- if (vmode == V64QImode)
- {
- /* vpshufb only works intra lanes, it is not
- possible to shuffle bytes in between the lanes. */
- for (i = 0; i < nelt; ++i)
- if ((d->perm[i] ^ i) & (nelt / 4))
- return false;
- }
- }
- else
+ /* xmm16-xmm31 are only available for AVX-512. */
+ if (EXT_REX_SSE_REGNO_P (regno))
return false;
- }
-
- if (d->testing_p)
- return true;
- if (vmode == V8SImode)
- for (i = 0; i < 8; ++i)
- rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
- else if (vmode == V16SImode)
- for (i = 0; i < 16; ++i)
- rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
- else
+ /* OImode and AVX modes are available only when AVX is enabled. */
+ return ((TARGET_AVX
+ && VALID_AVX256_REG_OR_OI_MODE (mode))
+ || VALID_SSE_REG_MODE (mode)
+ || VALID_SSE2_REG_MODE (mode)
+ || VALID_MMX_REG_MODE (mode)
+ || VALID_MMX_REG_MODE_3DNOW (mode));
+ }
+ if (MMX_REGNO_P (regno))
{
- eltsz = GET_MODE_UNIT_SIZE (d->vmode);
- if (!d->one_operand_p)
- mask = 2 * nelt - 1;
- else if (vmode == V16QImode)
- mask = nelt - 1;
- else if (vmode == V64QImode)
- mask = nelt / 4 - 1;
- else
- mask = nelt / 2 - 1;
-
- for (i = 0; i < nelt; ++i)
- {
- unsigned j, e = d->perm[i] & mask;
- for (j = 0; j < eltsz; ++j)
- rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
- }
- }
-
- vperm = gen_rtx_CONST_VECTOR (vmode,
- gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
- vperm = force_reg (vmode, vperm);
-
- target = d->target;
- if (d->vmode != vmode)
- target = gen_reg_rtx (vmode);
- op0 = gen_lowpart (vmode, d->op0);
- if (d->one_operand_p)
- {
- if (vmode == V16QImode)
- emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
- else if (vmode == V32QImode)
- emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
- else if (vmode == V64QImode)
- emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
- else if (vmode == V8SFmode)
- emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
- else if (vmode == V8SImode)
- emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
- else if (vmode == V16SFmode)
- emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
- else if (vmode == V16SImode)
- emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
- else
- gcc_unreachable ();
+ /* We implement the move patterns for 3DNOW modes even in MMX mode,
+ so if the register is available at all, then we can move data of
+ the given mode into or out of it. */
+ return (VALID_MMX_REG_MODE (mode)
+ || VALID_MMX_REG_MODE_3DNOW (mode));
}
- else
+
+ if (mode == QImode)
{
- op1 = gen_lowpart (vmode, d->op1);
- emit_insn (gen_xop_pperm (target, op0, op1, vperm));
+ /* Take care for QImode values - they can be in non-QI regs,
+ but then they do cause partial register stalls. */
+ if (ANY_QI_REGNO_P (regno))
+ return true;
+ if (!TARGET_PARTIAL_REG_STALL)
+ return true;
+ /* LRA checks if the hard register is OK for the given mode.
+ QImode values can live in non-QI regs, so we allow all
+ registers here. */
+ if (lra_in_progress)
+ return true;
+ return !can_create_pseudo_p ();
}
- if (target != d->target)
- emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+ /* We handle both integer and floats in the general purpose registers. */
+ else if (VALID_INT_MODE_P (mode))
+ return true;
+ else if (VALID_FP_MODE_P (mode))
+ return true;
+ else if (VALID_DFP_MODE_P (mode))
+ return true;
+ /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
+ on to use that value in smaller contexts, this can easily force a
+ pseudo to be allocated to GENERAL_REGS. Since this is no worse than
+ supporting DImode, allow it. */
+ else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
+ return true;
- return true;
+ return false;
}
-/* For V*[QHS]Imode permutations, check if the same permutation
- can't be performed in a 2x, 4x or 8x wider inner mode. */
+/* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that
+ saves SSE registers across calls is Win64 (thus no need to check the
+ current ABI here), and with AVX enabled Win64 only guarantees that
+ the low 16 bytes are saved. */
static bool
-canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
- struct expand_vec_perm_d *nd)
+ix86_hard_regno_call_part_clobbered (rtx_insn *insn ATTRIBUTE_UNUSED,
+ unsigned int regno, machine_mode mode)
{
- int i;
- machine_mode mode = VOIDmode;
-
- switch (d->vmode)
- {
- case E_V16QImode: mode = V8HImode; break;
- case E_V32QImode: mode = V16HImode; break;
- case E_V64QImode: mode = V32HImode; break;
- case E_V8HImode: mode = V4SImode; break;
- case E_V16HImode: mode = V8SImode; break;
- case E_V32HImode: mode = V16SImode; break;
- case E_V4SImode: mode = V2DImode; break;
- case E_V8SImode: mode = V4DImode; break;
- case E_V16SImode: mode = V8DImode; break;
- default: return false;
- }
- for (i = 0; i < d->nelt; i += 2)
- if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
- return false;
- nd->vmode = mode;
- nd->nelt = d->nelt / 2;
- for (i = 0; i < nd->nelt; i++)
- nd->perm[i] = d->perm[2 * i] / 2;
- if (GET_MODE_INNER (mode) != DImode)
- canonicalize_vector_int_perm (nd, nd);
- if (nd != d)
- {
- nd->one_operand_p = d->one_operand_p;
- nd->testing_p = d->testing_p;
- if (d->op0 == d->op1)
- nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
- else
- {
- nd->op0 = gen_lowpart (nd->vmode, d->op0);
- nd->op1 = gen_lowpart (nd->vmode, d->op1);
- }
- if (d->testing_p)
- nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
- else
- nd->target = gen_reg_rtx (nd->vmode);
- }
- return true;
+ return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16;
}
-/* Try to expand one-operand permutation with constant mask. */
+/* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
+ tieable integer mode. */
static bool
-ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
+ix86_tieable_integer_mode_p (machine_mode mode)
{
- machine_mode mode = GET_MODE (d->op0);
- machine_mode maskmode = mode;
- rtx (*gen) (rtx, rtx, rtx) = NULL;
- rtx target, op0, mask;
- rtx vec[64];
+ switch (mode)
+ {
+ case E_HImode:
+ case E_SImode:
+ return true;
- if (!rtx_equal_p (d->op0, d->op1))
- return false;
+ case E_QImode:
+ return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
- if (!TARGET_AVX512F)
- return false;
+ case E_DImode:
+ return TARGET_64BIT;
- switch (mode)
- {
- case E_V16SImode:
- gen = gen_avx512f_permvarv16si;
- break;
- case E_V16SFmode:
- gen = gen_avx512f_permvarv16sf;
- maskmode = V16SImode;
- break;
- case E_V8DImode:
- gen = gen_avx512f_permvarv8di;
- break;
- case E_V8DFmode:
- gen = gen_avx512f_permvarv8df;
- maskmode = V8DImode;
- break;
default:
return false;
}
-
- target = d->target;
- op0 = d->op0;
- for (int i = 0; i < d->nelt; ++i)
- vec[i] = GEN_INT (d->perm[i]);
- mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
- emit_insn (gen (target, op0, force_reg (maskmode, mask)));
- return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
- in a single instruction. */
+/* Implement TARGET_MODES_TIEABLE_P.
+
+ Return true if MODE1 is accessible in a register that can hold MODE2
+ without copying. That is, all register classes that can hold MODE2
+ can also hold MODE1. */
static bool
-expand_vec_perm_1 (struct expand_vec_perm_d *d)
+ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
{
- unsigned i, nelt = d->nelt;
- struct expand_vec_perm_d nd;
-
- /* Check plain VEC_SELECT first, because AVX has instructions that could
- match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
- input where SEL+CONCAT may not. */
- if (d->one_operand_p)
- {
- int mask = nelt - 1;
- bool identity_perm = true;
- bool broadcast_perm = true;
-
- for (i = 0; i < nelt; i++)
- {
- nd.perm[i] = d->perm[i] & mask;
- if (nd.perm[i] != i)
- identity_perm = false;
- if (nd.perm[i])
- broadcast_perm = false;
- }
+ if (mode1 == mode2)
+ return true;
- if (identity_perm)
- {
- if (!d->testing_p)
- emit_move_insn (d->target, d->op0);
- return true;
- }
- else if (broadcast_perm && TARGET_AVX2)
- {
- /* Use vpbroadcast{b,w,d}. */
- rtx (*gen) (rtx, rtx) = NULL;
- switch (d->vmode)
- {
- case E_V64QImode:
- if (TARGET_AVX512BW)
- gen = gen_avx512bw_vec_dupv64qi_1;
- break;
- case E_V32QImode:
- gen = gen_avx2_pbroadcastv32qi_1;
- break;
- case E_V32HImode:
- if (TARGET_AVX512BW)
- gen = gen_avx512bw_vec_dupv32hi_1;
- break;
- case E_V16HImode:
- gen = gen_avx2_pbroadcastv16hi_1;
- break;
- case E_V16SImode:
- if (TARGET_AVX512F)
- gen = gen_avx512f_vec_dupv16si_1;
- break;
- case E_V8SImode:
- gen = gen_avx2_pbroadcastv8si_1;
- break;
- case E_V16QImode:
- gen = gen_avx2_pbroadcastv16qi;
- break;
- case E_V8HImode:
- gen = gen_avx2_pbroadcastv8hi;
- break;
- case E_V16SFmode:
- if (TARGET_AVX512F)
- gen = gen_avx512f_vec_dupv16sf_1;
- break;
- case E_V8SFmode:
- gen = gen_avx2_vec_dupv8sf_1;
- break;
- case E_V8DFmode:
- if (TARGET_AVX512F)
- gen = gen_avx512f_vec_dupv8df_1;
- break;
- case E_V8DImode:
- if (TARGET_AVX512F)
- gen = gen_avx512f_vec_dupv8di_1;
- break;
- /* For other modes prefer other shuffles this function creates. */
- default: break;
- }
- if (gen != NULL)
- {
- if (!d->testing_p)
- emit_insn (gen (d->target, d->op0));
- return true;
- }
- }
+ if (ix86_tieable_integer_mode_p (mode1)
+ && ix86_tieable_integer_mode_p (mode2))
+ return true;
- if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
- return true;
+ /* MODE2 being XFmode implies fp stack or general regs, which means we
+ can tie any smaller floating point modes to it. Note that we do not
+ tie this with TFmode. */
+ if (mode2 == XFmode)
+ return mode1 == SFmode || mode1 == DFmode;
- /* There are plenty of patterns in sse.md that are written for
- SEL+CONCAT and are not replicated for a single op. Perhaps
- that should be changed, to avoid the nastiness here. */
+ /* MODE2 being DFmode implies fp stack, general or sse regs, which means
+ that we can tie it with SFmode. */
+ if (mode2 == DFmode)
+ return mode1 == SFmode;
- /* Recognize interleave style patterns, which means incrementing
- every other permutation operand. */
- for (i = 0; i < nelt; i += 2)
- {
- nd.perm[i] = d->perm[i] & mask;
- nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
- }
- if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
- d->testing_p))
- return true;
+ /* If MODE2 is only appropriate for an SSE register, then tie with
+ any other mode acceptable to SSE registers. */
+ if (GET_MODE_SIZE (mode2) == 64
+ && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
+ return (GET_MODE_SIZE (mode1) == 64
+ && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
+ if (GET_MODE_SIZE (mode2) == 32
+ && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
+ return (GET_MODE_SIZE (mode1) == 32
+ && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
+ if (GET_MODE_SIZE (mode2) == 16
+ && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
+ return (GET_MODE_SIZE (mode1) == 16
+ && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
- /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
- if (nelt >= 4)
- {
- for (i = 0; i < nelt; i += 4)
- {
- nd.perm[i + 0] = d->perm[i + 0] & mask;
- nd.perm[i + 1] = d->perm[i + 1] & mask;
- nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
- nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
- }
+ /* If MODE2 is appropriate for an MMX register, then tie
+ with any other mode acceptable to MMX registers. */
+ if (GET_MODE_SIZE (mode2) == 8
+ && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
+ return (GET_MODE_SIZE (mode1) == 8
+ && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
- if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
- d->testing_p))
- return true;
- }
- }
+ return false;
+}
- /* Try movss/movsd instructions. */
- if (expand_vec_perm_movs (d))
- return true;
+/* Return the cost of moving between two registers of mode MODE. */
- /* Finally, try the fully general two operand permute. */
- if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
- d->testing_p))
- return true;
+static int
+ix86_set_reg_reg_cost (machine_mode mode)
+{
+ unsigned int units = UNITS_PER_WORD;
- /* Recognize interleave style patterns with reversed operands. */
- if (!d->one_operand_p)
+ switch (GET_MODE_CLASS (mode))
{
- for (i = 0; i < nelt; ++i)
- {
- unsigned e = d->perm[i];
- if (e >= nelt)
- e -= nelt;
- else
- e += nelt;
- nd.perm[i] = e;
- }
-
- if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
- d->testing_p))
- return true;
- }
-
- /* Try the SSE4.1 blend variable merge instructions. */
- if (expand_vec_perm_blend (d))
- return true;
-
- /* Try one of the AVX vpermil variable permutations. */
- if (expand_vec_perm_vpermil (d))
- return true;
-
- /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
- vpshufb, vpermd, vpermps or vpermq variable permutation. */
- if (expand_vec_perm_pshufb (d))
- return true;
+ default:
+ break;
- /* Try the AVX2 vpalignr instruction. */
- if (expand_vec_perm_palignr (d, true))
- return true;
+ case MODE_CC:
+ units = GET_MODE_SIZE (CCmode);
+ break;
- /* Try the AVX512F vperm{s,d} instructions. */
- if (ix86_expand_vec_one_operand_perm_avx512 (d))
- return true;
+ case MODE_FLOAT:
+ if ((TARGET_SSE && mode == TFmode)
+ || (TARGET_80387 && mode == XFmode)
+ || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
+ || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
+ units = GET_MODE_SIZE (mode);
+ break;
- /* Try the AVX512F vpermt2/vpermi2 instructions. */
- if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
- return true;
+ case MODE_COMPLEX_FLOAT:
+ if ((TARGET_SSE && mode == TCmode)
+ || (TARGET_80387 && mode == XCmode)
+ || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
+ || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
+ units = GET_MODE_SIZE (mode);
+ break;
- /* See if we can get the same permutation in different vector integer
- mode. */
- if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
- {
- if (!d->testing_p)
- emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
- return true;
+ case MODE_VECTOR_INT:
+ case MODE_VECTOR_FLOAT:
+ if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
+ || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
+ || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
+ || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
+ || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
+ units = GET_MODE_SIZE (mode);
}
- return false;
+
+ /* Return the cost of moving between two registers of mode MODE,
+ assuming that the move will be in pieces of at most UNITS bytes. */
+ return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
- in terms of a pair of pshuflw + pshufhw instructions. */
+/* Return cost of vector operation in MODE given that scalar version has
+ COST. */
-static bool
-expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
+static int
+ix86_vec_cost (machine_mode mode, int cost)
{
- unsigned char perm2[MAX_VECT_LEN];
- unsigned i;
- bool ok;
-
- if (d->vmode != V8HImode || !d->one_operand_p)
- return false;
+ if (!VECTOR_MODE_P (mode))
+ return cost;
- /* The two permutations only operate in 64-bit lanes. */
- for (i = 0; i < 4; ++i)
- if (d->perm[i] >= 4)
- return false;
- for (i = 4; i < 8; ++i)
- if (d->perm[i] < 4)
- return false;
+ if (GET_MODE_BITSIZE (mode) == 128
+ && TARGET_SSE_SPLIT_REGS)
+ return cost * 2;
+ if (GET_MODE_BITSIZE (mode) > 128
+ && TARGET_AVX128_OPTIMAL)
+ return cost * GET_MODE_BITSIZE (mode) / 128;
+ return cost;
+}
- if (d->testing_p)
- return true;
+/* Return cost of multiplication in MODE. */
- /* Emit the pshuflw. */
- memcpy (perm2, d->perm, 4);
- for (i = 4; i < 8; ++i)
- perm2[i] = i;
- ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
- gcc_assert (ok);
+static int
+ix86_multiplication_cost (const struct processor_costs *cost,
+ enum machine_mode mode)
+{
+ machine_mode inner_mode = mode;
+ if (VECTOR_MODE_P (mode))
+ inner_mode = GET_MODE_INNER (mode);
- /* Emit the pshufhw. */
- memcpy (perm2 + 4, d->perm + 4, 4);
- for (i = 0; i < 4; ++i)
- perm2[i] = i;
- ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
- gcc_assert (ok);
+ if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+ return inner_mode == DFmode ? cost->mulsd : cost->mulss;
+ else if (X87_FLOAT_MODE_P (mode))
+ return cost->fmul;
+ else if (FLOAT_MODE_P (mode))
+ return ix86_vec_cost (mode,
+ inner_mode == DFmode ? cost->mulsd : cost->mulss);
+ else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+ {
+ /* vpmullq is used in this case. No emulation is needed. */
+ if (TARGET_AVX512DQ)
+ return ix86_vec_cost (mode, cost->mulss);
- return true;
+ /* V*QImode is emulated with 7-13 insns. */
+ if (mode == V16QImode || mode == V32QImode)
+ {
+ int extra = 11;
+ if (TARGET_XOP && mode == V16QImode)
+ extra = 5;
+ else if (TARGET_SSSE3)
+ extra = 6;
+ return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * extra);
+ }
+ /* V*DImode is emulated with 5-8 insns. */
+ else if (mode == V2DImode || mode == V4DImode)
+ {
+ if (TARGET_XOP && mode == V2DImode)
+ return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 3);
+ else
+ return ix86_vec_cost (mode, cost->mulss * 3 + cost->sse_op * 5);
+ }
+ /* Without sse4.1, we don't have PMULLD; it's emulated with 7
+ insns, including two PMULUDQ. */
+ else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
+ return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5);
+ else
+ return ix86_vec_cost (mode, cost->mulss);
+ }
+ else
+ return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7);
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
- the permutation using the SSSE3 palignr instruction. This succeeds
- when all of the elements in PERM fit within one vector and we merely
- need to shift them down so that a single vector permutation has a
- chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
- the vpalignr instruction itself can perform the requested permutation. */
+/* Return cost of multiplication in MODE. */
+
+static int
+ix86_division_cost (const struct processor_costs *cost,
+ enum machine_mode mode)
+{
+ machine_mode inner_mode = mode;
+ if (VECTOR_MODE_P (mode))
+ inner_mode = GET_MODE_INNER (mode);
-static bool
-expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
-{
- unsigned i, nelt = d->nelt;
- unsigned min, max, minswap, maxswap;
- bool in_order, ok, swap = false;
- rtx shift, target;
- struct expand_vec_perm_d dcopy;
-
- /* Even with AVX, palignr only operates on 128-bit vectors,
- in AVX2 palignr operates on both 128-bit lanes. */
- if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
- && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
- return false;
+ if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+ return inner_mode == DFmode ? cost->divsd : cost->divss;
+ else if (X87_FLOAT_MODE_P (mode))
+ return cost->fdiv;
+ else if (FLOAT_MODE_P (mode))
+ return ix86_vec_cost (mode,
+ inner_mode == DFmode ? cost->divsd : cost->divss);
+ else
+ return cost->divide[MODE_INDEX (mode)];
+}
- min = 2 * nelt;
- max = 0;
- minswap = 2 * nelt;
- maxswap = 0;
- for (i = 0; i < nelt; ++i)
- {
- unsigned e = d->perm[i];
- unsigned eswap = d->perm[i] ^ nelt;
- if (GET_MODE_SIZE (d->vmode) == 32)
- {
- e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
- eswap = e ^ (nelt / 2);
- }
- if (e < min)
- min = e;
- if (e > max)
- max = e;
- if (eswap < minswap)
- minswap = eswap;
- if (eswap > maxswap)
- maxswap = eswap;
- }
- if (min == 0
- || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
- {
- if (d->one_operand_p
- || minswap == 0
- || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
- ? nelt / 2 : nelt))
- return false;
- swap = true;
- min = minswap;
- max = maxswap;
- }
+#define COSTS_N_BYTES(N) ((N) * 2)
- /* Given that we have SSSE3, we know we'll be able to implement the
- single operand permutation after the palignr with pshufb for
- 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
- first. */
- if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
- return true;
+/* Return cost of shift in MODE.
+ If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL.
+ AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE
+ if op1 is a result of subreg.
- dcopy = *d;
- if (swap)
- {
- dcopy.op0 = d->op1;
- dcopy.op1 = d->op0;
- for (i = 0; i < nelt; ++i)
- dcopy.perm[i] ^= nelt;
- }
+ SKIP_OP0/1 is set to true if cost of OP0/1 should be ignored. */
- in_order = true;
- for (i = 0; i < nelt; ++i)
+static int
+ix86_shift_rotate_cost (const struct processor_costs *cost,
+ enum machine_mode mode, bool constant_op1,
+ HOST_WIDE_INT op1_val,
+ bool speed,
+ bool and_in_op1,
+ bool shift_and_truncate,
+ bool *skip_op0, bool *skip_op1)
+{
+ if (skip_op0)
+ *skip_op0 = *skip_op1 = false;
+ if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
{
- unsigned e = dcopy.perm[i];
- if (GET_MODE_SIZE (d->vmode) == 32
- && e >= nelt
- && (e & (nelt / 2 - 1)) < min)
- e = e - min - (nelt / 2);
+ /* V*QImode is emulated with 1-11 insns. */
+ if (mode == V16QImode || mode == V32QImode)
+ {
+ int count = 11;
+ if (TARGET_XOP && mode == V16QImode)
+ {
+ /* For XOP we use vpshab, which requires a broadcast of the
+ value to the variable shift insn. For constants this
+ means a V16Q const in mem; even when we can perform the
+ shift with one insn set the cost to prefer paddb. */
+ if (constant_op1)
+ {
+ if (skip_op1)
+ *skip_op1 = true;
+ return ix86_vec_cost (mode,
+ cost->sse_op
+ + (speed
+ ? 2
+ : COSTS_N_BYTES
+ (GET_MODE_UNIT_SIZE (mode))));
+ }
+ count = 3;
+ }
+ else if (TARGET_SSSE3)
+ count = 7;
+ return ix86_vec_cost (mode, cost->sse_op * count);
+ }
else
- e = e - min;
- if (e != i)
- in_order = false;
- dcopy.perm[i] = e;
- }
- dcopy.one_operand_p = true;
-
- if (single_insn_only_p && !in_order)
- return false;
-
- /* For AVX2, test whether we can permute the result in one instruction. */
- if (d->testing_p)
- {
- if (in_order)
- return true;
- dcopy.op1 = dcopy.op0;
- return expand_vec_perm_1 (&dcopy);
+ return ix86_vec_cost (mode, cost->sse_op);
}
-
- shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
- if (GET_MODE_SIZE (d->vmode) == 16)
+ if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
{
- target = gen_reg_rtx (TImode);
- emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
- gen_lowpart (TImode, dcopy.op0), shift));
+ if (constant_op1)
+ {
+ if (op1_val > 32)
+ return cost->shift_const + COSTS_N_INSNS (2);
+ else
+ return cost->shift_const * 2;
+ }
+ else
+ {
+ if (and_in_op1)
+ return cost->shift_var * 2;
+ else
+ return cost->shift_var * 6 + COSTS_N_INSNS (2);
+ }
}
else
{
- target = gen_reg_rtx (V2TImode);
- emit_insn (gen_avx2_palignrv2ti (target,
- gen_lowpart (V2TImode, dcopy.op1),
- gen_lowpart (V2TImode, dcopy.op0),
- shift));
- }
-
- dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
-
- /* Test for the degenerate case where the alignment by itself
- produces the desired permutation. */
- if (in_order)
- {
- emit_move_insn (d->target, dcopy.op0);
- return true;
+ if (constant_op1)
+ return cost->shift_const;
+ else if (shift_and_truncate)
+ {
+ if (skip_op0)
+ *skip_op0 = *skip_op1 = true;
+ /* Return the cost after shift-and truncation. */
+ return cost->shift_var;
+ }
+ else
+ return cost->shift_var;
}
-
- ok = expand_vec_perm_1 (&dcopy);
- gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
-
- return ok;
+ return cost->shift_const;
}
-/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
- the permutation using the SSE4_1 pblendv instruction. Potentially
- reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
+/* Compute a (partial) cost for rtx X. Return true if the complete
+ cost has been computed, and false if subexpressions should be
+ scanned. In either case, *TOTAL contains the cost result. */
static bool
-expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
+ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
+ int *total, bool speed)
{
- unsigned i, which, nelt = d->nelt;
- struct expand_vec_perm_d dcopy, dcopy1;
- machine_mode vmode = d->vmode;
- bool ok;
-
- /* Use the same checks as in expand_vec_perm_blend. */
- if (d->one_operand_p)
- return false;
- if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
- ;
- else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
- ;
- else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
- ;
- else
- return false;
-
- /* Figure out where permutation elements stay not in their
- respective lanes. */
- for (i = 0, which = 0; i < nelt; ++i)
- {
- unsigned e = d->perm[i];
- if (e != i)
- which |= (e < nelt ? 1 : 2);
- }
- /* We can pblend the part where elements stay not in their
- respective lanes only when these elements are all in one
- half of a permutation.
- {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
- lanes, but both 8 and 9 >= 8
- {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
- respective lanes and 8 >= 8, but 2 not. */
- if (which != 1 && which != 2)
- return false;
- if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
- return true;
-
- /* First we apply one operand permutation to the part where
- elements stay not in their respective lanes. */
- dcopy = *d;
- if (which == 2)
- dcopy.op0 = dcopy.op1 = d->op1;
- else
- dcopy.op0 = dcopy.op1 = d->op0;
- if (!d->testing_p)
- dcopy.target = gen_reg_rtx (vmode);
- dcopy.one_operand_p = true;
-
- for (i = 0; i < nelt; ++i)
- dcopy.perm[i] = d->perm[i] & (nelt - 1);
-
- ok = expand_vec_perm_1 (&dcopy);
- if (GET_MODE_SIZE (vmode) != 16 && !ok)
- return false;
- else
- gcc_assert (ok);
- if (d->testing_p)
- return true;
-
- /* Next we put permuted elements into their positions. */
- dcopy1 = *d;
- if (which == 2)
- dcopy1.op1 = dcopy.target;
- else
- dcopy1.op0 = dcopy.target;
-
- for (i = 0; i < nelt; ++i)
- dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
+ rtx mask;
+ enum rtx_code code = GET_CODE (x);
+ enum rtx_code outer_code = (enum rtx_code) outer_code_i;
+ const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
+ int src_cost;
- ok = expand_vec_perm_blend (&dcopy1);
- gcc_assert (ok);
+ switch (code)
+ {
+ case SET:
+ if (register_operand (SET_DEST (x), VOIDmode)
+ && register_operand (SET_SRC (x), VOIDmode))
+ {
+ *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
+ return true;
+ }
- return true;
-}
+ if (register_operand (SET_SRC (x), VOIDmode))
+ /* Avoid potentially incorrect high cost from rtx_costs
+ for non-tieable SUBREGs. */
+ src_cost = 0;
+ else
+ {
+ src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
-static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
+ if (CONSTANT_P (SET_SRC (x)))
+ /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
+ a small value, possibly zero for cheap constants. */
+ src_cost += COSTS_N_INSNS (1);
+ }
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
- a two vector permutation into a single vector permutation by using
- an interleave operation to merge the vectors. */
+ *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
+ return true;
-static bool
-expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
-{
- struct expand_vec_perm_d dremap, dfinal;
- unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
- unsigned HOST_WIDE_INT contents;
- unsigned char remap[2 * MAX_VECT_LEN];
- rtx_insn *seq;
- bool ok, same_halves = false;
+ case CONST_INT:
+ case CONST:
+ case LABEL_REF:
+ case SYMBOL_REF:
+ if (x86_64_immediate_operand (x, VOIDmode))
+ *total = 0;
+ else
+ *total = 1;
+ return true;
- if (GET_MODE_SIZE (d->vmode) == 16)
- {
- if (d->one_operand_p)
- return false;
- }
- else if (GET_MODE_SIZE (d->vmode) == 32)
- {
- if (!TARGET_AVX)
- return false;
- /* For 32-byte modes allow even d->one_operand_p.
- The lack of cross-lane shuffling in some instructions
- might prevent a single insn shuffle. */
- dfinal = *d;
- dfinal.testing_p = true;
- /* If expand_vec_perm_interleave3 can expand this into
- a 3 insn sequence, give up and let it be expanded as
- 3 insn sequence. While that is one insn longer,
- it doesn't need a memory operand and in the common
- case that both interleave low and high permutations
- with the same operands are adjacent needs 4 insns
- for both after CSE. */
- if (expand_vec_perm_interleave3 (&dfinal))
- return false;
- }
- else
- return false;
+ case CONST_DOUBLE:
+ if (IS_STACK_MODE (mode))
+ switch (standard_80387_constant_p (x))
+ {
+ case -1:
+ case 0:
+ break;
+ case 1: /* 0.0 */
+ *total = 1;
+ return true;
+ default: /* Other constants */
+ *total = 2;
+ return true;
+ }
+ /* FALLTHRU */
- /* Examine from whence the elements come. */
- contents = 0;
- for (i = 0; i < nelt; ++i)
- contents |= HOST_WIDE_INT_1U << d->perm[i];
+ case CONST_VECTOR:
+ switch (standard_sse_constant_p (x, mode))
+ {
+ case 0:
+ break;
+ case 1: /* 0: xor eliminates false dependency */
+ *total = 0;
+ return true;
+ default: /* -1: cmp contains false dependency */
+ *total = 1;
+ return true;
+ }
+ /* FALLTHRU */
- memset (remap, 0xff, sizeof (remap));
- dremap = *d;
+ case CONST_WIDE_INT:
+ /* Fall back to (MEM (SYMBOL_REF)), since that's where
+ it'll probably end up. Add a penalty for size. */
+ *total = (COSTS_N_INSNS (1)
+ + (!TARGET_64BIT && flag_pic)
+ + (GET_MODE_SIZE (mode) <= 4
+ ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
+ return true;
- if (GET_MODE_SIZE (d->vmode) == 16)
- {
- unsigned HOST_WIDE_INT h1, h2, h3, h4;
+ case ZERO_EXTEND:
+ /* The zero extensions is often completely free on x86_64, so make
+ it as cheap as possible. */
+ if (TARGET_64BIT && mode == DImode
+ && GET_MODE (XEXP (x, 0)) == SImode)
+ *total = 1;
+ else if (TARGET_ZERO_EXTEND_WITH_AND)
+ *total = cost->add;
+ else
+ *total = cost->movzx;
+ return false;
- /* Split the two input vectors into 4 halves. */
- h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
- h2 = h1 << nelt2;
- h3 = h2 << nelt2;
- h4 = h3 << nelt2;
+ case SIGN_EXTEND:
+ *total = cost->movsx;
+ return false;
- /* If the elements from the low halves use interleave low, and similarly
- for interleave high. If the elements are from mis-matched halves, we
- can use shufps for V4SF/V4SI or do a DImode shuffle. */
- if ((contents & (h1 | h3)) == contents)
- {
- /* punpckl* */
- for (i = 0; i < nelt2; ++i)
- {
- remap[i] = i * 2;
- remap[i + nelt] = i * 2 + 1;
- dremap.perm[i * 2] = i;
- dremap.perm[i * 2 + 1] = i + nelt;
- }
- if (!TARGET_SSE2 && d->vmode == V4SImode)
- dremap.vmode = V4SFmode;
- }
- else if ((contents & (h2 | h4)) == contents)
- {
- /* punpckh* */
- for (i = 0; i < nelt2; ++i)
- {
- remap[i + nelt2] = i * 2;
- remap[i + nelt + nelt2] = i * 2 + 1;
- dremap.perm[i * 2] = i + nelt2;
- dremap.perm[i * 2 + 1] = i + nelt + nelt2;
- }
- if (!TARGET_SSE2 && d->vmode == V4SImode)
- dremap.vmode = V4SFmode;
- }
- else if ((contents & (h1 | h4)) == contents)
+ case ASHIFT:
+ if (SCALAR_INT_MODE_P (mode)
+ && GET_MODE_SIZE (mode) < UNITS_PER_WORD
+ && CONST_INT_P (XEXP (x, 1)))
{
- /* shufps */
- for (i = 0; i < nelt2; ++i)
+ HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
+ if (value == 1)
{
- remap[i] = i;
- remap[i + nelt + nelt2] = i + nelt2;
- dremap.perm[i] = i;
- dremap.perm[i + nelt2] = i + nelt + nelt2;
+ *total = cost->add;
+ return false;
}
- if (nelt != 4)
+ if ((value == 2 || value == 3)
+ && cost->lea <= cost->shift_const)
{
- /* shufpd */
- dremap.vmode = V2DImode;
- dremap.nelt = 2;
- dremap.perm[0] = 0;
- dremap.perm[1] = 3;
+ *total = cost->lea;
+ return false;
}
}
- else if ((contents & (h2 | h3)) == contents)
+ /* FALLTHRU */
+
+ case ROTATE:
+ case ASHIFTRT:
+ case LSHIFTRT:
+ case ROTATERT:
+ bool skip_op0, skip_op1;
+ *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)),
+ CONST_INT_P (XEXP (x, 1))
+ ? INTVAL (XEXP (x, 1)) : -1,
+ speed,
+ GET_CODE (XEXP (x, 1)) == AND,
+ SUBREG_P (XEXP (x, 1))
+ && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND,
+ &skip_op0, &skip_op1);
+ if (skip_op0 || skip_op1)
{
- /* shufps */
- for (i = 0; i < nelt2; ++i)
- {
- remap[i + nelt2] = i;
- remap[i + nelt] = i + nelt2;
- dremap.perm[i] = i + nelt2;
- dremap.perm[i + nelt2] = i + nelt;
- }
- if (nelt != 4)
- {
- /* shufpd */
- dremap.vmode = V2DImode;
- dremap.nelt = 2;
- dremap.perm[0] = 1;
- dremap.perm[1] = 2;
- }
+ if (!skip_op0)
+ *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
+ if (!skip_op1)
+ *total += rtx_cost (XEXP (x, 1), mode, code, 0, speed);
+ return true;
}
- else
- return false;
- }
- else
- {
- unsigned int nelt4 = nelt / 4, nzcnt = 0;
- unsigned HOST_WIDE_INT q[8];
- unsigned int nonzero_halves[4];
+ return false;
- /* Split the two input vectors into 8 quarters. */
- q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
- for (i = 1; i < 8; ++i)
- q[i] = q[0] << (nelt4 * i);
- for (i = 0; i < 4; ++i)
- if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
- {
- nonzero_halves[nzcnt] = i;
- ++nzcnt;
- }
+ case FMA:
+ {
+ rtx sub;
- if (nzcnt == 1)
- {
- gcc_assert (d->one_operand_p);
- nonzero_halves[1] = nonzero_halves[0];
- same_halves = true;
- }
- else if (d->one_operand_p)
- {
- gcc_assert (nonzero_halves[0] == 0);
- gcc_assert (nonzero_halves[1] == 1);
- }
+ gcc_assert (FLOAT_MODE_P (mode));
+ gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
+
+ *total = ix86_vec_cost (mode,
+ GET_MODE_INNER (mode) == SFmode
+ ? cost->fmass : cost->fmasd);
+ *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
+
+ /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
+ sub = XEXP (x, 0);
+ if (GET_CODE (sub) == NEG)
+ sub = XEXP (sub, 0);
+ *total += rtx_cost (sub, mode, FMA, 0, speed);
- if (nzcnt <= 2)
+ sub = XEXP (x, 2);
+ if (GET_CODE (sub) == NEG)
+ sub = XEXP (sub, 0);
+ *total += rtx_cost (sub, mode, FMA, 2, speed);
+ return true;
+ }
+
+ case MULT:
+ if (!FLOAT_MODE_P (mode) && !VECTOR_MODE_P (mode))
{
- if (d->perm[0] / nelt2 == nonzero_halves[1])
+ rtx op0 = XEXP (x, 0);
+ rtx op1 = XEXP (x, 1);
+ int nbits;
+ if (CONST_INT_P (XEXP (x, 1)))
{
- /* Attempt to increase the likelihood that dfinal
- shuffle will be intra-lane. */
- std::swap (nonzero_halves[0], nonzero_halves[1]);
+ unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
+ for (nbits = 0; value != 0; value &= value - 1)
+ nbits++;
}
+ else
+ /* This is arbitrary. */
+ nbits = 7;
- /* vperm2f128 or vperm2i128. */
- for (i = 0; i < nelt2; ++i)
+ /* Compute costs correctly for widening multiplication. */
+ if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
+ && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
+ == GET_MODE_SIZE (mode))
{
- remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
- remap[i + nonzero_halves[0] * nelt2] = i;
- dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
- dremap.perm[i] = i + nonzero_halves[0] * nelt2;
+ int is_mulwiden = 0;
+ machine_mode inner_mode = GET_MODE (op0);
+
+ if (GET_CODE (op0) == GET_CODE (op1))
+ is_mulwiden = 1, op1 = XEXP (op1, 0);
+ else if (CONST_INT_P (op1))
+ {
+ if (GET_CODE (op0) == SIGN_EXTEND)
+ is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
+ == INTVAL (op1);
+ else
+ is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
+ }
+
+ if (is_mulwiden)
+ op0 = XEXP (op0, 0), mode = GET_MODE (op0);
}
- if (d->vmode != V8SFmode
- && d->vmode != V4DFmode
- && d->vmode != V8SImode)
+ *total = (cost->mult_init[MODE_INDEX (mode)]
+ + nbits * cost->mult_bit
+ + rtx_cost (op0, mode, outer_code, opno, speed)
+ + rtx_cost (op1, mode, outer_code, opno, speed));
+
+ return true;
+ }
+ *total = ix86_multiplication_cost (cost, mode);
+ return false;
+
+ case DIV:
+ case UDIV:
+ case MOD:
+ case UMOD:
+ *total = ix86_division_cost (cost, mode);
+ return false;
+
+ case PLUS:
+ if (GET_MODE_CLASS (mode) == MODE_INT
+ && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
+ {
+ if (GET_CODE (XEXP (x, 0)) == PLUS
+ && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
+ && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
+ && CONSTANT_P (XEXP (x, 1)))
{
- dremap.vmode = V8SImode;
- dremap.nelt = 8;
- for (i = 0; i < 4; ++i)
+ HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
+ if (val == 2 || val == 4 || val == 8)
{
- dremap.perm[i] = i + nonzero_halves[0] * 4;
- dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
+ *total = cost->lea;
+ *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
+ outer_code, opno, speed);
+ *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
+ outer_code, opno, speed);
+ *total += rtx_cost (XEXP (x, 1), mode,
+ outer_code, opno, speed);
+ return true;
}
}
- }
- else if (d->one_operand_p)
- return false;
- else if (TARGET_AVX2
- && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
- {
- /* vpunpckl* */
- for (i = 0; i < nelt4; ++i)
+ else if (GET_CODE (XEXP (x, 0)) == MULT
+ && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
{
- remap[i] = i * 2;
- remap[i + nelt] = i * 2 + 1;
- remap[i + nelt2] = i * 2 + nelt2;
- remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
- dremap.perm[i * 2] = i;
- dremap.perm[i * 2 + 1] = i + nelt;
- dremap.perm[i * 2 + nelt2] = i + nelt2;
- dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
+ HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
+ if (val == 2 || val == 4 || val == 8)
+ {
+ *total = cost->lea;
+ *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
+ outer_code, opno, speed);
+ *total += rtx_cost (XEXP (x, 1), mode,
+ outer_code, opno, speed);
+ return true;
+ }
}
- }
- else if (TARGET_AVX2
- && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
- {
- /* vpunpckh* */
- for (i = 0; i < nelt4; ++i)
+ else if (GET_CODE (XEXP (x, 0)) == PLUS)
{
- remap[i + nelt4] = i * 2;
- remap[i + nelt + nelt4] = i * 2 + 1;
- remap[i + nelt2 + nelt4] = i * 2 + nelt2;
- remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
- dremap.perm[i * 2] = i + nelt4;
- dremap.perm[i * 2 + 1] = i + nelt + nelt4;
- dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
- dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
+ /* Add with carry, ignore the cost of adding a carry flag. */
+ if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
+ *total = cost->add;
+ else
+ {
+ *total = cost->lea;
+ *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
+ outer_code, opno, speed);
+ }
+
+ *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
+ outer_code, opno, speed);
+ *total += rtx_cost (XEXP (x, 1), mode,
+ outer_code, opno, speed);
+ return true;
}
}
- else
- return false;
- }
+ /* FALLTHRU */
- /* Use the remapping array set up above to move the elements from their
- swizzled locations into their final destinations. */
- dfinal = *d;
- for (i = 0; i < nelt; ++i)
- {
- unsigned e = remap[d->perm[i]];
- gcc_assert (e < nelt);
- /* If same_halves is true, both halves of the remapped vector are the
- same. Avoid cross-lane accesses if possible. */
- if (same_halves && i >= nelt2)
+ case MINUS:
+ /* Subtract with borrow, ignore the cost of subtracting a carry flag. */
+ if (GET_MODE_CLASS (mode) == MODE_INT
+ && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
+ && GET_CODE (XEXP (x, 0)) == MINUS
+ && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
{
- gcc_assert (e < nelt2);
- dfinal.perm[i] = e + nelt2;
+ *total = cost->add;
+ *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
+ outer_code, opno, speed);
+ *total += rtx_cost (XEXP (x, 1), mode,
+ outer_code, opno, speed);
+ return true;
}
- else
- dfinal.perm[i] = e;
- }
- if (!d->testing_p)
- {
- dremap.target = gen_reg_rtx (dremap.vmode);
- dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
- }
- dfinal.op1 = dfinal.op0;
- dfinal.one_operand_p = true;
-
- /* Test if the final remap can be done with a single insn. For V4SFmode or
- V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
- start_sequence ();
- ok = expand_vec_perm_1 (&dfinal);
- seq = get_insns ();
- end_sequence ();
-
- if (!ok)
- return false;
-
- if (d->testing_p)
- return true;
-
- if (dremap.vmode != dfinal.vmode)
- {
- dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
- dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
- }
-
- ok = expand_vec_perm_1 (&dremap);
- gcc_assert (ok);
-
- emit_insn (seq);
- return true;
-}
-
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
- a single vector cross-lane permutation into vpermq followed
- by any of the single insn permutations. */
-
-static bool
-expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
-{
- struct expand_vec_perm_d dremap, dfinal;
- unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
- unsigned contents[2];
- bool ok;
-
- if (!(TARGET_AVX2
- && (d->vmode == V32QImode || d->vmode == V16HImode)
- && d->one_operand_p))
- return false;
-
- contents[0] = 0;
- contents[1] = 0;
- for (i = 0; i < nelt2; ++i)
- {
- contents[0] |= 1u << (d->perm[i] / nelt4);
- contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
- }
- for (i = 0; i < 2; ++i)
- {
- unsigned int cnt = 0;
- for (j = 0; j < 4; ++j)
- if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
+ if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+ {
+ *total = cost->addss;
return false;
- }
-
- if (d->testing_p)
- return true;
-
- dremap = *d;
- dremap.vmode = V4DImode;
- dremap.nelt = 4;
- dremap.target = gen_reg_rtx (V4DImode);
- dremap.op0 = gen_lowpart (V4DImode, d->op0);
- dremap.op1 = dremap.op0;
- dremap.one_operand_p = true;
- for (i = 0; i < 2; ++i)
- {
- unsigned int cnt = 0;
- for (j = 0; j < 4; ++j)
- if ((contents[i] & (1u << j)) != 0)
- dremap.perm[2 * i + cnt++] = j;
- for (; cnt < 2; ++cnt)
- dremap.perm[2 * i + cnt] = 0;
- }
-
- dfinal = *d;
- dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
- dfinal.op1 = dfinal.op0;
- dfinal.one_operand_p = true;
- for (i = 0, j = 0; i < nelt; ++i)
- {
- if (i == nelt2)
- j = 2;
- dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
- if ((d->perm[i] / nelt4) == dremap.perm[j])
- ;
- else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
- dfinal.perm[i] |= nelt4;
- else
- gcc_unreachable ();
- }
-
- ok = expand_vec_perm_1 (&dremap);
- gcc_assert (ok);
-
- ok = expand_vec_perm_1 (&dfinal);
- gcc_assert (ok);
-
- return true;
-}
-
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
- a vector permutation using two instructions, vperm2f128 resp.
- vperm2i128 followed by any single in-lane permutation. */
-
-static bool
-expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
-{
- struct expand_vec_perm_d dfirst, dsecond;
- unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
- bool ok;
+ }
+ else if (X87_FLOAT_MODE_P (mode))
+ {
+ *total = cost->fadd;
+ return false;
+ }
+ else if (FLOAT_MODE_P (mode))
+ {
+ *total = ix86_vec_cost (mode, cost->addss);
+ return false;
+ }
+ /* FALLTHRU */
- if (!TARGET_AVX
- || GET_MODE_SIZE (d->vmode) != 32
- || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
- return false;
+ case AND:
+ case IOR:
+ case XOR:
+ if (GET_MODE_CLASS (mode) == MODE_INT
+ && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+ {
+ *total = (cost->add * 2
+ + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
+ << (GET_MODE (XEXP (x, 0)) != DImode))
+ + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
+ << (GET_MODE (XEXP (x, 1)) != DImode)));
+ return true;
+ }
+ /* FALLTHRU */
- dsecond = *d;
- dsecond.one_operand_p = false;
- dsecond.testing_p = true;
-
- /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
- immediate. For perm < 16 the second permutation uses
- d->op0 as first operand, for perm >= 16 it uses d->op1
- as first operand. The second operand is the result of
- vperm2[fi]128. */
- for (perm = 0; perm < 32; perm++)
- {
- /* Ignore permutations which do not move anything cross-lane. */
- if (perm < 16)
- {
- /* The second shuffle for e.g. V4DFmode has
- 0123 and ABCD operands.
- Ignore AB23, as 23 is already in the second lane
- of the first operand. */
- if ((perm & 0xc) == (1 << 2)) continue;
- /* And 01CD, as 01 is in the first lane of the first
- operand. */
- if ((perm & 3) == 0) continue;
- /* And 4567, as then the vperm2[fi]128 doesn't change
- anything on the original 4567 second operand. */
- if ((perm & 0xf) == ((3 << 2) | 2)) continue;
+ case NEG:
+ if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+ {
+ *total = cost->sse_op;
+ return false;
}
- else
+ else if (X87_FLOAT_MODE_P (mode))
{
- /* The second shuffle for e.g. V4DFmode has
- 4567 and ABCD operands.
- Ignore AB67, as 67 is already in the second lane
- of the first operand. */
- if ((perm & 0xc) == (3 << 2)) continue;
- /* And 45CD, as 45 is in the first lane of the first
- operand. */
- if ((perm & 3) == 2) continue;
- /* And 0123, as then the vperm2[fi]128 doesn't change
- anything on the original 0123 first operand. */
- if ((perm & 0xf) == (1 << 2)) continue;
- }
-
- for (i = 0; i < nelt; i++)
- {
- j = d->perm[i] / nelt2;
- if (j == ((perm >> (2 * (i >= nelt2))) & 3))
- dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
- else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
- dsecond.perm[i] = d->perm[i] & (nelt - 1);
- else
- break;
+ *total = cost->fchs;
+ return false;
}
-
- if (i == nelt)
+ else if (FLOAT_MODE_P (mode))
{
- start_sequence ();
- ok = expand_vec_perm_1 (&dsecond);
- end_sequence ();
+ *total = ix86_vec_cost (mode, cost->sse_op);
+ return false;
}
+ /* FALLTHRU */
+
+ case NOT:
+ if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+ *total = ix86_vec_cost (mode, cost->sse_op);
+ else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+ *total = cost->add * 2;
else
- ok = false;
+ *total = cost->add;
+ return false;
- if (ok)
+ case COMPARE:
+ if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
+ && XEXP (XEXP (x, 0), 1) == const1_rtx
+ && CONST_INT_P (XEXP (XEXP (x, 0), 2))
+ && XEXP (x, 1) == const0_rtx)
{
- if (d->testing_p)
- return true;
-
- /* Found a usable second shuffle. dfirst will be
- vperm2f128 on d->op0 and d->op1. */
- dsecond.testing_p = false;
- dfirst = *d;
- dfirst.target = gen_reg_rtx (d->vmode);
- for (i = 0; i < nelt; i++)
- dfirst.perm[i] = (i & (nelt2 - 1))
- + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
-
- canonicalize_perm (&dfirst);
- ok = expand_vec_perm_1 (&dfirst);
- gcc_assert (ok);
-
- /* And dsecond is some single insn shuffle, taking
- d->op0 and result of vperm2f128 (if perm < 16) or
- d->op1 and result of vperm2f128 (otherwise). */
- if (perm >= 16)
- dsecond.op0 = dsecond.op1;
- dsecond.op1 = dfirst.target;
-
- ok = expand_vec_perm_1 (&dsecond);
- gcc_assert (ok);
-
+ /* This kind of construct is implemented using test[bwl].
+ Treat it as if we had an AND. */
+ mode = GET_MODE (XEXP (XEXP (x, 0), 0));
+ *total = (cost->add
+ + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
+ opno, speed)
+ + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
return true;
}
- /* For one operand, the only useful vperm2f128 permutation is 0x01
- aka lanes swap. */
- if (d->one_operand_p)
- return false;
- }
+ /* The embedded comparison operand is completely free. */
+ if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
+ && XEXP (x, 1) == const0_rtx)
+ *total = 0;
- return false;
-}
+ return false;
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
- a two vector permutation using 2 intra-lane interleave insns
- and cross-lane shuffle for 32-byte vectors. */
+ case FLOAT_EXTEND:
+ if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
+ *total = 0;
+ else
+ *total = ix86_vec_cost (mode, cost->addss);
+ return false;
-static bool
-expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
-{
- unsigned i, nelt;
- rtx (*gen) (rtx, rtx, rtx);
+ case FLOAT_TRUNCATE:
+ if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
+ *total = cost->fadd;
+ else
+ *total = ix86_vec_cost (mode, cost->addss);
+ return false;
- if (d->one_operand_p)
- return false;
- if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
- ;
- else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
- ;
- else
- return false;
+ case ABS:
+ /* SSE requires memory load for the constant operand. It may make
+ sense to account for this. Of course the constant operand may or
+ may not be reused. */
+ if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+ *total = cost->sse_op;
+ else if (X87_FLOAT_MODE_P (mode))
+ *total = cost->fabs;
+ else if (FLOAT_MODE_P (mode))
+ *total = ix86_vec_cost (mode, cost->sse_op);
+ return false;
- nelt = d->nelt;
- if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
- return false;
- for (i = 0; i < nelt; i += 2)
- if (d->perm[i] != d->perm[0] + i / 2
- || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
+ case SQRT:
+ if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+ *total = mode == SFmode ? cost->sqrtss : cost->sqrtsd;
+ else if (X87_FLOAT_MODE_P (mode))
+ *total = cost->fsqrt;
+ else if (FLOAT_MODE_P (mode))
+ *total = ix86_vec_cost (mode,
+ mode == SFmode ? cost->sqrtss : cost->sqrtsd);
return false;
- if (d->testing_p)
- return true;
+ case UNSPEC:
+ if (XINT (x, 1) == UNSPEC_TP)
+ *total = 0;
+ return false;
- switch (d->vmode)
- {
- case E_V32QImode:
- if (d->perm[0])
- gen = gen_vec_interleave_highv32qi;
- else
- gen = gen_vec_interleave_lowv32qi;
- break;
- case E_V16HImode:
- if (d->perm[0])
- gen = gen_vec_interleave_highv16hi;
- else
- gen = gen_vec_interleave_lowv16hi;
- break;
- case E_V8SImode:
- if (d->perm[0])
- gen = gen_vec_interleave_highv8si;
- else
- gen = gen_vec_interleave_lowv8si;
- break;
- case E_V4DImode:
- if (d->perm[0])
- gen = gen_vec_interleave_highv4di;
- else
- gen = gen_vec_interleave_lowv4di;
- break;
- case E_V8SFmode:
- if (d->perm[0])
- gen = gen_vec_interleave_highv8sf;
- else
- gen = gen_vec_interleave_lowv8sf;
- break;
- case E_V4DFmode:
- if (d->perm[0])
- gen = gen_vec_interleave_highv4df;
+ case VEC_SELECT:
+ case VEC_CONCAT:
+ case VEC_DUPLICATE:
+ /* ??? Assume all of these vector manipulation patterns are
+ recognizable. In which case they all pretty much have the
+ same cost. */
+ *total = cost->sse_op;
+ return true;
+ case VEC_MERGE:
+ mask = XEXP (x, 2);
+ /* This is masked instruction, assume the same cost,
+ as nonmasked variant. */
+ if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
+ *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
else
- gen = gen_vec_interleave_lowv4df;
- break;
+ *total = cost->sse_op;
+ return true;
+
default:
- gcc_unreachable ();
+ return false;
}
-
- emit_insn (gen (d->target, d->op0, d->op1));
- return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
- a single vector permutation using a single intra-lane vector
- permutation, vperm2f128 swapping the lanes and vblend* insn blending
- the non-swapped and swapped vectors together. */
-
-static bool
-expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
-{
- struct expand_vec_perm_d dfirst, dsecond;
- unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
- rtx_insn *seq;
- bool ok;
- rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
-
- if (!TARGET_AVX
- || TARGET_AVX2
- || (d->vmode != V8SFmode && d->vmode != V4DFmode)
- || !d->one_operand_p)
- return false;
-
- dfirst = *d;
- for (i = 0; i < nelt; i++)
- dfirst.perm[i] = 0xff;
- for (i = 0, msk = 0; i < nelt; i++)
- {
- j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
- if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
- return false;
- dfirst.perm[j] = d->perm[i];
- if (j != i)
- msk |= (1 << i);
- }
- for (i = 0; i < nelt; i++)
- if (dfirst.perm[i] == 0xff)
- dfirst.perm[i] = i;
-
- if (!d->testing_p)
- dfirst.target = gen_reg_rtx (dfirst.vmode);
-
- start_sequence ();
- ok = expand_vec_perm_1 (&dfirst);
- seq = get_insns ();
- end_sequence ();
-
- if (!ok)
- return false;
-
- if (d->testing_p)
- return true;
-
- emit_insn (seq);
-
- dsecond = *d;
- dsecond.op0 = dfirst.target;
- dsecond.op1 = dfirst.target;
- dsecond.one_operand_p = true;
- dsecond.target = gen_reg_rtx (dsecond.vmode);
- for (i = 0; i < nelt; i++)
- dsecond.perm[i] = i ^ nelt2;
-
- ok = expand_vec_perm_1 (&dsecond);
- gcc_assert (ok);
+#if TARGET_MACHO
- blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
- emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
- return true;
-}
+static int current_machopic_label_num;
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
- permutation using two vperm2f128, followed by a vshufpd insn blending
- the two vectors together. */
+/* Given a symbol name and its associated stub, write out the
+ definition of the stub. */
-static bool
-expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
+void
+machopic_output_stub (FILE *file, const char *symb, const char *stub)
{
- struct expand_vec_perm_d dfirst, dsecond, dthird;
- bool ok;
-
- if (!TARGET_AVX || (d->vmode != V4DFmode))
- return false;
-
- if (d->testing_p)
- return true;
-
- dfirst = *d;
- dsecond = *d;
- dthird = *d;
-
- dfirst.perm[0] = (d->perm[0] & ~1);
- dfirst.perm[1] = (d->perm[0] & ~1) + 1;
- dfirst.perm[2] = (d->perm[2] & ~1);
- dfirst.perm[3] = (d->perm[2] & ~1) + 1;
- dsecond.perm[0] = (d->perm[1] & ~1);
- dsecond.perm[1] = (d->perm[1] & ~1) + 1;
- dsecond.perm[2] = (d->perm[3] & ~1);
- dsecond.perm[3] = (d->perm[3] & ~1) + 1;
- dthird.perm[0] = (d->perm[0] % 2);
- dthird.perm[1] = (d->perm[1] % 2) + 4;
- dthird.perm[2] = (d->perm[2] % 2) + 2;
- dthird.perm[3] = (d->perm[3] % 2) + 6;
-
- dfirst.target = gen_reg_rtx (dfirst.vmode);
- dsecond.target = gen_reg_rtx (dsecond.vmode);
- dthird.op0 = dfirst.target;
- dthird.op1 = dsecond.target;
- dthird.one_operand_p = false;
-
- canonicalize_perm (&dfirst);
- canonicalize_perm (&dsecond);
-
- ok = expand_vec_perm_1 (&dfirst)
- && expand_vec_perm_1 (&dsecond)
- && expand_vec_perm_1 (&dthird);
+ unsigned int length;
+ char *binder_name, *symbol_name, lazy_ptr_name[32];
+ int label = ++current_machopic_label_num;
- gcc_assert (ok);
+ /* For 64-bit we shouldn't get here. */
+ gcc_assert (!TARGET_64BIT);
- return true;
-}
+ /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
+ symb = targetm.strip_name_encoding (symb);
-/* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
- permutation with two pshufb insns and an ior. We should have already
- failed all two instruction sequences. */
+ length = strlen (stub);
+ binder_name = XALLOCAVEC (char, length + 32);
+ GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
-static bool
-expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
-{
- rtx rperm[2][16], vperm, l, h, op, m128;
- unsigned int i, nelt, eltsz;
+ length = strlen (symb);
+ symbol_name = XALLOCAVEC (char, length + 32);
+ GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
- if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
- return false;
- gcc_assert (!d->one_operand_p);
+ sprintf (lazy_ptr_name, "L%d$lz", label);
- if (d->testing_p)
- return true;
+ if (MACHOPIC_ATT_STUB)
+ switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
+ else if (MACHOPIC_PURE)
+ switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
+ else
+ switch_to_section (darwin_sections[machopic_symbol_stub_section]);
- nelt = d->nelt;
- eltsz = GET_MODE_UNIT_SIZE (d->vmode);
+ fprintf (file, "%s:\n", stub);
+ fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
- /* Generate two permutation masks. If the required element is within
- the given vector it is shuffled into the proper lane. If the required
- element is in the other vector, force a zero into the lane by setting
- bit 7 in the permutation mask. */
- m128 = GEN_INT (-128);
- for (i = 0; i < nelt; ++i)
+ if (MACHOPIC_ATT_STUB)
{
- unsigned j, e = d->perm[i];
- unsigned which = (e >= nelt);
- if (e >= nelt)
- e -= nelt;
-
- for (j = 0; j < eltsz; ++j)
- {
- rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
- rperm[1-which][i*eltsz + j] = m128;
- }
+ fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
}
-
- vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
- vperm = force_reg (V16QImode, vperm);
-
- l = gen_reg_rtx (V16QImode);
- op = gen_lowpart (V16QImode, d->op0);
- emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
-
- vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
- vperm = force_reg (V16QImode, vperm);
-
- h = gen_reg_rtx (V16QImode);
- op = gen_lowpart (V16QImode, d->op1);
- emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
-
- op = d->target;
- if (d->vmode != V16QImode)
- op = gen_reg_rtx (V16QImode);
- emit_insn (gen_iorv16qi3 (op, l, h));
- if (op != d->target)
- emit_move_insn (d->target, gen_lowpart (d->vmode, op));
-
- return true;
-}
-
-/* Implement arbitrary permutation of one V32QImode and V16QImode operand
- with two vpshufb insns, vpermq and vpor. We should have already failed
- all two or three instruction sequences. */
-
-static bool
-expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
-{
- rtx rperm[2][32], vperm, l, h, hp, op, m128;
- unsigned int i, nelt, eltsz;
-
- if (!TARGET_AVX2
- || !d->one_operand_p
- || (d->vmode != V32QImode && d->vmode != V16HImode))
- return false;
-
- if (d->testing_p)
- return true;
-
- nelt = d->nelt;
- eltsz = GET_MODE_UNIT_SIZE (d->vmode);
-
- /* Generate two permutation masks. If the required element is within
- the same lane, it is shuffled in. If the required element from the
- other lane, force a zero by setting bit 7 in the permutation mask.
- In the other mask the mask has non-negative elements if element
- is requested from the other lane, but also moved to the other lane,
- so that the result of vpshufb can have the two V2TImode halves
- swapped. */
- m128 = GEN_INT (-128);
- for (i = 0; i < nelt; ++i)
+ else if (MACHOPIC_PURE)
{
- unsigned j, e = d->perm[i] & (nelt / 2 - 1);
- unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
-
- for (j = 0; j < eltsz; ++j)
- {
- rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
- rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
- }
+ /* PIC stub. */
+ /* 25-byte PIC stub using "CALL get_pc_thunk". */
+ rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
+ output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
+ fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
+ label, lazy_ptr_name, label);
+ fprintf (file, "\tjmp\t*%%ecx\n");
}
+ else
+ fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
- vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
- vperm = force_reg (V32QImode, vperm);
-
- h = gen_reg_rtx (V32QImode);
- op = gen_lowpart (V32QImode, d->op0);
- emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
+ /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
+ it needs no stub-binding-helper. */
+ if (MACHOPIC_ATT_STUB)
+ return;
- /* Swap the 128-byte lanes of h into hp. */
- hp = gen_reg_rtx (V4DImode);
- op = gen_lowpart (V4DImode, h);
- emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
- const1_rtx));
+ fprintf (file, "%s:\n", binder_name);
- vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
- vperm = force_reg (V32QImode, vperm);
+ if (MACHOPIC_PURE)
+ {
+ fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
+ fprintf (file, "\tpushl\t%%ecx\n");
+ }
+ else
+ fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
- l = gen_reg_rtx (V32QImode);
- op = gen_lowpart (V32QImode, d->op0);
- emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
+ fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
- op = d->target;
- if (d->vmode != V32QImode)
- op = gen_reg_rtx (V32QImode);
- emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
- if (op != d->target)
- emit_move_insn (d->target, gen_lowpart (d->vmode, op));
+ /* N.B. Keep the correspondence of these
+ 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
+ old-pic/new-pic/non-pic stubs; altering this will break
+ compatibility with existing dylibs. */
+ if (MACHOPIC_PURE)
+ {
+ /* 25-byte PIC stub using "CALL get_pc_thunk". */
+ switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
+ }
+ else
+ /* 16-byte -mdynamic-no-pic stub. */
+ switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
- return true;
+ fprintf (file, "%s:\n", lazy_ptr_name);
+ fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
+ fprintf (file, ASM_LONG "%s\n", binder_name);
}
+#endif /* TARGET_MACHO */
-/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
- and extract-odd permutations of two V32QImode and V16QImode operand
- with two vpshufb insns, vpor and vpermq. We should have already
- failed all two or three instruction sequences. */
+/* Order the registers for register allocator. */
-static bool
-expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
+void
+x86_order_regs_for_local_alloc (void)
{
- rtx rperm[2][32], vperm, l, h, ior, op, m128;
- unsigned int i, nelt, eltsz;
-
- if (!TARGET_AVX2
- || d->one_operand_p
- || (d->vmode != V32QImode && d->vmode != V16HImode))
- return false;
-
- for (i = 0; i < d->nelt; ++i)
- if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
- return false;
-
- if (d->testing_p)
- return true;
+ int pos = 0;
+ int i;
- nelt = d->nelt;
- eltsz = GET_MODE_UNIT_SIZE (d->vmode);
-
- /* Generate two permutation masks. In the first permutation mask
- the first quarter will contain indexes for the first half
- of the op0, the second quarter will contain bit 7 set, third quarter
- will contain indexes for the second half of the op0 and the
- last quarter bit 7 set. In the second permutation mask
- the first quarter will contain bit 7 set, the second quarter
- indexes for the first half of the op1, the third quarter bit 7 set
- and last quarter indexes for the second half of the op1.
- I.e. the first mask e.g. for V32QImode extract even will be:
- 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
- (all values masked with 0xf except for -128) and second mask
- for extract even will be
- -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
- m128 = GEN_INT (-128);
- for (i = 0; i < nelt; ++i)
- {
- unsigned j, e = d->perm[i] & (nelt / 2 - 1);
- unsigned which = d->perm[i] >= nelt;
- unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
+ /* First allocate the local general purpose registers. */
+ for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+ if (GENERAL_REGNO_P (i) && call_used_regs[i])
+ reg_alloc_order [pos++] = i;
- for (j = 0; j < eltsz; ++j)
- {
- rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
- rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
- }
- }
+ /* Global general purpose registers. */
+ for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+ if (GENERAL_REGNO_P (i) && !call_used_regs[i])
+ reg_alloc_order [pos++] = i;
- vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
- vperm = force_reg (V32QImode, vperm);
+ /* x87 registers come first in case we are doing FP math
+ using them. */
+ if (!TARGET_SSE_MATH)
+ for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
+ reg_alloc_order [pos++] = i;
- l = gen_reg_rtx (V32QImode);
- op = gen_lowpart (V32QImode, d->op0);
- emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
+ /* SSE registers. */
+ for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
+ reg_alloc_order [pos++] = i;
+ for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
+ reg_alloc_order [pos++] = i;
- vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
- vperm = force_reg (V32QImode, vperm);
+ /* Extended REX SSE registers. */
+ for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
+ reg_alloc_order [pos++] = i;
- h = gen_reg_rtx (V32QImode);
- op = gen_lowpart (V32QImode, d->op1);
- emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
+ /* Mask register. */
+ for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
+ reg_alloc_order [pos++] = i;
- ior = gen_reg_rtx (V32QImode);
- emit_insn (gen_iorv32qi3 (ior, l, h));
+ /* x87 registers. */
+ if (TARGET_SSE_MATH)
+ for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
+ reg_alloc_order [pos++] = i;
- /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
- op = gen_reg_rtx (V4DImode);
- ior = gen_lowpart (V4DImode, ior);
- emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
- const1_rtx, GEN_INT (3)));
- emit_move_insn (d->target, gen_lowpart (d->vmode, op));
+ for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
+ reg_alloc_order [pos++] = i;
- return true;
+ /* Initialize the rest of array as we do not allocate some registers
+ at all. */
+ while (pos < FIRST_PSEUDO_REGISTER)
+ reg_alloc_order [pos++] = 0;
}
-/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
- and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
- with two "and" and "pack" or two "shift" and "pack" insns. We should
- have already failed all two instruction sequences. */
-
static bool
-expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
+ix86_ms_bitfield_layout_p (const_tree record_type)
{
- rtx op, dop0, dop1, t;
- unsigned i, odd, c, s, nelt = d->nelt;
- bool end_perm = false;
- machine_mode half_mode;
- rtx (*gen_and) (rtx, rtx, rtx);
- rtx (*gen_pack) (rtx, rtx, rtx);
- rtx (*gen_shift) (rtx, rtx, rtx);
+ return ((TARGET_MS_BITFIELD_LAYOUT
+ && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
+ || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
+}
- if (d->one_operand_p)
- return false;
+/* Returns an expression indicating where the this parameter is
+ located on entry to the FUNCTION. */
- switch (d->vmode)
- {
- case E_V8HImode:
- /* Required for "pack". */
- if (!TARGET_SSE4_1)
- return false;
- c = 0xffff;
- s = 16;
- half_mode = V4SImode;
- gen_and = gen_andv4si3;
- gen_pack = gen_sse4_1_packusdw;
- gen_shift = gen_lshrv4si3;
- break;
- case E_V16QImode:
- /* No check as all instructions are SSE2. */
- c = 0xff;
- s = 8;
- half_mode = V8HImode;
- gen_and = gen_andv8hi3;
- gen_pack = gen_sse2_packuswb;
- gen_shift = gen_lshrv8hi3;
- break;
- case E_V16HImode:
- if (!TARGET_AVX2)
- return false;
- c = 0xffff;
- s = 16;
- half_mode = V8SImode;
- gen_and = gen_andv8si3;
- gen_pack = gen_avx2_packusdw;
- gen_shift = gen_lshrv8si3;
- end_perm = true;
- break;
- case E_V32QImode:
- if (!TARGET_AVX2)
- return false;
- c = 0xff;
- s = 8;
- half_mode = V16HImode;
- gen_and = gen_andv16hi3;
- gen_pack = gen_avx2_packuswb;
- gen_shift = gen_lshrv16hi3;
- end_perm = true;
- break;
- default:
- /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
- general shuffles. */
- return false;
- }
+static rtx
+x86_this_parameter (tree function)
+{
+ tree type = TREE_TYPE (function);
+ bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
+ int nregs;
- /* Check that permutation is even or odd. */
- odd = d->perm[0];
- if (odd > 1)
- return false;
+ if (TARGET_64BIT)
+ {
+ const int *parm_regs;
- for (i = 1; i < nelt; ++i)
- if (d->perm[i] != 2 * i + odd)
- return false;
+ if (ix86_function_type_abi (type) == MS_ABI)
+ parm_regs = x86_64_ms_abi_int_parameter_registers;
+ else
+ parm_regs = x86_64_int_parameter_registers;
+ return gen_rtx_REG (Pmode, parm_regs[aggr]);
+ }
- if (d->testing_p)
- return true;
+ nregs = ix86_function_regparm (type, function);
- dop0 = gen_reg_rtx (half_mode);
- dop1 = gen_reg_rtx (half_mode);
- if (odd == 0)
- {
- t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
- t = force_reg (half_mode, t);
- emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
- emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
- }
- else
+ if (nregs > 0 && !stdarg_p (type))
{
- emit_insn (gen_shift (dop0,
- gen_lowpart (half_mode, d->op0),
- GEN_INT (s)));
- emit_insn (gen_shift (dop1,
- gen_lowpart (half_mode, d->op1),
- GEN_INT (s)));
- }
- /* In AVX2 for 256 bit case we need to permute pack result. */
- if (TARGET_AVX2 && end_perm)
- {
- op = gen_reg_rtx (d->vmode);
- t = gen_reg_rtx (V4DImode);
- emit_insn (gen_pack (op, dop0, dop1));
- emit_insn (gen_avx2_permv4di_1 (t,
- gen_lowpart (V4DImode, op),
- const0_rtx,
- const2_rtx,
- const1_rtx,
- GEN_INT (3)));
- emit_move_insn (d->target, gen_lowpart (d->vmode, t));
+ int regno;
+ unsigned int ccvt = ix86_get_callcvt (type);
+
+ if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
+ regno = aggr ? DX_REG : CX_REG;
+ else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
+ {
+ regno = CX_REG;
+ if (aggr)
+ return gen_rtx_MEM (SImode,
+ plus_constant (Pmode, stack_pointer_rtx, 4));
+ }
+ else
+ {
+ regno = AX_REG;
+ if (aggr)
+ {
+ regno = DX_REG;
+ if (nregs == 1)
+ return gen_rtx_MEM (SImode,
+ plus_constant (Pmode,
+ stack_pointer_rtx, 4));
+ }
+ }
+ return gen_rtx_REG (SImode, regno);
}
- else
- emit_insn (gen_pack (d->target, dop0, dop1));
- return true;
+ return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
+ aggr ? 8 : 4));
}
-/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
- and extract-odd permutations of two V64QI operands
- with two "shifts", two "truncs" and one "concat" insns for "odd"
- and two "truncs" and one concat insn for "even."
- Have already failed all two instruction sequences. */
+/* Determine whether x86_output_mi_thunk can succeed. */
static bool
-expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
+x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
+ const_tree function)
{
- rtx t1, t2, t3, t4;
- unsigned i, odd, nelt = d->nelt;
-
- if (!TARGET_AVX512BW
- || d->one_operand_p
- || d->vmode != V64QImode)
- return false;
-
- /* Check that permutation is even or odd. */
- odd = d->perm[0];
- if (odd > 1)
- return false;
-
- for (i = 1; i < nelt; ++i)
- if (d->perm[i] != 2 * i + odd)
- return false;
-
- if (d->testing_p)
+ /* 64-bit can handle anything. */
+ if (TARGET_64BIT)
return true;
+ /* For 32-bit, everything's fine if we have one free register. */
+ if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
+ return true;
- if (odd)
- {
- t1 = gen_reg_rtx (V32HImode);
- t2 = gen_reg_rtx (V32HImode);
- emit_insn (gen_lshrv32hi3 (t1,
- gen_lowpart (V32HImode, d->op0),
- GEN_INT (8)));
- emit_insn (gen_lshrv32hi3 (t2,
- gen_lowpart (V32HImode, d->op1),
- GEN_INT (8)));
- }
- else
- {
- t1 = gen_lowpart (V32HImode, d->op0);
- t2 = gen_lowpart (V32HImode, d->op1);
- }
+ /* Need a free register for vcall_offset. */
+ if (vcall_offset)
+ return false;
- t3 = gen_reg_rtx (V32QImode);
- t4 = gen_reg_rtx (V32QImode);
- emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
- emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
- emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
+ /* Need a free register for GOT references. */
+ if (flag_pic && !targetm.binds_local_p (function))
+ return false;
+ /* Otherwise ok. */
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
- and extract-odd permutations. */
+/* Output the assembler code for a thunk function. THUNK_DECL is the
+ declaration for the thunk function itself, FUNCTION is the decl for
+ the target function. DELTA is an immediate constant offset to be
+ added to THIS. If VCALL_OFFSET is nonzero, the word at
+ *(*this + vcall_offset) should be added to THIS. */
-static bool
-expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
+static void
+x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
+ HOST_WIDE_INT vcall_offset, tree function)
{
- rtx t1, t2, t3, t4, t5;
+ rtx this_param = x86_this_parameter (function);
+ rtx this_reg, tmp, fnaddr;
+ unsigned int tmp_regno;
+ rtx_insn *insn;
- switch (d->vmode)
+ if (TARGET_64BIT)
+ tmp_regno = R10_REG;
+ else
{
- case E_V4DFmode:
- if (d->testing_p)
- break;
- t1 = gen_reg_rtx (V4DFmode);
- t2 = gen_reg_rtx (V4DFmode);
-
- /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
- emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
- emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
-
- /* Now an unpck[lh]pd will produce the result required. */
- if (odd)
- t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
+ unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
+ if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
+ tmp_regno = AX_REG;
+ else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
+ tmp_regno = DX_REG;
else
- t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
- emit_insn (t3);
- break;
-
- case E_V8SFmode:
- {
- int mask = odd ? 0xdd : 0x88;
-
- if (d->testing_p)
- break;
- t1 = gen_reg_rtx (V8SFmode);
- t2 = gen_reg_rtx (V8SFmode);
- t3 = gen_reg_rtx (V8SFmode);
-
- /* Shuffle within the 128-bit lanes to produce:
- { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
- emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
- GEN_INT (mask)));
-
- /* Shuffle the lanes around to produce:
- { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
- emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
- GEN_INT (0x3)));
-
- /* Shuffle within the 128-bit lanes to produce:
- { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
- emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
-
- /* Shuffle within the 128-bit lanes to produce:
- { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
- emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
-
- /* Shuffle the lanes around to produce:
- { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
- emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
- GEN_INT (0x20)));
- }
- break;
-
- case E_V2DFmode:
- case E_V4SFmode:
- case E_V2DImode:
- case E_V4SImode:
- /* These are always directly implementable by expand_vec_perm_1. */
- gcc_unreachable ();
+ tmp_regno = CX_REG;
+ }
- case E_V8HImode:
- if (TARGET_SSE4_1)
- return expand_vec_perm_even_odd_pack (d);
- else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
- return expand_vec_perm_pshufb2 (d);
- else
- {
- if (d->testing_p)
- break;
- /* We need 2*log2(N)-1 operations to achieve odd/even
- with interleave. */
- t1 = gen_reg_rtx (V8HImode);
- t2 = gen_reg_rtx (V8HImode);
- emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
- emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
- emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
- emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
- if (odd)
- t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
- else
- t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
- emit_insn (t3);
- }
- break;
+ emit_note (NOTE_INSN_PROLOGUE_END);
- case E_V16QImode:
- return expand_vec_perm_even_odd_pack (d);
+ /* CET is enabled, insert EB instruction. */
+ if ((flag_cf_protection & CF_BRANCH))
+ emit_insn (gen_nop_endbr ());
- case E_V16HImode:
- case E_V32QImode:
- return expand_vec_perm_even_odd_pack (d);
+ /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
+ pull it in now and let DELTA benefit. */
+ if (REG_P (this_param))
+ this_reg = this_param;
+ else if (vcall_offset)
+ {
+ /* Put the this parameter into %eax. */
+ this_reg = gen_rtx_REG (Pmode, AX_REG);
+ emit_move_insn (this_reg, this_param);
+ }
+ else
+ this_reg = NULL_RTX;
- case E_V64QImode:
- return expand_vec_perm_even_odd_trunc (d);
+ /* Adjust the this parameter by a fixed constant. */
+ if (delta)
+ {
+ rtx delta_rtx = GEN_INT (delta);
+ rtx delta_dst = this_reg ? this_reg : this_param;
- case E_V4DImode:
- if (!TARGET_AVX2)
+ if (TARGET_64BIT)
{
- struct expand_vec_perm_d d_copy = *d;
- d_copy.vmode = V4DFmode;
- if (d->testing_p)
- d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
- else
- d_copy.target = gen_reg_rtx (V4DFmode);
- d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
- d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
- if (expand_vec_perm_even_odd_1 (&d_copy, odd))
+ if (!x86_64_general_operand (delta_rtx, Pmode))
{
- if (!d->testing_p)
- emit_move_insn (d->target,
- gen_lowpart (V4DImode, d_copy.target));
- return true;
+ tmp = gen_rtx_REG (Pmode, tmp_regno);
+ emit_move_insn (tmp, delta_rtx);
+ delta_rtx = tmp;
}
- return false;
}
- if (d->testing_p)
- break;
+ ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
+ }
- t1 = gen_reg_rtx (V4DImode);
- t2 = gen_reg_rtx (V4DImode);
+ /* Adjust the this parameter by a value stored in the vtable. */
+ if (vcall_offset)
+ {
+ rtx vcall_addr, vcall_mem, this_mem;
- /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
- emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
- emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
+ tmp = gen_rtx_REG (Pmode, tmp_regno);
- /* Now an vpunpck[lh]qdq will produce the result required. */
- if (odd)
- t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
- else
- t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
- emit_insn (t3);
- break;
+ this_mem = gen_rtx_MEM (ptr_mode, this_reg);
+ if (Pmode != ptr_mode)
+ this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
+ emit_move_insn (tmp, this_mem);
- case E_V8SImode:
- if (!TARGET_AVX2)
+ /* Adjust the this parameter. */
+ vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
+ if (TARGET_64BIT
+ && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
{
- struct expand_vec_perm_d d_copy = *d;
- d_copy.vmode = V8SFmode;
- if (d->testing_p)
- d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
- else
- d_copy.target = gen_reg_rtx (V8SFmode);
- d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
- d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
- if (expand_vec_perm_even_odd_1 (&d_copy, odd))
- {
- if (!d->testing_p)
- emit_move_insn (d->target,
- gen_lowpart (V8SImode, d_copy.target));
- return true;
- }
- return false;
+ rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
+ emit_move_insn (tmp2, GEN_INT (vcall_offset));
+ vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
}
- if (d->testing_p)
- break;
-
- t1 = gen_reg_rtx (V8SImode);
- t2 = gen_reg_rtx (V8SImode);
- t3 = gen_reg_rtx (V4DImode);
- t4 = gen_reg_rtx (V4DImode);
- t5 = gen_reg_rtx (V4DImode);
-
- /* Shuffle the lanes around into
- { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
- emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
- gen_lowpart (V4DImode, d->op1),
- GEN_INT (0x20)));
- emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
- gen_lowpart (V4DImode, d->op1),
- GEN_INT (0x31)));
-
- /* Swap the 2nd and 3rd position in each lane into
- { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
- emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
- GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
- emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
- GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
-
- /* Now an vpunpck[lh]qdq will produce
- { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
- if (odd)
- t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
- gen_lowpart (V4DImode, t2));
+ vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
+ if (Pmode != ptr_mode)
+ emit_insn (gen_addsi_1_zext (this_reg,
+ gen_rtx_REG (ptr_mode,
+ REGNO (this_reg)),
+ vcall_mem));
else
- t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
- gen_lowpart (V4DImode, t2));
- emit_insn (t3);
- emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
- break;
-
- default:
- gcc_unreachable ();
+ ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
}
- return true;
-}
-
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
- extract-even and extract-odd permutations. */
-
-static bool
-expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
-{
- unsigned i, odd, nelt = d->nelt;
-
- odd = d->perm[0];
- if (odd != 0 && odd != 1)
- return false;
-
- for (i = 1; i < nelt; ++i)
- if (d->perm[i] != 2 * i + odd)
- return false;
-
- return expand_vec_perm_even_odd_1 (d, odd);
-}
-
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
- permutations. We assume that expand_vec_perm_1 has already failed. */
-
-static bool
-expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
-{
- unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
- machine_mode vmode = d->vmode;
- unsigned char perm2[4];
- rtx op0 = d->op0, dest;
- bool ok;
+ /* If necessary, drop THIS back to its stack slot. */
+ if (this_reg && this_reg != this_param)
+ emit_move_insn (this_param, this_reg);
- switch (vmode)
+ fnaddr = XEXP (DECL_RTL (function), 0);
+ if (TARGET_64BIT)
{
- case E_V4DFmode:
- case E_V8SFmode:
- /* These are special-cased in sse.md so that we can optionally
- use the vbroadcast instruction. They expand to two insns
- if the input happens to be in a register. */
- gcc_unreachable ();
-
- case E_V2DFmode:
- case E_V2DImode:
- case E_V4SFmode:
- case E_V4SImode:
- /* These are always implementable using standard shuffle patterns. */
- gcc_unreachable ();
-
- case E_V8HImode:
- case E_V16QImode:
- /* These can be implemented via interleave. We save one insn by
- stopping once we have promoted to V4SImode and then use pshufd. */
- if (d->testing_p)
- return true;
- do
+ if (!flag_pic || targetm.binds_local_p (function)
+ || TARGET_PECOFF)
+ ;
+ else
{
- rtx dest;
- rtx (*gen) (rtx, rtx, rtx)
- = vmode == V16QImode ? gen_vec_interleave_lowv16qi
- : gen_vec_interleave_lowv8hi;
-
- if (elt >= nelt2)
- {
- gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
- : gen_vec_interleave_highv8hi;
- elt -= nelt2;
- }
- nelt2 /= 2;
+ tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
+ tmp = gen_rtx_CONST (Pmode, tmp);
+ fnaddr = gen_const_mem (Pmode, tmp);
+ }
+ }
+ else
+ {
+ if (!flag_pic || targetm.binds_local_p (function))
+ ;
+#if TARGET_MACHO
+ else if (TARGET_MACHO)
+ {
+ fnaddr = machopic_indirect_call_target (DECL_RTL (function));
+ fnaddr = XEXP (fnaddr, 0);
+ }
+#endif /* TARGET_MACHO */
+ else
+ {
+ tmp = gen_rtx_REG (Pmode, CX_REG);
+ output_set_got (tmp, NULL_RTX);
- dest = gen_reg_rtx (vmode);
- emit_insn (gen (dest, op0, op0));
- vmode = get_mode_wider_vector (vmode);
- op0 = gen_lowpart (vmode, dest);
+ fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
+ fnaddr = gen_rtx_CONST (Pmode, fnaddr);
+ fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
+ fnaddr = gen_const_mem (Pmode, fnaddr);
}
- while (vmode != V4SImode);
+ }
- memset (perm2, elt, 4);
- dest = gen_reg_rtx (V4SImode);
- ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
- gcc_assert (ok);
- if (!d->testing_p)
- emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
- return true;
+ /* Our sibling call patterns do not allow memories, because we have no
+ predicate that can distinguish between frame and non-frame memory.
+ For our purposes here, we can get away with (ab)using a jump pattern,
+ because we're going to do no optimization. */
+ if (MEM_P (fnaddr))
+ {
+ if (sibcall_insn_operand (fnaddr, word_mode))
+ {
+ fnaddr = XEXP (DECL_RTL (function), 0);
+ tmp = gen_rtx_MEM (QImode, fnaddr);
+ tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
+ tmp = emit_call_insn (tmp);
+ SIBLING_CALL_P (tmp) = 1;
+ }
+ else
+ emit_jump_insn (gen_indirect_jump (fnaddr));
+ }
+ else
+ {
+ if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
+ {
+ // CM_LARGE_PIC always uses pseudo PIC register which is
+ // uninitialized. Since FUNCTION is local and calling it
+ // doesn't go through PLT, we use scratch register %r11 as
+ // PIC register and initialize it here.
+ pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
+ ix86_init_large_pic_reg (tmp_regno);
+ fnaddr = legitimize_pic_address (fnaddr,
+ gen_rtx_REG (Pmode, tmp_regno));
+ }
- case E_V64QImode:
- case E_V32QImode:
- case E_V16HImode:
- case E_V8SImode:
- case E_V4DImode:
- /* For AVX2 broadcasts of the first element vpbroadcast* or
- vpermq should be used by expand_vec_perm_1. */
- gcc_assert (!TARGET_AVX2 || d->perm[0]);
- return false;
+ if (!sibcall_insn_operand (fnaddr, word_mode))
+ {
+ tmp = gen_rtx_REG (word_mode, tmp_regno);
+ if (GET_MODE (fnaddr) != word_mode)
+ fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
+ emit_move_insn (tmp, fnaddr);
+ fnaddr = tmp;
+ }
- default:
- gcc_unreachable ();
+ tmp = gen_rtx_MEM (QImode, fnaddr);
+ tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
+ tmp = emit_call_insn (tmp);
+ SIBLING_CALL_P (tmp) = 1;
}
+ emit_barrier ();
+
+ /* Emit just enough of rest_of_compilation to get the insns emitted.
+ Note that use_thunk calls assemble_start_function et al. */
+ insn = get_insns ();
+ shorten_branches (insn);
+ final_start_function (insn, file, 1);
+ final (insn, file, 1);
+ final_end_function ();
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
- broadcast permutations. */
+static void
+x86_file_start (void)
+{
+ default_file_start ();
+ if (TARGET_16BIT)
+ fputs ("\t.code16gcc\n", asm_out_file);
+#if TARGET_MACHO
+ darwin_file_start ();
+#endif
+ if (X86_FILE_START_VERSION_DIRECTIVE)
+ fputs ("\t.version\t\"01.01\"\n", asm_out_file);
+ if (X86_FILE_START_FLTUSED)
+ fputs ("\t.global\t__fltused\n", asm_out_file);
+ if (ix86_asm_dialect == ASM_INTEL)
+ fputs ("\t.intel_syntax noprefix\n", asm_out_file);
+}
-static bool
-expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
+int
+x86_field_alignment (tree type, int computed)
{
- unsigned i, elt, nelt = d->nelt;
+ machine_mode mode;
- if (!d->one_operand_p)
- return false;
+ if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
+ return computed;
+ if (TARGET_IAMCU)
+ return iamcu_alignment (type, computed);
+ mode = TYPE_MODE (strip_array_types (type));
+ if (mode == DFmode || mode == DCmode
+ || GET_MODE_CLASS (mode) == MODE_INT
+ || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
+ return MIN (32, computed);
+ return computed;
+}
- elt = d->perm[0];
- for (i = 1; i < nelt; ++i)
- if (d->perm[i] != elt)
- return false;
+/* Print call to TARGET to FILE. */
- return expand_vec_perm_broadcast_1 (d);
+static void
+x86_print_call_or_nop (FILE *file, const char *target)
+{
+ if (flag_nop_mcount || !strcmp (target, "nop"))
+ /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
+ fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
+ else
+ fprintf (file, "1:\tcall\t%s\n", target);
}
-/* Implement arbitrary permutations of two V64QImode operands
- with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
static bool
-expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
+current_fentry_name (const char **name)
{
- if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
+ tree attr = lookup_attribute ("fentry_name",
+ DECL_ATTRIBUTES (current_function_decl));
+ if (!attr)
return false;
+ *name = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (attr)));
+ return true;
+}
- if (d->testing_p)
- return true;
-
- struct expand_vec_perm_d ds[2];
- rtx rperm[128], vperm, target0, target1;
- unsigned int i, nelt;
- machine_mode vmode;
-
- nelt = d->nelt;
- vmode = V64QImode;
-
- for (i = 0; i < 2; i++)
- {
- ds[i] = *d;
- ds[i].vmode = V32HImode;
- ds[i].nelt = 32;
- ds[i].target = gen_reg_rtx (V32HImode);
- ds[i].op0 = gen_lowpart (V32HImode, d->op0);
- ds[i].op1 = gen_lowpart (V32HImode, d->op1);
- }
-
- /* Prepare permutations such that the first one takes care of
- putting the even bytes into the right positions or one higher
- positions (ds[0]) and the second one takes care of
- putting the odd bytes into the right positions or one below
- (ds[1]). */
-
- for (i = 0; i < nelt; i++)
- {
- ds[i & 1].perm[i / 2] = d->perm[i] / 2;
- if (i & 1)
- {
- rperm[i] = constm1_rtx;
- rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
- }
- else
- {
- rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
- rperm[i + 64] = constm1_rtx;
- }
- }
-
- bool ok = expand_vec_perm_1 (&ds[0]);
- gcc_assert (ok);
- ds[0].target = gen_lowpart (V64QImode, ds[0].target);
-
- ok = expand_vec_perm_1 (&ds[1]);
- gcc_assert (ok);
- ds[1].target = gen_lowpart (V64QImode, ds[1].target);
-
- vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
- vperm = force_reg (vmode, vperm);
- target0 = gen_reg_rtx (V64QImode);
- emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
-
- vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
- vperm = force_reg (vmode, vperm);
- target1 = gen_reg_rtx (V64QImode);
- emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
-
- emit_insn (gen_iorv64qi3 (d->target, target0, target1));
+static bool
+current_fentry_section (const char **name)
+{
+ tree attr = lookup_attribute ("fentry_section",
+ DECL_ATTRIBUTES (current_function_decl));
+ if (!attr)
+ return false;
+ *name = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (attr)));
return true;
}
-/* Implement arbitrary permutation of two V32QImode and V16QImode operands
- with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
- all the shorter instruction sequences. */
-
-static bool
-expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
+/* Output assembler code to FILE to increment profiler label # LABELNO
+ for profiling a function entry. */
+void
+x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
{
- rtx rperm[4][32], vperm, l[2], h[2], op, m128;
- unsigned int i, nelt, eltsz;
- bool used[4];
+ if (cfun->machine->endbr_queued_at_entrance)
+ fprintf (file, "\t%s\n", TARGET_64BIT ? "endbr64" : "endbr32");
- if (!TARGET_AVX2
- || d->one_operand_p
- || (d->vmode != V32QImode && d->vmode != V16HImode))
- return false;
+ const char *mcount_name = MCOUNT_NAME;
- if (d->testing_p)
- return true;
+ if (current_fentry_name (&mcount_name))
+ ;
+ else if (fentry_name)
+ mcount_name = fentry_name;
+ else if (flag_fentry)
+ mcount_name = MCOUNT_NAME_BEFORE_PROLOGUE;
- nelt = d->nelt;
- eltsz = GET_MODE_UNIT_SIZE (d->vmode);
-
- /* Generate 4 permutation masks. If the required element is within
- the same lane, it is shuffled in. If the required element from the
- other lane, force a zero by setting bit 7 in the permutation mask.
- In the other mask the mask has non-negative elements if element
- is requested from the other lane, but also moved to the other lane,
- so that the result of vpshufb can have the two V2TImode halves
- swapped. */
- m128 = GEN_INT (-128);
- for (i = 0; i < 32; ++i)
- {
- rperm[0][i] = m128;
- rperm[1][i] = m128;
- rperm[2][i] = m128;
- rperm[3][i] = m128;
- }
- used[0] = false;
- used[1] = false;
- used[2] = false;
- used[3] = false;
- for (i = 0; i < nelt; ++i)
+ if (TARGET_64BIT)
{
- unsigned j, e = d->perm[i] & (nelt / 2 - 1);
- unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
- unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
+#ifndef NO_PROFILE_COUNTERS
+ fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
+#endif
- for (j = 0; j < eltsz; ++j)
- rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
- used[which] = true;
+ if (!TARGET_PECOFF && flag_pic)
+ fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
+ else
+ x86_print_call_or_nop (file, mcount_name);
}
-
- for (i = 0; i < 2; ++i)
+ else if (flag_pic)
{
- if (!used[2 * i + 1])
- {
- h[i] = NULL_RTX;
- continue;
- }
- vperm = gen_rtx_CONST_VECTOR (V32QImode,
- gen_rtvec_v (32, rperm[2 * i + 1]));
- vperm = force_reg (V32QImode, vperm);
- h[i] = gen_reg_rtx (V32QImode);
- op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
- emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
+#ifndef NO_PROFILE_COUNTERS
+ fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
+ LPREFIX, labelno);
+#endif
+ fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
}
-
- /* Swap the 128-byte lanes of h[X]. */
- for (i = 0; i < 2; ++i)
- {
- if (h[i] == NULL_RTX)
- continue;
- op = gen_reg_rtx (V4DImode);
- emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
- const2_rtx, GEN_INT (3), const0_rtx,
- const1_rtx));
- h[i] = gen_lowpart (V32QImode, op);
- }
-
- for (i = 0; i < 2; ++i)
+ else
{
- if (!used[2 * i])
- {
- l[i] = NULL_RTX;
- continue;
- }
- vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
- vperm = force_reg (V32QImode, vperm);
- l[i] = gen_reg_rtx (V32QImode);
- op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
- emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
+#ifndef NO_PROFILE_COUNTERS
+ fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
+ LPREFIX, labelno);
+#endif
+ x86_print_call_or_nop (file, mcount_name);
}
- for (i = 0; i < 2; ++i)
+ if (flag_record_mcount
+ || lookup_attribute ("fentry_section",
+ DECL_ATTRIBUTES (current_function_decl)))
{
- if (h[i] && l[i])
- {
- op = gen_reg_rtx (V32QImode);
- emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
- l[i] = op;
- }
- else if (h[i])
- l[i] = h[i];
- }
+ const char *sname = "__mcount_loc";
- gcc_assert (l[0] && l[1]);
- op = d->target;
- if (d->vmode != V32QImode)
- op = gen_reg_rtx (V32QImode);
- emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
- if (op != d->target)
- emit_move_insn (d->target, gen_lowpart (d->vmode, op));
- return true;
+ if (current_fentry_section (&sname))
+ ;
+ else if (fentry_section)
+ sname = fentry_section;
+
+ fprintf (file, "\t.section %s, \"a\",@progbits\n", sname);
+ fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
+ fprintf (file, "\t.previous\n");
+ }
}
-/* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
- taken care of, perform the expansion in D and return true on success. */
+/* We don't have exact information about the insn sizes, but we may assume
+ quite safely that we are informed about all 1 byte insns and memory
+ address sizes. This is enough to eliminate unnecessary padding in
+ 99% of cases. */
-static bool
-ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
+int
+ix86_min_insn_size (rtx_insn *insn)
{
- /* Try a single instruction expansion. */
- if (expand_vec_perm_1 (d))
- return true;
+ int l = 0, len;
- /* Try sequences of two instructions. */
+ if (!INSN_P (insn) || !active_insn_p (insn))
+ return 0;
- if (expand_vec_perm_pshuflw_pshufhw (d))
- return true;
+ /* Discard alignments we've emit and jump instructions. */
+ if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
+ && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
+ return 0;
- if (expand_vec_perm_palignr (d, false))
- return true;
+ /* Important case - calls are always 5 bytes.
+ It is common to have many calls in the row. */
+ if (CALL_P (insn)
+ && symbolic_reference_mentioned_p (PATTERN (insn))
+ && !SIBLING_CALL_P (insn))
+ return 5;
+ len = get_attr_length (insn);
+ if (len <= 1)
+ return 1;
- if (expand_vec_perm_interleave2 (d))
- return true;
+ /* For normal instructions we rely on get_attr_length being exact,
+ with a few exceptions. */
+ if (!JUMP_P (insn))
+ {
+ enum attr_type type = get_attr_type (insn);
- if (expand_vec_perm_broadcast (d))
- return true;
+ switch (type)
+ {
+ case TYPE_MULTI:
+ if (GET_CODE (PATTERN (insn)) == ASM_INPUT
+ || asm_noperands (PATTERN (insn)) >= 0)
+ return 0;
+ break;
+ case TYPE_OTHER:
+ case TYPE_FCMP:
+ break;
+ default:
+ /* Otherwise trust get_attr_length. */
+ return len;
+ }
- if (expand_vec_perm_vpermq_perm_1 (d))
- return true;
+ l = get_attr_length_address (insn);
+ if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
+ l = 4;
+ }
+ if (l)
+ return 1+l;
+ else
+ return 2;
+}
- if (expand_vec_perm_vperm2f128 (d))
- return true;
+#ifdef ASM_OUTPUT_MAX_SKIP_PAD
- if (expand_vec_perm_pblendv (d))
- return true;
+/* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
+ window. */
- /* Try sequences of three instructions. */
+static void
+ix86_avoid_jump_mispredicts (void)
+{
+ rtx_insn *insn, *start = get_insns ();
+ int nbytes = 0, njumps = 0;
+ bool isjump = false;
- if (expand_vec_perm_even_odd_pack (d))
- return true;
+ /* Look for all minimal intervals of instructions containing 4 jumps.
+ The intervals are bounded by START and INSN. NBYTES is the total
+ size of instructions in the interval including INSN and not including
+ START. When the NBYTES is smaller than 16 bytes, it is possible
+ that the end of START and INSN ends up in the same 16byte page.
- if (expand_vec_perm_2vperm2f128_vshuf (d))
- return true;
+ The smallest offset in the page INSN can start is the case where START
+ ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
+ We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
- if (expand_vec_perm_pshufb2 (d))
- return true;
+ Don't consider asm goto as jump, while it can contain a jump, it doesn't
+ have to, control transfer to label(s) can be performed through other
+ means, and also we estimate minimum length of all asm stmts as 0. */
+ for (insn = start; insn; insn = NEXT_INSN (insn))
+ {
+ int min_size;
- if (expand_vec_perm_interleave3 (d))
- return true;
+ if (LABEL_P (insn))
+ {
+ align_flags alignment = label_to_alignment (insn);
+ int align = alignment.levels[0].log;
+ int max_skip = alignment.levels[0].maxskip;
- if (expand_vec_perm_vperm2f128_vblend (d))
- return true;
+ if (max_skip > 15)
+ max_skip = 15;
+ /* If align > 3, only up to 16 - max_skip - 1 bytes can be
+ already in the current 16 byte page, because otherwise
+ ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
+ bytes to reach 16 byte boundary. */
+ if (align <= 0
+ || (align <= 3 && max_skip != (1 << align) - 1))
+ max_skip = 0;
+ if (dump_file)
+ fprintf (dump_file, "Label %i with max_skip %i\n",
+ INSN_UID (insn), max_skip);
+ if (max_skip)
+ {
+ while (nbytes + max_skip >= 16)
+ {
+ start = NEXT_INSN (start);
+ if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
+ || CALL_P (start))
+ njumps--, isjump = true;
+ else
+ isjump = false;
+ nbytes -= ix86_min_insn_size (start);
+ }
+ }
+ continue;
+ }
+
+ min_size = ix86_min_insn_size (insn);
+ nbytes += min_size;
+ if (dump_file)
+ fprintf (dump_file, "Insn %i estimated to %i bytes\n",
+ INSN_UID (insn), min_size);
+ if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
+ || CALL_P (insn))
+ njumps++;
+ else
+ continue;
- /* Try sequences of four instructions. */
+ while (njumps > 3)
+ {
+ start = NEXT_INSN (start);
+ if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
+ || CALL_P (start))
+ njumps--, isjump = true;
+ else
+ isjump = false;
+ nbytes -= ix86_min_insn_size (start);
+ }
+ gcc_assert (njumps >= 0);
+ if (dump_file)
+ fprintf (dump_file, "Interval %i to %i has %i bytes\n",
+ INSN_UID (start), INSN_UID (insn), nbytes);
- if (expand_vec_perm_even_odd_trunc (d))
- return true;
- if (expand_vec_perm_vpshufb2_vpermq (d))
- return true;
+ if (njumps == 3 && isjump && nbytes < 16)
+ {
+ int padsize = 15 - nbytes + ix86_min_insn_size (insn);
- if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
- return true;
+ if (dump_file)
+ fprintf (dump_file, "Padding insn %i by %i bytes!\n",
+ INSN_UID (insn), padsize);
+ emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
+ }
+ }
+}
+#endif
- if (expand_vec_perm_vpermt2_vpshub2 (d))
- return true;
+/* AMD Athlon works faster
+ when RET is not destination of conditional jump or directly preceded
+ by other jump instruction. We avoid the penalty by inserting NOP just
+ before the RET instructions in such cases. */
+static void
+ix86_pad_returns (void)
+{
+ edge e;
+ edge_iterator ei;
- /* ??? Look for narrow permutations whose element orderings would
- allow the promotion to a wider mode. */
+ FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
+ {
+ basic_block bb = e->src;
+ rtx_insn *ret = BB_END (bb);
+ rtx_insn *prev;
+ bool replace = false;
- /* ??? Look for sequences of interleave or a wider permute that place
- the data into the correct lanes for a half-vector shuffle like
- pshuf[lh]w or vpermilps. */
+ if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
+ || optimize_bb_for_size_p (bb))
+ continue;
+ for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
+ if (active_insn_p (prev) || LABEL_P (prev))
+ break;
+ if (prev && LABEL_P (prev))
+ {
+ edge e;
+ edge_iterator ei;
- /* ??? Look for sequences of interleave that produce the desired results.
- The combinatorics of punpck[lh] get pretty ugly... */
+ FOR_EACH_EDGE (e, ei, bb->preds)
+ if (EDGE_FREQUENCY (e) && e->src->index >= 0
+ && !(e->flags & EDGE_FALLTHRU))
+ {
+ replace = true;
+ break;
+ }
+ }
+ if (!replace)
+ {
+ prev = prev_active_insn (ret);
+ if (prev
+ && ((JUMP_P (prev) && any_condjump_p (prev))
+ || CALL_P (prev)))
+ replace = true;
+ /* Empty functions get branch mispredict even when
+ the jump destination is not visible to us. */
+ if (!prev && !optimize_function_for_size_p (cfun))
+ replace = true;
+ }
+ if (replace)
+ {
+ emit_jump_insn_before (gen_simple_return_internal_long (), ret);
+ delete_insn (ret);
+ }
+ }
+}
- if (expand_vec_perm_even_odd (d))
- return true;
+/* Count the minimum number of instructions in BB. Return 4 if the
+ number of instructions >= 4. */
- /* Even longer sequences. */
- if (expand_vec_perm_vpshufb4_vpermq2 (d))
- return true;
+static int
+ix86_count_insn_bb (basic_block bb)
+{
+ rtx_insn *insn;
+ int insn_count = 0;
- /* See if we can get the same permutation in different vector integer
- mode. */
- struct expand_vec_perm_d nd;
- if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
+ /* Count number of instructions in this block. Return 4 if the number
+ of instructions >= 4. */
+ FOR_BB_INSNS (bb, insn)
{
- if (!d->testing_p)
- emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
- return true;
+ /* Only happen in exit blocks. */
+ if (JUMP_P (insn)
+ && ANY_RETURN_P (PATTERN (insn)))
+ break;
+
+ if (NONDEBUG_INSN_P (insn)
+ && GET_CODE (PATTERN (insn)) != USE
+ && GET_CODE (PATTERN (insn)) != CLOBBER)
+ {
+ insn_count++;
+ if (insn_count >= 4)
+ return insn_count;
+ }
}
- return false;
+ return insn_count;
}
-/* If a permutation only uses one operand, make it clear. Returns true
- if the permutation references both operands. */
-static bool
-canonicalize_perm (struct expand_vec_perm_d *d)
-{
- int i, which, nelt = d->nelt;
+/* Count the minimum number of instructions in code path in BB.
+ Return 4 if the number of instructions >= 4. */
- for (i = which = 0; i < nelt; ++i)
- which |= (d->perm[i] < nelt ? 1 : 2);
+static int
+ix86_count_insn (basic_block bb)
+{
+ edge e;
+ edge_iterator ei;
+ int min_prev_count;
- d->one_operand_p = true;
- switch (which)
+ /* Only bother counting instructions along paths with no
+ more than 2 basic blocks between entry and exit. Given
+ that BB has an edge to exit, determine if a predecessor
+ of BB has an edge from entry. If so, compute the number
+ of instructions in the predecessor block. If there
+ happen to be multiple such blocks, compute the minimum. */
+ min_prev_count = 4;
+ FOR_EACH_EDGE (e, ei, bb->preds)
{
- default:
- gcc_unreachable();
+ edge prev_e;
+ edge_iterator prev_ei;
- case 3:
- if (!rtx_equal_p (d->op0, d->op1))
- {
- d->one_operand_p = false;
+ if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
+ {
+ min_prev_count = 0;
break;
- }
- /* The elements of PERM do not suggest that only the first operand
- is used, but both operands are identical. Allow easier matching
- of the permutation by folding the permutation into the single
- input vector. */
- /* FALLTHRU */
-
- case 2:
- for (i = 0; i < nelt; ++i)
- d->perm[i] &= nelt - 1;
- d->op0 = d->op1;
- break;
-
- case 1:
- d->op1 = d->op0;
- break;
+ }
+ FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
+ {
+ if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
+ {
+ int count = ix86_count_insn_bb (e->src);
+ if (count < min_prev_count)
+ min_prev_count = count;
+ break;
+ }
+ }
}
- return (which == 3);
+ if (min_prev_count < 4)
+ min_prev_count += ix86_count_insn_bb (bb);
+
+ return min_prev_count;
}
-/* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
+/* Pad short function to 4 instructions. */
-static bool
-ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
- rtx op1, const vec_perm_indices &sel)
+static void
+ix86_pad_short_function (void)
{
- struct expand_vec_perm_d d;
- unsigned char perm[MAX_VECT_LEN];
- unsigned int i, nelt, which;
- bool two_args;
+ edge e;
+ edge_iterator ei;
- d.target = target;
- d.op0 = op0;
- d.op1 = op1;
+ FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
+ {
+ rtx_insn *ret = BB_END (e->src);
+ if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
+ {
+ int insn_count = ix86_count_insn (e->src);
- d.vmode = vmode;
- gcc_assert (VECTOR_MODE_P (d.vmode));
- d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
- d.testing_p = !target;
+ /* Pad short function. */
+ if (insn_count < 4)
+ {
+ rtx_insn *insn = ret;
- gcc_assert (sel.length () == nelt);
- gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
+ /* Find epilogue. */
+ while (insn
+ && (!NOTE_P (insn)
+ || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
+ insn = PREV_INSN (insn);
- /* Given sufficient ISA support we can just return true here
- for selected vector modes. */
- switch (d.vmode)
- {
- case E_V16SFmode:
- case E_V16SImode:
- case E_V8DImode:
- case E_V8DFmode:
- if (!TARGET_AVX512F)
- return false;
- /* All implementable with a single vperm[it]2 insn. */
- if (d.testing_p)
- return true;
- break;
- case E_V32HImode:
- if (!TARGET_AVX512BW)
- return false;
- if (d.testing_p)
- /* All implementable with a single vperm[it]2 insn. */
- return true;
- break;
- case E_V64QImode:
- if (!TARGET_AVX512BW)
- return false;
- if (d.testing_p)
- /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
- return true;
- break;
- case E_V8SImode:
- case E_V8SFmode:
- case E_V4DFmode:
- case E_V4DImode:
- if (!TARGET_AVX)
- return false;
- if (d.testing_p && TARGET_AVX512VL)
- /* All implementable with a single vperm[it]2 insn. */
- return true;
- break;
- case E_V16HImode:
- if (!TARGET_SSE2)
- return false;
- if (d.testing_p && TARGET_AVX2)
- /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
- return true;
- break;
- case E_V32QImode:
- if (!TARGET_SSE2)
- return false;
- if (d.testing_p && TARGET_AVX2)
- /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
- return true;
- break;
- case E_V8HImode:
- case E_V16QImode:
- if (!TARGET_SSE2)
- return false;
- /* Fall through. */
- case E_V4SImode:
- case E_V4SFmode:
- if (!TARGET_SSE)
- return false;
- /* All implementable with a single vpperm insn. */
- if (d.testing_p && TARGET_XOP)
- return true;
- /* All implementable with 2 pshufb + 1 ior. */
- if (d.testing_p && TARGET_SSSE3)
- return true;
- break;
- case E_V2DImode:
- case E_V2DFmode:
- if (!TARGET_SSE)
- return false;
- /* All implementable with shufpd or unpck[lh]pd. */
- if (d.testing_p)
- return true;
- break;
- default:
- return false;
- }
+ if (!insn)
+ insn = ret;
- for (i = which = 0; i < nelt; ++i)
- {
- unsigned char e = sel[i];
- gcc_assert (e < 2 * nelt);
- d.perm[i] = e;
- perm[i] = e;
- which |= (e < nelt ? 1 : 2);
+ /* Two NOPs count as one instruction. */
+ insn_count = 2 * (4 - insn_count);
+ emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
+ }
+ }
}
+}
- if (d.testing_p)
- {
- /* For all elements from second vector, fold the elements to first. */
- if (which == 2)
- for (i = 0; i < nelt; ++i)
- d.perm[i] -= nelt;
+/* Fix up a Windows system unwinder issue. If an EH region falls through into
+ the epilogue, the Windows system unwinder will apply epilogue logic and
+ produce incorrect offsets. This can be avoided by adding a nop between
+ the last insn that can throw and the first insn of the epilogue. */
+
+static void
+ix86_seh_fixup_eh_fallthru (void)
+{
+ edge e;
+ edge_iterator ei;
- /* Check whether the mask can be applied to the vector type. */
- d.one_operand_p = (which != 3);
+ FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
+ {
+ rtx_insn *insn, *next;
- /* Implementable with shufps or pshufd. */
- if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
- return true;
+ /* Find the beginning of the epilogue. */
+ for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
+ if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
+ break;
+ if (insn == NULL)
+ continue;
- /* Otherwise we have to go through the motions and see if we can
- figure out how to generate the requested permutation. */
- d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
- d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
- if (!d.one_operand_p)
- d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
+ /* We only care about preceding insns that can throw. */
+ insn = prev_active_insn (insn);
+ if (insn == NULL || !can_throw_internal (insn))
+ continue;
- start_sequence ();
- bool ret = ix86_expand_vec_perm_const_1 (&d);
- end_sequence ();
+ /* Do not separate calls from their debug information. */
+ for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
+ if (NOTE_P (next) && NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION)
+ insn = next;
+ else
+ break;
- return ret;
+ emit_insn_after (gen_nops (const1_rtx), insn);
}
+}
- two_args = canonicalize_perm (&d);
+/* Implement machine specific optimizations. We implement padding of returns
+ for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
+static void
+ix86_reorg (void)
+{
+ /* We are freeing block_for_insn in the toplev to keep compatibility
+ with old MDEP_REORGS that are not CFG based. Recompute it now. */
+ compute_bb_for_insn ();
- if (ix86_expand_vec_perm_const_1 (&d))
- return true;
+ if (TARGET_SEH && current_function_has_exception_handlers ())
+ ix86_seh_fixup_eh_fallthru ();
- /* If the selector says both arguments are needed, but the operands are the
- same, the above tried to expand with one_operand_p and flattened selector.
- If that didn't work, retry without one_operand_p; we succeeded with that
- during testing. */
- if (two_args && d.one_operand_p)
+ if (optimize && optimize_function_for_speed_p (cfun))
{
- d.one_operand_p = false;
- memcpy (d.perm, perm, sizeof (perm));
- return ix86_expand_vec_perm_const_1 (&d);
+ if (TARGET_PAD_SHORT_FUNCTION)
+ ix86_pad_short_function ();
+ else if (TARGET_PAD_RETURNS)
+ ix86_pad_returns ();
+#ifdef ASM_OUTPUT_MAX_SKIP_PAD
+ if (TARGET_FOUR_JUMP_LIMIT)
+ ix86_avoid_jump_mispredicts ();
+#endif
}
+}
+/* Return nonzero when QImode register that must be represented via REX prefix
+ is used. */
+bool
+x86_extended_QIreg_mentioned_p (rtx_insn *insn)
+{
+ int i;
+ extract_insn_cached (insn);
+ for (i = 0; i < recog_data.n_operands; i++)
+ if (GENERAL_REG_P (recog_data.operand[i])
+ && !QI_REGNO_P (REGNO (recog_data.operand[i])))
+ return true;
return false;
}
-void
-ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
-{
- struct expand_vec_perm_d d;
- unsigned i, nelt;
-
- d.target = targ;
- d.op0 = op0;
- d.op1 = op1;
- d.vmode = GET_MODE (targ);
- d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
- d.one_operand_p = false;
- d.testing_p = false;
-
- for (i = 0; i < nelt; ++i)
- d.perm[i] = i * 2 + odd;
-
- /* We'll either be able to implement the permutation directly... */
- if (expand_vec_perm_1 (&d))
- return;
-
- /* ... or we use the special-case patterns. */
- expand_vec_perm_even_odd_1 (&d, odd);
-}
-
-static void
-ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
+/* Return true when INSN mentions register that must be encoded using REX
+ prefix. */
+bool
+x86_extended_reg_mentioned_p (rtx insn)
{
- struct expand_vec_perm_d d;
- unsigned i, nelt, base;
- bool ok;
-
- d.target = targ;
- d.op0 = op0;
- d.op1 = op1;
- d.vmode = GET_MODE (targ);
- d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
- d.one_operand_p = false;
- d.testing_p = false;
-
- base = high_p ? nelt / 2 : 0;
- for (i = 0; i < nelt / 2; ++i)
+ subrtx_iterator::array_type array;
+ FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
{
- d.perm[i * 2] = i + base;
- d.perm[i * 2 + 1] = i + base + nelt;
+ const_rtx x = *iter;
+ if (REG_P (x)
+ && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
+ return true;
}
-
- /* Note that for AVX this isn't one instruction. */
- ok = ix86_expand_vec_perm_const_1 (&d);
- gcc_assert (ok);
+ return false;
}
+/* If profitable, negate (without causing overflow) integer constant
+ of mode MODE at location LOC. Return true in this case. */
+bool
+x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
+{
+ HOST_WIDE_INT val;
-/* Expand a vector operation CODE for a V*QImode in terms of the
- same operation on V*HImode. */
-
-void
-ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
-{
- machine_mode qimode = GET_MODE (dest);
- machine_mode himode;
- rtx (*gen_il) (rtx, rtx, rtx);
- rtx (*gen_ih) (rtx, rtx, rtx);
- rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
- struct expand_vec_perm_d d;
- bool ok, full_interleave;
- bool uns_p = false;
- int i;
+ if (!CONST_INT_P (*loc))
+ return false;
- switch (qimode)
+ switch (mode)
{
- case E_V16QImode:
- himode = V8HImode;
- gen_il = gen_vec_interleave_lowv16qi;
- gen_ih = gen_vec_interleave_highv16qi;
- break;
- case E_V32QImode:
- himode = V16HImode;
- gen_il = gen_avx2_interleave_lowv32qi;
- gen_ih = gen_avx2_interleave_highv32qi;
- break;
- case E_V64QImode:
- himode = V32HImode;
- gen_il = gen_avx512bw_interleave_lowv64qi;
- gen_ih = gen_avx512bw_interleave_highv64qi;
- break;
- default:
- gcc_unreachable ();
- }
+ case E_DImode:
+ /* DImode x86_64 constants must fit in 32 bits. */
+ gcc_assert (x86_64_immediate_operand (*loc, mode));
- op2_l = op2_h = op2;
- switch (code)
- {
- case MULT:
- /* Unpack data such that we've got a source byte in each low byte of
- each word. We don't care what goes into the high byte of each word.
- Rather than trying to get zero in there, most convenient is to let
- it be a copy of the low byte. */
- op2_l = gen_reg_rtx (qimode);
- op2_h = gen_reg_rtx (qimode);
- emit_insn (gen_il (op2_l, op2, op2));
- emit_insn (gen_ih (op2_h, op2, op2));
-
- op1_l = gen_reg_rtx (qimode);
- op1_h = gen_reg_rtx (qimode);
- emit_insn (gen_il (op1_l, op1, op1));
- emit_insn (gen_ih (op1_h, op1, op1));
- full_interleave = qimode == V16QImode;
+ mode = SImode;
break;
- case ASHIFT:
- case LSHIFTRT:
- uns_p = true;
- /* FALLTHRU */
- case ASHIFTRT:
- op1_l = gen_reg_rtx (himode);
- op1_h = gen_reg_rtx (himode);
- ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
- ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
- full_interleave = true;
+ case E_SImode:
+ case E_HImode:
+ case E_QImode:
break;
+
default:
gcc_unreachable ();
}
- /* Perform the operation. */
- res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
- 1, OPTAB_DIRECT);
- res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
- 1, OPTAB_DIRECT);
- gcc_assert (res_l && res_h);
+ /* Avoid overflows. */
+ if (mode_signbit_p (mode, *loc))
+ return false;
- /* Merge the data back into the right place. */
- d.target = dest;
- d.op0 = gen_lowpart (qimode, res_l);
- d.op1 = gen_lowpart (qimode, res_h);
- d.vmode = qimode;
- d.nelt = GET_MODE_NUNITS (qimode);
- d.one_operand_p = false;
- d.testing_p = false;
+ val = INTVAL (*loc);
- if (full_interleave)
+ /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
+ Exceptions: -128 encodes smaller than 128, so swap sign and op. */
+ if ((val < 0 && val != -128)
+ || val == 128)
{
- /* For SSE2, we used an full interleave, so the desired
- results are in the even elements. */
- for (i = 0; i < d.nelt; ++i)
- d.perm[i] = i * 2;
+ *loc = GEN_INT (-val);
+ return true;
}
- else
- {
- /* For AVX, the interleave used above was not cross-lane. So the
- extraction is evens but with the second and third quarter swapped.
- Happily, that is even one insn shorter than even extraction.
- For AVX512BW we have 4 lanes. We extract evens from within a lane,
- always first from the first and then from the second source operand,
- the index bits above the low 4 bits remains the same.
- Thus, for d.nelt == 32 we want permutation
- 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
- and for d.nelt == 64 we want permutation
- 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
- 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
- for (i = 0; i < d.nelt; ++i)
- d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
- }
-
- ok = ix86_expand_vec_perm_const_1 (&d);
- gcc_assert (ok);
- set_unique_reg_note (get_last_insn (), REG_EQUAL,
- gen_rtx_fmt_ee (code, qimode, op1, op2));
+ return false;
}
-/* Helper function of ix86_expand_mul_widen_evenodd. Return true
- if op is CONST_VECTOR with all odd elements equal to their
- preceding element. */
-
-static bool
-const_vector_equal_evenodd_p (rtx op)
-{
- machine_mode mode = GET_MODE (op);
- int i, nunits = GET_MODE_NUNITS (mode);
- if (GET_CODE (op) != CONST_VECTOR
- || nunits != CONST_VECTOR_NUNITS (op))
- return false;
- for (i = 0; i < nunits; i += 2)
- if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
- return false;
- return true;
-}
+/* Generate an unsigned DImode/SImode to FP conversion. This is the same code
+ optabs would emit if we didn't have TFmode patterns. */
void
-ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
- bool uns_p, bool odd_p)
+x86_emit_floatuns (rtx operands[2])
{
- machine_mode mode = GET_MODE (op1);
- machine_mode wmode = GET_MODE (dest);
- rtx x;
- rtx orig_op1 = op1, orig_op2 = op2;
-
- if (!nonimmediate_operand (op1, mode))
- op1 = force_reg (mode, op1);
- if (!nonimmediate_operand (op2, mode))
- op2 = force_reg (mode, op2);
+ rtx_code_label *neglab, *donelab;
+ rtx i0, i1, f0, in, out;
+ machine_mode mode, inmode;
- /* We only play even/odd games with vectors of SImode. */
- gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
+ inmode = GET_MODE (operands[1]);
+ gcc_assert (inmode == SImode || inmode == DImode);
- /* If we're looking for the odd results, shift those members down to
- the even slots. For some cpus this is faster than a PSHUFD. */
- if (odd_p)
- {
- /* For XOP use vpmacsdqh, but only for smult, as it is only
- signed. */
- if (TARGET_XOP && mode == V4SImode && !uns_p)
- {
- x = force_reg (wmode, CONST0_RTX (wmode));
- emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
- return;
- }
+ out = operands[0];
+ in = force_reg (inmode, operands[1]);
+ mode = GET_MODE (out);
+ neglab = gen_label_rtx ();
+ donelab = gen_label_rtx ();
+ f0 = gen_reg_rtx (mode);
- x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
- if (!const_vector_equal_evenodd_p (orig_op1))
- op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
- x, NULL, 1, OPTAB_DIRECT);
- if (!const_vector_equal_evenodd_p (orig_op2))
- op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
- x, NULL, 1, OPTAB_DIRECT);
- op1 = gen_lowpart (mode, op1);
- op2 = gen_lowpart (mode, op2);
- }
+ emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
- if (mode == V16SImode)
- {
- if (uns_p)
- x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
- else
- x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
- }
- else if (mode == V8SImode)
- {
- if (uns_p)
- x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
- else
- x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
- }
- else if (uns_p)
- x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
- else if (TARGET_SSE4_1)
- x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
- else
- {
- rtx s1, s2, t0, t1, t2;
+ expand_float (out, in, 0);
- /* The easiest way to implement this without PMULDQ is to go through
- the motions as if we are performing a full 64-bit multiply. With
- the exception that we need to do less shuffling of the elements. */
+ emit_jump_insn (gen_jump (donelab));
+ emit_barrier ();
- /* Compute the sign-extension, aka highparts, of the two operands. */
- s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
- op1, pc_rtx, pc_rtx);
- s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
- op2, pc_rtx, pc_rtx);
+ emit_label (neglab);
- /* Multiply LO(A) * HI(B), and vice-versa. */
- t1 = gen_reg_rtx (wmode);
- t2 = gen_reg_rtx (wmode);
- emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
- emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
+ i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
+ 1, OPTAB_DIRECT);
+ i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
+ 1, OPTAB_DIRECT);
+ i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
- /* Multiply LO(A) * LO(B). */
- t0 = gen_reg_rtx (wmode);
- emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
+ expand_float (f0, i0, 0);
- /* Combine and shift the highparts into place. */
- t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
- t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
- 1, OPTAB_DIRECT);
+ emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
- /* Combine high and low parts. */
- force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
- return;
- }
- emit_insn (x);
+ emit_label (donelab);
}
-
-void
-ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
- bool uns_p, bool high_p)
+\f
+/* Target hook for scalar_mode_supported_p. */
+static bool
+ix86_scalar_mode_supported_p (scalar_mode mode)
{
- machine_mode wmode = GET_MODE (dest);
- machine_mode mode = GET_MODE (op1);
- rtx t1, t2, t3, t4, mask;
-
- switch (mode)
- {
- case E_V4SImode:
- t1 = gen_reg_rtx (mode);
- t2 = gen_reg_rtx (mode);
- if (TARGET_XOP && !uns_p)
- {
- /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
- shuffle the elements once so that all elements are in the right
- place for immediate use: { A C B D }. */
- emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
- const1_rtx, GEN_INT (3)));
- emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
- const1_rtx, GEN_INT (3)));
- }
- else
- {
- /* Put the elements into place for the multiply. */
- ix86_expand_vec_interleave (t1, op1, op1, high_p);
- ix86_expand_vec_interleave (t2, op2, op2, high_p);
- high_p = false;
- }
- ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
- break;
-
- case E_V8SImode:
- /* Shuffle the elements between the lanes. After this we
- have { A B E F | C D G H } for each operand. */
- t1 = gen_reg_rtx (V4DImode);
- t2 = gen_reg_rtx (V4DImode);
- emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
- const0_rtx, const2_rtx,
- const1_rtx, GEN_INT (3)));
- emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
- const0_rtx, const2_rtx,
- const1_rtx, GEN_INT (3)));
-
- /* Shuffle the elements within the lanes. After this we
- have { A A B B | C C D D } or { E E F F | G G H H }. */
- t3 = gen_reg_rtx (V8SImode);
- t4 = gen_reg_rtx (V8SImode);
- mask = GEN_INT (high_p
- ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
- : 0 + (0 << 2) + (1 << 4) + (1 << 6));
- emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
- emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
-
- ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
- break;
-
- case E_V8HImode:
- case E_V16HImode:
- t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
- uns_p, OPTAB_DIRECT);
- t2 = expand_binop (mode,
- uns_p ? umul_highpart_optab : smul_highpart_optab,
- op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
- gcc_assert (t1 && t2);
-
- t3 = gen_reg_rtx (mode);
- ix86_expand_vec_interleave (t3, t1, t2, high_p);
- emit_move_insn (dest, gen_lowpart (wmode, t3));
- break;
-
- case E_V16QImode:
- case E_V32QImode:
- case E_V32HImode:
- case E_V16SImode:
- case E_V64QImode:
- t1 = gen_reg_rtx (wmode);
- t2 = gen_reg_rtx (wmode);
- ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
- ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
-
- emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
- break;
-
- default:
- gcc_unreachable ();
- }
+ if (DECIMAL_FLOAT_MODE_P (mode))
+ return default_decimal_float_supported_p ();
+ else if (mode == TFmode)
+ return true;
+ else
+ return default_scalar_mode_supported_p (mode);
}
-void
-ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
+/* Implements target hook vector_mode_supported_p. */
+static bool
+ix86_vector_mode_supported_p (machine_mode mode)
{
- rtx res_1, res_2, res_3, res_4;
-
- res_1 = gen_reg_rtx (V4SImode);
- res_2 = gen_reg_rtx (V4SImode);
- res_3 = gen_reg_rtx (V2DImode);
- res_4 = gen_reg_rtx (V2DImode);
- ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
- ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
-
- /* Move the results in element 2 down to element 1; we don't care
- what goes in elements 2 and 3. Then we can merge the parts
- back together with an interleave.
-
- Note that two other sequences were tried:
- (1) Use interleaves at the start instead of psrldq, which allows
- us to use a single shufps to merge things back at the end.
- (2) Use shufps here to combine the two vectors, then pshufd to
- put the elements in the correct order.
- In both cases the cost of the reformatting stall was too high
- and the overall sequence slower. */
-
- emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
- const0_rtx, const2_rtx,
- const0_rtx, const0_rtx));
- emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
- const0_rtx, const2_rtx,
- const0_rtx, const0_rtx));
- res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
-
- set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
+ if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
+ return true;
+ if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
+ return true;
+ if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
+ return true;
+ if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
+ return true;
+ if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
+ return true;
+ if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
+ return true;
+ return false;
}
-void
-ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
+/* Target hook for c_mode_for_suffix. */
+static machine_mode
+ix86_c_mode_for_suffix (char suffix)
{
- machine_mode mode = GET_MODE (op0);
- rtx t1, t2, t3, t4, t5, t6;
+ if (suffix == 'q')
+ return TFmode;
+ if (suffix == 'w')
+ return XFmode;
- if (TARGET_AVX512DQ && mode == V8DImode)
- emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
- else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
- emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
- else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
- emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
- else if (TARGET_XOP && mode == V2DImode)
- {
- /* op1: A,B,C,D, op2: E,F,G,H */
- op1 = gen_lowpart (V4SImode, op1);
- op2 = gen_lowpart (V4SImode, op2);
+ return VOIDmode;
+}
- t1 = gen_reg_rtx (V4SImode);
- t2 = gen_reg_rtx (V4SImode);
- t3 = gen_reg_rtx (V2DImode);
- t4 = gen_reg_rtx (V2DImode);
+/* Worker function for TARGET_MD_ASM_ADJUST.
- /* t1: B,A,D,C */
- emit_insn (gen_sse2_pshufd_1 (t1, op1,
- GEN_INT (1),
- GEN_INT (0),
- GEN_INT (3),
- GEN_INT (2)));
+ We implement asm flag outputs, and maintain source compatibility
+ with the old cc0-based compiler. */
- /* t2: (B*E),(A*F),(D*G),(C*H) */
- emit_insn (gen_mulv4si3 (t2, t1, op2));
+static rtx_insn *
+ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
+ vec<const char *> &constraints,
+ vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
+{
+ bool saw_asm_flag = false;
- /* t3: (B*E)+(A*F), (D*G)+(C*H) */
- emit_insn (gen_xop_phadddq (t3, t2));
+ start_sequence ();
+ for (unsigned i = 0, n = outputs.length (); i < n; ++i)
+ {
+ const char *con = constraints[i];
+ if (strncmp (con, "=@cc", 4) != 0)
+ continue;
+ con += 4;
+ if (strchr (con, ',') != NULL)
+ {
+ error ("alternatives not allowed in asm flag output");
+ continue;
+ }
- /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
- emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
+ bool invert = false;
+ if (con[0] == 'n')
+ invert = true, con++;
- /* Multiply lower parts and add all */
- t5 = gen_reg_rtx (V2DImode);
- emit_insn (gen_vec_widen_umult_even_v4si (t5,
- gen_lowpart (V4SImode, op1),
- gen_lowpart (V4SImode, op2)));
- op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
+ machine_mode mode = CCmode;
+ rtx_code code = UNKNOWN;
- }
- else
- {
- machine_mode nmode;
- rtx (*umul) (rtx, rtx, rtx);
+ switch (con[0])
+ {
+ case 'a':
+ if (con[1] == 0)
+ mode = CCAmode, code = EQ;
+ else if (con[1] == 'e' && con[2] == 0)
+ mode = CCCmode, code = NE;
+ break;
+ case 'b':
+ if (con[1] == 0)
+ mode = CCCmode, code = EQ;
+ else if (con[1] == 'e' && con[2] == 0)
+ mode = CCAmode, code = NE;
+ break;
+ case 'c':
+ if (con[1] == 0)
+ mode = CCCmode, code = EQ;
+ break;
+ case 'e':
+ if (con[1] == 0)
+ mode = CCZmode, code = EQ;
+ break;
+ case 'g':
+ if (con[1] == 0)
+ mode = CCGCmode, code = GT;
+ else if (con[1] == 'e' && con[2] == 0)
+ mode = CCGCmode, code = GE;
+ break;
+ case 'l':
+ if (con[1] == 0)
+ mode = CCGCmode, code = LT;
+ else if (con[1] == 'e' && con[2] == 0)
+ mode = CCGCmode, code = LE;
+ break;
+ case 'o':
+ if (con[1] == 0)
+ mode = CCOmode, code = EQ;
+ break;
+ case 'p':
+ if (con[1] == 0)
+ mode = CCPmode, code = EQ;
+ break;
+ case 's':
+ if (con[1] == 0)
+ mode = CCSmode, code = EQ;
+ break;
+ case 'z':
+ if (con[1] == 0)
+ mode = CCZmode, code = EQ;
+ break;
+ }
+ if (code == UNKNOWN)
+ {
+ error ("unknown asm flag output %qs", constraints[i]);
+ continue;
+ }
+ if (invert)
+ code = reverse_condition (code);
- if (mode == V2DImode)
+ rtx dest = outputs[i];
+ if (!saw_asm_flag)
{
- umul = gen_vec_widen_umult_even_v4si;
- nmode = V4SImode;
+ /* This is the first asm flag output. Here we put the flags
+ register in as the real output and adjust the condition to
+ allow it. */
+ constraints[i] = "=Bf";
+ outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
+ saw_asm_flag = true;
}
- else if (mode == V4DImode)
+ else
{
- umul = gen_vec_widen_umult_even_v8si;
- nmode = V8SImode;
+ /* We don't need the flags register as output twice. */
+ constraints[i] = "=X";
+ outputs[i] = gen_rtx_SCRATCH (SImode);
}
- else if (mode == V8DImode)
+
+ rtx x = gen_rtx_REG (mode, FLAGS_REG);
+ x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
+
+ machine_mode dest_mode = GET_MODE (dest);
+ if (!SCALAR_INT_MODE_P (dest_mode))
{
- umul = gen_vec_widen_umult_even_v16si;
- nmode = V16SImode;
+ error ("invalid type for asm flag output");
+ continue;
}
- else
- gcc_unreachable ();
+ if (dest_mode == DImode && !TARGET_64BIT)
+ dest_mode = SImode;
- /* Multiply low parts. */
- t1 = gen_reg_rtx (mode);
- emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
+ if (dest_mode != QImode)
+ {
+ rtx destqi = gen_reg_rtx (QImode);
+ emit_insn (gen_rtx_SET (destqi, x));
- /* Shift input vectors right 32 bits so we can multiply high parts. */
- t6 = GEN_INT (32);
- t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
- t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
+ if (TARGET_ZERO_EXTEND_WITH_AND
+ && optimize_function_for_speed_p (cfun))
+ {
+ x = force_reg (dest_mode, const0_rtx);
- /* Multiply high parts by low parts. */
- t4 = gen_reg_rtx (mode);
- t5 = gen_reg_rtx (mode);
- emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
- emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
+ emit_insn (gen_movstrictqi
+ (gen_lowpart (QImode, x), destqi));
+ }
+ else
+ x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
+ }
- /* Combine and shift the highparts back. */
- t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
- t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
+ if (dest_mode != GET_MODE (dest))
+ {
+ rtx tmp = gen_reg_rtx (SImode);
- /* Combine high and low parts. */
- force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
+ emit_insn (gen_rtx_SET (tmp, x));
+ emit_insn (gen_zero_extendsidi2 (dest, tmp));
+ }
+ else
+ emit_insn (gen_rtx_SET (dest, x));
}
+ rtx_insn *seq = get_insns ();
+ end_sequence ();
- set_unique_reg_note (get_last_insn (), REG_EQUAL,
- gen_rtx_MULT (mode, op1, op2));
+ if (saw_asm_flag)
+ return seq;
+ else
+ {
+ /* If we had no asm flag outputs, clobber the flags. */
+ clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
+ SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
+ return NULL;
+ }
}
-/* Return 1 if control tansfer instruction INSN
- should be encoded with notrack prefix. */
+/* Implements target vector targetm.asm.encode_section_info. */
-static bool
-ix86_notrack_prefixed_insn_p (rtx insn)
+static void ATTRIBUTE_UNUSED
+ix86_encode_section_info (tree decl, rtx rtl, int first)
{
- if (!insn || !((flag_cf_protection & CF_BRANCH)))
- return false;
-
- if (CALL_P (insn))
- {
- rtx call = get_call_rtx_from (insn);
- gcc_assert (call != NULL_RTX);
- rtx addr = XEXP (call, 0);
+ default_encode_section_info (decl, rtl, first);
- /* Do not emit 'notrack' if it's not an indirect call. */
- if (MEM_P (addr)
- && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
- return false;
- else
- return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
- }
+ if (ix86_in_large_data_p (decl))
+ SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
+}
- if (JUMP_P (insn) && !flag_cet_switch)
- {
- rtx target = JUMP_LABEL (insn);
- if (target == NULL_RTX || ANY_RETURN_P (target))
- return false;
+/* Worker function for REVERSE_CONDITION. */
- /* Check the jump is a switch table. */
- rtx_insn *label = as_a<rtx_insn *> (target);
- rtx_insn *table = next_insn (label);
- if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
- return false;
- else
- return true;
- }
- return false;
+enum rtx_code
+ix86_reverse_condition (enum rtx_code code, machine_mode mode)
+{
+ return (mode == CCFPmode
+ ? reverse_condition_maybe_unordered (code)
+ : reverse_condition (code));
}
-/* Calculate integer abs() using only SSE2 instructions. */
+/* Output code to perform an x87 FP register move, from OPERANDS[1]
+ to OPERANDS[0]. */
-void
-ix86_expand_sse2_abs (rtx target, rtx input)
+const char *
+output_387_reg_move (rtx_insn *insn, rtx *operands)
{
- machine_mode mode = GET_MODE (target);
- rtx tmp0, tmp1, x;
-
- switch (mode)
+ if (REG_P (operands[0]))
{
- case E_V2DImode:
- case E_V4DImode:
- /* For 64-bit signed integer X, with SSE4.2 use
- pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
- Otherwise handle it similarly to V4SImode, except use 64 as W instead of
- 32 and use logical instead of arithmetic right shift (which is
- unimplemented) and subtract. */
- if (TARGET_SSE4_2)
- {
- tmp0 = gen_reg_rtx (mode);
- tmp1 = gen_reg_rtx (mode);
- emit_move_insn (tmp1, CONST0_RTX (mode));
- if (mode == E_V2DImode)
- emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
- else
- emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
+ if (REG_P (operands[1])
+ && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
+ {
+ if (REGNO (operands[0]) == FIRST_STACK_REG)
+ return output_387_ffreep (operands, 0);
+ return "fstp\t%y0";
}
+ if (STACK_TOP_P (operands[0]))
+ return "fld%Z1\t%y1";
+ return "fst\t%y0";
+ }
+ else if (MEM_P (operands[0]))
+ {
+ gcc_assert (REG_P (operands[1]));
+ if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
+ return "fstp%Z0\t%y0";
else
{
- tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
- GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
- - 1), NULL, 0, OPTAB_DIRECT);
- tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
+ /* There is no non-popping store to memory for XFmode.
+ So if we need one, follow the store with a load. */
+ if (GET_MODE (operands[0]) == XFmode)
+ return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
+ else
+ return "fst%Z0\t%y0";
}
-
- tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
- NULL, 0, OPTAB_DIRECT);
- x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
- target, 0, OPTAB_DIRECT);
- break;
-
- case E_V4SImode:
- /* For 32-bit signed integer X, the best way to calculate the absolute
- value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
- tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
- GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
- NULL, 0, OPTAB_DIRECT);
- tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
- NULL, 0, OPTAB_DIRECT);
- x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
- target, 0, OPTAB_DIRECT);
- break;
-
- case E_V8HImode:
- /* For 16-bit signed integer X, the best way to calculate the absolute
- value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
- tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
-
- x = expand_simple_binop (mode, SMAX, tmp0, input,
- target, 0, OPTAB_DIRECT);
- break;
-
- case E_V16QImode:
- /* For 8-bit signed integer X, the best way to calculate the absolute
- value of X is min ((unsigned char) X, (unsigned char) (-X)),
- as SSE2 provides the PMINUB insn. */
- tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
-
- x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
- target, 0, OPTAB_DIRECT);
- break;
-
- default:
- gcc_unreachable ();
}
-
- if (x != target)
- emit_move_insn (target, x);
+ else
+ gcc_unreachable();
}
+#ifdef TARGET_SOLARIS
+/* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
-/* Expand an extract from a vector register through pextr insn.
- Return true if successful. */
-
-bool
-ix86_expand_pextr (rtx *operands)
+static void
+i386_solaris_elf_named_section (const char *name, unsigned int flags,
+ tree decl)
{
- rtx dst = operands[0];
- rtx src = operands[1];
-
- unsigned int size = INTVAL (operands[2]);
- unsigned int pos = INTVAL (operands[3]);
-
- if (SUBREG_P (dst))
+ /* With Binutils 2.15, the "@unwind" marker must be specified on
+ every occurrence of the ".eh_frame" section, not just the first
+ one. */
+ if (TARGET_64BIT
+ && strcmp (name, ".eh_frame") == 0)
{
- /* Reject non-lowpart subregs. */
- if (SUBREG_BYTE (dst) > 0)
- return false;
- dst = SUBREG_REG (dst);
+ fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
+ flags & SECTION_WRITE ? "aw" : "a");
+ return;
}
-
- if (SUBREG_P (src))
+
+#ifndef USE_GAS
+ if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
{
- pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
- src = SUBREG_REG (src);
+ solaris_elf_asm_comdat_section (name, flags, decl);
+ return;
}
- switch (GET_MODE (src))
+ /* Solaris/x86 as uses the same syntax for the SHF_EXCLUDE flags as the
+ SPARC assembler. One cannot mix single-letter flags and #exclude, so
+ only emit the latter here. */
+ if (flags & SECTION_EXCLUDE)
{
- case E_V16QImode:
- case E_V8HImode:
- case E_V4SImode:
- case E_V2DImode:
- case E_V1TImode:
- case E_TImode:
- {
- machine_mode srcmode, dstmode;
- rtx d, pat;
+ fprintf (asm_out_file, "\t.section\t%s,#exclude\n", name);
+ return;
+ }
+#endif
- if (!int_mode_for_size (size, 0).exists (&dstmode))
- return false;
+ default_elf_asm_named_section (name, flags, decl);
+}
+#endif /* TARGET_SOLARIS */
- switch (dstmode)
- {
- case E_QImode:
- if (!TARGET_SSE4_1)
- return false;
- srcmode = V16QImode;
- break;
+/* Return the mangling of TYPE if it is an extended fundamental type. */
- case E_HImode:
- if (!TARGET_SSE2)
- return false;
- srcmode = V8HImode;
- break;
+static const char *
+ix86_mangle_type (const_tree type)
+{
+ type = TYPE_MAIN_VARIANT (type);
- case E_SImode:
- if (!TARGET_SSE4_1)
- return false;
- srcmode = V4SImode;
- break;
+ if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
+ && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
+ return NULL;
- case E_DImode:
- gcc_assert (TARGET_64BIT);
- if (!TARGET_SSE4_1)
- return false;
- srcmode = V2DImode;
- break;
+ switch (TYPE_MODE (type))
+ {
+ case E_TFmode:
+ /* __float128 is "g". */
+ return "g";
+ case E_XFmode:
+ /* "long double" or __float80 is "e". */
+ return "e";
+ default:
+ return NULL;
+ }
+}
- default:
- return false;
- }
+static GTY(()) tree ix86_tls_stack_chk_guard_decl;
+
+static tree
+ix86_stack_protect_guard (void)
+{
+ if (TARGET_SSP_TLS_GUARD)
+ {
+ tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1);
+ int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg);
+ tree type = build_qualified_type (type_node, qual);
+ tree t;
- /* Reject extractions from misaligned positions. */
- if (pos & (size-1))
- return false;
+ if (global_options_set.x_ix86_stack_protector_guard_symbol_str)
+ {
+ t = ix86_tls_stack_chk_guard_decl;
- if (GET_MODE (dst) == dstmode)
- d = dst;
- else
- d = gen_reg_rtx (dstmode);
+ if (t == NULL)
+ {
+ rtx x;
- /* Construct insn pattern. */
- pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
- pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
+ t = build_decl
+ (UNKNOWN_LOCATION, VAR_DECL,
+ get_identifier (ix86_stack_protector_guard_symbol_str),
+ type);
+ TREE_STATIC (t) = 1;
+ TREE_PUBLIC (t) = 1;
+ DECL_EXTERNAL (t) = 1;
+ TREE_USED (t) = 1;
+ TREE_THIS_VOLATILE (t) = 1;
+ DECL_ARTIFICIAL (t) = 1;
+ DECL_IGNORED_P (t) = 1;
- /* Let the rtl optimizers know about the zero extension performed. */
- if (dstmode == QImode || dstmode == HImode)
- {
- pat = gen_rtx_ZERO_EXTEND (SImode, pat);
- d = gen_lowpart (SImode, d);
- }
+ /* Do not share RTL as the declaration is visible outside of
+ current function. */
+ x = DECL_RTL (t);
+ RTX_FLAG (x, used) = 1;
- emit_insn (gen_rtx_SET (d, pat));
+ ix86_tls_stack_chk_guard_decl = t;
+ }
+ }
+ else
+ {
+ tree asptrtype = build_pointer_type (type);
- if (d != dst)
- emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
- return true;
- }
+ t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset);
+ t = build2 (MEM_REF, asptrtype, t,
+ build_int_cst (asptrtype, 0));
+ TREE_THIS_VOLATILE (t) = 1;
+ }
- default:
- return false;
+ return t;
}
+
+ return default_stack_protect_guard ();
}
-/* Expand an insert into a vector register through pinsr insn.
- Return true if successful. */
+/* For 32-bit code we can save PIC register setup by using
+ __stack_chk_fail_local hidden function instead of calling
+ __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
+ register, so it is better to call __stack_chk_fail directly. */
-bool
-ix86_expand_pinsr (rtx *operands)
+static tree ATTRIBUTE_UNUSED
+ix86_stack_protect_fail (void)
{
- rtx dst = operands[0];
- rtx src = operands[3];
+ return TARGET_64BIT
+ ? default_external_stack_protect_fail ()
+ : default_hidden_stack_protect_fail ();
+}
- unsigned int size = INTVAL (operands[1]);
- unsigned int pos = INTVAL (operands[2]);
+/* Select a format to encode pointers in exception handling data. CODE
+ is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
+ true if the symbol may be affected by dynamic relocations.
- if (SUBREG_P (dst))
+ ??? All x86 object file formats are capable of representing this.
+ After all, the relocation needed is the same as for the call insn.
+ Whether or not a particular assembler allows us to enter such, I
+ guess we'll have to see. */
+int
+asm_preferred_eh_data_format (int code, int global)
+{
+ if (flag_pic)
+ {
+ int type = DW_EH_PE_sdata8;
+ if (!TARGET_64BIT
+ || ix86_cmodel == CM_SMALL_PIC
+ || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
+ type = DW_EH_PE_sdata4;
+ return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
+ }
+ if (ix86_cmodel == CM_SMALL
+ || (ix86_cmodel == CM_MEDIUM && code))
+ return DW_EH_PE_udata4;
+ return DW_EH_PE_absptr;
+}
+\f
+/* Implement targetm.vectorize.builtin_vectorization_cost. */
+static int
+ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
+ tree vectype, int)
+{
+ bool fp = false;
+ machine_mode mode = TImode;
+ int index;
+ if (vectype != NULL)
{
- pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
- dst = SUBREG_REG (dst);
+ fp = FLOAT_TYPE_P (vectype);
+ mode = TYPE_MODE (vectype);
}
- switch (GET_MODE (dst))
+ switch (type_of_cost)
{
- case E_V16QImode:
- case E_V8HImode:
- case E_V4SImode:
- case E_V2DImode:
- case E_V1TImode:
- case E_TImode:
- {
- machine_mode srcmode, dstmode;
- rtx (*pinsr)(rtx, rtx, rtx, rtx);
- rtx d;
+ case scalar_stmt:
+ return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
- if (!int_mode_for_size (size, 0).exists (&srcmode))
- return false;
+ case scalar_load:
+ /* load/store costs are relative to register move which is 2. Recompute
+ it to COSTS_N_INSNS so everything have same base. */
+ return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0]
+ : ix86_cost->int_load [2]) / 2;
- switch (srcmode)
- {
- case E_QImode:
- if (!TARGET_SSE4_1)
- return false;
- dstmode = V16QImode;
- pinsr = gen_sse4_1_pinsrb;
- break;
+ case scalar_store:
+ return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0]
+ : ix86_cost->int_store [2]) / 2;
- case E_HImode:
- if (!TARGET_SSE2)
- return false;
- dstmode = V8HImode;
- pinsr = gen_sse2_pinsrw;
- break;
+ case vector_stmt:
+ return ix86_vec_cost (mode,
+ fp ? ix86_cost->addss : ix86_cost->sse_op);
- case E_SImode:
- if (!TARGET_SSE4_1)
- return false;
- dstmode = V4SImode;
- pinsr = gen_sse4_1_pinsrd;
- break;
+ case vector_load:
+ index = sse_store_index (mode);
+ /* See PR82713 - we may end up being called on non-vector type. */
+ if (index < 0)
+ index = 2;
+ return COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2;
- case E_DImode:
- gcc_assert (TARGET_64BIT);
- if (!TARGET_SSE4_1)
- return false;
- dstmode = V2DImode;
- pinsr = gen_sse4_1_pinsrq;
- break;
+ case vector_store:
+ index = sse_store_index (mode);
+ /* See PR82713 - we may end up being called on non-vector type. */
+ if (index < 0)
+ index = 2;
+ return COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2;
- default:
- return false;
- }
+ case vec_to_scalar:
+ case scalar_to_vec:
+ return ix86_vec_cost (mode, ix86_cost->sse_op);
- /* Reject insertions to misaligned positions. */
- if (pos & (size-1))
- return false;
+ /* We should have separate costs for unaligned loads and gather/scatter.
+ Do that incrementally. */
+ case unaligned_load:
+ index = sse_store_index (mode);
+ /* See PR82713 - we may end up being called on non-vector type. */
+ if (index < 0)
+ index = 2;
+ return COSTS_N_INSNS (ix86_cost->sse_unaligned_load[index]) / 2;
- if (SUBREG_P (src))
- {
- unsigned int srcpos = SUBREG_BYTE (src);
+ case unaligned_store:
+ index = sse_store_index (mode);
+ /* See PR82713 - we may end up being called on non-vector type. */
+ if (index < 0)
+ index = 2;
+ return COSTS_N_INSNS (ix86_cost->sse_unaligned_store[index]) / 2;
- if (srcpos > 0)
- {
- rtx extr_ops[4];
+ case vector_gather_load:
+ return ix86_vec_cost (mode,
+ COSTS_N_INSNS
+ (ix86_cost->gather_static
+ + ix86_cost->gather_per_elt
+ * TYPE_VECTOR_SUBPARTS (vectype)) / 2);
- extr_ops[0] = gen_reg_rtx (srcmode);
- extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
- extr_ops[2] = GEN_INT (size);
- extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
+ case vector_scatter_store:
+ return ix86_vec_cost (mode,
+ COSTS_N_INSNS
+ (ix86_cost->scatter_static
+ + ix86_cost->scatter_per_elt
+ * TYPE_VECTOR_SUBPARTS (vectype)) / 2);
- if (!ix86_expand_pextr (extr_ops))
- return false;
+ case cond_branch_taken:
+ return ix86_cost->cond_taken_branch_cost;
- src = extr_ops[0];
- }
- else
- src = gen_lowpart (srcmode, SUBREG_REG (src));
- }
+ case cond_branch_not_taken:
+ return ix86_cost->cond_not_taken_branch_cost;
- if (GET_MODE (dst) == dstmode)
- d = dst;
- else
- d = gen_reg_rtx (dstmode);
+ case vec_perm:
+ case vec_promote_demote:
+ return ix86_vec_cost (mode, ix86_cost->sse_op);
- emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
- gen_lowpart (srcmode, src),
- GEN_INT (1 << (pos / size))));
- if (d != dst)
- emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
- return true;
- }
+ case vec_construct:
+ {
+ /* N element inserts into SSE vectors. */
+ int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
+ /* One vinserti128 for combining two SSE vectors for AVX256. */
+ if (GET_MODE_BITSIZE (mode) == 256)
+ cost += ix86_vec_cost (mode, ix86_cost->addss);
+ /* One vinserti64x4 and two vinserti128 for combining SSE
+ and AVX256 vectors to AVX512. */
+ else if (GET_MODE_BITSIZE (mode) == 512)
+ cost += 3 * ix86_vec_cost (mode, ix86_cost->addss);
+ return cost;
+ }
- default:
- return false;
+ default:
+ gcc_unreachable ();
}
}
+
\f
/* This function returns the calling abi specific va_list type node.
It returns the FNDECL specific va_list type. */
}
}
-/* All CPUs prefer to avoid cross-lane operations so perform reductions
- upper against lower halves up to SSE reg size. */
-
-static machine_mode
-ix86_split_reduction (machine_mode mode)
-{
- /* Reduce lowpart against highpart until we reach SSE reg width to
- avoid cross-lane operations. */
- switch (mode)
- {
- case E_V8DImode:
- case E_V4DImode:
- return V2DImode;
- case E_V16SImode:
- case E_V8SImode:
- return V4SImode;
- case E_V32HImode:
- case E_V16HImode:
- return V8HImode;
- case E_V64QImode:
- case E_V32QImode:
- return V16QImode;
- case E_V16SFmode:
- case E_V8SFmode:
- return V4SFmode;
- case E_V8DFmode:
- case E_V4DFmode:
- return V2DFmode;
- default:
- return mode;
- }
-}
-
/* If AVX is enabled then try vectorizing with both 256bit and 128bit
vectors. If AVX512F is enabled then try vectorizing with 512bit,
256bit and 128bit vectors. */
return ret;
}
-/* Add target attribute to SIMD clone NODE if needed. */
-
-static void
-ix86_simd_clone_adjust (struct cgraph_node *node)
-{
- const char *str = NULL;
-
- /* Attributes need to be adjusted for definitions, not declarations. */
- if (!node->definition)
- return;
-
- gcc_assert (node->decl == cfun->decl);
- switch (node->simdclone->vecsize_mangle)
- {
- case 'b':
- if (!TARGET_SSE2)
- str = "sse2";
- break;
- case 'c':
- if (!TARGET_AVX)
- str = "avx";
- break;
- case 'd':
- if (!TARGET_AVX2)
- str = "avx2";
- break;
- case 'e':
- if (!TARGET_AVX512F)
- str = "avx512f";
- break;
- default:
- gcc_unreachable ();
- }
- if (str == NULL)
- return;
- push_cfun (NULL);
- tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
- bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
- gcc_assert (ok);
- pop_cfun ();
- ix86_reset_previous_fndecl ();
- ix86_set_current_function (node->decl);
-}
-
/* If SIMD clone NODE can't be used in a vectorized loop
in current function, return -1, otherwise return a badness of using it
(0 if it is most desirable from vecsize_mangle point of view, 1
tree fenv_ptr = build_pointer_type (fenv_type);
tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
fenv_addr = fold_convert (ptr_type_node, fenv_addr);
- tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
- tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
- tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
- tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
+ tree fnstenv = get_ix86_builtin (IX86_BUILTIN_FNSTENV);
+ tree fldenv = get_ix86_builtin (IX86_BUILTIN_FLDENV);
+ tree fnstsw = get_ix86_builtin (IX86_BUILTIN_FNSTSW);
+ tree fnclex = get_ix86_builtin (IX86_BUILTIN_FNCLEX);
tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
tree hold_fnclex = build_call_expr (fnclex, 0);
fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
{
tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
- tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
- tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
+ tree stmxcsr = get_ix86_builtin (IX86_BUILTIN_STMXCSR);
+ tree ldmxcsr = get_ix86_builtin (IX86_BUILTIN_LDMXCSR);
tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
mxcsr_orig_var, stmxcsr_hold_call);
#endif
}
-/* Generate call to __divmoddi4. */
-
-static void
-ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
- rtx op0, rtx op1,
- rtx *quot_p, rtx *rem_p)
-{
- rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
-
- rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
- mode, op0, mode, op1, mode,
- XEXP (rem, 0), Pmode);
- *quot_p = quot;
- *rem_p = rem;
-}
-
/* Set the value of FLT_EVAL_METHOD in float.h. When using only the
FPU, assume that the fpcw is set to extended precision; when using
only SSE, rounding is correct; when using both SSE and the FPU,