-/* Copyright (C) 1988-2019 Free Software Foundation, Inc.
+/* Copyright (C) 1988-2020 Free Software Foundation, Inc.
This file is part of GCC.
#include "gimplify.h"
#include "dwarf2.h"
#include "tm-constrs.h"
-#include "params.h"
#include "cselib.h"
#include "sched-int.h"
#include "opts.h"
{
machine_mode half_mode;
unsigned int byte;
+ rtx mem_op = NULL_RTX;
+ int mem_num = 0;
switch (mode)
{
case E_DImode:
half_mode = SImode;
break;
+ case E_P2HImode:
+ half_mode = HImode;
+ break;
+ case E_P2QImode:
+ half_mode = QImode;
+ break;
default:
gcc_unreachable ();
}
but we still have to handle it. */
if (MEM_P (op))
{
- lo_half[num] = adjust_address (op, half_mode, 0);
- hi_half[num] = adjust_address (op, half_mode, byte);
+ if (mem_op && rtx_equal_p (op, mem_op))
+ {
+ lo_half[num] = lo_half[mem_num];
+ hi_half[num] = hi_half[mem_num];
+ }
+ else
+ {
+ mem_op = op;
+ mem_num = num;
+ lo_half[num] = adjust_address (op, half_mode, 0);
+ hi_half[num] = adjust_address (op, half_mode, byte);
+ }
}
else
{
JUMP_LABEL (insn) = qimode_label;
/* Generate original signed/unsigned divimod. */
- div = gen_divmod4_1 (operands[0], operands[1],
- operands[2], operands[3]);
- emit_insn (div);
+ emit_insn (gen_divmod4_1 (operands[0], operands[1],
+ operands[2], operands[3]));
/* Branch to the end. */
emit_jump_insn (gen_jump (end_label));
}
/* Extract remainder from AH. */
- tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
- tmp0, GEN_INT (8), GEN_INT (8));
- if (REG_P (operands[1]))
- insn = emit_move_insn (operands[1], tmp1);
- else
- {
- /* Need a new scratch register since the old one has result
- of 8bit divide. */
- scratch = gen_reg_rtx (GET_MODE (operands[1]));
- emit_move_insn (scratch, tmp1);
- insn = emit_move_insn (operands[1], scratch);
- }
+ scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
+ tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
+ GEN_INT (8), GEN_INT (8));
+ insn = emit_move_insn (operands[1], tmp1);
set_unique_reg_note (insn, REG_EQUAL, mod);
/* Zero extend quotient from AL. */
OPTAB_DIRECT);
else
{
- rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
+ rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
two31 = ix86_build_const_vector (intmode, 1, two31);
*xorp = expand_simple_binop (intmode, AND,
gen_lowpart (intmode, tmp[0]),
machine_mode vmode = mode;
rtvec par;
- if (vector_mode)
- use_sse = true;
- else if (mode == TFmode)
+ if (vector_mode || mode == TFmode)
use_sse = true;
else if (TARGET_SSE_MATH)
{
Create the appropriate mask now. */
mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
use = gen_rtx_USE (VOIDmode, mask);
- if (vector_mode)
+ if (vector_mode || mode == TFmode)
par = gen_rtvec (2, set, use);
else
{
switch (code)
{
- case GT:
- case GE:
case LT:
case LE:
+ case GT:
+ case GE:
+ case LTGT:
return false;
case EQ:
case NE:
- case LTGT:
case UNORDERED:
case ORDERED:
case UNLT:
{
gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
- /* We may be reversing unordered compare to normal compare, that
- is not valid in general (we may convert non-trapping condition
- to trapping one), however on i386 we currently emit all
- comparisons unordered. */
- new_code = reverse_condition_maybe_unordered (code);
+ /* We may be reversing a non-trapping
+ comparison to a trapping comparison. */
+ if (HONOR_NANS (cmp_mode) && flag_trapping_math
+ && code != EQ && code != NE
+ && code != ORDERED && code != UNORDERED)
+ new_code = UNKNOWN;
+ else
+ new_code = reverse_condition_maybe_unordered (code);
}
else
new_code = ix86_reverse_condition (code, cmp_mode);
}
if (cf != 0)
{
- tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
+ tmp = plus_constant (mode, tmp, cf);
nops++;
}
if (!rtx_equal_p (tmp, out))
{
gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
- /* We may be reversing unordered compare to normal compare,
- that is not valid in general (we may convert non-trapping
- condition to trapping one), however on i386 we currently
- emit all comparisons unordered. */
- new_code = reverse_condition_maybe_unordered (code);
+ /* We may be reversing a non-trapping
+ comparison to a trapping comparison. */
+ if (HONOR_NANS (cmp_mode) && flag_trapping_math
+ && code != EQ && code != NE
+ && code != ORDERED && code != UNORDERED)
+ new_code = UNKNOWN;
+ else
+ new_code = reverse_condition_maybe_unordered (code);
+
}
else
{
{
var = operands[2];
if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
- operands[2] = constm1_rtx, op = and_optab;
+ {
+ /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
+ "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
+ if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
+ operands[1] = simplify_gen_relational (LT, VOIDmode,
+ GET_MODE (op0),
+ op0, const0_rtx);
+
+ operands[2] = constm1_rtx;
+ op = and_optab;
+ }
else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
operands[2] = const0_rtx, op = ior_optab;
else
return true;
}
+/* Return true if MODE is valid for vector compare to mask register,
+ Same result for conditionl vector move with mask register. */
+static bool
+ix86_valid_mask_cmp_mode (machine_mode mode)
+{
+ /* XOP has its own vector conditional movement. */
+ if (TARGET_XOP && !TARGET_AVX512F)
+ return false;
+
+ /* AVX512F is needed for mask operation. */
+ if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
+ return false;
+
+ /* AVX512BW is needed for vector QI/HImode,
+ AVX512VL is needed for 128/256-bit vector. */
+ machine_mode inner_mode = GET_MODE_INNER (mode);
+ int vector_size = GET_MODE_SIZE (mode);
+ if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
+ return false;
+
+ return vector_size == 64 || TARGET_AVX512VL;
+}
+
/* Expand an SSE comparison. Return the register with the result. */
static rtx
bool maskcmp = false;
rtx x;
- if (GET_MODE_SIZE (cmp_ops_mode) == 64)
+ if (ix86_valid_mask_cmp_mode (cmp_ops_mode))
{
unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
- cmp_mode = int_mode_for_size (nbits, 0).require ();
maskcmp = true;
+ cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
}
else
cmp_mode = cmp_ops_mode;
|| (op_false && reg_overlap_mentioned_p (dest, op_false)))
dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
- /* Compare patterns for int modes are unspec in AVX512F only. */
- if (maskcmp && (code == GT || code == EQ))
+ if (maskcmp)
{
- rtx (*gen)(rtx, rtx, rtx);
-
- switch (cmp_ops_mode)
- {
- case E_V64QImode:
- gcc_assert (TARGET_AVX512BW);
- gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
- break;
- case E_V32HImode:
- gcc_assert (TARGET_AVX512BW);
- gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
- break;
- case E_V16SImode:
- gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
- break;
- case E_V8DImode:
- gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
- break;
- default:
- gen = NULL;
- }
-
- if (gen)
- {
- emit_insn (gen (dest, cmp_op0, cmp_op1));
- return dest;
- }
+ bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
+ gcc_assert (ok);
+ return dest;
}
+
x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
if (cmp_mode != mode && !maskcmp)
machine_mode cmpmode = GET_MODE (cmp);
/* In AVX512F the result of comparison is an integer mask. */
- bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
+ bool maskcmp = mode != cmpmode && ix86_valid_mask_cmp_mode (mode);
rtx t2, t3, x;
if (maskcmp)
{
- rtx (*gen) (rtx, rtx) = NULL;
- if ((op_true == CONST0_RTX (mode)
- && vector_all_ones_operand (op_false, mode))
- || (op_false == CONST0_RTX (mode)
- && vector_all_ones_operand (op_true, mode)))
- switch (mode)
- {
- case E_V64QImode:
- if (TARGET_AVX512BW)
- gen = gen_avx512bw_cvtmask2bv64qi;
- break;
- case E_V32QImode:
- if (TARGET_AVX512VL && TARGET_AVX512BW)
- gen = gen_avx512vl_cvtmask2bv32qi;
- break;
- case E_V16QImode:
- if (TARGET_AVX512VL && TARGET_AVX512BW)
- gen = gen_avx512vl_cvtmask2bv16qi;
- break;
- case E_V32HImode:
- if (TARGET_AVX512BW)
- gen = gen_avx512bw_cvtmask2wv32hi;
- break;
- case E_V16HImode:
- if (TARGET_AVX512VL && TARGET_AVX512BW)
- gen = gen_avx512vl_cvtmask2wv16hi;
- break;
- case E_V8HImode:
- if (TARGET_AVX512VL && TARGET_AVX512BW)
- gen = gen_avx512vl_cvtmask2wv8hi;
- break;
- case E_V16SImode:
- if (TARGET_AVX512DQ)
- gen = gen_avx512f_cvtmask2dv16si;
- break;
- case E_V8SImode:
- if (TARGET_AVX512VL && TARGET_AVX512DQ)
- gen = gen_avx512vl_cvtmask2dv8si;
- break;
- case E_V4SImode:
- if (TARGET_AVX512VL && TARGET_AVX512DQ)
- gen = gen_avx512vl_cvtmask2dv4si;
- break;
- case E_V8DImode:
- if (TARGET_AVX512DQ)
- gen = gen_avx512f_cvtmask2qv8di;
- break;
- case E_V4DImode:
- if (TARGET_AVX512VL && TARGET_AVX512DQ)
- gen = gen_avx512vl_cvtmask2qv4di;
- break;
- case E_V2DImode:
- if (TARGET_AVX512VL && TARGET_AVX512DQ)
- gen = gen_avx512vl_cvtmask2qv2di;
- break;
- default:
- break;
- }
- if (gen && SCALAR_INT_MODE_P (cmpmode))
- {
- cmp = force_reg (cmpmode, cmp);
- if (op_true == CONST0_RTX (mode))
+ /* Using vector move with mask register. */
+ cmp = force_reg (cmpmode, cmp);
+ /* Optimize for mask zero. */
+ op_true = (op_true != CONST0_RTX (mode)
+ ? force_reg (mode, op_true) : op_true);
+ op_false = (op_false != CONST0_RTX (mode)
+ ? force_reg (mode, op_false) : op_false);
+ if (op_true == CONST0_RTX (mode))
+ {
+ rtx (*gen_not) (rtx, rtx);
+ switch (cmpmode)
{
- rtx (*gen_not) (rtx, rtx);
- switch (cmpmode)
- {
- case E_QImode: gen_not = gen_knotqi; break;
- case E_HImode: gen_not = gen_knothi; break;
- case E_SImode: gen_not = gen_knotsi; break;
- case E_DImode: gen_not = gen_knotdi; break;
- default: gcc_unreachable ();
- }
- rtx n = gen_reg_rtx (cmpmode);
- emit_insn (gen_not (n, cmp));
- cmp = n;
+ case E_QImode: gen_not = gen_knotqi; break;
+ case E_HImode: gen_not = gen_knothi; break;
+ case E_SImode: gen_not = gen_knotsi; break;
+ case E_DImode: gen_not = gen_knotdi; break;
+ default: gcc_unreachable ();
}
- emit_insn (gen (dest, cmp));
- return;
+ rtx n = gen_reg_rtx (cmpmode);
+ emit_insn (gen_not (n, cmp));
+ cmp = n;
+ /* Reverse op_true op_false. */
+ std::swap (op_true, op_false);
}
+
+ rtx vec_merge = gen_rtx_VEC_MERGE (mode, op_true, op_false, cmp);
+ emit_insn (gen_rtx_SET (dest, vec_merge));
+ return;
}
else if (vector_all_ones_operand (op_true, mode)
&& op_false == CONST0_RTX (mode))
/* Expand AVX-512 vector comparison. */
bool
-ix86_expand_mask_vec_cmp (rtx operands[])
+ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
{
- machine_mode mask_mode = GET_MODE (operands[0]);
- machine_mode cmp_mode = GET_MODE (operands[2]);
- enum rtx_code code = GET_CODE (operands[1]);
+ machine_mode mask_mode = GET_MODE (dest);
+ machine_mode cmp_mode = GET_MODE (cmp_op0);
rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
int unspec_code;
rtx unspec;
unspec_code = UNSPEC_PCMP;
}
- unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
- operands[3], imm),
+ unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
unspec_code);
- emit_insn (gen_rtx_SET (operands[0], unspec));
+ emit_insn (gen_rtx_SET (dest, unspec));
return true;
}
&& (mode == V16QImode || mode == V8HImode
|| mode == V4SImode || mode == V2DImode))
;
+ /* AVX512F supports all of the comparsions
+ on all 128/256/512-bit vector int types. */
+ else if (ix86_valid_mask_cmp_mode (mode))
+ ;
else
{
/* Canonicalize the comparison to EQ, GT, GTU. */
emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
HOST_WIDE_INT size_to_move)
{
- rtx dst = destmem, src = *srcmem, adjust, tempreg;
+ rtx dst = destmem, src = *srcmem, tempreg;
enum insn_code code;
machine_mode move_mode;
int piece_size, i;
/* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
gcc_assert (size_to_move % piece_size == 0);
- adjust = GEN_INT (piece_size);
+
for (i = 0; i < size_to_move; i += piece_size)
{
/* We move from memory to memory, so we'll need to do it via
emit_insn (GEN_FCN (code) (dst, tempreg));
emit_move_insn (destptr,
- gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
+ plus_constant (Pmode, copy_rtx (destptr), piece_size));
emit_move_insn (srcptr,
- gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
+ plus_constant (Pmode, copy_rtx (srcptr), piece_size));
dst = adjust_automodify_address_nv (dst, move_mode, destptr,
piece_size);
emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
HOST_WIDE_INT size_to_move)
{
- rtx dst = destmem, adjust;
+ rtx dst = destmem;
enum insn_code code;
machine_mode move_mode;
int piece_size, i;
/* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
gcc_assert (size_to_move % piece_size == 0);
- adjust = GEN_INT (piece_size);
+
for (i = 0; i < size_to_move; i += piece_size)
{
if (piece_size <= GET_MODE_SIZE (word_mode))
emit_insn (GEN_FCN (code) (dst, promoted_val));
emit_move_insn (destptr,
- gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
+ plus_constant (Pmode, copy_rtx (destptr), piece_size));
dst = adjust_automodify_address_nv (dst, move_mode, destptr,
piece_size);
rtx reg = convert_modes (mode, QImode, val, true);
if (!TARGET_PARTIAL_REG_STALL)
- if (mode == SImode)
- emit_insn (gen_insvsi_1 (reg, reg));
- else
- emit_insn (gen_insvdi_1 (reg, reg));
+ emit_insn (gen_insv_1 (mode, reg, reg));
else
{
tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
&& optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
move_mode = wider_mode;
- if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
+ if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
move_mode = TImode;
/* Find the corresponding vector mode with the same size as MOVE_MODE.
reg,
tmpreg)));
/* Emit lea manually to avoid clobbering of flags. */
- emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
+ emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
case VOID_FTYPE_PV8SI_V8DI_UQI:
case VOID_FTYPE_PV8HI_V8DI_UQI:
case VOID_FTYPE_PV16HI_V16SI_UHI:
- case VOID_FTYPE_PV16QI_V8DI_UQI:
+ case VOID_FTYPE_PUDI_V8DI_UQI:
case VOID_FTYPE_PV16QI_V16SI_UHI:
case VOID_FTYPE_PV4SI_V4DI_UQI:
- case VOID_FTYPE_PV4SI_V2DI_UQI:
- case VOID_FTYPE_PV8HI_V4DI_UQI:
- case VOID_FTYPE_PV8HI_V2DI_UQI:
+ case VOID_FTYPE_PUDI_V2DI_UQI:
+ case VOID_FTYPE_PUDI_V4DI_UQI:
+ case VOID_FTYPE_PUSI_V2DI_UQI:
case VOID_FTYPE_PV8HI_V8SI_UQI:
- case VOID_FTYPE_PV8HI_V4SI_UQI:
- case VOID_FTYPE_PV16QI_V4DI_UQI:
- case VOID_FTYPE_PV16QI_V2DI_UQI:
- case VOID_FTYPE_PV16QI_V8SI_UQI:
- case VOID_FTYPE_PV16QI_V4SI_UQI:
+ case VOID_FTYPE_PUDI_V4SI_UQI:
+ case VOID_FTYPE_PUSI_V4DI_UQI:
+ case VOID_FTYPE_PUHI_V2DI_UQI:
+ case VOID_FTYPE_PUDI_V8SI_UQI:
+ case VOID_FTYPE_PUSI_V4SI_UQI:
case VOID_FTYPE_PCHAR_V64QI_UDI:
case VOID_FTYPE_PCHAR_V32QI_USI:
case VOID_FTYPE_PCHAR_V16QI_UHI:
case VOID_FTYPE_PFLOAT_V4SF_UQI:
case VOID_FTYPE_PV32QI_V32HI_USI:
case VOID_FTYPE_PV16QI_V16HI_UHI:
- case VOID_FTYPE_PV8QI_V8HI_UQI:
+ case VOID_FTYPE_PUDI_V8HI_UQI:
nargs = 2;
klass = store;
/* Reserve memory operand for target. */
klass = load;
memory = 0;
break;
- case VOID_FTYPE_UINT_UINT_UINT:
- case VOID_FTYPE_UINT64_UINT_UINT:
- case UCHAR_FTYPE_UINT_UINT_UINT:
- case UCHAR_FTYPE_UINT64_UINT_UINT:
- nargs = 3;
- klass = load;
- memory = ARRAY_SIZE (args);
- last_arg_constant = true;
- break;
default:
gcc_unreachable ();
}
{
if (!match)
{
- if (icode == CODE_FOR_lwp_lwpvalsi3
- || icode == CODE_FOR_lwp_lwpinssi3
- || icode == CODE_FOR_lwp_lwpvaldi3
- || icode == CODE_FOR_lwp_lwpinsdi3)
- error ("the last argument must be a 32-bit immediate");
- else
- error ("the last argument must be an 8-bit immediate");
+ error ("the last argument must be an 8-bit immediate");
return const0_rtx;
}
}
tree arg0, arg1, arg2, arg3, arg4;
rtx op0, op1, op2, op3, op4, pat, pat2, insn;
machine_mode mode0, mode1, mode2, mode3, mode4;
- unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
+ unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
/* For CPU builtins that can be folded, fold first and expand the fold. */
switch (fcode)
OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
- where for each this pair it is sufficient if either of the ISAs is
- enabled, plus if it is ored with other options also those others. */
+ where for each such pair it is sufficient if either of the ISAs is
+ enabled, plus if it is ored with other options also those others.
+ OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
== (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
&& (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
== (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
&& (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
- /* Use SSE/SSE2/SSSE3 to emulate MMX intrinsics in 64-bit mode when
- MMX is disabled. NB: Since MMX intrinsics are marked with
- SSE/SSE2/SSSE3, enable them without SSE/SSE2/SSSE3 if MMX is
- enabled. */
- if (TARGET_MMX || TARGET_MMX_WITH_SSE)
- {
- if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX))
- == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX))
- && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX)) != 0)
- isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX);
- if (((bisa & (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX))
- == (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX))
- && (isa & (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX)) != 0)
- isa |= (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX);
- if (((bisa & (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX))
- == (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX))
- && (isa & (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX)) != 0)
- isa |= (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX);
+ if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE)
+ {
+ bisa &= ~OPTION_MASK_ISA_MMX;
+ bisa |= OPTION_MASK_ISA_SSE2;
}
if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
{
else
bisa |= OPTION_MASK_ABI_64;
char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
- (enum fpmath_unit) 0, false, add_abi_p);
+ (enum fpmath_unit) 0,
+ (enum prefer_vector_width) 0,
+ false, add_abi_p);
if (!opts)
error ("%qE needs unknown isa option", fndecl);
else
}
else
{
- rtx pat;
+ if (target == 0
+ || !register_operand (target, SImode))
+ target = gen_reg_rtx (SImode);
- target = gen_reg_rtx (SImode);
emit_move_insn (target, const0_rtx);
target = gen_rtx_SUBREG (QImode, target, 0);
- if (fcode == IX86_BUILTIN_ENQCMD)
- pat = gen_enqcmd (UNSPECV_ENQCMD, Pmode, op0, op1);
- else
- pat = gen_enqcmd (UNSPECV_ENQCMDS, Pmode, op0, op1);
-
- emit_insn (pat);
-
- emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
- gen_rtx_fmt_ee (EQ, QImode,
- SET_DEST (pat),
- const0_rtx)));
+ int unspecv = (fcode == IX86_BUILTIN_ENQCMD
+ ? UNSPECV_ENQCMD
+ : UNSPECV_ENQCMDS);
+ icode = code_for_enqcmd (unspecv, Pmode);
+ emit_insn (GEN_FCN (icode) (op0, op1));
+ emit_insn
+ (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
+ gen_rtx_fmt_ee (EQ, QImode,
+ gen_rtx_REG (CCZmode, FLAGS_REG),
+ const0_rtx)));
return SUBREG_REG (target);
}
case IX86_BUILTIN_LLWPCB:
arg0 = CALL_EXPR_ARG (exp, 0);
op0 = expand_normal (arg0);
- icode = CODE_FOR_lwp_llwpcb;
- if (!insn_data[icode].operand[0].predicate (op0, Pmode))
+
+ if (!register_operand (op0, Pmode))
op0 = ix86_zero_extend_to_Pmode (op0);
- emit_insn (gen_lwp_llwpcb (op0));
+ emit_insn (gen_lwp_llwpcb (Pmode, op0));
return 0;
case IX86_BUILTIN_SLWPCB:
- icode = CODE_FOR_lwp_slwpcb;
if (!target
- || !insn_data[icode].operand[0].predicate (target, Pmode))
+ || !register_operand (target, Pmode))
target = gen_reg_rtx (Pmode);
- emit_insn (gen_lwp_slwpcb (target));
+ emit_insn (gen_lwp_slwpcb (Pmode, target));
return target;
+ case IX86_BUILTIN_LWPVAL32:
+ case IX86_BUILTIN_LWPVAL64:
+ case IX86_BUILTIN_LWPINS32:
+ case IX86_BUILTIN_LWPINS64:
+ mode = ((fcode == IX86_BUILTIN_LWPVAL32
+ || fcode == IX86_BUILTIN_LWPINS32)
+ ? SImode : DImode);
+
+ if (fcode == IX86_BUILTIN_LWPVAL32
+ || fcode == IX86_BUILTIN_LWPVAL64)
+ icode = code_for_lwp_lwpval (mode);
+ else
+ icode = code_for_lwp_lwpins (mode);
+
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ arg1 = CALL_EXPR_ARG (exp, 1);
+ arg2 = CALL_EXPR_ARG (exp, 2);
+ op0 = expand_normal (arg0);
+ op1 = expand_normal (arg1);
+ op2 = expand_normal (arg2);
+ mode0 = insn_data[icode].operand[0].mode;
+
+ if (!insn_data[icode].operand[0].predicate (op0, mode0))
+ op0 = copy_to_mode_reg (mode0, op0);
+ if (!insn_data[icode].operand[1].predicate (op1, SImode))
+ op1 = copy_to_mode_reg (SImode, op1);
+
+ if (!CONST_INT_P (op2))
+ {
+ error ("the last argument must be a 32-bit immediate");
+ return const0_rtx;
+ }
+
+ emit_insn (GEN_FCN (icode) (op0, op1, op2));
+
+ if (fcode == IX86_BUILTIN_LWPINS32
+ || fcode == IX86_BUILTIN_LWPINS64)
+ {
+ if (target == 0
+ || !nonimmediate_operand (target, QImode))
+ target = gen_reg_rtx (QImode);
+
+ pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
+ const0_rtx);
+ emit_insn (gen_rtx_SET (target, pat));
+
+ return target;
+ }
+ else
+ return 0;
+
case IX86_BUILTIN_BEXTRI32:
case IX86_BUILTIN_BEXTRI64:
+ mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
+
arg0 = CALL_EXPR_ARG (exp, 0);
arg1 = CALL_EXPR_ARG (exp, 1);
op0 = expand_normal (arg0);
op1 = expand_normal (arg1);
- icode = (fcode == IX86_BUILTIN_BEXTRI32
- ? CODE_FOR_tbm_bextri_si
- : CODE_FOR_tbm_bextri_di);
+
if (!CONST_INT_P (op1))
- {
- error ("last argument must be an immediate");
- return const0_rtx;
- }
+ {
+ error ("last argument must be an immediate");
+ return const0_rtx;
+ }
else
- {
- unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
- unsigned char lsb_index = INTVAL (op1) & 0xFF;
- op1 = GEN_INT (length);
- op2 = GEN_INT (lsb_index);
+ {
+ unsigned char lsb_index = UINTVAL (op1);
+ unsigned char length = UINTVAL (op1) >> 8;
+
+ unsigned char bitsize = GET_MODE_BITSIZE (mode);
+
+ icode = code_for_tbm_bextri (mode);
mode1 = insn_data[icode].operand[1].mode;
if (!insn_data[icode].operand[1].predicate (op0, mode1))
|| !register_operand (target, mode0))
target = gen_reg_rtx (mode0);
- pat = GEN_FCN (icode) (target, op0, op1, op2);
- if (pat)
- emit_insn (pat);
- return target;
- }
+ if (length == 0 || lsb_index >= bitsize)
+ {
+ emit_move_insn (target, const0_rtx);
+ return target;
+ }
+
+ if (length + lsb_index > bitsize)
+ length = bitsize - lsb_index;
+
+ op1 = GEN_INT (length);
+ op2 = GEN_INT (lsb_index);
+
+ emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
+ return target;
+ }
case IX86_BUILTIN_RDRAND16_STEP:
- icode = CODE_FOR_rdrandhi_1;
- mode0 = HImode;
+ mode = HImode;
goto rdrand_step;
case IX86_BUILTIN_RDRAND32_STEP:
- icode = CODE_FOR_rdrandsi_1;
- mode0 = SImode;
+ mode = SImode;
goto rdrand_step;
case IX86_BUILTIN_RDRAND64_STEP:
- icode = CODE_FOR_rdranddi_1;
- mode0 = DImode;
+ mode = DImode;
rdrand_step:
arg0 = CALL_EXPR_ARG (exp, 0);
op1 = copy_addr_to_reg (op1);
}
- op0 = gen_reg_rtx (mode0);
- emit_insn (GEN_FCN (icode) (op0));
+ op0 = gen_reg_rtx (mode);
+ emit_insn (gen_rdrand (mode, op0));
- emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
+ emit_move_insn (gen_rtx_MEM (mode, op1), op0);
- op1 = gen_reg_rtx (SImode);
- emit_move_insn (op1, CONST1_RTX (SImode));
+ op1 = force_reg (SImode, const1_rtx);
/* Emit SImode conditional move. */
- if (mode0 == HImode)
+ if (mode == HImode)
{
if (TARGET_ZERO_EXTEND_WITH_AND
&& optimize_function_for_speed_p (cfun))
emit_insn (gen_zero_extendhisi2 (op2, op0));
}
}
- else if (mode0 == SImode)
+ else if (mode == SImode)
op2 = op0;
else
op2 = gen_rtx_SUBREG (SImode, op0, 0);
return target;
case IX86_BUILTIN_RDSEED16_STEP:
- icode = CODE_FOR_rdseedhi_1;
- mode0 = HImode;
+ mode = HImode;
goto rdseed_step;
case IX86_BUILTIN_RDSEED32_STEP:
- icode = CODE_FOR_rdseedsi_1;
- mode0 = SImode;
+ mode = SImode;
goto rdseed_step;
case IX86_BUILTIN_RDSEED64_STEP:
- icode = CODE_FOR_rdseeddi_1;
- mode0 = DImode;
+ mode = DImode;
rdseed_step:
arg0 = CALL_EXPR_ARG (exp, 0);
op1 = copy_addr_to_reg (op1);
}
- op0 = gen_reg_rtx (mode0);
- emit_insn (GEN_FCN (icode) (op0));
+ op0 = gen_reg_rtx (mode);
+ emit_insn (gen_rdseed (mode, op0));
- emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
+ emit_move_insn (gen_rtx_MEM (mode, op1), op0);
op2 = gen_reg_rtx (QImode);
tree fndecl = gimple_call_fndecl (def_stmt);
if (fndecl
&& fndecl_built_in_p (fndecl, BUILT_IN_MD))
- switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
+ switch (DECL_MD_FUNCTION_CODE (fndecl))
{
case IX86_BUILTIN_CMPPD:
case IX86_BUILTIN_CMPPS:
emit_insn (gen_xabort (op0));
return 0;
+ case IX86_BUILTIN_RDSSPD:
+ case IX86_BUILTIN_RDSSPQ:
+ mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
+
+ if (target == 0
+ || !register_operand (target, mode))
+ target = gen_reg_rtx (mode);
+
+ op0 = force_reg (mode, const0_rtx);
+
+ emit_insn (gen_rdssp (mode, target, op0));
+ return target;
+
+ case IX86_BUILTIN_INCSSPD:
+ case IX86_BUILTIN_INCSSPQ:
+ mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
+
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op0 = expand_normal (arg0);
+
+ op0 = force_reg (mode, op0);
+
+ emit_insn (gen_incssp (mode, op0));
+ return 0;
+
case IX86_BUILTIN_RSTORSSP:
case IX86_BUILTIN_CLRSSBSY:
arg0 = CALL_EXPR_ARG (exp, 0);
op0 = expand_normal (arg0);
icode = (fcode == IX86_BUILTIN_RSTORSSP
- ? CODE_FOR_rstorssp
- : CODE_FOR_clrssbsy);
+ ? CODE_FOR_rstorssp
+ : CODE_FOR_clrssbsy);
+
if (!address_operand (op0, VOIDmode))
{
- op1 = convert_memory_address (Pmode, op0);
- op0 = copy_addr_to_reg (op1);
+ op0 = convert_memory_address (Pmode, op0);
+ op0 = copy_addr_to_reg (op0);
}
- emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
+ emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
return 0;
case IX86_BUILTIN_WRSSD:
case IX86_BUILTIN_WRSSQ:
case IX86_BUILTIN_WRUSSD:
case IX86_BUILTIN_WRUSSQ:
+ mode = ((fcode == IX86_BUILTIN_WRSSD
+ || fcode == IX86_BUILTIN_WRUSSD)
+ ? SImode : DImode);
+
arg0 = CALL_EXPR_ARG (exp, 0);
op0 = expand_normal (arg0);
arg1 = CALL_EXPR_ARG (exp, 1);
op1 = expand_normal (arg1);
- switch (fcode)
- {
- case IX86_BUILTIN_WRSSD:
- icode = CODE_FOR_wrsssi;
- mode = SImode;
- break;
- case IX86_BUILTIN_WRSSQ:
- icode = CODE_FOR_wrssdi;
- mode = DImode;
- break;
- case IX86_BUILTIN_WRUSSD:
- icode = CODE_FOR_wrusssi;
- mode = SImode;
- break;
- case IX86_BUILTIN_WRUSSQ:
- icode = CODE_FOR_wrussdi;
- mode = DImode;
- break;
- }
+
op0 = force_reg (mode, op0);
+
if (!address_operand (op1, VOIDmode))
{
- op2 = convert_memory_address (Pmode, op1);
- op1 = copy_addr_to_reg (op2);
+ op1 = convert_memory_address (Pmode, op1);
+ op1 = copy_addr_to_reg (op1);
}
- emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
+ op1 = gen_rtx_MEM (mode, op1);
+
+ icode = ((fcode == IX86_BUILTIN_WRSSD
+ || fcode == IX86_BUILTIN_WRSSQ)
+ ? code_for_wrss (mode)
+ : code_for_wruss (mode));
+ emit_insn (GEN_FCN (icode) (op0, op1));
+
return 0;
default:
target);
}
- if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
- && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
- {
- i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
- return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp,
- target);
- }
-
gcc_unreachable ();
}
case E_V8HImode:
use_vector_set = TARGET_SSE2;
break;
+ case E_V8QImode:
+ use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
+ break;
case E_V4HImode:
use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
break;
wmode = V8HImode;
goto widen;
case E_V8QImode:
+ if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
+ break;
wmode = V4HImode;
goto widen;
widen:
ix86_expand_vector_init_concat (machine_mode mode,
rtx target, rtx *ops, int n)
{
- machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
- rtx first[16], second[8], third[4];
+ machine_mode half_mode = VOIDmode;
+ rtx half[2];
rtvec v;
int i, j;
switch (mode)
{
case E_V16SImode:
- cmode = V8SImode;
+ half_mode = V8SImode;
break;
case E_V16SFmode:
- cmode = V8SFmode;
+ half_mode = V8SFmode;
break;
case E_V8DImode:
- cmode = V4DImode;
+ half_mode = V4DImode;
break;
case E_V8DFmode:
- cmode = V4DFmode;
+ half_mode = V4DFmode;
break;
case E_V8SImode:
- cmode = V4SImode;
+ half_mode = V4SImode;
break;
case E_V8SFmode:
- cmode = V4SFmode;
+ half_mode = V4SFmode;
break;
case E_V4DImode:
- cmode = V2DImode;
+ half_mode = V2DImode;
break;
case E_V4DFmode:
- cmode = V2DFmode;
+ half_mode = V2DFmode;
break;
case E_V4SImode:
- cmode = V2SImode;
+ half_mode = V2SImode;
break;
case E_V4SFmode:
- cmode = V2SFmode;
+ half_mode = V2SFmode;
break;
case E_V2DImode:
- cmode = DImode;
+ half_mode = DImode;
break;
case E_V2SImode:
- cmode = SImode;
+ half_mode = SImode;
break;
case E_V2DFmode:
- cmode = DFmode;
+ half_mode = DFmode;
break;
case E_V2SFmode:
- cmode = SFmode;
+ half_mode = SFmode;
break;
default:
gcc_unreachable ();
}
- if (!register_operand (ops[1], cmode))
- ops[1] = force_reg (cmode, ops[1]);
- if (!register_operand (ops[0], cmode))
- ops[0] = force_reg (cmode, ops[0]);
+ if (!register_operand (ops[1], half_mode))
+ ops[1] = force_reg (half_mode, ops[1]);
+ if (!register_operand (ops[0], half_mode))
+ ops[0] = force_reg (half_mode, ops[0]);
emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
ops[1])));
break;
switch (mode)
{
case E_V4DImode:
- cmode = V2DImode;
+ half_mode = V2DImode;
break;
case E_V4DFmode:
- cmode = V2DFmode;
+ half_mode = V2DFmode;
break;
case E_V4SImode:
- cmode = V2SImode;
+ half_mode = V2SImode;
break;
case E_V4SFmode:
- cmode = V2SFmode;
+ half_mode = V2SFmode;
break;
default:
gcc_unreachable ();
switch (mode)
{
case E_V8DImode:
- cmode = V2DImode;
- hmode = V4DImode;
+ half_mode = V4DImode;
break;
case E_V8DFmode:
- cmode = V2DFmode;
- hmode = V4DFmode;
+ half_mode = V4DFmode;
break;
case E_V8SImode:
- cmode = V2SImode;
- hmode = V4SImode;
+ half_mode = V4SImode;
break;
case E_V8SFmode:
- cmode = V2SFmode;
- hmode = V4SFmode;
+ half_mode = V4SFmode;
break;
default:
gcc_unreachable ();
switch (mode)
{
case E_V16SImode:
- cmode = V2SImode;
- hmode = V4SImode;
- gmode = V8SImode;
+ half_mode = V8SImode;
break;
case E_V16SFmode:
- cmode = V2SFmode;
- hmode = V4SFmode;
- gmode = V8SFmode;
+ half_mode = V8SFmode;
break;
default:
gcc_unreachable ();
half:
/* FIXME: We process inputs backward to help RA. PR 36222. */
i = n - 1;
- j = (n >> 1) - 1;
- for (; i > 0; i -= 2, j--)
- {
- first[j] = gen_reg_rtx (cmode);
- v = gen_rtvec (2, ops[i - 1], ops[i]);
- ix86_expand_vector_init (false, first[j],
- gen_rtx_PARALLEL (cmode, v));
- }
-
- n >>= 1;
- if (n > 4)
+ for (j = 1; j != -1; j--)
{
- gcc_assert (hmode != VOIDmode);
- gcc_assert (gmode != VOIDmode);
- for (i = j = 0; i < n; i += 2, j++)
+ half[j] = gen_reg_rtx (half_mode);
+ switch (n >> 1)
{
- second[j] = gen_reg_rtx (hmode);
- ix86_expand_vector_init_concat (hmode, second [j],
- &first [i], 2);
- }
- n >>= 1;
- for (i = j = 0; i < n; i += 2, j++)
- {
- third[j] = gen_reg_rtx (gmode);
- ix86_expand_vector_init_concat (gmode, third[j],
- &second[i], 2);
- }
- n >>= 1;
- ix86_expand_vector_init_concat (mode, target, third, n);
- }
- else if (n > 2)
- {
- gcc_assert (hmode != VOIDmode);
- for (i = j = 0; i < n; i += 2, j++)
- {
- second[j] = gen_reg_rtx (hmode);
- ix86_expand_vector_init_concat (hmode, second [j],
- &first [i], 2);
+ case 2:
+ v = gen_rtvec (2, ops[i-1], ops[i]);
+ i -= 2;
+ break;
+ case 4:
+ v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
+ i -= 4;
+ break;
+ case 8:
+ v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
+ ops[i-3], ops[i-2], ops[i-1], ops[i]);
+ i -= 8;
+ break;
+ default:
+ gcc_unreachable ();
}
- n >>= 1;
- ix86_expand_vector_init_concat (mode, target, second, n);
+ ix86_expand_vector_init (false, half[j],
+ gen_rtx_PARALLEL (half_mode, v));
}
- else
- ix86_expand_vector_init_concat (mode, target, first, n);
+
+ ix86_expand_vector_init_concat (mode, target, half, 2);
break;
default:
switch (mode)
{
- case E_V2SFmode:
case E_V2SImode:
+ use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
+ if (use_vec_merge)
+ break;
+ /* FALLTHRU */
+
+ case E_V2SFmode:
if (mmx_ok)
{
tmp = gen_reg_rtx (GET_MODE_INNER (mode));
break;
case E_V8QImode:
+ use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
break;
case E_V32QImode:
switch (mode)
{
case E_V2SImode:
+ use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
+ if (use_vec_extr)
+ break;
+ /* FALLTHRU */
+
case E_V2SFmode:
if (!mmx_ok)
break;
case E_V16QImode:
use_vec_extr = TARGET_SSE4_1;
+ if (!use_vec_extr
+ && TARGET_SSE2
+ && elt == 0
+ && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
+ {
+ tmp = gen_reg_rtx (SImode);
+ ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
+ 0);
+ emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
+ return;
+ }
break;
case E_V8SFmode:
return;
case E_V8QImode:
+ use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
/* ??? Could extract the appropriate HImode element and shift. */
+ break;
+
default:
break;
}
break;
case E_V64QImode:
case E_V32HImode:
+ if (i < 64)
+ {
+ d = gen_reg_rtx (V4TImode);
+ tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
+ GEN_INT (i / 2));
+ break;
+ }
+ /* FALLTHRU */
case E_V16SImode:
case E_V16SFmode:
case E_V8DImode:
case E_V8DFmode:
if (i > 128)
tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
- gen_lowpart (V16SImode, src),
- gen_lowpart (V16SImode, src),
- GEN_INT (0x4 + (i == 512 ? 4 : 0)),
- GEN_INT (0x5 + (i == 512 ? 4 : 0)),
- GEN_INT (0x6 + (i == 512 ? 4 : 0)),
- GEN_INT (0x7 + (i == 512 ? 4 : 0)),
- GEN_INT (0xC), GEN_INT (0xD),
- GEN_INT (0xE), GEN_INT (0xF),
- GEN_INT (0x10), GEN_INT (0x11),
- GEN_INT (0x12), GEN_INT (0x13),
- GEN_INT (0x14), GEN_INT (0x15),
- GEN_INT (0x16), GEN_INT (0x17));
+ gen_lowpart (V16SImode, src),
+ gen_lowpart (V16SImode, src),
+ GEN_INT (0x4 + (i == 512 ? 4 : 0)),
+ GEN_INT (0x5 + (i == 512 ? 4 : 0)),
+ GEN_INT (0x6 + (i == 512 ? 4 : 0)),
+ GEN_INT (0x7 + (i == 512 ? 4 : 0)),
+ GEN_INT (0xC), GEN_INT (0xD),
+ GEN_INT (0xE), GEN_INT (0xF),
+ GEN_INT (0x10), GEN_INT (0x11),
+ GEN_INT (0x12), GEN_INT (0x13),
+ GEN_INT (0x14), GEN_INT (0x15),
+ GEN_INT (0x16), GEN_INT (0x17));
else
tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
- gen_lowpart (V16SImode, src),
- GEN_INT (i == 128 ? 0x2 : 0x1),
- GEN_INT (0x3),
- GEN_INT (0x3),
- GEN_INT (0x3),
- GEN_INT (i == 128 ? 0x6 : 0x5),
- GEN_INT (0x7),
- GEN_INT (0x7),
- GEN_INT (0x7),
- GEN_INT (i == 128 ? 0xA : 0x9),
- GEN_INT (0xB),
- GEN_INT (0xB),
- GEN_INT (0xB),
- GEN_INT (i == 128 ? 0xE : 0xD),
- GEN_INT (0xF),
- GEN_INT (0xF),
- GEN_INT (0xF));
+ gen_lowpart (V16SImode, src),
+ GEN_INT (i == 128 ? 0x2 : 0x1),
+ GEN_INT (0x3),
+ GEN_INT (0x3),
+ GEN_INT (0x3),
+ GEN_INT (i == 128 ? 0x6 : 0x5),
+ GEN_INT (0x7),
+ GEN_INT (0x7),
+ GEN_INT (0x7),
+ GEN_INT (i == 128 ? 0xA : 0x9),
+ GEN_INT (0xB),
+ GEN_INT (0xB),
+ GEN_INT (0xB),
+ GEN_INT (i == 128 ? 0xE : 0xD),
+ GEN_INT (0xF),
+ GEN_INT (0xF),
+ GEN_INT (0xF));
break;
default:
gcc_unreachable ();
}
}
+ mthree = force_reg (mode, mthree);
+
/* e0 = x0 * a */
emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
- /* e1 = e0 * x0 */
- emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
- /* e2 = e1 - 3. */
- mthree = force_reg (mode, mthree);
- emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
+ unsigned vector_size = GET_MODE_SIZE (mode);
+ if (TARGET_FMA
+ || (TARGET_AVX512F && vector_size == 64)
+ || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
+ emit_insn (gen_rtx_SET (e2,
+ gen_rtx_FMA (mode, e0, x0, mthree)));
+ else
+ {
+ /* e1 = e0 * x0 */
+ emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
+
+ /* e2 = e1 - 3. */
+ emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
+ }
mhalf = force_reg (mode, mhalf);
if (recip)
emit_move_insn (operand0, res);
}
-/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
- into OPERAND0. */
+/* Expand SSE2 sequence for computing floor or ceil
+ from OPERAND1 storing into OPERAND0. */
void
-ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
+ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
{
/* C code for the stuff we expand below.
- double xa = fabs (x), x2;
+ double xa = fabs (x), x2;
if (!isless (xa, TWO52))
return x;
- xa = xa + TWO52 - TWO52;
- x2 = copysign (xa, x);
+ x2 = (double)(long)x;
Compensate. Floor:
- if (x2 > x)
- x2 -= 1;
+ if (x2 > x)
+ x2 -= 1;
Compensate. Ceil:
- if (x2 < x)
- x2 += 1;
+ if (x2 < x)
+ x2 += 1;
if (HONOR_SIGNED_ZEROS (mode))
- x2 = copysign (x2, x);
+ return copysign (x2, x);
return x2;
*/
machine_mode mode = GET_MODE (operand0);
- rtx xa, TWO52, tmp, one, res, mask;
+ rtx xa, xi, TWO52, tmp, one, res, mask;
rtx_code_label *label;
TWO52 = ix86_gen_TWO52 (mode);
/* if (!isless (xa, TWO52)) goto label; */
label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
- /* xa = xa + TWO52 - TWO52; */
- xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
- xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
-
- /* xa = copysign (xa, operand1) */
- ix86_sse_copysign_to_positive (xa, xa, res, mask);
+ /* xa = (double)(long)x */
+ xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
+ expand_fix (xi, res, 0);
+ expand_float (xa, xi, 0);
/* generate 1.0 */
one = force_reg (mode, const_double_from_real_value (dconst1, mode));
emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
- if (!do_floor && HONOR_SIGNED_ZEROS (mode))
- ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
emit_move_insn (res, tmp);
+ if (HONOR_SIGNED_ZEROS (mode))
+ ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
+
emit_label (label);
LABEL_NUSES (label) = 1;
}
/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
- into OPERAND0. */
+ into OPERAND0 without relying on DImode truncation via cvttsd2siq
+ that is only available on 64bit targets. */
void
-ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
+ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
{
/* C code for the stuff we expand below.
- double xa = fabs (x), x2;
+ double xa = fabs (x), x2;
if (!isless (xa, TWO52))
return x;
- x2 = (double)(long)x;
+ xa = xa + TWO52 - TWO52;
+ x2 = copysign (xa, x);
Compensate. Floor:
- if (x2 > x)
- x2 -= 1;
+ if (x2 > x)
+ x2 -= 1;
Compensate. Ceil:
- if (x2 < x)
- x2 += 1;
+ if (x2 < x)
+ x2 += 1;
if (HONOR_SIGNED_ZEROS (mode))
- return copysign (x2, x);
+ x2 = copysign (x2, x);
return x2;
*/
machine_mode mode = GET_MODE (operand0);
- rtx xa, xi, TWO52, tmp, one, res, mask;
+ rtx xa, TWO52, tmp, one, res, mask;
rtx_code_label *label;
TWO52 = ix86_gen_TWO52 (mode);
/* if (!isless (xa, TWO52)) goto label; */
label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
- /* xa = (double)(long)x */
- xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
- expand_fix (xi, res, 0);
- expand_float (xa, xi, 0);
+ /* xa = xa + TWO52 - TWO52; */
+ xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
+ xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
+
+ /* xa = copysign (xa, operand1) */
+ ix86_sse_copysign_to_positive (xa, xa, res, mask);
/* generate 1.0 */
one = force_reg (mode, const_double_from_real_value (dconst1, mode));
emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+ if (!do_floor && HONOR_SIGNED_ZEROS (mode))
+ ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
emit_move_insn (res, tmp);
- if (HONOR_SIGNED_ZEROS (mode))
- ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
-
emit_label (label);
LABEL_NUSES (label) = 1;
emit_move_insn (operand0, res);
}
-/* Expand SSE sequence for computing round from OPERAND1 storing
- into OPERAND0. Sequence that works without relying on DImode truncation
- via cvttsd2siq that is only available on 64bit targets. */
-void
-ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
-{
- /* C code for the stuff we expand below.
- double xa = fabs (x), xa2, x2;
- if (!isless (xa, TWO52))
- return x;
- Using the absolute value and copying back sign makes
- -0.0 -> -0.0 correct.
- xa2 = xa + TWO52 - TWO52;
- Compensate.
- dxa = xa2 - xa;
- if (dxa <= -0.5)
- xa2 += 1;
- else if (dxa > 0.5)
- xa2 -= 1;
- x2 = copysign (xa2, x);
- return x2;
- */
- machine_mode mode = GET_MODE (operand0);
- rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
- rtx_code_label *label;
-
- TWO52 = ix86_gen_TWO52 (mode);
-
- /* Temporary for holding the result, initialized to the input
- operand to ease control flow. */
- res = gen_reg_rtx (mode);
- emit_move_insn (res, operand1);
-
- /* xa = abs (operand1) */
- xa = ix86_expand_sse_fabs (res, &mask);
-
- /* if (!isless (xa, TWO52)) goto label; */
- label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
-
- /* xa2 = xa + TWO52 - TWO52; */
- xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
- xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
-
- /* dxa = xa2 - xa; */
- dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
-
- /* generate 0.5, 1.0 and -0.5 */
- half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
- one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
- mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
- 0, OPTAB_DIRECT);
-
- /* Compensate. */
- /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
- tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
- emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
- xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
- /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
- tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
- emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
- xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
-
- /* res = copysign (xa2, operand1) */
- ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
-
- emit_label (label);
- LABEL_NUSES (label) = 1;
-
- emit_move_insn (operand0, res);
-}
-
-/* Expand SSE sequence for computing trunc from OPERAND1 storing
- into OPERAND0. */
+/* Expand SSE sequence for computing trunc
+ from OPERAND1 storing into OPERAND0. */
void
ix86_expand_trunc (rtx operand0, rtx operand1)
{
}
/* Expand SSE sequence for computing trunc from OPERAND1 storing
- into OPERAND0. */
+ into OPERAND0 without relying on DImode truncation via cvttsd2siq
+ that is only available on 64bit targets. */
void
ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
{
emit_move_insn (operand0, res);
}
-/* Expand SSE sequence for computing round from OPERAND1 storing
- into OPERAND0. */
+/* Expand SSE sequence for computing round
+ from OPERAND1 storing into OPERAND0. */
void
ix86_expand_round (rtx operand0, rtx operand1)
{
emit_move_insn (operand0, res);
}
+/* Expand SSE sequence for computing round from OPERAND1 storing
+ into OPERAND0 without relying on DImode truncation via cvttsd2siq
+ that is only available on 64bit targets. */
+void
+ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
+{
+ /* C code for the stuff we expand below.
+ double xa = fabs (x), xa2, x2;
+ if (!isless (xa, TWO52))
+ return x;
+ Using the absolute value and copying back sign makes
+ -0.0 -> -0.0 correct.
+ xa2 = xa + TWO52 - TWO52;
+ Compensate.
+ dxa = xa2 - xa;
+ if (dxa <= -0.5)
+ xa2 += 1;
+ else if (dxa > 0.5)
+ xa2 -= 1;
+ x2 = copysign (xa2, x);
+ return x2;
+ */
+ machine_mode mode = GET_MODE (operand0);
+ rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
+ rtx_code_label *label;
+
+ TWO52 = ix86_gen_TWO52 (mode);
+
+ /* Temporary for holding the result, initialized to the input
+ operand to ease control flow. */
+ res = gen_reg_rtx (mode);
+ emit_move_insn (res, operand1);
+
+ /* xa = abs (operand1) */
+ xa = ix86_expand_sse_fabs (res, &mask);
+
+ /* if (!isless (xa, TWO52)) goto label; */
+ label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+ /* xa2 = xa + TWO52 - TWO52; */
+ xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
+ xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
+
+ /* dxa = xa2 - xa; */
+ dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
+
+ /* generate 0.5, 1.0 and -0.5 */
+ half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
+ one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
+ mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
+ 0, OPTAB_DIRECT);
+
+ /* Compensate. */
+ /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
+ tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
+ emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
+ xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+ /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
+ tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
+ emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
+ xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+
+ /* res = copysign (xa2, operand1) */
+ ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
+
+ emit_label (label);
+ LABEL_NUSES (label) = 1;
+
+ emit_move_insn (operand0, res);
+}
+
/* Expand SSE sequence for computing round
from OP1 storing into OP0 using sse4 round insn. */
void
return ok;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
using movss or movsd. */
static bool
expand_vec_perm_movs (struct expand_vec_perm_d *d)
return false;
if (!(TARGET_SSE && vmode == V4SFmode)
+ && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
&& !(TARGET_SSE2 && vmode == V2DFmode))
return false;
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
static bool
expand_vec_perm_blend (struct expand_vec_perm_d *d)
{
machine_mode mmode, vmode = d->vmode;
- unsigned i, mask, nelt = d->nelt;
+ unsigned i, nelt = d->nelt;
+ unsigned HOST_WIDE_INT mask;
rtx target, op0, op1, maskop, x;
rtx rperm[32], vperm;
case E_V16SImode:
case E_V8DImode:
for (i = 0; i < nelt; ++i)
- mask |= (d->perm[i] >= nelt) << i;
+ mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
break;
case E_V2DImode:
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
in terms of the variable form of vpermilps.
Note that we will have already failed the immediate input vpermilps,
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
static bool
/* vpshufb only works intra lanes, it is not
possible to shuffle bytes in between the lanes. */
for (i = 0; i < nelt; ++i)
- if ((d->perm[i] ^ i) & (nelt / 4))
+ if ((d->perm[i] ^ i) & (3 * nelt / 4))
return false;
}
}
static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
in a single instruction. */
static bool
return false;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
in terms of a pair of pshuflw + pshufhw instructions. */
static bool
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
the permutation using the SSSE3 palignr instruction. This succeeds
when all of the elements in PERM fit within one vector and we merely
need to shift them down so that a single vector permutation has a
static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
a two vector permutation into a single vector permutation by using
an interleave operation to merge the vectors. */
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
a single vector cross-lane permutation into vpermq followed
by any of the single insn permutations. */
static bool canonicalize_perm (struct expand_vec_perm_d *d);
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
a vector permutation using two instructions, vperm2f128 resp.
vperm2i128 followed by any single in-lane permutation. */
return false;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
a two vector permutation using 2 intra-lane interleave insns
and cross-lane shuffle for 32-byte vectors. */
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
a single vector permutation using a single intra-lane vector
permutation, vperm2f128 swapping the lanes and vblend* insn blending
the non-swapped and swapped vectors together. */
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
+/* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
permutation using two vperm2f128, followed by a vshufpd insn blending
the two vectors together. */
return true;
}
+static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
+
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
+ a two vector permutation using two intra-lane vector
+ permutations, vperm2f128 swapping the lanes and vblend* insn blending
+ the non-swapped and swapped vectors together. */
+
+static bool
+expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
+{
+ struct expand_vec_perm_d dfirst, dsecond, dthird;
+ unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
+ rtx_insn *seq1, *seq2;
+ bool ok;
+ rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
+
+ if (!TARGET_AVX
+ || TARGET_AVX2
+ || (d->vmode != V8SFmode && d->vmode != V4DFmode)
+ || d->one_operand_p)
+ return false;
+
+ dfirst = *d;
+ dsecond = *d;
+ for (i = 0; i < nelt; i++)
+ {
+ dfirst.perm[i] = 0xff;
+ dsecond.perm[i] = 0xff;
+ }
+ for (i = 0, msk = 0; i < nelt; i++)
+ {
+ j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
+ if (j == i)
+ {
+ dfirst.perm[j] = d->perm[i];
+ which1 |= (d->perm[i] < nelt ? 1 : 2);
+ }
+ else
+ {
+ dsecond.perm[j] = d->perm[i];
+ which2 |= (d->perm[i] < nelt ? 1 : 2);
+ msk |= (1U << i);
+ }
+ }
+ if (msk == 0 || msk == (1U << nelt) - 1)
+ return false;
+
+ if (!d->testing_p)
+ {
+ dfirst.target = gen_reg_rtx (dfirst.vmode);
+ dsecond.target = gen_reg_rtx (dsecond.vmode);
+ }
+
+ for (i = 0; i < nelt; i++)
+ {
+ if (dfirst.perm[i] == 0xff)
+ dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
+ if (dsecond.perm[i] == 0xff)
+ dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
+ }
+ canonicalize_perm (&dfirst);
+ start_sequence ();
+ ok = ix86_expand_vec_perm_const_1 (&dfirst);
+ seq1 = get_insns ();
+ end_sequence ();
+
+ if (!ok)
+ return false;
+
+ canonicalize_perm (&dsecond);
+ start_sequence ();
+ ok = ix86_expand_vec_perm_const_1 (&dsecond);
+ seq2 = get_insns ();
+ end_sequence ();
+
+ if (!ok)
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ emit_insn (seq1);
+ emit_insn (seq2);
+
+ dthird = *d;
+ dthird.op0 = dsecond.target;
+ dthird.op1 = dsecond.target;
+ dthird.one_operand_p = true;
+ dthird.target = gen_reg_rtx (dthird.vmode);
+ for (i = 0; i < nelt; i++)
+ dthird.perm[i] = i ^ nelt2;
+
+ ok = expand_vec_perm_1 (&dthird);
+ gcc_assert (ok);
+
+ blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
+ emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
+ return true;
+}
+
/* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
permutation with two pshufb insns and an ior. We should have already
failed all two instruction sequences. */
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
+/* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
and extract-odd permutations. */
static bool
case E_V2DFmode:
case E_V4SFmode:
case E_V2DImode:
+ case E_V2SImode:
case E_V4SImode:
/* These are always directly implementable by expand_vec_perm_1. */
gcc_unreachable ();
+ case E_V2SFmode:
+ gcc_assert (TARGET_MMX_WITH_SSE);
+ /* We have no suitable instructions. */
+ if (d->testing_p)
+ return false;
+ break;
+
+ case E_V4HImode:
+ if (d->testing_p)
+ break;
+ /* We need 2*log2(N)-1 operations to achieve odd/even
+ with interleave. */
+ t1 = gen_reg_rtx (V4HImode);
+ emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
+ emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
+ if (odd)
+ t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
+ else
+ t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
+ emit_insn (t2);
+ break;
+
case E_V8HImode:
if (TARGET_SSE4_1)
return expand_vec_perm_even_odd_pack (d);
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
+/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
extract-even and extract-odd permutations. */
static bool
return expand_vec_perm_even_odd_1 (d, odd);
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
+/* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
permutations. We assume that expand_vec_perm_1 has already failed. */
static bool
gcc_unreachable ();
case E_V2DFmode:
- case E_V2DImode:
+ case E_V2SFmode:
case E_V4SFmode:
+ case E_V2DImode:
+ case E_V2SImode:
case E_V4SImode:
/* These are always implementable using standard shuffle patterns. */
gcc_unreachable ();
}
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
+/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
broadcast permutations. */
static bool
return true;
}
+ /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
+ if (expand_vec_perm2_vperm2f128_vblend (d))
+ return true;
+
return false;
}
int i, which, nelt = d->nelt;
for (i = which = 0; i < nelt; ++i)
- which |= (d->perm[i] < nelt ? 1 : 2);
+ which |= (d->perm[i] < nelt ? 1 : 2);
d->one_operand_p = true;
switch (which)
if (d.testing_p && TARGET_SSSE3)
return true;
break;
+ case E_V2SFmode:
+ case E_V2SImode:
+ case E_V4HImode:
+ if (!TARGET_MMX_WITH_SSE)
+ return false;
+ break;
case E_V2DImode:
case E_V2DFmode:
if (!TARGET_SSE)
d.one_operand_p = (which != 3);
/* Implementable with shufps or pshufd. */
- if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
+ if (d.one_operand_p
+ && (d.vmode == V4SFmode || d.vmode == V2SFmode
+ || d.vmode == V4SImode || d.vmode == V2SImode))
return true;
/* Otherwise we have to go through the motions and see if we can
gcc_assert (ok);
}
+/* Optimize vector MUL generation for V8QI, V16QI and V32QI
+ under TARGET_AVX512BW. i.e. for v16qi a * b, it has
+
+ vpmovzxbw ymm2, xmm0
+ vpmovzxbw ymm3, xmm1
+ vpmullw ymm4, ymm2, ymm3
+ vpmovwb xmm0, ymm4
+
+ it would take less instructions than ix86_expand_vecop_qihi.
+ Return true if success. */
+
+bool
+ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
+{
+ machine_mode himode, qimode = GET_MODE (dest);
+ rtx hop1, hop2, hdest;
+ rtx (*gen_extend)(rtx, rtx);
+ rtx (*gen_truncate)(rtx, rtx);
+
+ /* There's no V64HImode multiplication instruction. */
+ if (qimode == E_V64QImode)
+ return false;
+
+ /* vpmovwb only available under AVX512BW. */
+ if (!TARGET_AVX512BW)
+ return false;
+ if ((qimode == V8QImode || qimode == V16QImode)
+ && !TARGET_AVX512VL)
+ return false;
+ /* Not generate zmm instruction when prefer 128/256 bit vector width. */
+ if (qimode == V32QImode
+ && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
+ return false;
+
+ switch (qimode)
+ {
+ case E_V8QImode:
+ himode = V8HImode;
+ gen_extend = gen_zero_extendv8qiv8hi2;
+ gen_truncate = gen_truncv8hiv8qi2;
+ break;
+ case E_V16QImode:
+ himode = V16HImode;
+ gen_extend = gen_zero_extendv16qiv16hi2;
+ gen_truncate = gen_truncv16hiv16qi2;
+ break;
+ case E_V32QImode:
+ himode = V32HImode;
+ gen_extend = gen_zero_extendv32qiv32hi2;
+ gen_truncate = gen_truncv32hiv32qi2;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ hop1 = gen_reg_rtx (himode);
+ hop2 = gen_reg_rtx (himode);
+ hdest = gen_reg_rtx (himode);
+ emit_insn (gen_extend (hop1, op1));
+ emit_insn (gen_extend (hop2, op2));
+ emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (MULT, himode,
+ hop1, hop2)));
+ emit_insn (gen_truncate (dest, hdest));
+ return true;
+}
+
+/* Expand a vector operation shift by constant for a V*QImode in terms of the
+ same operation on V*HImode. Return true if success. */
+bool
+ix86_expand_vec_shift_qihi_constant (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+{
+ machine_mode qimode, himode;
+ HOST_WIDE_INT and_constant, xor_constant;
+ HOST_WIDE_INT shift_amount;
+ rtx vec_const_and, vec_const_xor;
+ rtx tmp, op1_subreg;
+ rtx (*gen_shift) (rtx, rtx, rtx);
+ rtx (*gen_and) (rtx, rtx, rtx);
+ rtx (*gen_xor) (rtx, rtx, rtx);
+ rtx (*gen_sub) (rtx, rtx, rtx);
+
+ /* Only optimize shift by constant. */
+ if (!CONST_INT_P (op2))
+ return false;
+
+ qimode = GET_MODE (dest);
+ shift_amount = INTVAL (op2);
+ /* Do nothing when shift amount greater equal 8. */
+ if (shift_amount > 7)
+ return false;
+
+ gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
+ /* Record sign bit. */
+ xor_constant = 1 << (8 - shift_amount - 1);
+
+ /* Zero upper/lower bits shift from left/right element. */
+ and_constant
+ = (code == ASHIFT ? 256 - (1 << shift_amount)
+ : (1 << (8 - shift_amount)) - 1);
+
+ switch (qimode)
+ {
+ case V16QImode:
+ himode = V8HImode;
+ gen_shift =
+ ((code == ASHIFT)
+ ? gen_ashlv8hi3
+ : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
+ gen_and = gen_andv16qi3;
+ gen_xor = gen_xorv16qi3;
+ gen_sub = gen_subv16qi3;
+ break;
+ case V32QImode:
+ himode = V16HImode;
+ gen_shift =
+ ((code == ASHIFT)
+ ? gen_ashlv16hi3
+ : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
+ gen_and = gen_andv32qi3;
+ gen_xor = gen_xorv32qi3;
+ gen_sub = gen_subv32qi3;
+ break;
+ case V64QImode:
+ himode = V32HImode;
+ gen_shift =
+ ((code == ASHIFT)
+ ? gen_ashlv32hi3
+ : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
+ gen_and = gen_andv64qi3;
+ gen_xor = gen_xorv64qi3;
+ gen_sub = gen_subv64qi3;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ tmp = gen_reg_rtx (himode);
+ vec_const_and = gen_reg_rtx (qimode);
+ op1_subreg = lowpart_subreg (himode, op1, qimode);
+
+ /* For ASHIFT and LSHIFTRT, perform operation like
+ vpsllw/vpsrlw $shift_amount, %op1, %dest.
+ vpand %vec_const_and, %dest. */
+ emit_insn (gen_shift (tmp, op1_subreg, op2));
+ emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
+ emit_move_insn (vec_const_and,
+ ix86_build_const_vector (qimode, true,
+ gen_int_mode (and_constant, QImode)));
+ emit_insn (gen_and (dest, dest, vec_const_and));
+
+ /* For ASHIFTRT, perform extra operation like
+ vpxor %vec_const_xor, %dest, %dest
+ vpsubb %vec_const_xor, %dest, %dest */
+ if (code == ASHIFTRT)
+ {
+ vec_const_xor = gen_reg_rtx (qimode);
+ emit_move_insn (vec_const_xor,
+ ix86_build_const_vector (qimode, true,
+ gen_int_mode (xor_constant, QImode)));
+ emit_insn (gen_xor (dest, dest, vec_const_xor));
+ emit_insn (gen_sub (dest, dest, vec_const_xor));
+ }
+ return true;
+}
/* Expand a vector operation CODE for a V*QImode in terms of the
same operation on V*HImode. */
should be encoded with notrack prefix. */
bool
-ix86_notrack_prefixed_insn_p (rtx insn)
+ix86_notrack_prefixed_insn_p (rtx_insn *insn)
{
if (!insn || !((flag_cf_protection & CF_BRANCH)))
return false;
case E_V4SImode:
case E_V2DImode:
case E_V1TImode:
- case E_TImode:
{
machine_mode srcmode, dstmode;
rtx d, pat;
case E_V4SImode:
case E_V2DImode:
case E_V1TImode:
- case E_TImode:
{
machine_mode srcmode, dstmode;
rtx (*pinsr)(rtx, rtx, rtx, rtx);