Add expander for movp2hi and movp2qi.

[gcc.git] / gcc / config / i386 / i386-expand.c
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c

index d50e21e8a1cee37660908a60a633b8d7d4e0c6dc..e6f8b314f186a91aa700c538e36b8eceb5d3f44c 100644 (file)
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1988-2019 Free Software Foundation, Inc.
+/* Copyright (C) 1988-2020 Free Software Foundation, Inc.
  
  This file is part of GCC.
  
@@ -58,7 +58,6 @@ along with GCC; see the file COPYING3.  If not see
  #include "gimplify.h"
  #include "dwarf2.h"
  #include "tm-constrs.h"
-#include "params.h"
  #include "cselib.h"
  #include "sched-int.h"
  #include "opts.h"
@@ -106,6 +105,8 @@ split_double_mode (machine_mode mode, rtx operands[],
  {
    machine_mode half_mode;
    unsigned int byte;
+  rtx mem_op = NULL_RTX;
+  int mem_num = 0;
  
    switch (mode)
      {
@@ -115,6 +116,12 @@ split_double_mode (machine_mode mode, rtx operands[],
      case E_DImode:
        half_mode = SImode;
        break;
+    case E_P2HImode:
+      half_mode = HImode;
+      break;
+    case E_P2QImode:
+      half_mode = QImode;
+      break;
      default:
        gcc_unreachable ();
      }
@@ -129,8 +136,18 @@ split_double_mode (machine_mode mode, rtx operands[],
           but we still have to handle it.  */
        if (MEM_P (op))
         {
-         lo_half[num] = adjust_address (op, half_mode, 0);
-         hi_half[num] = adjust_address (op, half_mode, byte);
+         if (mem_op && rtx_equal_p (op, mem_op))
+           {
+             lo_half[num] = lo_half[mem_num];
+             hi_half[num] = hi_half[mem_num];
+           }
+         else
+           {
+             mem_op = op;
+             mem_num = num;
+             lo_half[num] = adjust_address (op, half_mode, 0);
+             hi_half[num] = adjust_address (op, half_mode, byte);
+           }
         }
        else
         {
@@ -1168,9 +1185,8 @@ ix86_split_idivmod (machine_mode mode, rtx operands[],
    JUMP_LABEL (insn) = qimode_label;
  
    /* Generate original signed/unsigned divimod.  */
-  div = gen_divmod4_1 (operands[0], operands[1],
-                      operands[2], operands[3]);
-  emit_insn (div);
+  emit_insn (gen_divmod4_1 (operands[0], operands[1],
+                           operands[2], operands[3]));
  
    /* Branch to the end.  */
    emit_jump_insn (gen_jump (end_label));
@@ -1204,18 +1220,10 @@ ix86_split_idivmod (machine_mode mode, rtx operands[],
      }
  
    /* Extract remainder from AH.  */
-  tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
-                              tmp0, GEN_INT (8), GEN_INT (8));
-  if (REG_P (operands[1]))
-    insn = emit_move_insn (operands[1], tmp1);
-  else
-    {
-      /* Need a new scratch register since the old one has result
-        of 8bit divide.  */
-      scratch = gen_reg_rtx (GET_MODE (operands[1]));
-      emit_move_insn (scratch, tmp1);
-      insn = emit_move_insn (operands[1], scratch);
-    }
+  scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
+  tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
+                              GEN_INT (8), GEN_INT (8));
+  insn = emit_move_insn (operands[1], tmp1);
    set_unique_reg_note (insn, REG_EQUAL, mod);
  
    /* Zero extend quotient from AL.  */
@@ -1682,7 +1690,7 @@ ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
                                  OPTAB_DIRECT);
    else
      {
-      rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
+      rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
        two31 = ix86_build_const_vector (intmode, 1, two31);
        *xorp = expand_simple_binop (intmode, AND,
                                    gen_lowpart (intmode, tmp[0]),
@@ -1705,9 +1713,7 @@ ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
    machine_mode vmode = mode;
    rtvec par;
  
-  if (vector_mode)
-    use_sse = true;
-  else if (mode == TFmode)
+  if (vector_mode || mode == TFmode)
      use_sse = true;
    else if (TARGET_SSE_MATH)
      {
@@ -1732,7 +1738,7 @@ ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
          Create the appropriate mask now.  */
        mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
        use = gen_rtx_USE (VOIDmode, mask);
-      if (vector_mode)
+      if (vector_mode || mode == TFmode)
         par = gen_rtvec (2, set, use);
        else
         {
@@ -2286,16 +2292,16 @@ ix86_unordered_fp_compare (enum rtx_code code)
  
    switch (code)
      {
-    case GT:
-    case GE:
      case LT:
      case LE:
+    case GT:
+    case GE:
+    case LTGT:
        return false;
  
      case EQ:
      case NE:
  
-    case LTGT:
      case UNORDERED:
      case ORDERED:
      case UNLT:
@@ -3048,11 +3054,14 @@ ix86_expand_int_movcc (rtx operands[])
             {
               gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
  
-             /* We may be reversing unordered compare to normal compare, that
-                is not valid in general (we may convert non-trapping condition
-                to trapping one), however on i386 we currently emit all
-                comparisons unordered.  */
-             new_code = reverse_condition_maybe_unordered (code);
+             /* We may be reversing a non-trapping
+                comparison to a trapping comparison.  */
+                 if (HONOR_NANS (cmp_mode) && flag_trapping_math
+                     && code != EQ && code != NE
+                     && code != ORDERED && code != UNORDERED)
+                   new_code = UNKNOWN;
+                 else
+                   new_code = reverse_condition_maybe_unordered (code);
             }
           else
             new_code = ix86_reverse_condition (code, cmp_mode);
@@ -3159,7 +3168,7 @@ ix86_expand_int_movcc (rtx operands[])
             }
           if (cf != 0)
             {
-             tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
+             tmp = plus_constant (mode, tmp, cf);
               nops++;
             }
           if (!rtx_equal_p (tmp, out))
@@ -3204,11 +3213,15 @@ ix86_expand_int_movcc (rtx operands[])
                 {
                   gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
  
-                 /* We may be reversing unordered compare to normal compare,
-                    that is not valid in general (we may convert non-trapping
-                    condition to trapping one), however on i386 we currently
-                    emit all comparisons unordered.  */
-                 new_code = reverse_condition_maybe_unordered (code);
+                 /* We may be reversing a non-trapping
+                    comparison to a trapping comparison.  */
+                 if (HONOR_NANS (cmp_mode) && flag_trapping_math
+                     && code != EQ && code != NE
+                     && code != ORDERED && code != UNORDERED)
+                   new_code = UNKNOWN;
+                 else
+                   new_code = reverse_condition_maybe_unordered (code);
+
                 }
               else
                 {
@@ -3298,7 +3311,17 @@ ix86_expand_int_movcc (rtx operands[])
         {
           var = operands[2];
           if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
-           operands[2] = constm1_rtx, op = and_optab;
+           {
+             /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
+                "x <= 0 ? x : 0" to enable sign_bit_compare_p.  */
+             if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
+               operands[1] = simplify_gen_relational (LT, VOIDmode,
+                                                      GET_MODE (op0),
+                                                      op0, const0_rtx);
+
+             operands[2] = constm1_rtx;
+             op = and_optab;
+           }
           else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
             operands[2] = const0_rtx, op = ior_optab;
           else
@@ -3411,6 +3434,29 @@ ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
    return true;
  }
  
+/* Return true if MODE is valid for vector compare to mask register,
+   Same result for conditionl vector move with mask register.  */
+static bool
+ix86_valid_mask_cmp_mode (machine_mode mode)
+{
+  /* XOP has its own vector conditional movement.  */
+  if (TARGET_XOP && !TARGET_AVX512F)
+    return false;
+
+  /* AVX512F is needed for mask operation.  */
+  if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
+    return false;
+
+  /* AVX512BW is needed for vector QI/HImode,
+     AVX512VL is needed for 128/256-bit vector.  */
+  machine_mode inner_mode = GET_MODE_INNER (mode);
+  int vector_size = GET_MODE_SIZE (mode);
+  if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
+    return false;
+
+  return vector_size == 64 || TARGET_AVX512VL;
+}
+
  /* Expand an SSE comparison.  Return the register with the result.  */
  
  static rtx
@@ -3427,11 +3473,11 @@ ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
    bool maskcmp = false;
    rtx x;
  
-  if (GET_MODE_SIZE (cmp_ops_mode) == 64)
+  if (ix86_valid_mask_cmp_mode (cmp_ops_mode))
      {
        unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
-      cmp_mode = int_mode_for_size (nbits, 0).require ();
        maskcmp = true;
+      cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
      }
    else
      cmp_mode = cmp_ops_mode;
@@ -3450,37 +3496,13 @@ ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
        || (op_false && reg_overlap_mentioned_p (dest, op_false)))
      dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
  
-  /* Compare patterns for int modes are unspec in AVX512F only.  */
-  if (maskcmp && (code == GT || code == EQ))
+  if (maskcmp)
      {
-      rtx (*gen)(rtx, rtx, rtx);
-
-      switch (cmp_ops_mode)
-       {
-       case E_V64QImode:
-         gcc_assert (TARGET_AVX512BW);
-         gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
-         break;
-       case E_V32HImode:
-         gcc_assert (TARGET_AVX512BW);
-         gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
-         break;
-       case E_V16SImode:
-         gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
-         break;
-       case E_V8DImode:
-         gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
-         break;
-       default:
-         gen = NULL;
-       }
-
-      if (gen)
-       {
-         emit_insn (gen (dest, cmp_op0, cmp_op1));
-         return dest;
-       }
+      bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
+      gcc_assert (ok);
+      return dest;
      }
+
    x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
  
    if (cmp_mode != mode && !maskcmp)
@@ -3504,7 +3526,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
    machine_mode cmpmode = GET_MODE (cmp);
  
    /* In AVX512F the result of comparison is an integer mask.  */
-  bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
+  bool maskcmp = mode != cmpmode && ix86_valid_mask_cmp_mode (mode);
  
    rtx t2, t3, x;
  
@@ -3518,85 +3540,34 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
  
    if (maskcmp)
      {
-      rtx (*gen) (rtx, rtx) = NULL;
-      if ((op_true == CONST0_RTX (mode)
-          && vector_all_ones_operand (op_false, mode))
-         || (op_false == CONST0_RTX (mode)
-             && vector_all_ones_operand (op_true, mode)))
-       switch (mode)
-         {
-         case E_V64QImode:
-           if (TARGET_AVX512BW)
-             gen = gen_avx512bw_cvtmask2bv64qi;
-           break;
-         case E_V32QImode:
-           if (TARGET_AVX512VL && TARGET_AVX512BW)
-             gen = gen_avx512vl_cvtmask2bv32qi;
-           break;
-         case E_V16QImode:
-           if (TARGET_AVX512VL && TARGET_AVX512BW)
-             gen = gen_avx512vl_cvtmask2bv16qi;
-           break;
-         case E_V32HImode:
-           if (TARGET_AVX512BW)
-             gen = gen_avx512bw_cvtmask2wv32hi;
-           break;
-         case E_V16HImode:
-           if (TARGET_AVX512VL && TARGET_AVX512BW)
-             gen = gen_avx512vl_cvtmask2wv16hi;
-           break;
-         case E_V8HImode:
-           if (TARGET_AVX512VL && TARGET_AVX512BW)
-             gen = gen_avx512vl_cvtmask2wv8hi;
-           break;
-         case E_V16SImode:
-           if (TARGET_AVX512DQ)
-             gen = gen_avx512f_cvtmask2dv16si;
-           break;
-         case E_V8SImode:
-           if (TARGET_AVX512VL && TARGET_AVX512DQ)
-             gen = gen_avx512vl_cvtmask2dv8si;
-           break;
-         case E_V4SImode:
-           if (TARGET_AVX512VL && TARGET_AVX512DQ)
-             gen = gen_avx512vl_cvtmask2dv4si;
-           break;
-         case E_V8DImode:
-           if (TARGET_AVX512DQ)
-             gen = gen_avx512f_cvtmask2qv8di;
-           break;
-         case E_V4DImode:
-           if (TARGET_AVX512VL && TARGET_AVX512DQ)
-             gen = gen_avx512vl_cvtmask2qv4di;
-           break;
-         case E_V2DImode:
-           if (TARGET_AVX512VL && TARGET_AVX512DQ)
-             gen = gen_avx512vl_cvtmask2qv2di;
-           break;
-         default:
-           break;
-         }
-      if (gen && SCALAR_INT_MODE_P (cmpmode))
-       {
-         cmp = force_reg (cmpmode, cmp);
-         if (op_true == CONST0_RTX (mode))
+      /* Using vector move with mask register.  */
+      cmp = force_reg (cmpmode, cmp);
+      /* Optimize for mask zero.  */
+      op_true = (op_true != CONST0_RTX (mode)
+                ? force_reg (mode, op_true) : op_true);
+      op_false = (op_false != CONST0_RTX (mode)
+                 ? force_reg (mode, op_false) : op_false);
+      if (op_true == CONST0_RTX (mode))
+       {
+         rtx (*gen_not) (rtx, rtx);
+         switch (cmpmode)
             {
-             rtx (*gen_not) (rtx, rtx);
-             switch (cmpmode)
-               {
-               case E_QImode: gen_not = gen_knotqi; break;
-               case E_HImode: gen_not = gen_knothi; break;
-               case E_SImode: gen_not = gen_knotsi; break;
-               case E_DImode: gen_not = gen_knotdi; break;
-               default: gcc_unreachable ();
-               }
-             rtx n = gen_reg_rtx (cmpmode);
-             emit_insn (gen_not (n, cmp));
-             cmp = n;
+           case E_QImode: gen_not = gen_knotqi; break;
+           case E_HImode: gen_not = gen_knothi; break;
+           case E_SImode: gen_not = gen_knotsi; break;
+           case E_DImode: gen_not = gen_knotdi; break;
+           default: gcc_unreachable ();
             }
-         emit_insn (gen (dest, cmp));
-         return;
+         rtx n = gen_reg_rtx (cmpmode);
+         emit_insn (gen_not (n, cmp));
+         cmp = n;
+         /* Reverse op_true op_false.  */
+         std::swap (op_true, op_false);
         }
+
+      rtx vec_merge = gen_rtx_VEC_MERGE (mode, op_true, op_false, cmp);
+      emit_insn (gen_rtx_SET (dest, vec_merge));
+      return;
      }
    else if (vector_all_ones_operand (op_true, mode)
            && op_false == CONST0_RTX (mode))
@@ -3967,11 +3938,10 @@ ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
  /* Expand AVX-512 vector comparison.  */
  
  bool
-ix86_expand_mask_vec_cmp (rtx operands[])
+ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
  {
-  machine_mode mask_mode = GET_MODE (operands[0]);
-  machine_mode cmp_mode = GET_MODE (operands[2]);
-  enum rtx_code code = GET_CODE (operands[1]);
+  machine_mode mask_mode = GET_MODE (dest);
+  machine_mode cmp_mode = GET_MODE (cmp_op0);
    rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
    int unspec_code;
    rtx unspec;
@@ -3989,10 +3959,9 @@ ix86_expand_mask_vec_cmp (rtx operands[])
        unspec_code = UNSPEC_PCMP;
      }
  
-  unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
-                                                operands[3], imm),
+  unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
                            unspec_code);
-  emit_insn (gen_rtx_SET (operands[0], unspec));
+  emit_insn (gen_rtx_SET (dest, unspec));
  
    return true;
  }
@@ -4057,6 +4026,10 @@ ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
        && (mode == V16QImode || mode == V8HImode
           || mode == V4SImode || mode == V2DImode))
      ;
+  /* AVX512F supports all of the comparsions
+     on all 128/256/512-bit vector int types.  */
+  else if (ix86_valid_mask_cmp_mode (mode))
+    ;
    else
      {
        /* Canonicalize the comparison to EQ, GT, GTU.  */
@@ -6030,7 +6003,7 @@ static rtx
  emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
              HOST_WIDE_INT size_to_move)
  {
-  rtx dst = destmem, src = *srcmem, adjust, tempreg;
+  rtx dst = destmem, src = *srcmem, tempreg;
    enum insn_code code;
    machine_mode move_mode;
    int piece_size, i;
@@ -6066,7 +6039,7 @@ emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
  
    /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
    gcc_assert (size_to_move % piece_size == 0);
-  adjust = GEN_INT (piece_size);
+
    for (i = 0; i < size_to_move; i += piece_size)
      {
        /* We move from memory to memory, so we'll need to do it via
@@ -6076,9 +6049,9 @@ emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
        emit_insn (GEN_FCN (code) (dst, tempreg));
  
        emit_move_insn (destptr,
-                     gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
+                     plus_constant (Pmode, copy_rtx (destptr), piece_size));
        emit_move_insn (srcptr,
-                     gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
+                     plus_constant (Pmode, copy_rtx (srcptr), piece_size));
  
        dst = adjust_automodify_address_nv (dst, move_mode, destptr,
                                           piece_size);
@@ -6235,7 +6208,7 @@ static rtx
  emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
              HOST_WIDE_INT size_to_move)
  {
-  rtx dst = destmem, adjust;
+  rtx dst = destmem;
    enum insn_code code;
    machine_mode move_mode;
    int piece_size, i;
@@ -6260,7 +6233,7 @@ emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
  
    /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
    gcc_assert (size_to_move % piece_size == 0);
-  adjust = GEN_INT (piece_size);
+
    for (i = 0; i < size_to_move; i += piece_size)
      {
        if (piece_size <= GET_MODE_SIZE (word_mode))
@@ -6274,7 +6247,7 @@ emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
        emit_insn (GEN_FCN (code) (dst, promoted_val));
  
        emit_move_insn (destptr,
-                     gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
+                     plus_constant (Pmode, copy_rtx (destptr), piece_size));
  
        dst = adjust_automodify_address_nv (dst, move_mode, destptr,
                                           piece_size);
@@ -7099,10 +7072,7 @@ promote_duplicated_reg (machine_mode mode, rtx val)
        rtx reg = convert_modes (mode, QImode, val, true);
  
        if (!TARGET_PARTIAL_REG_STALL)
-       if (mode == SImode)
-         emit_insn (gen_insvsi_1 (reg, reg));
-       else
-         emit_insn (gen_insvdi_1 (reg, reg));
+       emit_insn (gen_insv_1 (mode, reg, reg));
        else
         {
           tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
@@ -7336,7 +7306,7 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
              && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
         move_mode = wider_mode;
  
-      if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
+      if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
         move_mode = TImode;
  
        /* Find the corresponding vector mode with the same size as MOVE_MODE.
@@ -7827,7 +7797,7 @@ ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
                                                      reg,
                                                      tmpreg)));
         /* Emit lea manually to avoid clobbering of flags.  */
-       emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
+       emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
  
         tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
         tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
@@ -10595,18 +10565,18 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
      case VOID_FTYPE_PV8SI_V8DI_UQI:
      case VOID_FTYPE_PV8HI_V8DI_UQI:
      case VOID_FTYPE_PV16HI_V16SI_UHI:
-    case VOID_FTYPE_PV16QI_V8DI_UQI:
+    case VOID_FTYPE_PUDI_V8DI_UQI:
      case VOID_FTYPE_PV16QI_V16SI_UHI:
      case VOID_FTYPE_PV4SI_V4DI_UQI:
-    case VOID_FTYPE_PV4SI_V2DI_UQI:
-    case VOID_FTYPE_PV8HI_V4DI_UQI:
-    case VOID_FTYPE_PV8HI_V2DI_UQI:
+    case VOID_FTYPE_PUDI_V2DI_UQI:
+    case VOID_FTYPE_PUDI_V4DI_UQI:
+    case VOID_FTYPE_PUSI_V2DI_UQI:
      case VOID_FTYPE_PV8HI_V8SI_UQI:
-    case VOID_FTYPE_PV8HI_V4SI_UQI:
-    case VOID_FTYPE_PV16QI_V4DI_UQI:
-    case VOID_FTYPE_PV16QI_V2DI_UQI:
-    case VOID_FTYPE_PV16QI_V8SI_UQI:
-    case VOID_FTYPE_PV16QI_V4SI_UQI:
+    case VOID_FTYPE_PUDI_V4SI_UQI:
+    case VOID_FTYPE_PUSI_V4DI_UQI:
+    case VOID_FTYPE_PUHI_V2DI_UQI:
+    case VOID_FTYPE_PUDI_V8SI_UQI:
+    case VOID_FTYPE_PUSI_V4SI_UQI:
      case VOID_FTYPE_PCHAR_V64QI_UDI:
      case VOID_FTYPE_PCHAR_V32QI_USI:
      case VOID_FTYPE_PCHAR_V16QI_UHI:
@@ -10627,7 +10597,7 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
      case VOID_FTYPE_PFLOAT_V4SF_UQI:
      case VOID_FTYPE_PV32QI_V32HI_USI:
      case VOID_FTYPE_PV16QI_V16HI_UHI:
-    case VOID_FTYPE_PV8QI_V8HI_UQI:
+    case VOID_FTYPE_PUDI_V8HI_UQI:
        nargs = 2;
        klass = store;
        /* Reserve memory operand for target.  */
@@ -10701,15 +10671,6 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
        klass = load;
        memory = 0;
        break;
-    case VOID_FTYPE_UINT_UINT_UINT:
-    case VOID_FTYPE_UINT64_UINT_UINT:
-    case UCHAR_FTYPE_UINT_UINT_UINT:
-    case UCHAR_FTYPE_UINT64_UINT_UINT:
-      nargs = 3;
-      klass = load;
-      memory = ARRAY_SIZE (args);
-      last_arg_constant = true;
-      break;
      default:
        gcc_unreachable ();
      }
@@ -10764,13 +10725,7 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
         {
           if (!match)
             {
-             if (icode == CODE_FOR_lwp_lwpvalsi3
-                 || icode == CODE_FOR_lwp_lwpinssi3
-                 || icode == CODE_FOR_lwp_lwpvaldi3
-                 || icode == CODE_FOR_lwp_lwpinsdi3)
-               error ("the last argument must be a 32-bit immediate");
-             else
-               error ("the last argument must be an 8-bit immediate");
+             error ("the last argument must be an 8-bit immediate");
               return const0_rtx;
             }
         }
@@ -10978,7 +10933,7 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
    tree arg0, arg1, arg2, arg3, arg4;
    rtx op0, op1, op2, op3, op4, pat, pat2, insn;
    machine_mode mode0, mode1, mode2, mode3, mode4;
-  unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
+  unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
  
    /* For CPU builtins that can be folded, fold first and expand the fold.  */
    switch (fcode)
@@ -11012,8 +10967,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
       OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
       OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
       OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
-     where for each this pair it is sufficient if either of the ISAs is
-     enabled, plus if it is ored with other options also those others.  */
+     where for each such pair it is sufficient if either of the ISAs is
+     enabled, plus if it is ored with other options also those others.
+     OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE.  */
    if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
         == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
        && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
@@ -11026,24 +10982,10 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
         == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
        && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
      isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
-  /* Use SSE/SSE2/SSSE3 to emulate MMX intrinsics in 64-bit mode when
-     MMX is disabled.  NB: Since MMX intrinsics are marked with
-     SSE/SSE2/SSSE3, enable them without SSE/SSE2/SSSE3 if MMX is
-     enabled.  */
-  if (TARGET_MMX || TARGET_MMX_WITH_SSE)
-    {
-      if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX))
-          == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX))
-         && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX)) != 0)
-       isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX);
-      if (((bisa & (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX))
-          == (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX))
-         && (isa & (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX)) != 0)
-       isa |= (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX);
-      if (((bisa & (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX))
-          == (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX))
-         && (isa & (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX)) != 0)
-       isa |= (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX);
+  if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE)
+    {
+      bisa &= ~OPTION_MASK_ISA_MMX;
+      bisa |= OPTION_MASK_ISA_SSE2;
      }
    if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2)
      {
@@ -11053,7 +10995,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
        else
         bisa |= OPTION_MASK_ABI_64;
        char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
-                                      (enum fpmath_unit) 0, false, add_abi_p);
+                                      (enum fpmath_unit) 0,
+                                      (enum prefer_vector_width) 0,
+                                      false, add_abi_p);
        if (!opts)
         error ("%qE needs unknown isa option", fndecl);
        else
@@ -11495,24 +11439,24 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
         }
        else
         {
-         rtx pat;
+         if (target == 0
+             || !register_operand (target, SImode))
+           target = gen_reg_rtx (SImode);
  
-         target = gen_reg_rtx (SImode);
           emit_move_insn (target, const0_rtx);
           target = gen_rtx_SUBREG (QImode, target, 0);
  
-         if (fcode == IX86_BUILTIN_ENQCMD)
-           pat = gen_enqcmd (UNSPECV_ENQCMD, Pmode, op0, op1);
-         else
-           pat = gen_enqcmd (UNSPECV_ENQCMDS, Pmode, op0, op1);
-
-         emit_insn (pat);
-
-         emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
-                                 gen_rtx_fmt_ee (EQ, QImode,
-                                                 SET_DEST (pat),
-                                                 const0_rtx)));
+         int unspecv = (fcode == IX86_BUILTIN_ENQCMD
+                        ? UNSPECV_ENQCMD
+                        : UNSPECV_ENQCMDS);
+         icode = code_for_enqcmd (unspecv, Pmode);
+         emit_insn (GEN_FCN (icode) (op0, op1));
  
+         emit_insn
+           (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
+                         gen_rtx_fmt_ee (EQ, QImode,
+                                         gen_rtx_REG (CCZmode, FLAGS_REG),
+                                         const0_rtx)));
           return SUBREG_REG (target);
         }
  
@@ -11705,40 +11649,92 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
      case IX86_BUILTIN_LLWPCB:
        arg0 = CALL_EXPR_ARG (exp, 0);
        op0 = expand_normal (arg0);
-      icode = CODE_FOR_lwp_llwpcb;
-      if (!insn_data[icode].operand[0].predicate (op0, Pmode))
+
+      if (!register_operand (op0, Pmode))
         op0 = ix86_zero_extend_to_Pmode (op0);
-      emit_insn (gen_lwp_llwpcb (op0));
+      emit_insn (gen_lwp_llwpcb (Pmode, op0));
        return 0;
  
      case IX86_BUILTIN_SLWPCB:
-      icode = CODE_FOR_lwp_slwpcb;
        if (!target
-         || !insn_data[icode].operand[0].predicate (target, Pmode))
+         || !register_operand (target, Pmode))
         target = gen_reg_rtx (Pmode);
-      emit_insn (gen_lwp_slwpcb (target));
+      emit_insn (gen_lwp_slwpcb (Pmode, target));
        return target;
  
+    case IX86_BUILTIN_LWPVAL32:
+    case IX86_BUILTIN_LWPVAL64:
+    case IX86_BUILTIN_LWPINS32:
+    case IX86_BUILTIN_LWPINS64:
+      mode = ((fcode == IX86_BUILTIN_LWPVAL32
+              || fcode == IX86_BUILTIN_LWPINS32)
+             ? SImode : DImode);
+
+      if (fcode == IX86_BUILTIN_LWPVAL32
+         || fcode == IX86_BUILTIN_LWPVAL64)
+       icode = code_for_lwp_lwpval (mode);
+      else
+       icode = code_for_lwp_lwpins (mode);
+
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      arg1 = CALL_EXPR_ARG (exp, 1);
+      arg2 = CALL_EXPR_ARG (exp, 2);
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+      op2 = expand_normal (arg2);
+      mode0 = insn_data[icode].operand[0].mode;
+
+      if (!insn_data[icode].operand[0].predicate (op0, mode0))
+       op0 = copy_to_mode_reg (mode0, op0);
+      if (!insn_data[icode].operand[1].predicate (op1, SImode))
+       op1 = copy_to_mode_reg (SImode, op1);
+
+      if (!CONST_INT_P (op2))
+       {
+         error ("the last argument must be a 32-bit immediate");
+         return const0_rtx;
+       }
+
+      emit_insn (GEN_FCN (icode) (op0, op1, op2));
+
+      if (fcode == IX86_BUILTIN_LWPINS32
+         || fcode == IX86_BUILTIN_LWPINS64)
+       {
+         if (target == 0
+             || !nonimmediate_operand (target, QImode))
+           target = gen_reg_rtx (QImode);
+
+         pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
+                           const0_rtx);
+         emit_insn (gen_rtx_SET (target, pat));
+
+         return target;
+       }
+      else
+       return 0;
+
      case IX86_BUILTIN_BEXTRI32:
      case IX86_BUILTIN_BEXTRI64:
+      mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
+
        arg0 = CALL_EXPR_ARG (exp, 0);
        arg1 = CALL_EXPR_ARG (exp, 1);
        op0 = expand_normal (arg0);
        op1 = expand_normal (arg1);
-      icode = (fcode == IX86_BUILTIN_BEXTRI32
-         ? CODE_FOR_tbm_bextri_si
-         : CODE_FOR_tbm_bextri_di);
+
        if (!CONST_INT_P (op1))
-        {
-          error ("last argument must be an immediate");
-          return const0_rtx;
-        }
+       {
+         error ("last argument must be an immediate");
+         return const0_rtx;
+       }
        else
-        {
-          unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
-          unsigned char lsb_index = INTVAL (op1) & 0xFF;
-          op1 = GEN_INT (length);
-          op2 = GEN_INT (lsb_index);
+       {
+         unsigned char lsb_index = UINTVAL (op1);
+         unsigned char length = UINTVAL (op1) >> 8;
+
+         unsigned char bitsize = GET_MODE_BITSIZE (mode);
+
+         icode = code_for_tbm_bextri (mode);
  
           mode1 = insn_data[icode].operand[1].mode;
           if (!insn_data[icode].operand[1].predicate (op0, mode1))
@@ -11749,25 +11745,32 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
               || !register_operand (target, mode0))
             target = gen_reg_rtx (mode0);
  
-          pat = GEN_FCN (icode) (target, op0, op1, op2);
-          if (pat)
-            emit_insn (pat);
-          return target;
-        }
+         if (length == 0 || lsb_index >= bitsize)
+           {
+             emit_move_insn (target, const0_rtx);
+             return target;
+           }
+
+         if (length + lsb_index > bitsize)
+           length = bitsize - lsb_index;
+
+         op1 = GEN_INT (length);
+         op2 = GEN_INT (lsb_index);
+
+         emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
+         return target;
+       }
  
      case IX86_BUILTIN_RDRAND16_STEP:
-      icode = CODE_FOR_rdrandhi_1;
-      mode0 = HImode;
+      mode = HImode;
        goto rdrand_step;
  
      case IX86_BUILTIN_RDRAND32_STEP:
-      icode = CODE_FOR_rdrandsi_1;
-      mode0 = SImode;
+      mode = SImode;
        goto rdrand_step;
  
      case IX86_BUILTIN_RDRAND64_STEP:
-      icode = CODE_FOR_rdranddi_1;
-      mode0 = DImode;
+      mode = DImode;
  
  rdrand_step:
        arg0 = CALL_EXPR_ARG (exp, 0);
@@ -11778,16 +11781,15 @@ rdrand_step:
           op1 = copy_addr_to_reg (op1);
         }
  
-      op0 = gen_reg_rtx (mode0);
-      emit_insn (GEN_FCN (icode) (op0));
+      op0 = gen_reg_rtx (mode);
+      emit_insn (gen_rdrand (mode, op0));
  
-      emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
+      emit_move_insn (gen_rtx_MEM (mode, op1), op0);
  
-      op1 = gen_reg_rtx (SImode);
-      emit_move_insn (op1, CONST1_RTX (SImode));
+      op1 = force_reg (SImode, const1_rtx);
  
        /* Emit SImode conditional move.  */
-      if (mode0 == HImode)
+      if (mode == HImode)
         {
           if (TARGET_ZERO_EXTEND_WITH_AND
               && optimize_function_for_speed_p (cfun))
@@ -11804,7 +11806,7 @@ rdrand_step:
               emit_insn (gen_zero_extendhisi2 (op2, op0));
             }
         }
-      else if (mode0 == SImode)
+      else if (mode == SImode)
         op2 = op0;
        else
         op2 = gen_rtx_SUBREG (SImode, op0, 0);
@@ -11820,18 +11822,15 @@ rdrand_step:
        return target;
  
      case IX86_BUILTIN_RDSEED16_STEP:
-      icode = CODE_FOR_rdseedhi_1;
-      mode0 = HImode;
+      mode = HImode;
        goto rdseed_step;
  
      case IX86_BUILTIN_RDSEED32_STEP:
-      icode = CODE_FOR_rdseedsi_1;
-      mode0 = SImode;
+      mode = SImode;
        goto rdseed_step;
  
      case IX86_BUILTIN_RDSEED64_STEP:
-      icode = CODE_FOR_rdseeddi_1;
-      mode0 = DImode;
+      mode = DImode;
  
  rdseed_step:
        arg0 = CALL_EXPR_ARG (exp, 0);
@@ -11842,10 +11841,10 @@ rdseed_step:
           op1 = copy_addr_to_reg (op1);
         }
  
-      op0 = gen_reg_rtx (mode0);
-      emit_insn (GEN_FCN (icode) (op0));
+      op0 = gen_reg_rtx (mode);
+      emit_insn (gen_rdseed (mode, op0));
  
-      emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
+      emit_move_insn (gen_rtx_MEM (mode, op1), op0);
  
        op2 = gen_reg_rtx (QImode);
  
@@ -12535,7 +12534,7 @@ rdseed_step:
                   tree fndecl = gimple_call_fndecl (def_stmt);
                   if (fndecl
                       && fndecl_built_in_p (fndecl, BUILT_IN_MD))
-                   switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
+                   switch (DECL_MD_FUNCTION_CODE (fndecl))
                       {
                       case IX86_BUILTIN_CMPPD:
                       case IX86_BUILTIN_CMPPS:
@@ -12783,55 +12782,75 @@ rdseed_step:
        emit_insn (gen_xabort (op0));
        return 0;
  
+    case IX86_BUILTIN_RDSSPD:
+    case IX86_BUILTIN_RDSSPQ:
+      mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
+
+      if (target == 0
+         || !register_operand (target, mode))
+       target = gen_reg_rtx (mode);
+
+      op0 = force_reg (mode, const0_rtx);
+
+      emit_insn (gen_rdssp (mode, target, op0));
+      return target;
+
+    case IX86_BUILTIN_INCSSPD:
+    case IX86_BUILTIN_INCSSPQ:
+      mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
+
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      op0 = expand_normal (arg0);
+
+      op0 = force_reg (mode, op0);
+
+      emit_insn (gen_incssp (mode, op0));
+      return 0;
+
      case IX86_BUILTIN_RSTORSSP:
      case IX86_BUILTIN_CLRSSBSY:
        arg0 = CALL_EXPR_ARG (exp, 0);
        op0 = expand_normal (arg0);
        icode = (fcode == IX86_BUILTIN_RSTORSSP
-         ? CODE_FOR_rstorssp
-         : CODE_FOR_clrssbsy);
+              ? CODE_FOR_rstorssp
+              : CODE_FOR_clrssbsy);
+
        if (!address_operand (op0, VOIDmode))
         {
-         op1 = convert_memory_address (Pmode, op0);
-         op0 = copy_addr_to_reg (op1);
+         op0 = convert_memory_address (Pmode, op0);
+         op0 = copy_addr_to_reg (op0);
         }
-      emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
+      emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
        return 0;
  
      case IX86_BUILTIN_WRSSD:
      case IX86_BUILTIN_WRSSQ:
      case IX86_BUILTIN_WRUSSD:
      case IX86_BUILTIN_WRUSSQ:
+      mode = ((fcode == IX86_BUILTIN_WRSSD
+              || fcode == IX86_BUILTIN_WRUSSD)
+             ? SImode : DImode);
+
        arg0 = CALL_EXPR_ARG (exp, 0);
        op0 = expand_normal (arg0);
        arg1 = CALL_EXPR_ARG (exp, 1);
        op1 = expand_normal (arg1);
-      switch (fcode)
-       {
-       case IX86_BUILTIN_WRSSD:
-         icode = CODE_FOR_wrsssi;
-         mode = SImode;
-         break;
-       case IX86_BUILTIN_WRSSQ:
-         icode = CODE_FOR_wrssdi;
-         mode = DImode;
-         break;
-       case IX86_BUILTIN_WRUSSD:
-         icode = CODE_FOR_wrusssi;
-         mode = SImode;
-         break;
-       case IX86_BUILTIN_WRUSSQ:
-         icode = CODE_FOR_wrussdi;
-         mode = DImode;
-         break;
-       }
+
        op0 = force_reg (mode, op0);
+
        if (!address_operand (op1, VOIDmode))
         {
-         op2 = convert_memory_address (Pmode, op1);
-         op1 = copy_addr_to_reg (op2);
+         op1 = convert_memory_address (Pmode, op1);
+         op1 = copy_addr_to_reg (op1);
         }
-      emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
+      op1 = gen_rtx_MEM (mode, op1);
+
+      icode = ((fcode == IX86_BUILTIN_WRSSD
+               || fcode == IX86_BUILTIN_WRSSQ)
+              ? code_for_wrss (mode)
+              : code_for_wruss (mode));
+      emit_insn (GEN_FCN (icode) (op0, op1));
+
        return 0;
  
      default:
@@ -13133,14 +13152,6 @@ s4fma_expand:
                                                target);
      }
  
-  if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
-      && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
-    {
-      i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
-      return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp,
-                                      target);
-    }
-
    gcc_unreachable ();
  }
  
@@ -13383,6 +13394,9 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
      case E_V8HImode:
        use_vector_set = TARGET_SSE2;
        break;
+    case E_V8QImode:
+      use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
+      break;
      case E_V4HImode:
        use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
        break;
@@ -13590,6 +13604,8 @@ ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
        wmode = V8HImode;
        goto widen;
      case E_V8QImode:
+      if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
+       break;
        wmode = V4HImode;
        goto widen;
      widen:
@@ -13637,8 +13653,8 @@ static void
  ix86_expand_vector_init_concat (machine_mode mode,
                                 rtx target, rtx *ops, int n)
  {
-  machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
-  rtx first[16], second[8], third[4];
+  machine_mode half_mode = VOIDmode;
+  rtx half[2];
    rtvec v;
    int i, j;
  
@@ -13648,55 +13664,55 @@ ix86_expand_vector_init_concat (machine_mode mode,
        switch (mode)
         {
         case E_V16SImode:
-         cmode = V8SImode;
+         half_mode = V8SImode;
           break;
         case E_V16SFmode:
-         cmode = V8SFmode;
+         half_mode = V8SFmode;
           break;
         case E_V8DImode:
-         cmode = V4DImode;
+         half_mode = V4DImode;
           break;
         case E_V8DFmode:
-         cmode = V4DFmode;
+         half_mode = V4DFmode;
           break;
         case E_V8SImode:
-         cmode = V4SImode;
+         half_mode = V4SImode;
           break;
         case E_V8SFmode:
-         cmode = V4SFmode;
+         half_mode = V4SFmode;
           break;
         case E_V4DImode:
-         cmode = V2DImode;
+         half_mode = V2DImode;
           break;
         case E_V4DFmode:
-         cmode = V2DFmode;
+         half_mode = V2DFmode;
           break;
         case E_V4SImode:
-         cmode = V2SImode;
+         half_mode = V2SImode;
           break;
         case E_V4SFmode:
-         cmode = V2SFmode;
+         half_mode = V2SFmode;
           break;
         case E_V2DImode:
-         cmode = DImode;
+         half_mode = DImode;
           break;
         case E_V2SImode:
-         cmode = SImode;
+         half_mode = SImode;
           break;
         case E_V2DFmode:
-         cmode = DFmode;
+         half_mode = DFmode;
           break;
         case E_V2SFmode:
-         cmode = SFmode;
+         half_mode = SFmode;
           break;
         default:
           gcc_unreachable ();
         }
  
-      if (!register_operand (ops[1], cmode))
-       ops[1] = force_reg (cmode, ops[1]);
-      if (!register_operand (ops[0], cmode))
-       ops[0] = force_reg (cmode, ops[0]);
+      if (!register_operand (ops[1], half_mode))
+       ops[1] = force_reg (half_mode, ops[1]);
+      if (!register_operand (ops[0], half_mode))
+       ops[0] = force_reg (half_mode, ops[0]);
        emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
                                                           ops[1])));
        break;
@@ -13705,16 +13721,16 @@ ix86_expand_vector_init_concat (machine_mode mode,
        switch (mode)
         {
         case E_V4DImode:
-         cmode = V2DImode;
+         half_mode = V2DImode;
           break;
         case E_V4DFmode:
-         cmode = V2DFmode;
+         half_mode = V2DFmode;
           break;
         case E_V4SImode:
-         cmode = V2SImode;
+         half_mode = V2SImode;
           break;
         case E_V4SFmode:
-         cmode = V2SFmode;
+         half_mode = V2SFmode;
           break;
         default:
           gcc_unreachable ();
@@ -13725,20 +13741,16 @@ ix86_expand_vector_init_concat (machine_mode mode,
        switch (mode)
         {
         case E_V8DImode:
-         cmode = V2DImode;
-         hmode = V4DImode;
+         half_mode = V4DImode;
           break;
         case E_V8DFmode:
-         cmode = V2DFmode;
-         hmode = V4DFmode;
+         half_mode = V4DFmode;
           break;
         case E_V8SImode:
-         cmode = V2SImode;
-         hmode = V4SImode;
+         half_mode = V4SImode;
           break;
         case E_V8SFmode:
-         cmode = V2SFmode;
-         hmode = V4SFmode;
+         half_mode = V4SFmode;
           break;
         default:
           gcc_unreachable ();
@@ -13749,14 +13761,10 @@ ix86_expand_vector_init_concat (machine_mode mode,
        switch (mode)
         {
         case E_V16SImode:
-         cmode = V2SImode;
-         hmode = V4SImode;
-         gmode = V8SImode;
+         half_mode = V8SImode;
           break;
         case E_V16SFmode:
-         cmode = V2SFmode;
-         hmode = V4SFmode;
-         gmode = V8SFmode;
+         half_mode = V8SFmode;
           break;
         default:
           gcc_unreachable ();
@@ -13766,50 +13774,32 @@ ix86_expand_vector_init_concat (machine_mode mode,
  half:
        /* FIXME: We process inputs backward to help RA.  PR 36222.  */
        i = n - 1;
-      j = (n >> 1) - 1;
-      for (; i > 0; i -= 2, j--)
-       {
-         first[j] = gen_reg_rtx (cmode);
-         v = gen_rtvec (2, ops[i - 1], ops[i]);
-         ix86_expand_vector_init (false, first[j],
-                                  gen_rtx_PARALLEL (cmode, v));
-       }
-
-      n >>= 1;
-      if (n > 4)
+      for (j = 1; j != -1; j--)
         {
-         gcc_assert (hmode != VOIDmode);
-         gcc_assert (gmode != VOIDmode);
-         for (i = j = 0; i < n; i += 2, j++)
+         half[j] = gen_reg_rtx (half_mode);
+         switch (n >> 1)
             {
-             second[j] = gen_reg_rtx (hmode);
-             ix86_expand_vector_init_concat (hmode, second [j],
-                                             &first [i], 2);
-           }
-         n >>= 1;
-         for (i = j = 0; i < n; i += 2, j++)
-           {
-             third[j] = gen_reg_rtx (gmode);
-             ix86_expand_vector_init_concat (gmode, third[j],
-                                             &second[i], 2);
-           }
-         n >>= 1;
-         ix86_expand_vector_init_concat (mode, target, third, n);
-       }
-      else if (n > 2)
-       {
-         gcc_assert (hmode != VOIDmode);
-         for (i = j = 0; i < n; i += 2, j++)
-           {
-             second[j] = gen_reg_rtx (hmode);
-             ix86_expand_vector_init_concat (hmode, second [j],
-                                             &first [i], 2);
+           case 2:
+             v = gen_rtvec (2, ops[i-1], ops[i]);
+             i -= 2;
+             break;
+           case 4:
+             v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
+             i -= 4;
+             break;
+           case 8:
+             v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
+                            ops[i-3], ops[i-2], ops[i-1], ops[i]);
+             i -= 8;
+             break;
+           default:
+             gcc_unreachable ();
             }
-         n >>= 1;
-         ix86_expand_vector_init_concat (mode, target, second, n);
+         ix86_expand_vector_init (false, half[j],
+                                  gen_rtx_PARALLEL (half_mode, v));
         }
-      else
-       ix86_expand_vector_init_concat (mode, target, first, n);
+
+      ix86_expand_vector_init_concat (mode, target, half, 2);
        break;
  
      default:
@@ -14243,8 +14233,13 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
  
    switch (mode)
      {
-    case E_V2SFmode:
      case E_V2SImode:
+      use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
+      if (use_vec_merge)
+       break;
+      /* FALLTHRU */
+
+    case E_V2SFmode:
        if (mmx_ok)
         {
           tmp = gen_reg_rtx (GET_MODE_INNER (mode));
@@ -14409,6 +14404,7 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
        break;
  
      case E_V8QImode:
+      use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
        break;
  
      case E_V32QImode:
@@ -14611,6 +14607,11 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
    switch (mode)
      {
      case E_V2SImode:
+      use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
+      if (use_vec_extr)
+       break;
+      /* FALLTHRU */
+
      case E_V2SFmode:
        if (!mmx_ok)
         break;
@@ -14706,6 +14707,17 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
  
      case E_V16QImode:
        use_vec_extr = TARGET_SSE4_1;
+      if (!use_vec_extr
+         && TARGET_SSE2
+         && elt == 0
+         && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
+       {
+         tmp = gen_reg_rtx (SImode);
+         ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
+                                     0);
+         emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
+         return;
+       }
        break;
  
      case E_V8SFmode:
@@ -14849,7 +14861,10 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
        return;
  
      case E_V8QImode:
+      use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
        /* ??? Could extract the appropriate HImode element and shift.  */
+      break;
+
      default:
        break;
      }
@@ -14942,43 +14957,51 @@ emit_reduc_half (rtx dest, rtx src, int i)
        break;
      case E_V64QImode:
      case E_V32HImode:
+      if (i < 64)
+       {
+         d = gen_reg_rtx (V4TImode);
+         tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
+                                       GEN_INT (i / 2));
+         break;
+       }
+      /* FALLTHRU */
      case E_V16SImode:
      case E_V16SFmode:
      case E_V8DImode:
      case E_V8DFmode:
        if (i > 128)
         tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
-                                     gen_lowpart (V16SImode, src),
-                                     gen_lowpart (V16SImode, src),
-                                     GEN_INT (0x4 + (i == 512 ? 4 : 0)),
-                                     GEN_INT (0x5 + (i == 512 ? 4 : 0)),
-                                     GEN_INT (0x6 + (i == 512 ? 4 : 0)),
-                                     GEN_INT (0x7 + (i == 512 ? 4 : 0)),
-                                     GEN_INT (0xC), GEN_INT (0xD),
-                                     GEN_INT (0xE), GEN_INT (0xF),
-                                     GEN_INT (0x10), GEN_INT (0x11),
-                                     GEN_INT (0x12), GEN_INT (0x13),
-                                     GEN_INT (0x14), GEN_INT (0x15),
-                                     GEN_INT (0x16), GEN_INT (0x17));
+                                       gen_lowpart (V16SImode, src),
+                                       gen_lowpart (V16SImode, src),
+                                       GEN_INT (0x4 + (i == 512 ? 4 : 0)),
+                                       GEN_INT (0x5 + (i == 512 ? 4 : 0)),
+                                       GEN_INT (0x6 + (i == 512 ? 4 : 0)),
+                                       GEN_INT (0x7 + (i == 512 ? 4 : 0)),
+                                       GEN_INT (0xC), GEN_INT (0xD),
+                                       GEN_INT (0xE), GEN_INT (0xF),
+                                       GEN_INT (0x10), GEN_INT (0x11),
+                                       GEN_INT (0x12), GEN_INT (0x13),
+                                       GEN_INT (0x14), GEN_INT (0x15),
+                                       GEN_INT (0x16), GEN_INT (0x17));
        else
         tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
-                                  gen_lowpart (V16SImode, src),
-                                  GEN_INT (i == 128 ? 0x2 : 0x1),
-                                  GEN_INT (0x3),
-                                  GEN_INT (0x3),
-                                  GEN_INT (0x3),
-                                  GEN_INT (i == 128 ? 0x6 : 0x5),
-                                  GEN_INT (0x7),
-                                  GEN_INT (0x7),
-                                  GEN_INT (0x7),
-                                  GEN_INT (i == 128 ? 0xA : 0x9),
-                                  GEN_INT (0xB),
-                                  GEN_INT (0xB),
-                                  GEN_INT (0xB),
-                                  GEN_INT (i == 128 ? 0xE : 0xD),
-                                  GEN_INT (0xF),
-                                  GEN_INT (0xF),
-                                  GEN_INT (0xF));
+                                   gen_lowpart (V16SImode, src),
+                                   GEN_INT (i == 128 ? 0x2 : 0x1),
+                                   GEN_INT (0x3),
+                                   GEN_INT (0x3),
+                                   GEN_INT (0x3),
+                                   GEN_INT (i == 128 ? 0x6 : 0x5),
+                                   GEN_INT (0x7),
+                                   GEN_INT (0x7),
+                                   GEN_INT (0x7),
+                                   GEN_INT (i == 128 ? 0xA : 0x9),
+                                   GEN_INT (0xB),
+                                   GEN_INT (0xB),
+                                   GEN_INT (0xB),
+                                   GEN_INT (i == 128 ? 0xE : 0xD),
+                                   GEN_INT (0xF),
+                                   GEN_INT (0xF),
+                                   GEN_INT (0xF));
        break;
      default:
        gcc_unreachable ();
@@ -15585,14 +15608,25 @@ void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
         }
      }
  
+  mthree = force_reg (mode, mthree);
+
    /* e0 = x0 * a */
    emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
-  /* e1 = e0 * x0 */
-  emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
  
-  /* e2 = e1 - 3. */
-  mthree = force_reg (mode, mthree);
-  emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
+  unsigned vector_size = GET_MODE_SIZE (mode);
+  if (TARGET_FMA
+      || (TARGET_AVX512F && vector_size == 64)
+      || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
+    emit_insn (gen_rtx_SET (e2,
+                           gen_rtx_FMA (mode, e0, x0, mthree)));
+  else
+    {
+      /* e1 = e0 * x0 */
+      emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
+
+      /* e2 = e1 - 3. */
+      emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
+    }
  
    mhalf = force_reg (mode, mhalf);
    if (recip)
@@ -15861,29 +15895,28 @@ ix86_expand_rint (rtx operand0, rtx operand1)
    emit_move_insn (operand0, res);
  }
  
-/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
-   into OPERAND0.  */
+/* Expand SSE2 sequence for computing floor or ceil
+   from OPERAND1 storing into OPERAND0.  */
  void
-ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
+ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
  {
    /* C code for the stuff we expand below.
-        double xa = fabs (x), x2;
+       double xa = fabs (x), x2;
          if (!isless (xa, TWO52))
            return x;
-        xa = xa + TWO52 - TWO52;
-        x2 = copysign (xa, x);
+       x2 = (double)(long)x;
       Compensate.  Floor:
-        if (x2 > x)
-          x2 -= 1;
+       if (x2 > x)
+         x2 -= 1;
       Compensate.  Ceil:
-        if (x2 < x)
-          x2 += 1;
+       if (x2 < x)
+         x2 += 1;
         if (HONOR_SIGNED_ZEROS (mode))
-         x2 = copysign (x2, x);
+         return copysign (x2, x);
         return x2;
     */
    machine_mode mode = GET_MODE (operand0);
-  rtx xa, TWO52, tmp, one, res, mask;
+  rtx xa, xi, TWO52, tmp, one, res, mask;
    rtx_code_label *label;
  
    TWO52 = ix86_gen_TWO52 (mode);
@@ -15899,12 +15932,10 @@ ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
    /* if (!isless (xa, TWO52)) goto label; */
    label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
  
-  /* xa = xa + TWO52 - TWO52; */
-  xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
-  xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
-
-  /* xa = copysign (xa, operand1) */
-  ix86_sse_copysign_to_positive (xa, xa, res, mask);
+  /* xa = (double)(long)x */
+  xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
+  expand_fix (xi, res, 0);
+  expand_float (xa, xi, 0);
  
    /* generate 1.0 */
    one = force_reg (mode, const_double_from_real_value (dconst1, mode));
@@ -15914,10 +15945,11 @@ ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
    emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
    tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
                              xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
-  if (!do_floor && HONOR_SIGNED_ZEROS (mode))
-    ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
    emit_move_insn (res, tmp);
  
+  if (HONOR_SIGNED_ZEROS (mode))
+    ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
+
    emit_label (label);
    LABEL_NUSES (label) = 1;
  
@@ -15925,27 +15957,29 @@ ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
  }
  
  /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
-   into OPERAND0.  */
+   into OPERAND0 without relying on DImode truncation via cvttsd2siq
+   that is only available on 64bit targets.  */
  void
-ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
+ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
  {
    /* C code for the stuff we expand below.
-       double xa = fabs (x), x2;
+        double xa = fabs (x), x2;
          if (!isless (xa, TWO52))
            return x;
-       x2 = (double)(long)x;
+        xa = xa + TWO52 - TWO52;
+        x2 = copysign (xa, x);
       Compensate.  Floor:
-       if (x2 > x)
-         x2 -= 1;
+        if (x2 > x)
+          x2 -= 1;
       Compensate.  Ceil:
-       if (x2 < x)
-         x2 += 1;
+        if (x2 < x)
+          x2 += 1;
         if (HONOR_SIGNED_ZEROS (mode))
-         return copysign (x2, x);
+         x2 = copysign (x2, x);
         return x2;
     */
    machine_mode mode = GET_MODE (operand0);
-  rtx xa, xi, TWO52, tmp, one, res, mask;
+  rtx xa, TWO52, tmp, one, res, mask;
    rtx_code_label *label;
  
    TWO52 = ix86_gen_TWO52 (mode);
@@ -15961,10 +15995,12 @@ ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
    /* if (!isless (xa, TWO52)) goto label; */
    label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
  
-  /* xa = (double)(long)x */
-  xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
-  expand_fix (xi, res, 0);
-  expand_float (xa, xi, 0);
+  /* xa = xa + TWO52 - TWO52; */
+  xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
+  xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
+
+  /* xa = copysign (xa, operand1) */
+  ix86_sse_copysign_to_positive (xa, xa, res, mask);
  
    /* generate 1.0 */
    one = force_reg (mode, const_double_from_real_value (dconst1, mode));
@@ -15974,90 +16010,18 @@ ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
    emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
    tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
                              xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+  if (!do_floor && HONOR_SIGNED_ZEROS (mode))
+    ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
    emit_move_insn (res, tmp);
  
-  if (HONOR_SIGNED_ZEROS (mode))
-    ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
-
    emit_label (label);
    LABEL_NUSES (label) = 1;
  
    emit_move_insn (operand0, res);
  }
  
-/* Expand SSE sequence for computing round from OPERAND1 storing
-   into OPERAND0.  Sequence that works without relying on DImode truncation
-   via cvttsd2siq that is only available on 64bit targets.  */
-void
-ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
-{
-  /* C code for the stuff we expand below.
-        double xa = fabs (x), xa2, x2;
-        if (!isless (xa, TWO52))
-          return x;
-     Using the absolute value and copying back sign makes
-     -0.0 -> -0.0 correct.
-        xa2 = xa + TWO52 - TWO52;
-     Compensate.
-       dxa = xa2 - xa;
-        if (dxa <= -0.5)
-          xa2 += 1;
-        else if (dxa > 0.5)
-          xa2 -= 1;
-        x2 = copysign (xa2, x);
-        return x2;
-   */
-  machine_mode mode = GET_MODE (operand0);
-  rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
-  rtx_code_label *label;
-
-  TWO52 = ix86_gen_TWO52 (mode);
-
-  /* Temporary for holding the result, initialized to the input
-     operand to ease control flow.  */
-  res = gen_reg_rtx (mode);
-  emit_move_insn (res, operand1);
-
-  /* xa = abs (operand1) */
-  xa = ix86_expand_sse_fabs (res, &mask);
-
-  /* if (!isless (xa, TWO52)) goto label; */
-  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
-
-  /* xa2 = xa + TWO52 - TWO52; */
-  xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
-  xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
-
-  /* dxa = xa2 - xa; */
-  dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
-
-  /* generate 0.5, 1.0 and -0.5 */
-  half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
-  one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
-  mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
-                              0, OPTAB_DIRECT);
-
-  /* Compensate.  */
-  /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
-  tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
-  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
-  xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
-  /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
-  tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
-  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
-  xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
-
-  /* res = copysign (xa2, operand1) */
-  ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
-
-  emit_label (label);
-  LABEL_NUSES (label) = 1;
-
-  emit_move_insn (operand0, res);
-}
-
-/* Expand SSE sequence for computing trunc from OPERAND1 storing
-   into OPERAND0.  */
+/* Expand SSE sequence for computing trunc
+   from OPERAND1 storing into OPERAND0.  */
  void
  ix86_expand_trunc (rtx operand0, rtx operand1)
  {
@@ -16102,7 +16066,8 @@ ix86_expand_trunc (rtx operand0, rtx operand1)
  }
  
  /* Expand SSE sequence for computing trunc from OPERAND1 storing
-   into OPERAND0.  */
+   into OPERAND0 without relying on DImode truncation via cvttsd2siq
+   that is only available on 64bit targets.  */
  void
  ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
  {
@@ -16159,8 +16124,8 @@ ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
    emit_move_insn (operand0, res);
  }
  
-/* Expand SSE sequence for computing round from OPERAND1 storing
-   into OPERAND0.  */
+/* Expand SSE sequence for computing round
+   from OPERAND1 storing into OPERAND0.  */
  void
  ix86_expand_round (rtx operand0, rtx operand1)
  {
@@ -16209,6 +16174,77 @@ ix86_expand_round (rtx operand0, rtx operand1)
    emit_move_insn (operand0, res);
  }
  
+/* Expand SSE sequence for computing round from OPERAND1 storing
+   into OPERAND0 without relying on DImode truncation via cvttsd2siq
+   that is only available on 64bit targets.  */
+void
+ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
+{
+  /* C code for the stuff we expand below.
+        double xa = fabs (x), xa2, x2;
+        if (!isless (xa, TWO52))
+          return x;
+     Using the absolute value and copying back sign makes
+     -0.0 -> -0.0 correct.
+        xa2 = xa + TWO52 - TWO52;
+     Compensate.
+       dxa = xa2 - xa;
+        if (dxa <= -0.5)
+          xa2 += 1;
+        else if (dxa > 0.5)
+          xa2 -= 1;
+        x2 = copysign (xa2, x);
+        return x2;
+   */
+  machine_mode mode = GET_MODE (operand0);
+  rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
+  rtx_code_label *label;
+
+  TWO52 = ix86_gen_TWO52 (mode);
+
+  /* Temporary for holding the result, initialized to the input
+     operand to ease control flow.  */
+  res = gen_reg_rtx (mode);
+  emit_move_insn (res, operand1);
+
+  /* xa = abs (operand1) */
+  xa = ix86_expand_sse_fabs (res, &mask);
+
+  /* if (!isless (xa, TWO52)) goto label; */
+  label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
+
+  /* xa2 = xa + TWO52 - TWO52; */
+  xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
+  xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
+
+  /* dxa = xa2 - xa; */
+  dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* generate 0.5, 1.0 and -0.5 */
+  half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
+  one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
+  mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
+                              0, OPTAB_DIRECT);
+
+  /* Compensate.  */
+  /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
+  tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
+  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
+  xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+  /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
+  tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
+  emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
+  xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* res = copysign (xa2, operand1) */
+  ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+
+  emit_move_insn (operand0, res);
+}
+
  /* Expand SSE sequence for computing round
     from OP1 storing into OP0 using sse4 round insn.  */
  void
@@ -16342,7 +16378,7 @@ expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
    return ok;
  }
  
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
     using movss or movsd.  */
  static bool
  expand_vec_perm_movs (struct expand_vec_perm_d *d)
@@ -16355,6 +16391,7 @@ expand_vec_perm_movs (struct expand_vec_perm_d *d)
      return false;
  
    if (!(TARGET_SSE && vmode == V4SFmode)
+      && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
        && !(TARGET_SSE2 && vmode == V2DFmode))
      return false;
  
@@ -16378,14 +16415,15 @@ expand_vec_perm_movs (struct expand_vec_perm_d *d)
    return true;
  }
  
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
     in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
  
  static bool
  expand_vec_perm_blend (struct expand_vec_perm_d *d)
  {
    machine_mode mmode, vmode = d->vmode;
-  unsigned i, mask, nelt = d->nelt;
+  unsigned i, nelt = d->nelt;
+  unsigned HOST_WIDE_INT mask;
    rtx target, op0, op1, maskop, x;
    rtx rperm[32], vperm;
  
@@ -16439,7 +16477,7 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
      case E_V16SImode:
      case E_V8DImode:
        for (i = 0; i < nelt; ++i)
-       mask |= (d->perm[i] >= nelt) << i;
+       mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
        break;
  
      case E_V2DImode:
@@ -16602,7 +16640,7 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
    return true;
  }
  
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
     in terms of the variable form of vpermilps.
  
     Note that we will have already failed the immediate input vpermilps,
@@ -16678,7 +16716,7 @@ valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
    return true;
  }
  
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
     in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128.  */
  
  static bool
@@ -16821,7 +16859,7 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
               /* vpshufb only works intra lanes, it is not
                  possible to shuffle bytes in between the lanes.  */
               for (i = 0; i < nelt; ++i)
-               if ((d->perm[i] ^ i) & (nelt / 4))
+               if ((d->perm[i] ^ i) & (3 * nelt / 4))
                   return false;
             }
         }
@@ -16995,7 +17033,7 @@ ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
  
  static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
  
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to instantiate D
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to instantiate D
     in a single instruction.  */
  
  static bool
@@ -17185,7 +17223,7 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
    return false;
  }
  
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
     in terms of a pair of pshuflw + pshufhw instructions.  */
  
  static bool
@@ -17226,7 +17264,7 @@ expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
    return true;
  }
  
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
     the permutation using the SSSE3 palignr instruction.  This succeeds
     when all of the elements in PERM fit within one vector and we merely
     need to shift them down so that a single vector permutation has a
@@ -17443,7 +17481,7 @@ expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
  
  static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
  
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
     a two vector permutation into a single vector permutation by using
     an interleave operation to merge the vectors.  */
  
@@ -17721,7 +17759,7 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
    return true;
  }
  
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
     a single vector cross-lane permutation into vpermq followed
     by any of the single insn permutations.  */
  
@@ -17802,7 +17840,7 @@ expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
  
  static bool canonicalize_perm (struct expand_vec_perm_d *d);
  
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to expand
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to expand
     a vector permutation using two instructions, vperm2f128 resp.
     vperm2i128 followed by any single in-lane permutation.  */
  
@@ -17919,7 +17957,7 @@ expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
    return false;
  }
  
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
     a two vector permutation using 2 intra-lane interleave insns
     and cross-lane shuffle for 32-byte vectors.  */
  
@@ -17995,7 +18033,7 @@ expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
    return true;
  }
  
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
     a single vector permutation using a single intra-lane vector
     permutation, vperm2f128 swapping the lanes and vblend* insn blending
     the non-swapped and swapped vectors together.  */
@@ -18063,7 +18101,7 @@ expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
    return true;
  }
  
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement a V4DF
+/* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
     permutation using two vperm2f128, followed by a vshufpd insn blending
     the two vectors together.  */
  
@@ -18114,6 +18152,106 @@ expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
    return true;
  }
  
+static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
+
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
+   a two vector permutation using two intra-lane vector
+   permutations, vperm2f128 swapping the lanes and vblend* insn blending
+   the non-swapped and swapped vectors together.  */
+
+static bool
+expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
+{
+  struct expand_vec_perm_d dfirst, dsecond, dthird;
+  unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
+  rtx_insn *seq1, *seq2;
+  bool ok;
+  rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
+
+  if (!TARGET_AVX
+      || TARGET_AVX2
+      || (d->vmode != V8SFmode && d->vmode != V4DFmode)
+      || d->one_operand_p)
+    return false;
+
+  dfirst = *d;
+  dsecond = *d;
+  for (i = 0; i < nelt; i++)
+    {
+      dfirst.perm[i] = 0xff;
+      dsecond.perm[i] = 0xff;
+    }
+  for (i = 0, msk = 0; i < nelt; i++)
+    {
+      j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
+      if (j == i)
+       {
+         dfirst.perm[j] = d->perm[i];
+         which1 |= (d->perm[i] < nelt ? 1 : 2);
+       }
+      else
+       {
+         dsecond.perm[j] = d->perm[i];
+         which2 |= (d->perm[i] < nelt ? 1 : 2);
+         msk |= (1U << i);
+       }
+    }
+  if (msk == 0 || msk == (1U << nelt) - 1)
+    return false;
+
+  if (!d->testing_p)
+    {
+      dfirst.target = gen_reg_rtx (dfirst.vmode);
+      dsecond.target = gen_reg_rtx (dsecond.vmode);
+    }
+
+  for (i = 0; i < nelt; i++)
+    {
+      if (dfirst.perm[i] == 0xff)
+       dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
+      if (dsecond.perm[i] == 0xff)
+       dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
+    }
+  canonicalize_perm (&dfirst);
+  start_sequence ();
+  ok = ix86_expand_vec_perm_const_1 (&dfirst);
+  seq1 = get_insns ();
+  end_sequence ();
+
+  if (!ok)
+    return false;
+
+  canonicalize_perm (&dsecond);
+  start_sequence ();
+  ok = ix86_expand_vec_perm_const_1 (&dsecond);
+  seq2 = get_insns ();
+  end_sequence ();
+
+  if (!ok)
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  emit_insn (seq1);
+  emit_insn (seq2);
+
+  dthird = *d;
+  dthird.op0 = dsecond.target;
+  dthird.op1 = dsecond.target;
+  dthird.one_operand_p = true;
+  dthird.target = gen_reg_rtx (dthird.vmode);
+  for (i = 0; i < nelt; i++)
+    dthird.perm[i] = i ^ nelt2;
+
+  ok = expand_vec_perm_1 (&dthird);
+  gcc_assert (ok);
+
+  blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
+  emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
+  return true;
+}
+
  /* A subroutine of expand_vec_perm_even_odd_1.  Implement the double-word
     permutation with two pshufb insns and an ior.  We should have already
     failed all two instruction sequences.  */
@@ -18503,7 +18641,7 @@ expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
    return true;
  }
  
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
+/* A subroutine of ix86_expand_vec_perm_const_1.  Implement extract-even
     and extract-odd permutations.  */
  
  static bool
@@ -18569,10 +18707,33 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
      case E_V2DFmode:
      case E_V4SFmode:
      case E_V2DImode:
+    case E_V2SImode:
      case E_V4SImode:
        /* These are always directly implementable by expand_vec_perm_1.  */
        gcc_unreachable ();
  
+    case E_V2SFmode:
+      gcc_assert (TARGET_MMX_WITH_SSE);
+      /* We have no suitable instructions.  */
+      if (d->testing_p)
+       return false;
+      break;
+
+    case E_V4HImode:
+      if (d->testing_p)
+       break;
+      /* We need 2*log2(N)-1 operations to achieve odd/even
+        with interleave. */
+      t1 = gen_reg_rtx (V4HImode);
+      emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
+      emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
+      if (odd)
+       t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
+      else
+       t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
+      emit_insn (t2);
+      break;
+
      case E_V8HImode:
        if (TARGET_SSE4_1)
         return expand_vec_perm_even_odd_pack (d);
@@ -18712,7 +18873,7 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
    return true;
  }
  
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
+/* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
     extract-even and extract-odd permutations.  */
  
  static bool
@@ -18731,7 +18892,7 @@ expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
    return expand_vec_perm_even_odd_1 (d, odd);
  }
  
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement broadcast
+/* A subroutine of ix86_expand_vec_perm_const_1.  Implement broadcast
     permutations.  We assume that expand_vec_perm_1 has already failed.  */
  
  static bool
@@ -18753,8 +18914,10 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
        gcc_unreachable ();
  
      case E_V2DFmode:
-    case E_V2DImode:
+    case E_V2SFmode:
      case E_V4SFmode:
+    case E_V2DImode:
+    case E_V2SImode:
      case E_V4SImode:
        /* These are always implementable using standard shuffle patterns.  */
        gcc_unreachable ();
@@ -18810,7 +18973,7 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
      }
  }
  
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
+/* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
     broadcast permutations.  */
  
  static bool
@@ -19106,6 +19269,10 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
        return true;
      }
  
+  /* Even longer, including recursion to ix86_expand_vec_perm_const_1.  */
+  if (expand_vec_perm2_vperm2f128_vblend (d))
+    return true;
+
    return false;
  }
  
@@ -19118,7 +19285,7 @@ canonicalize_perm (struct expand_vec_perm_d *d)
    int i, which, nelt = d->nelt;
  
    for (i = which = 0; i < nelt; ++i)
-      which |= (d->perm[i] < nelt ? 1 : 2);
+    which |= (d->perm[i] < nelt ? 1 : 2);
  
    d->one_operand_p = true;
    switch (which)
@@ -19243,6 +19410,12 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
        if (d.testing_p && TARGET_SSSE3)
         return true;
        break;
+    case E_V2SFmode:
+    case E_V2SImode:
+    case E_V4HImode:
+      if (!TARGET_MMX_WITH_SSE)
+       return false;
+      break;
      case E_V2DImode:
      case E_V2DFmode:
        if (!TARGET_SSE)
@@ -19275,7 +19448,9 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
        d.one_operand_p = (which != 3);
  
        /* Implementable with shufps or pshufd.  */
-      if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
+      if (d.one_operand_p
+         && (d.vmode == V4SFmode || d.vmode == V2SFmode
+             || d.vmode == V4SImode || d.vmode == V2SImode))
         return true;
  
        /* Otherwise we have to go through the motions and see if we can
@@ -19363,6 +19538,170 @@ ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
    gcc_assert (ok);
  }
  
+/* Optimize vector MUL generation for V8QI, V16QI and V32QI
+   under TARGET_AVX512BW. i.e. for v16qi a * b, it has
+
+   vpmovzxbw ymm2, xmm0
+   vpmovzxbw ymm3, xmm1
+   vpmullw   ymm4, ymm2, ymm3
+   vpmovwb   xmm0, ymm4
+
+   it would take less instructions than ix86_expand_vecop_qihi.
+   Return true if success.  */
+
+bool
+ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
+{
+  machine_mode himode, qimode = GET_MODE (dest);
+  rtx hop1, hop2, hdest;
+  rtx (*gen_extend)(rtx, rtx);
+  rtx (*gen_truncate)(rtx, rtx);
+
+  /* There's no V64HImode multiplication instruction.  */
+  if (qimode == E_V64QImode)
+    return false;
+
+  /* vpmovwb only available under AVX512BW.  */
+  if (!TARGET_AVX512BW)
+    return false;
+  if ((qimode == V8QImode || qimode == V16QImode)
+      && !TARGET_AVX512VL)
+    return false;
+  /* Not generate zmm instruction when prefer 128/256 bit vector width.  */
+  if (qimode == V32QImode
+      && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
+    return false;
+
+  switch (qimode)
+    {
+    case E_V8QImode:
+      himode = V8HImode;
+      gen_extend = gen_zero_extendv8qiv8hi2;
+      gen_truncate = gen_truncv8hiv8qi2;
+      break;
+    case E_V16QImode:
+      himode = V16HImode;
+      gen_extend = gen_zero_extendv16qiv16hi2;
+      gen_truncate = gen_truncv16hiv16qi2;
+      break;
+    case E_V32QImode:
+      himode = V32HImode;
+      gen_extend = gen_zero_extendv32qiv32hi2;
+      gen_truncate = gen_truncv32hiv32qi2;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  hop1 = gen_reg_rtx (himode);
+  hop2 = gen_reg_rtx (himode);
+  hdest = gen_reg_rtx (himode);
+  emit_insn (gen_extend (hop1, op1));
+  emit_insn (gen_extend (hop2, op2));
+  emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (MULT, himode,
+                                                     hop1, hop2)));
+  emit_insn (gen_truncate (dest, hdest));
+  return true;
+}
+
+/* Expand a vector operation shift by constant for a V*QImode in terms of the
+   same operation on V*HImode. Return true if success. */
+bool
+ix86_expand_vec_shift_qihi_constant (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+{
+  machine_mode qimode, himode;
+  HOST_WIDE_INT and_constant, xor_constant;
+  HOST_WIDE_INT shift_amount;
+  rtx vec_const_and, vec_const_xor;
+  rtx tmp, op1_subreg;
+  rtx (*gen_shift) (rtx, rtx, rtx);
+  rtx (*gen_and) (rtx, rtx, rtx);
+  rtx (*gen_xor) (rtx, rtx, rtx);
+  rtx (*gen_sub) (rtx, rtx, rtx);
+
+  /* Only optimize shift by constant.  */
+  if (!CONST_INT_P (op2))
+    return false;
+
+  qimode = GET_MODE (dest);
+  shift_amount = INTVAL (op2);
+  /* Do nothing when shift amount greater equal 8.  */
+  if (shift_amount > 7)
+    return false;
+
+  gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
+  /* Record sign bit.  */
+  xor_constant = 1 << (8 - shift_amount - 1);
+
+  /* Zero upper/lower bits shift from left/right element.  */
+  and_constant
+    = (code == ASHIFT ? 256 - (1 << shift_amount)
+       : (1 << (8 - shift_amount)) - 1);
+
+  switch (qimode)
+    {
+    case V16QImode:
+      himode = V8HImode;
+      gen_shift =
+       ((code == ASHIFT)
+        ? gen_ashlv8hi3
+        : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
+      gen_and = gen_andv16qi3;
+      gen_xor = gen_xorv16qi3;
+      gen_sub = gen_subv16qi3;
+      break;
+    case V32QImode:
+      himode = V16HImode;
+      gen_shift =
+       ((code == ASHIFT)
+        ? gen_ashlv16hi3
+        : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
+      gen_and = gen_andv32qi3;
+      gen_xor = gen_xorv32qi3;
+      gen_sub = gen_subv32qi3;
+      break;
+    case V64QImode:
+      himode = V32HImode;
+      gen_shift =
+       ((code == ASHIFT)
+        ? gen_ashlv32hi3
+        : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
+      gen_and = gen_andv64qi3;
+      gen_xor = gen_xorv64qi3;
+      gen_sub = gen_subv64qi3;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  tmp = gen_reg_rtx (himode);
+  vec_const_and = gen_reg_rtx (qimode);
+  op1_subreg = lowpart_subreg (himode, op1, qimode);
+
+  /* For ASHIFT and LSHIFTRT, perform operation like
+     vpsllw/vpsrlw $shift_amount, %op1, %dest.
+     vpand %vec_const_and, %dest.  */
+  emit_insn (gen_shift (tmp, op1_subreg, op2));
+  emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
+  emit_move_insn (vec_const_and,
+                 ix86_build_const_vector (qimode, true,
+                                          gen_int_mode (and_constant, QImode)));
+  emit_insn (gen_and (dest, dest, vec_const_and));
+
+  /* For ASHIFTRT, perform extra operation like
+     vpxor %vec_const_xor, %dest, %dest
+     vpsubb %vec_const_xor, %dest, %dest  */
+  if (code == ASHIFTRT)
+    {
+      vec_const_xor = gen_reg_rtx (qimode);
+      emit_move_insn (vec_const_xor,
+                     ix86_build_const_vector (qimode, true,
+                                              gen_int_mode (xor_constant, QImode)));
+      emit_insn (gen_xor (dest, dest, vec_const_xor));
+      emit_insn (gen_sub (dest, dest, vec_const_xor));
+    }
+  return true;
+}
  
  /* Expand a vector operation CODE for a V*QImode in terms of the
     same operation on V*HImode.  */
@@ -19822,7 +20161,7 @@ ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
     should be encoded with notrack prefix.  */
  
  bool
-ix86_notrack_prefixed_insn_p (rtx insn)
+ix86_notrack_prefixed_insn_p (rtx_insn *insn)
  {
    if (!insn || !((flag_cf_protection & CF_BRANCH)))
      return false;
@@ -19971,7 +20310,6 @@ ix86_expand_pextr (rtx *operands)
      case E_V4SImode:
      case E_V2DImode:
      case E_V1TImode:
-    case E_TImode:
        {
         machine_mode srcmode, dstmode;
         rtx d, pat;
@@ -20067,7 +20405,6 @@ ix86_expand_pinsr (rtx *operands)
      case E_V4SImode:
      case E_V2DImode:
      case E_V1TImode:
-    case E_TImode:
        {
         machine_mode srcmode, dstmode;
         rtx (*pinsr)(rtx, rtx, rtx, rtx);