re PR target/53961 (internal compiler error: in memory_address_length, at config...

[gcc.git] / gcc / config / i386 / i386.c
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

index 12719e2b9a6ed4f4b861a1848a4cfcaa7d50b2c8..5770ed26aa2bb33059410ed71b065ef03016690e 100644 (file)
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -61,6 +61,7 @@ along with GCC; see the file COPYING3.  If not see
  #include "fibheap.h"
  #include "opts.h"
  #include "diagnostic.h"
+#include "dumpfile.h"
  
  enum upper_128bits_state
  {
@@ -1874,6 +1875,10 @@ struct processor_costs generic32_cost = {
    1,                                   /* cond_not_taken_branch_cost.  */
  };
  
+/* Set by -mtune.  */
+const struct processor_costs *ix86_tune_cost = &pentium_cost;
+
+/* Set by -mtune or -Os.  */
  const struct processor_costs *ix86_cost = &pentium_cost;
  
  /* Processor feature/optimization bitmasks.  */
@@ -2408,7 +2413,6 @@ struct ix86_frame
    int va_arg_size;
    int red_zone_size;
    int outgoing_arguments_size;
-  HOST_WIDE_INT frame;
  
    /* The offsets relative to ARG_POINTER.  */
    HOST_WIDE_INT frame_pointer_offset;
@@ -3280,7 +3284,7 @@ ix86_option_override_internal (bool main_args_p)
                    "large", "32");
           else if (TARGET_X32)
             error ("code model %qs not supported in x32 mode",
-                  "medium");
+                  "large");
           break;
  
         case CM_32:
@@ -3547,10 +3551,11 @@ ix86_option_override_internal (bool main_args_p)
         flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
      }
  
+  ix86_tune_cost = processor_target_table[ix86_tune].cost;
    if (optimize_size)
      ix86_cost = &ix86_size_cost;
    else
-    ix86_cost = processor_target_table[ix86_tune].cost;
+    ix86_cost = ix86_tune_cost;
  
    /* Arrange to set up i386_stack_locals for all functions.  */
    init_machine_status = ix86_init_machine_status;
@@ -3660,7 +3665,7 @@ ix86_option_override_internal (bool main_args_p)
    ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
    if (global_options_set.x_ix86_preferred_stack_boundary_arg)
      {
-      int min = (TARGET_64BIT ? 4 : 2);
+      int min = (TARGET_64BIT ? (TARGET_SSE ? 4 : 3) : 2);
        int max = (TARGET_SEH ? 4 : 12);
  
        if (ix86_preferred_stack_boundary_arg < min
@@ -3795,16 +3800,19 @@ ix86_option_override_internal (bool main_args_p)
      flag_schedule_insns_after_reload = flag_schedule_insns = 0;
  
    maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
-                        ix86_cost->simultaneous_prefetches,
+                        ix86_tune_cost->simultaneous_prefetches,
                          global_options.x_param_values,
                          global_options_set.x_param_values);
-  maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, ix86_cost->prefetch_block,
+  maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
+                        ix86_tune_cost->prefetch_block,
                          global_options.x_param_values,
                          global_options_set.x_param_values);
-  maybe_set_param_value (PARAM_L1_CACHE_SIZE, ix86_cost->l1_cache_size,
+  maybe_set_param_value (PARAM_L1_CACHE_SIZE,
+                        ix86_tune_cost->l1_cache_size,
                          global_options.x_param_values,
                          global_options_set.x_param_values);
-  maybe_set_param_value (PARAM_L2_CACHE_SIZE, ix86_cost->l2_cache_size,
+  maybe_set_param_value (PARAM_L2_CACHE_SIZE,
+                        ix86_tune_cost->l2_cache_size,
                          global_options.x_param_values,
                          global_options_set.x_param_values);
  
@@ -8551,11 +8559,16 @@ ix86_frame_pointer_required (void)
    if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
      return true;
  
+  /* Win64 SEH, very large frames need a frame-pointer as maximum stack
+     allocation is 4GB.  */
+  if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
+    return true;
+
    /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
       turns off the frame pointer by default.  Turn it back on now if
       we've not got a leaf function.  */
    if (TARGET_OMIT_LEAF_FRAME_POINTER
-      && (!current_function_is_leaf
+      && (!crtl->is_leaf
           || ix86_current_function_calls_tls_descriptor))
      return true;
  
@@ -8802,7 +8815,7 @@ gen_pop (rtx arg)
  static unsigned int
  ix86_select_alt_pic_regnum (void)
  {
-  if (current_function_is_leaf
+  if (crtl->is_leaf
        && !crtl->profile
        && !ix86_current_function_calls_tls_descriptor)
      {
@@ -8951,9 +8964,9 @@ ix86_builtin_setjmp_frame_value (void)
  static void
  ix86_compute_frame_layout (struct ix86_frame *frame)
  {
-  unsigned int stack_alignment_needed;
+  unsigned HOST_WIDE_INT stack_alignment_needed;
    HOST_WIDE_INT offset;
-  unsigned int preferred_alignment;
+  unsigned HOST_WIDE_INT preferred_alignment;
    HOST_WIDE_INT size = get_frame_size ();
    HOST_WIDE_INT to_allocate;
  
@@ -8966,7 +8979,7 @@ ix86_compute_frame_layout (struct ix86_frame *frame)
    /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
       function prologues and leaf.  */
    if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
-      && (!current_function_is_leaf || cfun->calls_alloca != 0
+      && (!crtl->is_leaf || cfun->calls_alloca != 0
            || ix86_current_function_calls_tls_descriptor))
      {
        preferred_alignment = 16;
@@ -9044,6 +9057,11 @@ ix86_compute_frame_layout (struct ix86_frame *frame)
    offset += frame->nregs * UNITS_PER_WORD;
    frame->reg_save_offset = offset;
  
+  /* On SEH target, registers are pushed just before the frame pointer
+     location.  */
+  if (TARGET_SEH)
+    frame->hard_frame_pointer_offset = offset;
+
    /* Align and set SSE register save area.  */
    if (frame->nsseregs)
      {
@@ -9071,7 +9089,7 @@ ix86_compute_frame_layout (struct ix86_frame *frame)
    if (stack_realign_fp
        || offset != frame->sse_reg_save_offset
        || size != 0
-      || !current_function_is_leaf
+      || !crtl->is_leaf
        || cfun->calls_alloca
        || ix86_current_function_calls_tls_descriptor)
      offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
@@ -9087,7 +9105,7 @@ ix86_compute_frame_layout (struct ix86_frame *frame)
       expander assumes that last crtl->outgoing_args_size
       of stack frame are unused.  */
    if (ACCUMULATE_OUTGOING_ARGS
-      && (!current_function_is_leaf || cfun->calls_alloca
+      && (!crtl->is_leaf || cfun->calls_alloca
           || ix86_current_function_calls_tls_descriptor))
      {
        offset += crtl->outgoing_args_size;
@@ -9098,7 +9116,7 @@ ix86_compute_frame_layout (struct ix86_frame *frame)
  
    /* Align stack boundary.  Only needed if we're calling another function
       or using alloca.  */
-  if (!current_function_is_leaf || cfun->calls_alloca
+  if (!crtl->is_leaf || cfun->calls_alloca
        || ix86_current_function_calls_tls_descriptor)
      offset = (offset + preferred_alignment - 1) & -preferred_alignment;
  
@@ -9113,8 +9131,8 @@ ix86_compute_frame_layout (struct ix86_frame *frame)
      frame->save_regs_using_mov = false;
  
    if (ix86_using_red_zone ()
-      && current_function_sp_is_unchanging
-      && current_function_is_leaf
+      && crtl->sp_is_unchanging
+      && crtl->is_leaf
        && !ix86_current_function_calls_tls_descriptor)
      {
        frame->red_zone_size = to_allocate;
@@ -9135,9 +9153,12 @@ ix86_compute_frame_layout (struct ix86_frame *frame)
      {
        HOST_WIDE_INT diff;
  
-      /* If we can leave the frame pointer where it is, do so.  */
+      /* If we can leave the frame pointer where it is, do so.  Also, returns
+        the establisher frame for __builtin_frame_address (0).  */
        diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
-      if (diff > 240 || (diff & 15) != 0)
+      if (diff <= SEH_MAX_FRAME_SIZE
+         && (diff > 240 || (diff & 15) != 0)
+         && !crtl->accesses_prior_frames)
         {
           /* Ideally we'd determine what portion of the local stack frame
              (within the constraint of the lowest 240) is most heavily used.
@@ -9186,7 +9207,7 @@ choose_baseaddr (HOST_WIDE_INT cfa_offset)
    if (m->use_fast_prologue_epilogue)
      {
        /* Choose the base register most likely to allow the most scheduling
-         opportunities.  Generally FP is valid througout the function,
+         opportunities.  Generally FP is valid throughout the function,
           while DRAP must be reloaded within the epilogue.  But choose either
           over the SP due to increased encoding size.  */
  
@@ -10055,7 +10076,7 @@ ix86_finalize_stack_realign_flags (void)
      = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
         ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
    unsigned int stack_realign = (incoming_stack_boundary
-                               < (current_function_is_leaf
+                               < (crtl->is_leaf
                                    ? crtl->max_used_stack_slot_alignment
                                    : crtl->stack_alignment_needed));
  
@@ -10074,9 +10095,9 @@ ix86_finalize_stack_realign_flags (void)
    if (stack_realign
        && !crtl->need_drap
        && frame_pointer_needed
-      && current_function_is_leaf
+      && crtl->is_leaf
        && flag_omit_frame_pointer
-      && current_function_sp_is_unchanging
+      && crtl->sp_is_unchanging
        && !ix86_current_function_calls_tls_descriptor
        && !crtl->accesses_prior_frames
        && !cfun->calls_alloca
@@ -10139,6 +10160,7 @@ ix86_expand_prologue (void)
    struct ix86_frame frame;
    HOST_WIDE_INT allocate;
    bool int_registers_saved;
+  bool sse_registers_saved;
  
    ix86_finalize_stack_realign_flags ();
  
@@ -10291,6 +10313,9 @@ ix86_expand_prologue (void)
        m->fs.realigned = true;
      }
  
+  int_registers_saved = (frame.nregs == 0);
+  sse_registers_saved = (frame.nsseregs == 0);
+
    if (frame_pointer_needed && !m->fs.fp_valid)
      {
        /* Note: AT&T enter does NOT have reversed args.  Enter is probably
@@ -10298,6 +10323,17 @@ ix86_expand_prologue (void)
        insn = emit_insn (gen_push (hard_frame_pointer_rtx));
        RTX_FRAME_RELATED_P (insn) = 1;
  
+      /* Push registers now, before setting the frame pointer
+        on SEH target.  */
+      if (!int_registers_saved
+         && TARGET_SEH
+         && !frame.save_regs_using_mov)
+       {
+         ix86_emit_save_regs ();
+         int_registers_saved = true;
+         gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
+       }
+
        if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
         {
           insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
@@ -10310,8 +10346,6 @@ ix86_expand_prologue (void)
         }
      }
  
-  int_registers_saved = (frame.nregs == 0);
-
    if (!int_registers_saved)
      {
        /* If saving registers via PUSH, do so now.  */
@@ -10388,6 +10422,27 @@ ix86_expand_prologue (void)
        current_function_static_stack_size = stack_size;
      }
  
+  /* On SEH target with very large frame size, allocate an area to save
+     SSE registers (as the very large allocation won't be described).  */
+  if (TARGET_SEH
+      && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
+      && !sse_registers_saved)
+    {
+      HOST_WIDE_INT sse_size =
+       frame.sse_reg_save_offset - frame.reg_save_offset;
+
+      gcc_assert (int_registers_saved);
+
+      /* No need to do stack checking as the area will be immediately
+        written.  */
+      pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+                                GEN_INT (-sse_size), -1,
+                                m->fs.cfa_reg == stack_pointer_rtx);
+      allocate -= sse_size;
+      ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
+      sse_registers_saved = true;
+    }
+
    /* The stack has already been decremented by the instruction calling us
       so probe if the size is non-negative to preserve the protection area.  */
    if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
@@ -10512,7 +10567,7 @@ ix86_expand_prologue (void)
  
    if (!int_registers_saved)
      ix86_emit_save_regs_using_mov (frame.reg_save_offset);
-  if (frame.nsseregs)
+  if (!sse_registers_saved)
      ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
  
    pic_reg_used = false;
@@ -10785,7 +10840,7 @@ ix86_expand_epilogue (int style)
    ix86_compute_frame_layout (&frame);
  
    m->fs.sp_valid = (!frame_pointer_needed
-                   || (current_function_sp_is_unchanging
+                   || (crtl->sp_is_unchanging
                         && !stack_realign_fp));
    gcc_assert (!m->fs.sp_valid
               || m->fs.sp_offset == frame.stack_pointer_offset);
@@ -10968,8 +11023,13 @@ ix86_expand_epilogue (int style)
         }
  
        /* First step is to deallocate the stack frame so that we can
-        pop the registers.  */
-      if (!m->fs.sp_valid)
+        pop the registers.  Also do it on SEH target for very large
+        frame as the emitted instructions aren't allowed by the ABI in
+        epilogues.  */
+      if (!m->fs.sp_valid
+         || (TARGET_SEH
+             && (m->fs.sp_offset - frame.reg_save_offset
+                 >= SEH_MAX_FRAME_SIZE)))
         {
           pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
                                      GEN_INT (m->fs.fp_offset
@@ -11516,9 +11576,18 @@ ix86_decompose_address (rtx addr, struct ix86_address *out)
    int retval = 1;
    enum ix86_address_seg seg = SEG_DEFAULT;
  
+  /* Allow SImode subregs of DImode addresses,
+     they will be emitted with addr32 prefix.  */
+  if (TARGET_64BIT && GET_MODE (addr) == SImode)
+    {
+      if (GET_CODE (addr) == SUBREG
+         && GET_MODE (XEXP (addr, 0)) == DImode)
+       addr = SUBREG_REG (addr);
+    }
+
    /* Allow zero-extended SImode addresses,
       they will be emitted with addr32 prefix.  */
-  if (TARGET_64BIT && GET_MODE (addr) == DImode)
+  else if (TARGET_64BIT && GET_MODE (addr) == DImode)
      {
        if (GET_CODE (addr) == ZERO_EXTEND
           && GET_MODE (XEXP (addr, 0)) == SImode)
@@ -12811,13 +12880,13 @@ legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
      case TLS_MODEL_INITIAL_EXEC:
        if (TARGET_64BIT)
         {
-         if (TARGET_SUN_TLS)
+         if (TARGET_SUN_TLS && !TARGET_X32)
             {
               /* The Sun linker took the AMD64 TLS spec literally
                  and can only handle %rax as destination of the
                  initial executable code sequence.  */
  
-             dest = gen_reg_rtx (Pmode);
+             dest = gen_reg_rtx (DImode);
               emit_insn (gen_tls_initial_exec_64_sun (dest, x));
               return dest;
             }
@@ -13932,8 +14001,8 @@ get_some_local_dynamic_name (void)
     C -- print opcode suffix for set/cmov insn.
     c -- like C, but print reversed condition
     F,f -- likewise, but for floating-point.
-   O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, print the opcode suffix for
-       the size of the current operand, otherwise nothing.
+   O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
+       otherwise nothing
     R -- print the prefix for register names.
     z -- print the opcode suffix for the size of the current operand.
     Z -- likewise, with special suffixes for x87 instructions.
@@ -14062,6 +14131,8 @@ ix86_print_operand (FILE *file, rtx x, int code)
                 ("invalid operand size for operand code 'O'");
               return;
             }
+
+         putc ('.', file);
  #endif
           return;
  
@@ -14321,20 +14392,21 @@ ix86_print_operand (FILE *file, rtx x, int code)
             }
           return;
  
-       case 'C':
-       case 'c':
         case 'F':
         case 'f':
+#ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
+         if (ASSEMBLER_DIALECT == ASM_ATT)
+           putc ('.', file);
+#endif
+
+       case 'C':
+       case 'c':
           if (!COMPARISON_P (x))
             {
               output_operand_lossage ("operand is not a condition code, "
                                       "invalid operand code '%c'", code);
               return;
             }
-#ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
-         if (ASSEMBLER_DIALECT == ASM_ATT)
-           putc ('.', file);
-#endif
           put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
                               code == 'c' || code == 'f',
                               code == 'F' || code == 'f',
@@ -14692,10 +14764,10 @@ ix86_print_operand_address (FILE *file, rtx addr)
      }
    else
      {
-      /* Print SImode register names for zero-extended
-        addresses to force addr32 prefix.  */
+      /* Print SImode register names to force addr32 prefix.  */
        if (TARGET_64BIT
-         && (GET_CODE (addr) == ZERO_EXTEND
+         && (GET_CODE (addr) == SUBREG
+             || GET_CODE (addr) == ZERO_EXTEND
               || GET_CODE (addr) == AND))
         {
           gcc_assert (!code);
@@ -16792,6 +16864,11 @@ ix86_avoid_lea_for_add (rtx insn, rtx operands[])
    unsigned int regno1 = true_regnum (operands[1]);
    unsigned int regno2 = true_regnum (operands[2]);
  
+  /* FIXME: Handle zero-extended addresses.  */
+  if (GET_CODE (operands[1]) == ZERO_EXTEND
+      || GET_CODE (operands[1]) == AND)
+    return false;
+
    /* Check if we need to optimize.  */
    if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
      return false;
@@ -19922,7 +19999,7 @@ ix86_expand_vec_perm (rtx operands[])
               t1 = gen_reg_rtx (V8SImode);
               t2 = gen_reg_rtx (V8SImode);
               emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
-             emit_insn (gen_avx2_permvarv8si (t2, op0, mask));
+             emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
               goto merge_two;
             }
           return;
@@ -19955,10 +20032,10 @@ ix86_expand_vec_perm (rtx operands[])
  
          case V4SFmode:
           t1 = gen_reg_rtx (V8SFmode);
-         t2 = gen_reg_rtx (V8SFmode);
-         mask = gen_lowpart (V4SFmode, mask);
+         t2 = gen_reg_rtx (V8SImode);
+         mask = gen_lowpart (V4SImode, mask);
           emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
-         emit_insn (gen_avx_vec_concatv8sf (t2, mask, mask));
+         emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
           emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
           emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
           return;
@@ -20177,10 +20254,10 @@ ix86_expand_vec_perm (rtx operands[])
     true if we want the N/2 high elements, else the low elements.  */
  
  void
-ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
+ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
  {
-  enum machine_mode imode = GET_MODE (operands[1]);
-  rtx tmp, dest;
+  enum machine_mode imode = GET_MODE (src);
+  rtx tmp;
  
    if (TARGET_SSE4_1)
      {
@@ -20242,20 +20319,20 @@ ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
        if (GET_MODE_SIZE (imode) == 32)
         {
           tmp = gen_reg_rtx (halfmode);
-         emit_insn (extract (tmp, operands[1]));
+         emit_insn (extract (tmp, src));
         }
        else if (high_p)
         {
           /* Shift higher 8 bytes to lower 8 bytes.  */
           tmp = gen_reg_rtx (imode);
           emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
-                                        gen_lowpart (V1TImode, operands[1]),
+                                        gen_lowpart (V1TImode, src),
                                          GEN_INT (64)));
         }
        else
-       tmp = operands[1];
+       tmp = src;
  
-      emit_insn (unpack (operands[0], tmp));
+      emit_insn (unpack (dest, tmp));
      }
    else
      {
@@ -20285,15 +20362,13 @@ ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
           gcc_unreachable ();
         }
  
-      dest = gen_lowpart (imode, operands[0]);
-
        if (unsigned_p)
         tmp = force_reg (imode, CONST0_RTX (imode));
        else
         tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
-                                  operands[1], pc_rtx, pc_rtx);
+                                  src, pc_rtx, pc_rtx);
  
-      emit_insn (unpack (dest, operands[1], tmp));
+      emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
      }
  }
  
@@ -23030,7 +23105,7 @@ ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
  
  /* For given symbol (function) construct code to compute address of it's PLT
     entry in large x86-64 PIC model.  */
-rtx
+static rtx
  construct_plt_address (rtx symbol)
  {
    rtx tmp, unspec;
@@ -23855,6 +23930,114 @@ ia32_multipass_dfa_lookahead (void)
      }
  }
  
+/* Try to reorder ready list to take advantage of Atom pipelined IMUL
+   execution. It is applied if
+   (1) IMUL instruction is on the top of list;
+   (2) There exists the only producer of independent IMUL instruction in
+       ready list;
+   (3) Put found producer on the top of ready list.
+   Returns issue rate.  */
+
+static int
+ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
+                   int clock_var ATTRIBUTE_UNUSED)
+{
+  static int issue_rate = -1;
+  int n_ready = *pn_ready;
+  rtx insn, insn1, insn2;
+  int i;
+  sd_iterator_def sd_it;
+  dep_t dep;
+  int index = -1;
+
+  /* Set up issue rate.  */
+  issue_rate = ix86_issue_rate();
+
+  /* Do reodering for Atom only.  */
+  if (ix86_tune != PROCESSOR_ATOM)
+    return issue_rate;
+  /* Nothing to do if ready list contains only 1 instruction.  */
+  if (n_ready <= 1)
+    return issue_rate;
+
+  /* Check that IMUL instruction is on the top of ready list.  */
+  insn = ready[n_ready - 1];
+  if (!NONDEBUG_INSN_P (insn))
+    return issue_rate;
+  insn = PATTERN (insn);
+  if (GET_CODE (insn) == PARALLEL)
+    insn = XVECEXP (insn, 0, 0);
+  if (GET_CODE (insn) != SET)
+    return issue_rate;
+  if (!(GET_CODE (SET_SRC (insn)) == MULT
+      && GET_MODE (SET_SRC (insn)) == SImode))
+    return issue_rate;
+
+  /* Search for producer of independent IMUL instruction.  */
+  for (i = n_ready - 2; i>= 0; i--)
+    {
+      insn = ready[i];
+      if (!NONDEBUG_INSN_P (insn))
+        continue;
+      /* Skip IMUL instruction.  */
+      insn2 = PATTERN (insn);
+      if (GET_CODE (insn2) == PARALLEL)
+        insn2 = XVECEXP (insn2, 0, 0);
+      if (GET_CODE (insn2) == SET
+          && GET_CODE (SET_SRC (insn2)) == MULT
+          && GET_MODE (SET_SRC (insn2)) == SImode)
+        continue;
+
+      FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
+        {
+          rtx con;
+         con = DEP_CON (dep);
+         if (!NONDEBUG_INSN_P (con))
+           continue;
+          insn1 = PATTERN (con);
+          if (GET_CODE (insn1) == PARALLEL)
+            insn1 = XVECEXP (insn1, 0, 0);
+
+          if (GET_CODE (insn1) == SET
+              && GET_CODE (SET_SRC (insn1)) == MULT
+              && GET_MODE (SET_SRC (insn1)) == SImode)
+            {
+              sd_iterator_def sd_it1;
+              dep_t dep1;
+              /* Check if there is no other dependee for IMUL.  */
+              index = i;
+              FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
+                {
+                  rtx pro;
+                  pro = DEP_PRO (dep1);
+                 if (!NONDEBUG_INSN_P (pro))
+                   continue;
+                  if (pro != insn)
+                    index = -1;
+               }
+              if (index >= 0)
+                break;
+            }
+        }
+      if (index >= 0)
+        break;
+    }
+  if (index < 0)
+    return issue_rate; /* Didn't find IMUL producer.  */
+
+  if (sched_verbose > 1)
+    fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n",
+            INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1]));
+
+  /* Put IMUL producer (ready[index]) at the top of ready list.  */
+  insn1= ready[index];
+  for (i = index; i < n_ready - 1; i++)
+    ready[i] = ready[i + 1];
+  ready[n_ready - 1] = insn1;
+
+  return issue_rate;
+}
+
  \f
  
  /* Model decoder of Core 2/i7.
@@ -26443,7 +26626,7 @@ static const struct builtin_description bdesc_args[] =
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
  
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
  
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
  
@@ -26835,13 +27018,13 @@ static const struct builtin_description bdesc_args[] =
    { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2  , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
    { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2  , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
    { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2  , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3  , "__builtin_ia32_pmuldq256"  , IX86_BUILTIN_PMULDQ256  , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
    { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
    { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
    { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256"  , IX86_BUILTIN_PMULHW256  , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
    { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256"  , IX86_BUILTIN_PMULLW256  , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
    { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256"  , IX86_BUILTIN_PMULLD256  , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3  , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
    { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
    { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
    { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
@@ -27914,6 +28097,10 @@ ix86_fold_builtin (tree fndecl, int n_args,
         }
      }
  
+#ifdef SUBTARGET_FOLD_BUILTIN
+  return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
+#endif
+
    return NULL_TREE;
  }
  
@@ -28974,6 +29161,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
      case V2DI_FTYPE_V2DI_V2DI:
      case V2DI_FTYPE_V16QI_V16QI:
      case V2DI_FTYPE_V4SI_V4SI:
+    case V2UDI_FTYPE_V4USI_V4USI:
      case V2DI_FTYPE_V2DI_V16QI:
      case V2DI_FTYPE_V2DF_V2DF:
      case V2SI_FTYPE_V2SI_V2SI:
@@ -28998,6 +29186,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
      case V8SI_FTYPE_V16HI_V16HI:
      case V4DI_FTYPE_V4DI_V4DI:
      case V4DI_FTYPE_V8SI_V8SI:
+    case V4UDI_FTYPE_V8USI_V8USI:
        if (comparison == UNKNOWN)
         return ix86_expand_binop_builtin (icode, exp, target);
        nargs = 2;
@@ -31788,7 +31977,7 @@ ix86_set_reg_reg_cost (enum machine_mode mode)
        break;
  
      case MODE_FLOAT:
-      if ((TARGET_SSE2 && mode == TFmode)
+      if ((TARGET_SSE && mode == TFmode)
           || (TARGET_80387 && mode == XFmode)
           || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
           || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
@@ -31796,7 +31985,7 @@ ix86_set_reg_reg_cost (enum machine_mode mode)
        break;
  
      case MODE_COMPLEX_FLOAT:
-      if ((TARGET_SSE2 && mode == TCmode)
+      if ((TARGET_SSE && mode == TCmode)
           || (TARGET_80387 && mode == XCmode)
           || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
           || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
@@ -31822,9 +32011,10 @@ ix86_set_reg_reg_cost (enum machine_mode mode)
     scanned.  In either case, *TOTAL contains the cost result.  */
  
  static bool
-ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
+ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
                 bool speed)
  {
+  enum rtx_code code = (enum rtx_code) code_i;
    enum rtx_code outer_code = (enum rtx_code) outer_code_i;
    enum machine_mode mode = GET_MODE (x);
    const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
@@ -31860,25 +32050,42 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
  
      case CONST_DOUBLE:
        if (mode == VOIDmode)
-       *total = 0;
-      else
-       switch (standard_80387_constant_p (x))
-         {
-         case 1: /* 0.0 */
-           *total = 1;
-           break;
-         default: /* Other constants */
-           *total = 2;
-           break;
-         case 0:
-         case -1:
-           /* Start with (MEM (SYMBOL_REF)), since that's where
-              it'll probably end up.  Add a penalty for size.  */
-           *total = (COSTS_N_INSNS (1)
-                     + (flag_pic != 0 && !TARGET_64BIT)
-                     + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
-           break;
-         }
+       {
+         *total = 0;
+         return true;
+       }
+      switch (standard_80387_constant_p (x))
+       {
+       case 1: /* 0.0 */
+         *total = 1;
+         return true;
+       default: /* Other constants */
+         *total = 2;
+         return true;
+       case 0:
+       case -1:
+         break;
+       }
+      if (SSE_FLOAT_MODE_P (mode))
+       {
+    case CONST_VECTOR:
+         switch (standard_sse_constant_p (x))
+           {
+           case 0:
+             break;
+           case 1:  /* 0: xor eliminates false dependency */
+             *total = 0;
+             return true;
+           default: /* -1: cmp contains false dependency */
+             *total = 1;
+             return true;
+           }
+       }
+      /* Fall back to (MEM (SYMBOL_REF)), since that's where
+        it'll probably end up.  Add a penalty for size.  */
+      *total = (COSTS_N_INSNS (1)
+               + (flag_pic != 0 && !TARGET_64BIT)
+               + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
        return true;
  
      case ZERO_EXTEND:
@@ -31898,8 +32105,9 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
        return false;
  
      case ASHIFT:
-      if (CONST_INT_P (XEXP (x, 1))
-         && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT))
+      if (SCALAR_INT_MODE_P (mode)
+         && GET_MODE_SIZE (mode) < UNITS_PER_WORD
+         && CONST_INT_P (XEXP (x, 1)))
         {
           HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
           if (value == 1)
@@ -31920,7 +32128,38 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
      case ASHIFTRT:
      case LSHIFTRT:
      case ROTATERT:
-      if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
+      if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+       {
+         /* ??? Should be SSE vector operation cost.  */
+         /* At least for published AMD latencies, this really is the same
+            as the latency for a simple fpu operation like fabs.  */
+         /* V*QImode is emulated with 1-11 insns.  */
+         if (mode == V16QImode || mode == V32QImode)
+           {
+             int count = 11;
+             if (TARGET_XOP && mode == V16QImode)
+               {
+                 /* For XOP we use vpshab, which requires a broadcast of the
+                    value to the variable shift insn.  For constants this
+                    means a V16Q const in mem; even when we can perform the
+                    shift with one insn set the cost to prefer paddb.  */
+                 if (CONSTANT_P (XEXP (x, 1)))
+                   {
+                     *total = (cost->fabs
+                               + rtx_cost (XEXP (x, 0), code, 0, speed)
+                               + (speed ? 2 : COSTS_N_BYTES (16)));
+                     return true;
+                   }
+                 count = 3;
+               }
+             else if (TARGET_SSSE3)
+               count = 7;
+             *total = cost->fabs * count;
+           }
+         else
+           *total = cost->fabs;
+       }
+      else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
         {
           if (CONST_INT_P (XEXP (x, 1)))
             {
@@ -31989,6 +32228,34 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
           *total = cost->fmul;
           return false;
         }
+      else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+       {
+         /* V*QImode is emulated with 7-13 insns.  */
+         if (mode == V16QImode || mode == V32QImode)
+           {
+             int extra = 11;
+             if (TARGET_XOP && mode == V16QImode)
+               extra = 5;
+             else if (TARGET_SSSE3)
+               extra = 6;
+             *total = cost->fmul * 2 + cost->fabs * extra;
+           }
+         /* V*DImode is emulated with 5-8 insns.  */
+         else if (mode == V2DImode || mode == V4DImode)
+           {
+             if (TARGET_XOP && mode == V2DImode)
+               *total = cost->fmul * 2 + cost->fabs * 3;
+             else
+               *total = cost->fmul * 3 + cost->fabs * 5;
+           }
+         /* Without sse4.1, we don't have PMULLD; it's emulated with 7
+            insns, including two PMULUDQ.  */
+         else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
+           *total = cost->fmul * 2 + cost->fabs * 5;
+         else
+           *total = cost->fmul;
+         return false;
+       }
        else
         {
           rtx op0 = XEXP (x, 0);
@@ -32053,7 +32320,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
  
      case PLUS:
        if (GET_MODE_CLASS (mode) == MODE_INT
-              && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode))
+         && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
         {
           if (GET_CODE (XEXP (x, 0)) == PLUS
               && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
@@ -32121,7 +32388,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
      case AND:
      case IOR:
      case XOR:
-      if (!TARGET_64BIT && mode == DImode)
+      if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
         {
           *total = (cost->add * 2
                     + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
@@ -32153,7 +32420,14 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
        /* FALLTHRU */
  
      case NOT:
-      if (!TARGET_64BIT && mode == DImode)
+      if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+       {
+         /* ??? Should be SSE vector operation cost.  */
+         /* At least for published AMD latencies, this really is the same
+            as the latency for a simple fpu operation like fabs.  */
+         *total = cost->fabs;
+       }
+      else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
         *total = cost->add * 2;
        else
         *total = cost->add;
@@ -32213,7 +32487,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
        /* ??? Assume all of these vector manipulation patterns are
          recognizable.  In which case they all pretty much have the
          same cost.  */
-     *total = COSTS_N_INSNS (1);
+     *total = cost->fabs;
       return true;
  
      default:
@@ -32605,6 +32879,18 @@ x86_output_mi_thunk (FILE *file,
  {
    rtx this_param = x86_this_parameter (function);
    rtx this_reg, tmp, fnaddr;
+  unsigned int tmp_regno;
+
+  if (TARGET_64BIT)
+    tmp_regno = R10_REG;
+  else
+    {
+      unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
+      if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
+       tmp_regno = AX_REG;
+      else
+       tmp_regno = CX_REG;
+    }
  
    emit_note (NOTE_INSN_PROLOGUE_END);
  
@@ -32631,7 +32917,7 @@ x86_output_mi_thunk (FILE *file,
         {
           if (!x86_64_general_operand (delta_rtx, Pmode))
             {
-             tmp = gen_rtx_REG (Pmode, R10_REG);
+             tmp = gen_rtx_REG (Pmode, tmp_regno);
               emit_move_insn (tmp, delta_rtx);
               delta_rtx = tmp;
             }
@@ -32644,18 +32930,7 @@ x86_output_mi_thunk (FILE *file,
    if (vcall_offset)
      {
        rtx vcall_addr, vcall_mem, this_mem;
-      unsigned int tmp_regno;
  
-      if (TARGET_64BIT)
-       tmp_regno = R10_REG;
-      else
-       {
-         unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
-         if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
-           tmp_regno = AX_REG;
-         else
-           tmp_regno = CX_REG;
-       }
        tmp = gen_rtx_REG (Pmode, tmp_regno);
  
        this_mem = gen_rtx_MEM (ptr_mode, this_reg);
@@ -32730,6 +33005,19 @@ x86_output_mi_thunk (FILE *file,
      emit_jump_insn (gen_indirect_jump (fnaddr));
    else
      {
+      if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
+       fnaddr = legitimize_pic_address (fnaddr,
+                                        gen_rtx_REG (Pmode, tmp_regno));
+
+      if (!sibcall_insn_operand (fnaddr, word_mode))
+       {
+         tmp = gen_rtx_REG (word_mode, tmp_regno);
+         if (GET_MODE (fnaddr) != word_mode)
+           fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
+         emit_move_insn (tmp, fnaddr);
+         fnaddr = tmp;
+       }
+
        tmp = gen_rtx_MEM (QImode, fnaddr);
        tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
        tmp = emit_call_insn (tmp);
@@ -33110,7 +33398,7 @@ ix86_count_insn (basic_block bb)
    return min_prev_count;
  }
  
-/* Pad short funtion to 4 instructions.   */
+/* Pad short function to 4 instructions.   */
  
  static void
  ix86_pad_short_function (void)
@@ -34408,7 +34696,7 @@ half:
      }
    else
      {
-      rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
+      rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
  
        emit_move_insn (mem, target);
  
@@ -34625,7 +34913,7 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
      }
    else
      {
-      rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
+      rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
  
        emit_move_insn (mem, vec);
  
@@ -35962,9 +36250,11 @@ static const struct attribute_spec ix86_attribute_table[] =
  /* Implement targetm.vectorize.builtin_vectorization_cost.  */
  static int
  ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
-                                 tree vectype ATTRIBUTE_UNUSED,
+                                 tree vectype,
                                   int misalign ATTRIBUTE_UNUSED)
  {
+  unsigned elements;
+
    switch (type_of_cost)
      {
        case scalar_stmt:
@@ -36005,6 +36295,10 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
        case vec_promote_demote:
          return ix86_cost->vec_stmt_cost;
  
+      case vec_construct:
+       elements = TYPE_VECTOR_SUBPARTS (vectype);
+       return elements / 2 + 1;
+
        default:
          gcc_unreachable ();
      }
@@ -36475,12 +36769,6 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
                                 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
    vperm = force_reg (vmode, vperm);
  
-  if (vmode == V8SImode && d->vmode == V8SFmode)
-    {
-      vmode = V8SFmode;
-      vperm = gen_lowpart (vmode, vperm);
-    }
-
    target = gen_lowpart (vmode, d->target);
    op0 = gen_lowpart (vmode, d->op0);
    if (d->one_operand_p)
@@ -36913,7 +37201,7 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
         {
           if (d->perm[0] / nelt2 == nonzero_halves[1])
             {
-             /* Attempt to increase the likelyhood that dfinal
+             /* Attempt to increase the likelihood that dfinal
                  shuffle will be intra-lane.  */
               char tmph = nonzero_halves[0];
               nonzero_halves[0] = nonzero_halves[1];
@@ -38290,6 +38578,419 @@ ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
    expand_vec_perm_even_odd_1 (&d, odd);
  }
  
+static void
+ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
+{
+  struct expand_vec_perm_d d;
+  unsigned i, nelt, base;
+  bool ok;
+
+  d.target = targ;
+  d.op0 = op0;
+  d.op1 = op1;
+  d.vmode = GET_MODE (targ);
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.one_operand_p = false;
+  d.testing_p = false;
+
+  base = high_p ? nelt / 2 : 0;
+  for (i = 0; i < nelt / 2; ++i)
+    {
+      d.perm[i * 2] = i + base;
+      d.perm[i * 2 + 1] = i + base + nelt;
+    }
+
+  /* Note that for AVX this isn't one instruction.  */
+  ok = ix86_expand_vec_perm_const_1 (&d);
+  gcc_assert (ok);
+}
+
+
+/* Expand a vector operation CODE for a V*QImode in terms of the
+   same operation on V*HImode.  */
+
+void
+ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+{
+  enum machine_mode qimode = GET_MODE (dest);
+  enum machine_mode himode;
+  rtx (*gen_il) (rtx, rtx, rtx);
+  rtx (*gen_ih) (rtx, rtx, rtx);
+  rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
+  struct expand_vec_perm_d d;
+  bool ok, full_interleave;
+  bool uns_p = false;
+  int i;
+
+  switch (qimode)
+    {
+    case V16QImode:
+      himode = V8HImode;
+      gen_il = gen_vec_interleave_lowv16qi;
+      gen_ih = gen_vec_interleave_highv16qi;
+      break;
+    case V32QImode:
+      himode = V16HImode;
+      gen_il = gen_avx2_interleave_lowv32qi;
+      gen_ih = gen_avx2_interleave_highv32qi;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  op2_l = op2_h = op2;
+  switch (code)
+    {
+    case MULT:
+      /* Unpack data such that we've got a source byte in each low byte of
+        each word.  We don't care what goes into the high byte of each word.
+        Rather than trying to get zero in there, most convenient is to let
+        it be a copy of the low byte.  */
+      op2_l = gen_reg_rtx (qimode);
+      op2_h = gen_reg_rtx (qimode);
+      emit_insn (gen_il (op2_l, op2, op2));
+      emit_insn (gen_ih (op2_h, op2, op2));
+      /* FALLTHRU */
+
+      op1_l = gen_reg_rtx (qimode);
+      op1_h = gen_reg_rtx (qimode);
+      emit_insn (gen_il (op1_l, op1, op1));
+      emit_insn (gen_ih (op1_h, op1, op1));
+      full_interleave = qimode == V16QImode;
+      break;
+
+    case ASHIFT:
+    case LSHIFTRT:
+      uns_p = true;
+      /* FALLTHRU */
+    case ASHIFTRT:
+      op1_l = gen_reg_rtx (himode);
+      op1_h = gen_reg_rtx (himode);
+      ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
+      ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
+      full_interleave = true;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  /* Perform the operation.  */
+  res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
+                              1, OPTAB_DIRECT);
+  res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
+                              1, OPTAB_DIRECT);
+  gcc_assert (res_l && res_h);
+
+  /* Merge the data back into the right place.  */
+  d.target = dest;
+  d.op0 = gen_lowpart (qimode, res_l);
+  d.op1 = gen_lowpart (qimode, res_h);
+  d.vmode = qimode;
+  d.nelt = GET_MODE_NUNITS (qimode);
+  d.one_operand_p = false;
+  d.testing_p = false;
+
+  if (full_interleave)
+    {
+      /* For SSE2, we used an full interleave, so the desired
+        results are in the even elements.  */
+      for (i = 0; i < 32; ++i)
+       d.perm[i] = i * 2;
+    }
+  else
+    {
+      /* For AVX, the interleave used above was not cross-lane.  So the
+        extraction is evens but with the second and third quarter swapped.
+        Happily, that is even one insn shorter than even extraction.  */
+      for (i = 0; i < 32; ++i)
+       d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
+    }
+
+  ok = ix86_expand_vec_perm_const_1 (&d);
+  gcc_assert (ok);
+
+  set_unique_reg_note (get_last_insn (), REG_EQUAL,
+                      gen_rtx_fmt_ee (code, qimode, op1, op2));
+}
+
+void
+ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
+                              bool uns_p, bool odd_p)
+{
+  enum machine_mode mode = GET_MODE (op1);
+  enum machine_mode wmode = GET_MODE (dest);
+  rtx x;
+
+  /* We only play even/odd games with vectors of SImode.  */
+  gcc_assert (mode == V4SImode || mode == V8SImode);
+
+  /* If we're looking for the odd results, shift those members down to
+     the even slots.  For some cpus this is faster than a PSHUFD.  */
+  if (odd_p)
+    {
+      if (TARGET_XOP && mode == V4SImode)
+       {
+         x = force_reg (wmode, CONST0_RTX (wmode));
+         emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
+         return;
+       }
+
+      x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
+      op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
+                         x, NULL, 1, OPTAB_DIRECT);
+      op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
+                         x, NULL, 1, OPTAB_DIRECT);
+      op1 = gen_lowpart (mode, op1);
+      op2 = gen_lowpart (mode, op2);
+    }
+
+  if (mode == V8SImode)
+    {
+      if (uns_p)
+       x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
+      else
+       x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
+    }
+  else if (uns_p)
+    x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
+  else if (TARGET_SSE4_1)
+    x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
+  else
+    {
+      rtx s1, s2, t0, t1, t2;
+
+      /* The easiest way to implement this without PMULDQ is to go through
+        the motions as if we are performing a full 64-bit multiply.  With
+        the exception that we need to do less shuffling of the elements.  */
+
+      /* Compute the sign-extension, aka highparts, of the two operands.  */
+      s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
+                               op1, pc_rtx, pc_rtx);
+      s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
+                               op2, pc_rtx, pc_rtx);
+
+      /* Multiply LO(A) * HI(B), and vice-versa.  */
+      t1 = gen_reg_rtx (wmode);
+      t2 = gen_reg_rtx (wmode);
+      emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
+      emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
+
+      /* Multiply LO(A) * LO(B).  */
+      t0 = gen_reg_rtx (wmode);
+      emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
+
+      /* Combine and shift the highparts into place.  */
+      t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
+      t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
+                        1, OPTAB_DIRECT);
+
+      /* Combine high and low parts.  */
+      force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
+      return;
+    }
+  emit_insn (x);
+}
+
+void
+ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
+                           bool uns_p, bool high_p)
+{
+  enum machine_mode wmode = GET_MODE (dest);
+  enum machine_mode mode = GET_MODE (op1);
+  rtx t1, t2, t3, t4, mask;
+
+  switch (mode)
+    {
+    case V4SImode:
+      t1 = gen_reg_rtx (mode);
+      t2 = gen_reg_rtx (mode);
+      if (TARGET_XOP && !uns_p)
+       {
+         /* With XOP, we have pmacsdqh, aka mul_widen_odd.  In this case,
+            shuffle the elements once so that all elements are in the right
+            place for immediate use: { A C B D }.  */
+         emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
+                                       const1_rtx, GEN_INT (3)));
+         emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
+                                       const1_rtx, GEN_INT (3)));
+       }
+      else
+       {
+         /* Put the elements into place for the multiply.  */
+         ix86_expand_vec_interleave (t1, op1, op1, high_p);
+         ix86_expand_vec_interleave (t2, op2, op2, high_p);
+         high_p = false;
+       }
+      ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
+      break;
+
+    case V8SImode:
+      /* Shuffle the elements between the lanes.  After this we
+        have { A B E F | C D G H } for each operand.  */
+      t1 = gen_reg_rtx (V4DImode);
+      t2 = gen_reg_rtx (V4DImode);
+      emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
+                                     const0_rtx, const2_rtx,
+                                     const1_rtx, GEN_INT (3)));
+      emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
+                                     const0_rtx, const2_rtx,
+                                     const1_rtx, GEN_INT (3)));
+
+      /* Shuffle the elements within the lanes.  After this we
+        have { A A B B | C C D D } or { E E F F | G G H H }.  */
+      t3 = gen_reg_rtx (V8SImode);
+      t4 = gen_reg_rtx (V8SImode);
+      mask = GEN_INT (high_p
+                     ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
+                     : 0 + (0 << 2) + (1 << 4) + (1 << 6));
+      emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
+      emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
+
+      ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
+      break;
+
+    case V8HImode:
+    case V16HImode:
+      t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
+                        uns_p, OPTAB_DIRECT);
+      t2 = expand_binop (mode,
+                        uns_p ? umul_highpart_optab : smul_highpart_optab,
+                        op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
+      gcc_assert (t1 && t2);
+
+      ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
+      break;
+
+    case V16QImode:
+    case V32QImode:
+      t1 = gen_reg_rtx (wmode);
+      t2 = gen_reg_rtx (wmode);
+      ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
+      ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
+
+      emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+void
+ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
+{
+  rtx res_1, res_2;
+
+  res_1 = gen_reg_rtx (V4SImode);
+  res_2 = gen_reg_rtx (V4SImode);
+  ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
+                                op1, op2, true, false);
+  ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
+                                op1, op2, true, true);
+
+  /* Move the results in element 2 down to element 1; we don't care
+     what goes in elements 2 and 3.  Then we can merge the parts
+     back together with an interleave.
+
+     Note that two other sequences were tried:
+     (1) Use interleaves at the start instead of psrldq, which allows
+     us to use a single shufps to merge things back at the end.
+     (2) Use shufps here to combine the two vectors, then pshufd to
+     put the elements in the correct order.
+     In both cases the cost of the reformatting stall was too high
+     and the overall sequence slower.  */
+
+  emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
+                               const0_rtx, const0_rtx));
+  emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
+                               const0_rtx, const0_rtx));
+  res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
+
+  set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
+}
+
+void
+ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
+{
+  enum machine_mode mode = GET_MODE (op0);
+  rtx t1, t2, t3, t4, t5, t6;
+
+  if (TARGET_XOP && mode == V2DImode)
+    {
+      /* op1: A,B,C,D, op2: E,F,G,H */
+      op1 = gen_lowpart (V4SImode, op1);
+      op2 = gen_lowpart (V4SImode, op2);
+
+      t1 = gen_reg_rtx (V4SImode);
+      t2 = gen_reg_rtx (V4SImode);
+      t3 = gen_reg_rtx (V2DImode);
+      t4 = gen_reg_rtx (V2DImode);
+
+      /* t1: B,A,D,C */
+      emit_insn (gen_sse2_pshufd_1 (t1, op1,
+                                   GEN_INT (1),
+                                   GEN_INT (0),
+                                   GEN_INT (3),
+                                   GEN_INT (2)));
+
+      /* t2: (B*E),(A*F),(D*G),(C*H) */
+      emit_insn (gen_mulv4si3 (t2, t1, op2));
+
+      /* t3: (B*E)+(A*F), (D*G)+(C*H) */
+      emit_insn (gen_xop_phadddq (t3, t2));
+
+      /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
+      emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
+
+      /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
+      emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
+    }
+  else
+    {
+      enum machine_mode nmode;
+      rtx (*umul) (rtx, rtx, rtx);
+
+      if (mode == V2DImode)
+       {
+         umul = gen_vec_widen_umult_even_v4si;
+         nmode = V4SImode;
+       }
+      else if (mode == V4DImode)
+       {
+         umul = gen_vec_widen_umult_even_v8si;
+         nmode = V8SImode;
+       }
+      else
+       gcc_unreachable ();
+
+
+      /* Multiply low parts.  */
+      t1 = gen_reg_rtx (mode);
+      emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
+
+      /* Shift input vectors right 32 bits so we can multiply high parts.  */
+      t6 = GEN_INT (32);
+      t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
+      t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
+
+      /* Multiply high parts by low parts.  */
+      t4 = gen_reg_rtx (mode);
+      t5 = gen_reg_rtx (mode);
+      emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
+      emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
+
+      /* Combine and shift the highparts back.  */
+      t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
+      t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
+
+      /* Combine high and low parts.  */
+      force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
+    }
+
+  set_unique_reg_note (get_last_insn (), REG_EQUAL,
+                      gen_rtx_MULT (mode, op1, op2));
+}
+
  /* Expand an insert into a vector register through pinsr insn.
     Return true if successful.  */
  
@@ -38504,6 +39205,8 @@ ix86_enum_va_list (int idx, const char **pname, tree *ptree)
  #define TARGET_SCHED_DISPATCH_DO do_dispatch
  #undef TARGET_SCHED_REASSOCIATION_WIDTH
  #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
+#undef TARGET_SCHED_REORDER
+#define TARGET_SCHED_REORDER ix86_sched_reorder
  
  /* The size of the dispatch window is the total number of bytes of
     object code allowed in a window.  */
@@ -38989,7 +39692,7 @@ fits_dispatch_window (rtx insn)
    /* Make disp_cmp and disp_jcc get scheduled at the latest.  These
       instructions should be given the lowest priority in the
       scheduling process in Haifa scheduler to make sure they will be
-     scheduled in the same dispatch window as the refrence to them.  */
+     scheduled in the same dispatch window as the reference to them.  */
    if (group == disp_jcc || group == disp_cmp)
      return false;
  
@@ -39376,6 +40079,59 @@ ix86_autovectorize_vector_sizes (void)
    return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
  }
  
+/* Implement targetm.vectorize.init_cost.  */
+
+static void *
+ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
+{
+  unsigned *cost = XNEW (unsigned);
+  *cost = 0;
+  return cost;
+}
+
+/* Implement targetm.vectorize.add_stmt_cost.  */
+
+static unsigned
+ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
+                   struct _stmt_vec_info *stmt_info, int misalign)
+{
+  unsigned *cost = (unsigned *) data;
+  unsigned retval = 0;
+
+  if (flag_vect_cost_model)
+    {
+      tree vectype = stmt_vectype (stmt_info);
+      int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+
+      /* Statements in an inner loop relative to the loop being
+        vectorized are weighted more heavily.  The value here is
+        arbitrary and could potentially be improved with analysis.  */
+      if (stmt_in_inner_loop_p (stmt_info))
+       count *= 50;  /* FIXME.  */
+
+      retval = (unsigned) (count * stmt_cost);
+      *cost += retval;
+    }
+
+  return retval;
+}
+
+/* Implement targetm.vectorize.finish_cost.  */
+
+static unsigned
+ix86_finish_cost (void *data)
+{
+  return *((unsigned *) data);
+}
+
+/* Implement targetm.vectorize.destroy_cost_data.  */
+
+static void
+ix86_destroy_cost_data (void *data)
+{
+  free (data);
+}
+
  /* Validate target specific memory model bits in VAL. */
  
  static unsigned HOST_WIDE_INT
@@ -39681,6 +40437,14 @@ ix86_memmodel_check (unsigned HOST_WIDE_INT val)
  #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
  #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
    ix86_autovectorize_vector_sizes
+#undef TARGET_VECTORIZE_INIT_COST
+#define TARGET_VECTORIZE_INIT_COST ix86_init_cost
+#undef TARGET_VECTORIZE_ADD_STMT_COST
+#define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
+#undef TARGET_VECTORIZE_FINISH_COST
+#define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
+#undef TARGET_VECTORIZE_DESTROY_COST_DATA
+#define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
  
  #undef TARGET_SET_CURRENT_FUNCTION
  #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function