#include "fibheap.h"
#include "opts.h"
#include "diagnostic.h"
+#include "dumpfile.h"
enum upper_128bits_state
{
1, /* cond_not_taken_branch_cost. */
};
+/* Set by -mtune. */
+const struct processor_costs *ix86_tune_cost = &pentium_cost;
+
+/* Set by -mtune or -Os. */
const struct processor_costs *ix86_cost = &pentium_cost;
/* Processor feature/optimization bitmasks. */
int va_arg_size;
int red_zone_size;
int outgoing_arguments_size;
- HOST_WIDE_INT frame;
/* The offsets relative to ARG_POINTER. */
HOST_WIDE_INT frame_pointer_offset;
"large", "32");
else if (TARGET_X32)
error ("code model %qs not supported in x32 mode",
- "medium");
+ "large");
break;
case CM_32:
flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
}
+ ix86_tune_cost = processor_target_table[ix86_tune].cost;
if (optimize_size)
ix86_cost = &ix86_size_cost;
else
- ix86_cost = processor_target_table[ix86_tune].cost;
+ ix86_cost = ix86_tune_cost;
/* Arrange to set up i386_stack_locals for all functions. */
init_machine_status = ix86_init_machine_status;
ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
if (global_options_set.x_ix86_preferred_stack_boundary_arg)
{
- int min = (TARGET_64BIT ? 4 : 2);
+ int min = (TARGET_64BIT ? (TARGET_SSE ? 4 : 3) : 2);
int max = (TARGET_SEH ? 4 : 12);
if (ix86_preferred_stack_boundary_arg < min
flag_schedule_insns_after_reload = flag_schedule_insns = 0;
maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
- ix86_cost->simultaneous_prefetches,
+ ix86_tune_cost->simultaneous_prefetches,
global_options.x_param_values,
global_options_set.x_param_values);
- maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, ix86_cost->prefetch_block,
+ maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
+ ix86_tune_cost->prefetch_block,
global_options.x_param_values,
global_options_set.x_param_values);
- maybe_set_param_value (PARAM_L1_CACHE_SIZE, ix86_cost->l1_cache_size,
+ maybe_set_param_value (PARAM_L1_CACHE_SIZE,
+ ix86_tune_cost->l1_cache_size,
global_options.x_param_values,
global_options_set.x_param_values);
- maybe_set_param_value (PARAM_L2_CACHE_SIZE, ix86_cost->l2_cache_size,
+ maybe_set_param_value (PARAM_L2_CACHE_SIZE,
+ ix86_tune_cost->l2_cache_size,
global_options.x_param_values,
global_options_set.x_param_values);
if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
return true;
+ /* Win64 SEH, very large frames need a frame-pointer as maximum stack
+ allocation is 4GB. */
+ if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
+ return true;
+
/* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
turns off the frame pointer by default. Turn it back on now if
we've not got a leaf function. */
if (TARGET_OMIT_LEAF_FRAME_POINTER
- && (!current_function_is_leaf
+ && (!crtl->is_leaf
|| ix86_current_function_calls_tls_descriptor))
return true;
static unsigned int
ix86_select_alt_pic_regnum (void)
{
- if (current_function_is_leaf
+ if (crtl->is_leaf
&& !crtl->profile
&& !ix86_current_function_calls_tls_descriptor)
{
static void
ix86_compute_frame_layout (struct ix86_frame *frame)
{
- unsigned int stack_alignment_needed;
+ unsigned HOST_WIDE_INT stack_alignment_needed;
HOST_WIDE_INT offset;
- unsigned int preferred_alignment;
+ unsigned HOST_WIDE_INT preferred_alignment;
HOST_WIDE_INT size = get_frame_size ();
HOST_WIDE_INT to_allocate;
/* 64-bit MS ABI seem to require stack alignment to be always 16 except for
function prologues and leaf. */
if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
- && (!current_function_is_leaf || cfun->calls_alloca != 0
+ && (!crtl->is_leaf || cfun->calls_alloca != 0
|| ix86_current_function_calls_tls_descriptor))
{
preferred_alignment = 16;
offset += frame->nregs * UNITS_PER_WORD;
frame->reg_save_offset = offset;
+ /* On SEH target, registers are pushed just before the frame pointer
+ location. */
+ if (TARGET_SEH)
+ frame->hard_frame_pointer_offset = offset;
+
/* Align and set SSE register save area. */
if (frame->nsseregs)
{
if (stack_realign_fp
|| offset != frame->sse_reg_save_offset
|| size != 0
- || !current_function_is_leaf
+ || !crtl->is_leaf
|| cfun->calls_alloca
|| ix86_current_function_calls_tls_descriptor)
offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
expander assumes that last crtl->outgoing_args_size
of stack frame are unused. */
if (ACCUMULATE_OUTGOING_ARGS
- && (!current_function_is_leaf || cfun->calls_alloca
+ && (!crtl->is_leaf || cfun->calls_alloca
|| ix86_current_function_calls_tls_descriptor))
{
offset += crtl->outgoing_args_size;
/* Align stack boundary. Only needed if we're calling another function
or using alloca. */
- if (!current_function_is_leaf || cfun->calls_alloca
+ if (!crtl->is_leaf || cfun->calls_alloca
|| ix86_current_function_calls_tls_descriptor)
offset = (offset + preferred_alignment - 1) & -preferred_alignment;
frame->save_regs_using_mov = false;
if (ix86_using_red_zone ()
- && current_function_sp_is_unchanging
- && current_function_is_leaf
+ && crtl->sp_is_unchanging
+ && crtl->is_leaf
&& !ix86_current_function_calls_tls_descriptor)
{
frame->red_zone_size = to_allocate;
{
HOST_WIDE_INT diff;
- /* If we can leave the frame pointer where it is, do so. */
+ /* If we can leave the frame pointer where it is, do so. Also, returns
+ the establisher frame for __builtin_frame_address (0). */
diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
- if (diff > 240 || (diff & 15) != 0)
+ if (diff <= SEH_MAX_FRAME_SIZE
+ && (diff > 240 || (diff & 15) != 0)
+ && !crtl->accesses_prior_frames)
{
/* Ideally we'd determine what portion of the local stack frame
(within the constraint of the lowest 240) is most heavily used.
if (m->use_fast_prologue_epilogue)
{
/* Choose the base register most likely to allow the most scheduling
- opportunities. Generally FP is valid througout the function,
+ opportunities. Generally FP is valid throughout the function,
while DRAP must be reloaded within the epilogue. But choose either
over the SP due to increased encoding size. */
= (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
unsigned int stack_realign = (incoming_stack_boundary
- < (current_function_is_leaf
+ < (crtl->is_leaf
? crtl->max_used_stack_slot_alignment
: crtl->stack_alignment_needed));
if (stack_realign
&& !crtl->need_drap
&& frame_pointer_needed
- && current_function_is_leaf
+ && crtl->is_leaf
&& flag_omit_frame_pointer
- && current_function_sp_is_unchanging
+ && crtl->sp_is_unchanging
&& !ix86_current_function_calls_tls_descriptor
&& !crtl->accesses_prior_frames
&& !cfun->calls_alloca
struct ix86_frame frame;
HOST_WIDE_INT allocate;
bool int_registers_saved;
+ bool sse_registers_saved;
ix86_finalize_stack_realign_flags ();
m->fs.realigned = true;
}
+ int_registers_saved = (frame.nregs == 0);
+ sse_registers_saved = (frame.nsseregs == 0);
+
if (frame_pointer_needed && !m->fs.fp_valid)
{
/* Note: AT&T enter does NOT have reversed args. Enter is probably
insn = emit_insn (gen_push (hard_frame_pointer_rtx));
RTX_FRAME_RELATED_P (insn) = 1;
+ /* Push registers now, before setting the frame pointer
+ on SEH target. */
+ if (!int_registers_saved
+ && TARGET_SEH
+ && !frame.save_regs_using_mov)
+ {
+ ix86_emit_save_regs ();
+ int_registers_saved = true;
+ gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
+ }
+
if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
{
insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
}
}
- int_registers_saved = (frame.nregs == 0);
-
if (!int_registers_saved)
{
/* If saving registers via PUSH, do so now. */
current_function_static_stack_size = stack_size;
}
+ /* On SEH target with very large frame size, allocate an area to save
+ SSE registers (as the very large allocation won't be described). */
+ if (TARGET_SEH
+ && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
+ && !sse_registers_saved)
+ {
+ HOST_WIDE_INT sse_size =
+ frame.sse_reg_save_offset - frame.reg_save_offset;
+
+ gcc_assert (int_registers_saved);
+
+ /* No need to do stack checking as the area will be immediately
+ written. */
+ pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (-sse_size), -1,
+ m->fs.cfa_reg == stack_pointer_rtx);
+ allocate -= sse_size;
+ ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
+ sse_registers_saved = true;
+ }
+
/* The stack has already been decremented by the instruction calling us
so probe if the size is non-negative to preserve the protection area. */
if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
if (!int_registers_saved)
ix86_emit_save_regs_using_mov (frame.reg_save_offset);
- if (frame.nsseregs)
+ if (!sse_registers_saved)
ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
pic_reg_used = false;
ix86_compute_frame_layout (&frame);
m->fs.sp_valid = (!frame_pointer_needed
- || (current_function_sp_is_unchanging
+ || (crtl->sp_is_unchanging
&& !stack_realign_fp));
gcc_assert (!m->fs.sp_valid
|| m->fs.sp_offset == frame.stack_pointer_offset);
}
/* First step is to deallocate the stack frame so that we can
- pop the registers. */
- if (!m->fs.sp_valid)
+ pop the registers. Also do it on SEH target for very large
+ frame as the emitted instructions aren't allowed by the ABI in
+ epilogues. */
+ if (!m->fs.sp_valid
+ || (TARGET_SEH
+ && (m->fs.sp_offset - frame.reg_save_offset
+ >= SEH_MAX_FRAME_SIZE)))
{
pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
GEN_INT (m->fs.fp_offset
int retval = 1;
enum ix86_address_seg seg = SEG_DEFAULT;
+ /* Allow SImode subregs of DImode addresses,
+ they will be emitted with addr32 prefix. */
+ if (TARGET_64BIT && GET_MODE (addr) == SImode)
+ {
+ if (GET_CODE (addr) == SUBREG
+ && GET_MODE (XEXP (addr, 0)) == DImode)
+ addr = SUBREG_REG (addr);
+ }
+
/* Allow zero-extended SImode addresses,
they will be emitted with addr32 prefix. */
- if (TARGET_64BIT && GET_MODE (addr) == DImode)
+ else if (TARGET_64BIT && GET_MODE (addr) == DImode)
{
if (GET_CODE (addr) == ZERO_EXTEND
&& GET_MODE (XEXP (addr, 0)) == SImode)
case TLS_MODEL_INITIAL_EXEC:
if (TARGET_64BIT)
{
- if (TARGET_SUN_TLS)
+ if (TARGET_SUN_TLS && !TARGET_X32)
{
/* The Sun linker took the AMD64 TLS spec literally
and can only handle %rax as destination of the
initial executable code sequence. */
- dest = gen_reg_rtx (Pmode);
+ dest = gen_reg_rtx (DImode);
emit_insn (gen_tls_initial_exec_64_sun (dest, x));
return dest;
}
C -- print opcode suffix for set/cmov insn.
c -- like C, but print reversed condition
F,f -- likewise, but for floating-point.
- O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, print the opcode suffix for
- the size of the current operand, otherwise nothing.
+ O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
+ otherwise nothing
R -- print the prefix for register names.
z -- print the opcode suffix for the size of the current operand.
Z -- likewise, with special suffixes for x87 instructions.
("invalid operand size for operand code 'O'");
return;
}
+
+ putc ('.', file);
#endif
return;
}
return;
- case 'C':
- case 'c':
case 'F':
case 'f':
+#ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
+ if (ASSEMBLER_DIALECT == ASM_ATT)
+ putc ('.', file);
+#endif
+
+ case 'C':
+ case 'c':
if (!COMPARISON_P (x))
{
output_operand_lossage ("operand is not a condition code, "
"invalid operand code '%c'", code);
return;
}
-#ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
- if (ASSEMBLER_DIALECT == ASM_ATT)
- putc ('.', file);
-#endif
put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
code == 'c' || code == 'f',
code == 'F' || code == 'f',
}
else
{
- /* Print SImode register names for zero-extended
- addresses to force addr32 prefix. */
+ /* Print SImode register names to force addr32 prefix. */
if (TARGET_64BIT
- && (GET_CODE (addr) == ZERO_EXTEND
+ && (GET_CODE (addr) == SUBREG
+ || GET_CODE (addr) == ZERO_EXTEND
|| GET_CODE (addr) == AND))
{
gcc_assert (!code);
unsigned int regno1 = true_regnum (operands[1]);
unsigned int regno2 = true_regnum (operands[2]);
+ /* FIXME: Handle zero-extended addresses. */
+ if (GET_CODE (operands[1]) == ZERO_EXTEND
+ || GET_CODE (operands[1]) == AND)
+ return false;
+
/* Check if we need to optimize. */
if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
return false;
t1 = gen_reg_rtx (V8SImode);
t2 = gen_reg_rtx (V8SImode);
emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
- emit_insn (gen_avx2_permvarv8si (t2, op0, mask));
+ emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
goto merge_two;
}
return;
case V4SFmode:
t1 = gen_reg_rtx (V8SFmode);
- t2 = gen_reg_rtx (V8SFmode);
- mask = gen_lowpart (V4SFmode, mask);
+ t2 = gen_reg_rtx (V8SImode);
+ mask = gen_lowpart (V4SImode, mask);
emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
- emit_insn (gen_avx_vec_concatv8sf (t2, mask, mask));
+ emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
return;
true if we want the N/2 high elements, else the low elements. */
void
-ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
+ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
{
- enum machine_mode imode = GET_MODE (operands[1]);
- rtx tmp, dest;
+ enum machine_mode imode = GET_MODE (src);
+ rtx tmp;
if (TARGET_SSE4_1)
{
if (GET_MODE_SIZE (imode) == 32)
{
tmp = gen_reg_rtx (halfmode);
- emit_insn (extract (tmp, operands[1]));
+ emit_insn (extract (tmp, src));
}
else if (high_p)
{
/* Shift higher 8 bytes to lower 8 bytes. */
tmp = gen_reg_rtx (imode);
emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
- gen_lowpart (V1TImode, operands[1]),
+ gen_lowpart (V1TImode, src),
GEN_INT (64)));
}
else
- tmp = operands[1];
+ tmp = src;
- emit_insn (unpack (operands[0], tmp));
+ emit_insn (unpack (dest, tmp));
}
else
{
gcc_unreachable ();
}
- dest = gen_lowpart (imode, operands[0]);
-
if (unsigned_p)
tmp = force_reg (imode, CONST0_RTX (imode));
else
tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
- operands[1], pc_rtx, pc_rtx);
+ src, pc_rtx, pc_rtx);
- emit_insn (unpack (dest, operands[1], tmp));
+ emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
}
}
/* For given symbol (function) construct code to compute address of it's PLT
entry in large x86-64 PIC model. */
-rtx
+static rtx
construct_plt_address (rtx symbol)
{
rtx tmp, unspec;
}
}
+/* Try to reorder ready list to take advantage of Atom pipelined IMUL
+ execution. It is applied if
+ (1) IMUL instruction is on the top of list;
+ (2) There exists the only producer of independent IMUL instruction in
+ ready list;
+ (3) Put found producer on the top of ready list.
+ Returns issue rate. */
+
+static int
+ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
+ int clock_var ATTRIBUTE_UNUSED)
+{
+ static int issue_rate = -1;
+ int n_ready = *pn_ready;
+ rtx insn, insn1, insn2;
+ int i;
+ sd_iterator_def sd_it;
+ dep_t dep;
+ int index = -1;
+
+ /* Set up issue rate. */
+ issue_rate = ix86_issue_rate();
+
+ /* Do reodering for Atom only. */
+ if (ix86_tune != PROCESSOR_ATOM)
+ return issue_rate;
+ /* Nothing to do if ready list contains only 1 instruction. */
+ if (n_ready <= 1)
+ return issue_rate;
+
+ /* Check that IMUL instruction is on the top of ready list. */
+ insn = ready[n_ready - 1];
+ if (!NONDEBUG_INSN_P (insn))
+ return issue_rate;
+ insn = PATTERN (insn);
+ if (GET_CODE (insn) == PARALLEL)
+ insn = XVECEXP (insn, 0, 0);
+ if (GET_CODE (insn) != SET)
+ return issue_rate;
+ if (!(GET_CODE (SET_SRC (insn)) == MULT
+ && GET_MODE (SET_SRC (insn)) == SImode))
+ return issue_rate;
+
+ /* Search for producer of independent IMUL instruction. */
+ for (i = n_ready - 2; i>= 0; i--)
+ {
+ insn = ready[i];
+ if (!NONDEBUG_INSN_P (insn))
+ continue;
+ /* Skip IMUL instruction. */
+ insn2 = PATTERN (insn);
+ if (GET_CODE (insn2) == PARALLEL)
+ insn2 = XVECEXP (insn2, 0, 0);
+ if (GET_CODE (insn2) == SET
+ && GET_CODE (SET_SRC (insn2)) == MULT
+ && GET_MODE (SET_SRC (insn2)) == SImode)
+ continue;
+
+ FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
+ {
+ rtx con;
+ con = DEP_CON (dep);
+ if (!NONDEBUG_INSN_P (con))
+ continue;
+ insn1 = PATTERN (con);
+ if (GET_CODE (insn1) == PARALLEL)
+ insn1 = XVECEXP (insn1, 0, 0);
+
+ if (GET_CODE (insn1) == SET
+ && GET_CODE (SET_SRC (insn1)) == MULT
+ && GET_MODE (SET_SRC (insn1)) == SImode)
+ {
+ sd_iterator_def sd_it1;
+ dep_t dep1;
+ /* Check if there is no other dependee for IMUL. */
+ index = i;
+ FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
+ {
+ rtx pro;
+ pro = DEP_PRO (dep1);
+ if (!NONDEBUG_INSN_P (pro))
+ continue;
+ if (pro != insn)
+ index = -1;
+ }
+ if (index >= 0)
+ break;
+ }
+ }
+ if (index >= 0)
+ break;
+ }
+ if (index < 0)
+ return issue_rate; /* Didn't find IMUL producer. */
+
+ if (sched_verbose > 1)
+ fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n",
+ INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1]));
+
+ /* Put IMUL producer (ready[index]) at the top of ready list. */
+ insn1= ready[index];
+ for (i = index; i < n_ready - 1; i++)
+ ready[i] = ready[i + 1];
+ ready[n_ready - 1] = insn1;
+
+ return issue_rate;
+}
+
\f
/* Model decoder of Core 2/i7.
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
- { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
- { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3 , "__builtin_ia32_pmuldq256" , IX86_BUILTIN_PMULDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
+ { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
- { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3 , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
+ { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
}
}
+#ifdef SUBTARGET_FOLD_BUILTIN
+ return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
+#endif
+
return NULL_TREE;
}
case V2DI_FTYPE_V2DI_V2DI:
case V2DI_FTYPE_V16QI_V16QI:
case V2DI_FTYPE_V4SI_V4SI:
+ case V2UDI_FTYPE_V4USI_V4USI:
case V2DI_FTYPE_V2DI_V16QI:
case V2DI_FTYPE_V2DF_V2DF:
case V2SI_FTYPE_V2SI_V2SI:
case V8SI_FTYPE_V16HI_V16HI:
case V4DI_FTYPE_V4DI_V4DI:
case V4DI_FTYPE_V8SI_V8SI:
+ case V4UDI_FTYPE_V8USI_V8USI:
if (comparison == UNKNOWN)
return ix86_expand_binop_builtin (icode, exp, target);
nargs = 2;
break;
case MODE_FLOAT:
- if ((TARGET_SSE2 && mode == TFmode)
+ if ((TARGET_SSE && mode == TFmode)
|| (TARGET_80387 && mode == XFmode)
|| ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
|| ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
break;
case MODE_COMPLEX_FLOAT:
- if ((TARGET_SSE2 && mode == TCmode)
+ if ((TARGET_SSE && mode == TCmode)
|| (TARGET_80387 && mode == XCmode)
|| ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
|| ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
scanned. In either case, *TOTAL contains the cost result. */
static bool
-ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total,
+ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
bool speed)
{
+ enum rtx_code code = (enum rtx_code) code_i;
enum rtx_code outer_code = (enum rtx_code) outer_code_i;
enum machine_mode mode = GET_MODE (x);
const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
case CONST_DOUBLE:
if (mode == VOIDmode)
- *total = 0;
- else
- switch (standard_80387_constant_p (x))
- {
- case 1: /* 0.0 */
- *total = 1;
- break;
- default: /* Other constants */
- *total = 2;
- break;
- case 0:
- case -1:
- /* Start with (MEM (SYMBOL_REF)), since that's where
- it'll probably end up. Add a penalty for size. */
- *total = (COSTS_N_INSNS (1)
- + (flag_pic != 0 && !TARGET_64BIT)
- + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
- break;
- }
+ {
+ *total = 0;
+ return true;
+ }
+ switch (standard_80387_constant_p (x))
+ {
+ case 1: /* 0.0 */
+ *total = 1;
+ return true;
+ default: /* Other constants */
+ *total = 2;
+ return true;
+ case 0:
+ case -1:
+ break;
+ }
+ if (SSE_FLOAT_MODE_P (mode))
+ {
+ case CONST_VECTOR:
+ switch (standard_sse_constant_p (x))
+ {
+ case 0:
+ break;
+ case 1: /* 0: xor eliminates false dependency */
+ *total = 0;
+ return true;
+ default: /* -1: cmp contains false dependency */
+ *total = 1;
+ return true;
+ }
+ }
+ /* Fall back to (MEM (SYMBOL_REF)), since that's where
+ it'll probably end up. Add a penalty for size. */
+ *total = (COSTS_N_INSNS (1)
+ + (flag_pic != 0 && !TARGET_64BIT)
+ + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
return true;
case ZERO_EXTEND:
return false;
case ASHIFT:
- if (CONST_INT_P (XEXP (x, 1))
- && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT))
+ if (SCALAR_INT_MODE_P (mode)
+ && GET_MODE_SIZE (mode) < UNITS_PER_WORD
+ && CONST_INT_P (XEXP (x, 1)))
{
HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
if (value == 1)
case ASHIFTRT:
case LSHIFTRT:
case ROTATERT:
- if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
+ if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+ {
+ /* ??? Should be SSE vector operation cost. */
+ /* At least for published AMD latencies, this really is the same
+ as the latency for a simple fpu operation like fabs. */
+ /* V*QImode is emulated with 1-11 insns. */
+ if (mode == V16QImode || mode == V32QImode)
+ {
+ int count = 11;
+ if (TARGET_XOP && mode == V16QImode)
+ {
+ /* For XOP we use vpshab, which requires a broadcast of the
+ value to the variable shift insn. For constants this
+ means a V16Q const in mem; even when we can perform the
+ shift with one insn set the cost to prefer paddb. */
+ if (CONSTANT_P (XEXP (x, 1)))
+ {
+ *total = (cost->fabs
+ + rtx_cost (XEXP (x, 0), code, 0, speed)
+ + (speed ? 2 : COSTS_N_BYTES (16)));
+ return true;
+ }
+ count = 3;
+ }
+ else if (TARGET_SSSE3)
+ count = 7;
+ *total = cost->fabs * count;
+ }
+ else
+ *total = cost->fabs;
+ }
+ else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
{
if (CONST_INT_P (XEXP (x, 1)))
{
*total = cost->fmul;
return false;
}
+ else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+ {
+ /* V*QImode is emulated with 7-13 insns. */
+ if (mode == V16QImode || mode == V32QImode)
+ {
+ int extra = 11;
+ if (TARGET_XOP && mode == V16QImode)
+ extra = 5;
+ else if (TARGET_SSSE3)
+ extra = 6;
+ *total = cost->fmul * 2 + cost->fabs * extra;
+ }
+ /* V*DImode is emulated with 5-8 insns. */
+ else if (mode == V2DImode || mode == V4DImode)
+ {
+ if (TARGET_XOP && mode == V2DImode)
+ *total = cost->fmul * 2 + cost->fabs * 3;
+ else
+ *total = cost->fmul * 3 + cost->fabs * 5;
+ }
+ /* Without sse4.1, we don't have PMULLD; it's emulated with 7
+ insns, including two PMULUDQ. */
+ else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
+ *total = cost->fmul * 2 + cost->fabs * 5;
+ else
+ *total = cost->fmul;
+ return false;
+ }
else
{
rtx op0 = XEXP (x, 0);
case PLUS:
if (GET_MODE_CLASS (mode) == MODE_INT
- && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode))
+ && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
{
if (GET_CODE (XEXP (x, 0)) == PLUS
&& GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
case AND:
case IOR:
case XOR:
- if (!TARGET_64BIT && mode == DImode)
+ if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
{
*total = (cost->add * 2
+ (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
/* FALLTHRU */
case NOT:
- if (!TARGET_64BIT && mode == DImode)
+ if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+ {
+ /* ??? Should be SSE vector operation cost. */
+ /* At least for published AMD latencies, this really is the same
+ as the latency for a simple fpu operation like fabs. */
+ *total = cost->fabs;
+ }
+ else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
*total = cost->add * 2;
else
*total = cost->add;
/* ??? Assume all of these vector manipulation patterns are
recognizable. In which case they all pretty much have the
same cost. */
- *total = COSTS_N_INSNS (1);
+ *total = cost->fabs;
return true;
default:
{
rtx this_param = x86_this_parameter (function);
rtx this_reg, tmp, fnaddr;
+ unsigned int tmp_regno;
+
+ if (TARGET_64BIT)
+ tmp_regno = R10_REG;
+ else
+ {
+ unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
+ if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
+ tmp_regno = AX_REG;
+ else
+ tmp_regno = CX_REG;
+ }
emit_note (NOTE_INSN_PROLOGUE_END);
{
if (!x86_64_general_operand (delta_rtx, Pmode))
{
- tmp = gen_rtx_REG (Pmode, R10_REG);
+ tmp = gen_rtx_REG (Pmode, tmp_regno);
emit_move_insn (tmp, delta_rtx);
delta_rtx = tmp;
}
if (vcall_offset)
{
rtx vcall_addr, vcall_mem, this_mem;
- unsigned int tmp_regno;
- if (TARGET_64BIT)
- tmp_regno = R10_REG;
- else
- {
- unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
- if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
- tmp_regno = AX_REG;
- else
- tmp_regno = CX_REG;
- }
tmp = gen_rtx_REG (Pmode, tmp_regno);
this_mem = gen_rtx_MEM (ptr_mode, this_reg);
emit_jump_insn (gen_indirect_jump (fnaddr));
else
{
+ if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
+ fnaddr = legitimize_pic_address (fnaddr,
+ gen_rtx_REG (Pmode, tmp_regno));
+
+ if (!sibcall_insn_operand (fnaddr, word_mode))
+ {
+ tmp = gen_rtx_REG (word_mode, tmp_regno);
+ if (GET_MODE (fnaddr) != word_mode)
+ fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
+ emit_move_insn (tmp, fnaddr);
+ fnaddr = tmp;
+ }
+
tmp = gen_rtx_MEM (QImode, fnaddr);
tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
tmp = emit_call_insn (tmp);
return min_prev_count;
}
-/* Pad short funtion to 4 instructions. */
+/* Pad short function to 4 instructions. */
static void
ix86_pad_short_function (void)
}
else
{
- rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
+ rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
emit_move_insn (mem, target);
}
else
{
- rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
+ rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
emit_move_insn (mem, vec);
/* Implement targetm.vectorize.builtin_vectorization_cost. */
static int
ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
- tree vectype ATTRIBUTE_UNUSED,
+ tree vectype,
int misalign ATTRIBUTE_UNUSED)
{
+ unsigned elements;
+
switch (type_of_cost)
{
case scalar_stmt:
case vec_promote_demote:
return ix86_cost->vec_stmt_cost;
+ case vec_construct:
+ elements = TYPE_VECTOR_SUBPARTS (vectype);
+ return elements / 2 + 1;
+
default:
gcc_unreachable ();
}
gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
vperm = force_reg (vmode, vperm);
- if (vmode == V8SImode && d->vmode == V8SFmode)
- {
- vmode = V8SFmode;
- vperm = gen_lowpart (vmode, vperm);
- }
-
target = gen_lowpart (vmode, d->target);
op0 = gen_lowpart (vmode, d->op0);
if (d->one_operand_p)
{
if (d->perm[0] / nelt2 == nonzero_halves[1])
{
- /* Attempt to increase the likelyhood that dfinal
+ /* Attempt to increase the likelihood that dfinal
shuffle will be intra-lane. */
char tmph = nonzero_halves[0];
nonzero_halves[0] = nonzero_halves[1];
expand_vec_perm_even_odd_1 (&d, odd);
}
+static void
+ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
+{
+ struct expand_vec_perm_d d;
+ unsigned i, nelt, base;
+ bool ok;
+
+ d.target = targ;
+ d.op0 = op0;
+ d.op1 = op1;
+ d.vmode = GET_MODE (targ);
+ d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+ d.one_operand_p = false;
+ d.testing_p = false;
+
+ base = high_p ? nelt / 2 : 0;
+ for (i = 0; i < nelt / 2; ++i)
+ {
+ d.perm[i * 2] = i + base;
+ d.perm[i * 2 + 1] = i + base + nelt;
+ }
+
+ /* Note that for AVX this isn't one instruction. */
+ ok = ix86_expand_vec_perm_const_1 (&d);
+ gcc_assert (ok);
+}
+
+
+/* Expand a vector operation CODE for a V*QImode in terms of the
+ same operation on V*HImode. */
+
+void
+ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+{
+ enum machine_mode qimode = GET_MODE (dest);
+ enum machine_mode himode;
+ rtx (*gen_il) (rtx, rtx, rtx);
+ rtx (*gen_ih) (rtx, rtx, rtx);
+ rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
+ struct expand_vec_perm_d d;
+ bool ok, full_interleave;
+ bool uns_p = false;
+ int i;
+
+ switch (qimode)
+ {
+ case V16QImode:
+ himode = V8HImode;
+ gen_il = gen_vec_interleave_lowv16qi;
+ gen_ih = gen_vec_interleave_highv16qi;
+ break;
+ case V32QImode:
+ himode = V16HImode;
+ gen_il = gen_avx2_interleave_lowv32qi;
+ gen_ih = gen_avx2_interleave_highv32qi;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ op2_l = op2_h = op2;
+ switch (code)
+ {
+ case MULT:
+ /* Unpack data such that we've got a source byte in each low byte of
+ each word. We don't care what goes into the high byte of each word.
+ Rather than trying to get zero in there, most convenient is to let
+ it be a copy of the low byte. */
+ op2_l = gen_reg_rtx (qimode);
+ op2_h = gen_reg_rtx (qimode);
+ emit_insn (gen_il (op2_l, op2, op2));
+ emit_insn (gen_ih (op2_h, op2, op2));
+ /* FALLTHRU */
+
+ op1_l = gen_reg_rtx (qimode);
+ op1_h = gen_reg_rtx (qimode);
+ emit_insn (gen_il (op1_l, op1, op1));
+ emit_insn (gen_ih (op1_h, op1, op1));
+ full_interleave = qimode == V16QImode;
+ break;
+
+ case ASHIFT:
+ case LSHIFTRT:
+ uns_p = true;
+ /* FALLTHRU */
+ case ASHIFTRT:
+ op1_l = gen_reg_rtx (himode);
+ op1_h = gen_reg_rtx (himode);
+ ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
+ ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
+ full_interleave = true;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ /* Perform the operation. */
+ res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
+ 1, OPTAB_DIRECT);
+ res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
+ 1, OPTAB_DIRECT);
+ gcc_assert (res_l && res_h);
+
+ /* Merge the data back into the right place. */
+ d.target = dest;
+ d.op0 = gen_lowpart (qimode, res_l);
+ d.op1 = gen_lowpart (qimode, res_h);
+ d.vmode = qimode;
+ d.nelt = GET_MODE_NUNITS (qimode);
+ d.one_operand_p = false;
+ d.testing_p = false;
+
+ if (full_interleave)
+ {
+ /* For SSE2, we used an full interleave, so the desired
+ results are in the even elements. */
+ for (i = 0; i < 32; ++i)
+ d.perm[i] = i * 2;
+ }
+ else
+ {
+ /* For AVX, the interleave used above was not cross-lane. So the
+ extraction is evens but with the second and third quarter swapped.
+ Happily, that is even one insn shorter than even extraction. */
+ for (i = 0; i < 32; ++i)
+ d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
+ }
+
+ ok = ix86_expand_vec_perm_const_1 (&d);
+ gcc_assert (ok);
+
+ set_unique_reg_note (get_last_insn (), REG_EQUAL,
+ gen_rtx_fmt_ee (code, qimode, op1, op2));
+}
+
+void
+ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
+ bool uns_p, bool odd_p)
+{
+ enum machine_mode mode = GET_MODE (op1);
+ enum machine_mode wmode = GET_MODE (dest);
+ rtx x;
+
+ /* We only play even/odd games with vectors of SImode. */
+ gcc_assert (mode == V4SImode || mode == V8SImode);
+
+ /* If we're looking for the odd results, shift those members down to
+ the even slots. For some cpus this is faster than a PSHUFD. */
+ if (odd_p)
+ {
+ if (TARGET_XOP && mode == V4SImode)
+ {
+ x = force_reg (wmode, CONST0_RTX (wmode));
+ emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
+ return;
+ }
+
+ x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
+ op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
+ x, NULL, 1, OPTAB_DIRECT);
+ op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
+ x, NULL, 1, OPTAB_DIRECT);
+ op1 = gen_lowpart (mode, op1);
+ op2 = gen_lowpart (mode, op2);
+ }
+
+ if (mode == V8SImode)
+ {
+ if (uns_p)
+ x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
+ else
+ x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
+ }
+ else if (uns_p)
+ x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
+ else if (TARGET_SSE4_1)
+ x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
+ else
+ {
+ rtx s1, s2, t0, t1, t2;
+
+ /* The easiest way to implement this without PMULDQ is to go through
+ the motions as if we are performing a full 64-bit multiply. With
+ the exception that we need to do less shuffling of the elements. */
+
+ /* Compute the sign-extension, aka highparts, of the two operands. */
+ s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
+ op1, pc_rtx, pc_rtx);
+ s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
+ op2, pc_rtx, pc_rtx);
+
+ /* Multiply LO(A) * HI(B), and vice-versa. */
+ t1 = gen_reg_rtx (wmode);
+ t2 = gen_reg_rtx (wmode);
+ emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
+ emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
+
+ /* Multiply LO(A) * LO(B). */
+ t0 = gen_reg_rtx (wmode);
+ emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
+
+ /* Combine and shift the highparts into place. */
+ t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
+ t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
+ 1, OPTAB_DIRECT);
+
+ /* Combine high and low parts. */
+ force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
+ return;
+ }
+ emit_insn (x);
+}
+
+void
+ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
+ bool uns_p, bool high_p)
+{
+ enum machine_mode wmode = GET_MODE (dest);
+ enum machine_mode mode = GET_MODE (op1);
+ rtx t1, t2, t3, t4, mask;
+
+ switch (mode)
+ {
+ case V4SImode:
+ t1 = gen_reg_rtx (mode);
+ t2 = gen_reg_rtx (mode);
+ if (TARGET_XOP && !uns_p)
+ {
+ /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
+ shuffle the elements once so that all elements are in the right
+ place for immediate use: { A C B D }. */
+ emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
+ const1_rtx, GEN_INT (3)));
+ emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
+ const1_rtx, GEN_INT (3)));
+ }
+ else
+ {
+ /* Put the elements into place for the multiply. */
+ ix86_expand_vec_interleave (t1, op1, op1, high_p);
+ ix86_expand_vec_interleave (t2, op2, op2, high_p);
+ high_p = false;
+ }
+ ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
+ break;
+
+ case V8SImode:
+ /* Shuffle the elements between the lanes. After this we
+ have { A B E F | C D G H } for each operand. */
+ t1 = gen_reg_rtx (V4DImode);
+ t2 = gen_reg_rtx (V4DImode);
+ emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
+ const0_rtx, const2_rtx,
+ const1_rtx, GEN_INT (3)));
+ emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
+ const0_rtx, const2_rtx,
+ const1_rtx, GEN_INT (3)));
+
+ /* Shuffle the elements within the lanes. After this we
+ have { A A B B | C C D D } or { E E F F | G G H H }. */
+ t3 = gen_reg_rtx (V8SImode);
+ t4 = gen_reg_rtx (V8SImode);
+ mask = GEN_INT (high_p
+ ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
+ : 0 + (0 << 2) + (1 << 4) + (1 << 6));
+ emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
+ emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
+
+ ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
+ break;
+
+ case V8HImode:
+ case V16HImode:
+ t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
+ uns_p, OPTAB_DIRECT);
+ t2 = expand_binop (mode,
+ uns_p ? umul_highpart_optab : smul_highpart_optab,
+ op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
+ gcc_assert (t1 && t2);
+
+ ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
+ break;
+
+ case V16QImode:
+ case V32QImode:
+ t1 = gen_reg_rtx (wmode);
+ t2 = gen_reg_rtx (wmode);
+ ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
+ ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
+
+ emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+}
+
+void
+ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
+{
+ rtx res_1, res_2;
+
+ res_1 = gen_reg_rtx (V4SImode);
+ res_2 = gen_reg_rtx (V4SImode);
+ ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
+ op1, op2, true, false);
+ ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
+ op1, op2, true, true);
+
+ /* Move the results in element 2 down to element 1; we don't care
+ what goes in elements 2 and 3. Then we can merge the parts
+ back together with an interleave.
+
+ Note that two other sequences were tried:
+ (1) Use interleaves at the start instead of psrldq, which allows
+ us to use a single shufps to merge things back at the end.
+ (2) Use shufps here to combine the two vectors, then pshufd to
+ put the elements in the correct order.
+ In both cases the cost of the reformatting stall was too high
+ and the overall sequence slower. */
+
+ emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
+ const0_rtx, const0_rtx));
+ emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
+ const0_rtx, const0_rtx));
+ res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
+
+ set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
+}
+
+void
+ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
+{
+ enum machine_mode mode = GET_MODE (op0);
+ rtx t1, t2, t3, t4, t5, t6;
+
+ if (TARGET_XOP && mode == V2DImode)
+ {
+ /* op1: A,B,C,D, op2: E,F,G,H */
+ op1 = gen_lowpart (V4SImode, op1);
+ op2 = gen_lowpart (V4SImode, op2);
+
+ t1 = gen_reg_rtx (V4SImode);
+ t2 = gen_reg_rtx (V4SImode);
+ t3 = gen_reg_rtx (V2DImode);
+ t4 = gen_reg_rtx (V2DImode);
+
+ /* t1: B,A,D,C */
+ emit_insn (gen_sse2_pshufd_1 (t1, op1,
+ GEN_INT (1),
+ GEN_INT (0),
+ GEN_INT (3),
+ GEN_INT (2)));
+
+ /* t2: (B*E),(A*F),(D*G),(C*H) */
+ emit_insn (gen_mulv4si3 (t2, t1, op2));
+
+ /* t3: (B*E)+(A*F), (D*G)+(C*H) */
+ emit_insn (gen_xop_phadddq (t3, t2));
+
+ /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
+ emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
+
+ /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
+ emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
+ }
+ else
+ {
+ enum machine_mode nmode;
+ rtx (*umul) (rtx, rtx, rtx);
+
+ if (mode == V2DImode)
+ {
+ umul = gen_vec_widen_umult_even_v4si;
+ nmode = V4SImode;
+ }
+ else if (mode == V4DImode)
+ {
+ umul = gen_vec_widen_umult_even_v8si;
+ nmode = V8SImode;
+ }
+ else
+ gcc_unreachable ();
+
+
+ /* Multiply low parts. */
+ t1 = gen_reg_rtx (mode);
+ emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
+
+ /* Shift input vectors right 32 bits so we can multiply high parts. */
+ t6 = GEN_INT (32);
+ t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
+ t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
+
+ /* Multiply high parts by low parts. */
+ t4 = gen_reg_rtx (mode);
+ t5 = gen_reg_rtx (mode);
+ emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
+ emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
+
+ /* Combine and shift the highparts back. */
+ t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
+ t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
+
+ /* Combine high and low parts. */
+ force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
+ }
+
+ set_unique_reg_note (get_last_insn (), REG_EQUAL,
+ gen_rtx_MULT (mode, op1, op2));
+}
+
/* Expand an insert into a vector register through pinsr insn.
Return true if successful. */
#define TARGET_SCHED_DISPATCH_DO do_dispatch
#undef TARGET_SCHED_REASSOCIATION_WIDTH
#define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
+#undef TARGET_SCHED_REORDER
+#define TARGET_SCHED_REORDER ix86_sched_reorder
/* The size of the dispatch window is the total number of bytes of
object code allowed in a window. */
/* Make disp_cmp and disp_jcc get scheduled at the latest. These
instructions should be given the lowest priority in the
scheduling process in Haifa scheduler to make sure they will be
- scheduled in the same dispatch window as the refrence to them. */
+ scheduled in the same dispatch window as the reference to them. */
if (group == disp_jcc || group == disp_cmp)
return false;
return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
}
+/* Implement targetm.vectorize.init_cost. */
+
+static void *
+ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
+{
+ unsigned *cost = XNEW (unsigned);
+ *cost = 0;
+ return cost;
+}
+
+/* Implement targetm.vectorize.add_stmt_cost. */
+
+static unsigned
+ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
+ struct _stmt_vec_info *stmt_info, int misalign)
+{
+ unsigned *cost = (unsigned *) data;
+ unsigned retval = 0;
+
+ if (flag_vect_cost_model)
+ {
+ tree vectype = stmt_vectype (stmt_info);
+ int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+
+ /* Statements in an inner loop relative to the loop being
+ vectorized are weighted more heavily. The value here is
+ arbitrary and could potentially be improved with analysis. */
+ if (stmt_in_inner_loop_p (stmt_info))
+ count *= 50; /* FIXME. */
+
+ retval = (unsigned) (count * stmt_cost);
+ *cost += retval;
+ }
+
+ return retval;
+}
+
+/* Implement targetm.vectorize.finish_cost. */
+
+static unsigned
+ix86_finish_cost (void *data)
+{
+ return *((unsigned *) data);
+}
+
+/* Implement targetm.vectorize.destroy_cost_data. */
+
+static void
+ix86_destroy_cost_data (void *data)
+{
+ free (data);
+}
+
/* Validate target specific memory model bits in VAL. */
static unsigned HOST_WIDE_INT
#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
ix86_autovectorize_vector_sizes
+#undef TARGET_VECTORIZE_INIT_COST
+#define TARGET_VECTORIZE_INIT_COST ix86_init_cost
+#undef TARGET_VECTORIZE_ADD_STMT_COST
+#define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
+#undef TARGET_VECTORIZE_FINISH_COST
+#define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
+#undef TARGET_VECTORIZE_DESTROY_COST_DATA
+#define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
#undef TARGET_SET_CURRENT_FUNCTION
#define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function