+2008-05-14 Michael Meissner <michael.meissner@amd.com>
+ Dwarakanath Rajagopal <dwarak.rajagopal@amd.com>
+
+ * optabs.h (optab_index): Add OTI_vashl, OTI_vlshr, OTI_vashr,
+ OTI_vrotl, OTI_vrotr to support vector/vector shifts.
+ (vashl_optab): New optab for vector/vector shifts.
+ (vashr_optab): Ditto.
+ (vlshr_optab): Ditto.
+ (vrotl_optab): Ditto.
+ (vrotr_optab): Ditto.
+ (optab_subtype): New enum for optab_for_tree_code call.
+ (optab_for_tree_code): Add enum optab_subtype argument.
+
+ * optabs.c (optab_for_tree_code): Take an additional argument to
+ distinguish between a vector shift by a scalar and vector shift by
+ a vector. Make lshr/ashr/ashl/rotl/rotr optabs just vector
+ shifted by a scalar. Use vlshr/vashr/vashl/vrotl/vrotr for the
+ vector shift by a vector.
+ (expand_widen_pattern_expr): Pass additional argument to
+ optab_for_tree_code.
+
+ * genopinit.c (optabs): Add vashr_optab, vashl_optab, vlshr_optab,
+ vrotl_optab, vrotr_optab.
+
+ * expr.c (expand_expr_real_1): Update calls to
+ optab_for_tree_code to distinguish between vector shifted by a
+ scalar and vector shifted by a vector.
+ * tree-vectorizer.c (supportable_widening_operation): Ditto.
+ (supportable_narrowing_operation): Ditto.
+ * tree-vect-analyze.c (vect_build_slp_tree): Ditto.
+ * tree-vect-patterns.c (vect_pattern_recog_1): Ditto.
+ * tree-vect-transform.c (vect_model_reduction_cost): Ditto.
+ (vect_create_epilog_for_reduction): Ditto.
+ (vectorizable_reduction): Ditto.
+ (vectorizable_operation): Ditto.
+ (vect_strided_store_supported): Ditto.
+ (vect_strided_load_supported): Ditto.
+ * tree-vect-generic.c (expand_vector_operations_1): Ditto.
+ * expmed.c (expand_shift): Ditto.
+
+ * doc/md.texi (ashl@var{m}3): Document that operand 2 is always a
+ scalar type.
+ (ashr@var{m}3): Ditto.
+ (vashl@var{m}3): Document new vector/vector shift standard name.
+ (vashr@var{m}3): Ditto.
+ (vlshr@var{m}3): Ditto.
+ (vrotl@var{m}3): Ditto.
+ (vrotr@var{m}3): Ditto.
+
+ * config/i386/i386.md (PPERM_SRC): Move PPERM masks here from
+ i386.c.
+ (PPERM_INVERT): Ditto.
+ (PPERM_REVERSE): Ditto.
+ (PPERM_REV_INV): Ditto.
+ (PPERM_ZERO): Ditto.
+ (PPERM_ONES): Ditto.
+ (PPERM_SIGN): Ditto.
+ (PPERM_INV_SIGN): Ditto.
+ (PPERM_SRC1): Ditto.
+ (PPERM_SRC2): Ditto.
+
+ * config/i386/sse.md (mulv2di3): Add SSE5 support.
+ (sse5_pmacsdql_mem): New SSE5 define_and_split that temporarily
+ allows a memory operand to be the value being added, and split it
+ to improve vectorization.
+ (sse5_pmacsdqh_mem): Ditto.
+ (sse5_mulv2div2di3_low): SSE5 32-bit multiply and extend function.
+ (sse5_mulv2div2di3_high): Ditto.
+ (vec_pack_trunc_v8hi): Add SSE5 pperm support.
+ (vec_pack_trunc_v4si): Ditto.
+ (vec_pack_trunc_v2di): Ditto.
+ (sse5_pcmov_<mode>): Remove code that tried to use use
+ andps/andnps instead of pcmov.
+ (vec_widen_smult_hi_v4si): If we have SSE5, use the pmacsdql and
+ pmacsdqh instructions.
+ (vec_widen_smult_lo_v4si): Ditto.
+
+ * config/i386/i386.c (PPERM_SRC): Move PPERM masks to i386.md.
+ (PPERM_INVERT): Ditto.
+ (PPERM_REVERSE): Ditto.
+ (PPERM_REV_INV): Ditto.
+ (PPERM_ZERO): Ditto.
+ (PPERM_ONES): Ditto.
+ (PPERM_SIGN): Ditto.
+ (PPERM_INV_SIGN): Ditto.
+ (PPERM_SRC1): Ditto.
+ (PPERM_SRC2): Ditto.
+ (ix86_expand_sse_movcc): Move the SSE5 test after the if
+ true/false tests.
+ (ix86_expand_int_vcond): If SSE5 generate all possible integer
+ comparisons.
+ (ix86_sse5_valid_op_p): Allow num_memory to be negative, which
+ says ignore whether the last reference is a memory operand.
+
+2008-05-14 Michael Meissner <michael.meissner@amd.com>
+ Paolo Bonzini <bonzini at gnu dot org>
+
+ * config/rs6000/rs6000.c (bdesc_2arg): Change the names of vector
+ shift patterns.
+
+ * config/rs6000/altivec.md (vashl<mode>3): Rename from
+ ashl<mode>3.
+ (vlshr<mode>3): Rename from vlshr<mode>3.
+ (vashr<mode>3): Rename from vashr<mode>3.
+ (mulv4sf3): Change the names of vector shift patterns.
+ (mulv4si3): Ditto.
+ (negv4sf2): Ditt.
+
+ * config/spu/spu.c (spu_initialize_trampoline): Rename vector
+ shift insns.
+
+ * config/spu/spu-builtins.def (SI_SHLH): Rename vector shift
+ insns.
+ (SI_SHLHI): Ditto.
+ (SI_SHL): Ditto.
+ (SI_SHLI): Ditto.
+ (SI_ROTH): Ditto.
+ (SI_ROTHI): Ditto.
+ (SI_ROT): Ditto.
+ (SI_ROTI): Ditto.
+ (SPU_RL_0): Ditto.
+ (SPU_RL_1): Ditto.
+ (SPU_RL_2): Ditto.
+ (SPU_RL_3): Ditto.
+ (SPU_RL_4): Ditto.
+ (SPU_RL_5): Ditto.
+ (SPU_RL_6): Ditto.
+ (SPU_RL_7): Ditto.
+ (SPU_SL_0): Ditto.
+ (SPU_SL_1): Ditto.
+ (SPU_SL_2): Ditto.
+ (SPU_SL_3): Ditto.
+ (SPU_SL_4): Ditto.
+ (SPU_SL_5): Ditto.
+ (SPU_SL_6): Ditto.
+ (SPU_SL_7): Ditto.
+
+ * config/spu/spu.md (v): New iterator macro to add v for vector types.
+ (floatunssidf2_internal): Change vector/vector shift names.
+ (floatunsdidf2_internal): Ditto.
+ (mulv8hi3): Ditto.
+ (ashrdi3): Ditto.
+ (ashrti3): Ditto.
+ (cgt_df): Ditto.
+ (cgt_v2df): Ditto.
+ (dftsv): Ditto.
+ (vashl<mode>3): Rename from ashl<mode>3.
+ (vashr<mode>3): Rename from ashr<mode>3.
+ (vlshr<mode>3): Rename from lshr<mode>3.
+ (vrotl<mode>3): Rename from rotl<mode>3.
+
2008-05-14 Michael Meissner <michael.meissner@amd.com>
PR target/36224
enum machine_mode mode = GET_MODE (dest);
rtx t2, t3, x;
- if (TARGET_SSE5)
- {
- rtx pcmov = gen_rtx_SET (mode, dest,
- gen_rtx_IF_THEN_ELSE (mode, cmp,
- op_true,
- op_false));
- emit_insn (pcmov);
- }
- else if (op_false == CONST0_RTX (mode))
+ if (op_false == CONST0_RTX (mode))
{
op_true = force_reg (mode, op_true);
x = gen_rtx_AND (mode, cmp, op_true);
x = gen_rtx_AND (mode, x, op_false);
emit_insn (gen_rtx_SET (VOIDmode, dest, x));
}
+ else if (TARGET_SSE5)
+ {
+ rtx pcmov = gen_rtx_SET (mode, dest,
+ gen_rtx_IF_THEN_ELSE (mode, cmp,
+ op_true,
+ op_false));
+ emit_insn (pcmov);
+ }
else
{
op_true = force_reg (mode, op_true);
cop0 = operands[4];
cop1 = operands[5];
- /* Canonicalize the comparison to EQ, GT, GTU. */
- switch (code)
- {
- case EQ:
- case GT:
- case GTU:
- break;
-
- case NE:
- case LE:
- case LEU:
- code = reverse_condition (code);
- negate = true;
- break;
-
- case GE:
- case GEU:
- code = reverse_condition (code);
- negate = true;
- /* FALLTHRU */
-
- case LT:
- case LTU:
- code = swap_condition (code);
- x = cop0, cop0 = cop1, cop1 = x;
- break;
-
- default:
- gcc_unreachable ();
- }
-
- /* Only SSE4.1/SSE4.2 supports V2DImode. */
- if (mode == V2DImode)
+ /* SSE5 supports all of the comparisons on all vector int types. */
+ if (!TARGET_SSE5)
{
+ /* Canonicalize the comparison to EQ, GT, GTU. */
switch (code)
{
case EQ:
- /* SSE4.1 supports EQ. */
- if (!TARGET_SSE4_1)
- return false;
- break;
-
case GT:
case GTU:
- /* SSE4.2 supports GT/GTU. */
- if (!TARGET_SSE4_2)
- return false;
+ break;
+
+ case NE:
+ case LE:
+ case LEU:
+ code = reverse_condition (code);
+ negate = true;
+ break;
+
+ case GE:
+ case GEU:
+ code = reverse_condition (code);
+ negate = true;
+ /* FALLTHRU */
+
+ case LT:
+ case LTU:
+ code = swap_condition (code);
+ x = cop0, cop0 = cop1, cop1 = x;
break;
default:
gcc_unreachable ();
}
- }
- /* Unsigned parallel compare is not supported by the hardware. Play some
- tricks to turn this into a signed comparison against 0. */
- if (code == GTU)
- {
- cop0 = force_reg (mode, cop0);
+ /* Only SSE4.1/SSE4.2 supports V2DImode. */
+ if (mode == V2DImode)
+ {
+ switch (code)
+ {
+ case EQ:
+ /* SSE4.1 supports EQ. */
+ if (!TARGET_SSE4_1)
+ return false;
+ break;
- switch (mode)
+ case GT:
+ case GTU:
+ /* SSE4.2 supports GT/GTU. */
+ if (!TARGET_SSE4_2)
+ return false;
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+ }
+
+ /* Unsigned parallel compare is not supported by the hardware. Play some
+ tricks to turn this into a signed comparison against 0. */
+ if (code == GTU)
{
- case V4SImode:
- case V2DImode:
- {
- rtx t1, t2, mask;
-
- /* Perform a parallel modulo subtraction. */
- t1 = gen_reg_rtx (mode);
- emit_insn ((mode == V4SImode
- ? gen_subv4si3
- : gen_subv2di3) (t1, cop0, cop1));
-
- /* Extract the original sign bit of op0. */
- mask = ix86_build_signbit_mask (GET_MODE_INNER (mode),
- true, false);
- t2 = gen_reg_rtx (mode);
- emit_insn ((mode == V4SImode
- ? gen_andv4si3
- : gen_andv2di3) (t2, cop0, mask));
-
- /* XOR it back into the result of the subtraction. This results
- in the sign bit set iff we saw unsigned underflow. */
- x = gen_reg_rtx (mode);
- emit_insn ((mode == V4SImode
- ? gen_xorv4si3
- : gen_xorv2di3) (x, t1, t2));
-
- code = GT;
- }
- break;
+ cop0 = force_reg (mode, cop0);
- case V16QImode:
- case V8HImode:
- /* Perform a parallel unsigned saturating subtraction. */
- x = gen_reg_rtx (mode);
- emit_insn (gen_rtx_SET (VOIDmode, x,
- gen_rtx_US_MINUS (mode, cop0, cop1)));
+ switch (mode)
+ {
+ case V4SImode:
+ case V2DImode:
+ {
+ rtx t1, t2, mask;
+
+ /* Perform a parallel modulo subtraction. */
+ t1 = gen_reg_rtx (mode);
+ emit_insn ((mode == V4SImode
+ ? gen_subv4si3
+ : gen_subv2di3) (t1, cop0, cop1));
+
+ /* Extract the original sign bit of op0. */
+ mask = ix86_build_signbit_mask (GET_MODE_INNER (mode),
+ true, false);
+ t2 = gen_reg_rtx (mode);
+ emit_insn ((mode == V4SImode
+ ? gen_andv4si3
+ : gen_andv2di3) (t2, cop0, mask));
+
+ /* XOR it back into the result of the subtraction. This results
+ in the sign bit set iff we saw unsigned underflow. */
+ x = gen_reg_rtx (mode);
+ emit_insn ((mode == V4SImode
+ ? gen_xorv4si3
+ : gen_xorv2di3) (x, t1, t2));
+
+ code = GT;
+ }
+ break;
- code = EQ;
- negate = !negate;
- break;
+ case V16QImode:
+ case V8HImode:
+ /* Perform a parallel unsigned saturating subtraction. */
+ x = gen_reg_rtx (mode);
+ emit_insn (gen_rtx_SET (VOIDmode, x,
+ gen_rtx_US_MINUS (mode, cop0, cop1)));
- default:
- gcc_unreachable ();
- }
+ code = EQ;
+ negate = !negate;
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
- cop0 = x;
- cop1 = CONST0_RTX (mode);
+ cop0 = x;
+ cop1 = CONST0_RTX (mode);
+ }
}
x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
}
/* This function performs the same task as ix86_expand_sse_unpack,
- but with amdfam15 instructions. */
-
-#define PPERM_SRC 0x00 /* copy source */
-#define PPERM_INVERT 0x20 /* invert source */
-#define PPERM_REVERSE 0x40 /* bit reverse source */
-#define PPERM_REV_INV 0x60 /* bit reverse & invert src */
-#define PPERM_ZERO 0x80 /* all 0's */
-#define PPERM_ONES 0xa0 /* all 1's */
-#define PPERM_SIGN 0xc0 /* propagate sign bit */
-#define PPERM_INV_SIGN 0xe0 /* invert & propagate sign */
-
-#define PPERM_SRC1 0x00 /* use first source byte */
-#define PPERM_SRC2 0x10 /* use second source byte */
+ but with sse5 instructions. */
void
ix86_expand_sse5_unpack (rtx operands[2], bool unsigned_p, bool high_p)
{ OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_pmacsdqh, "__builtin_ia32_pmacsdqh", IX86_BUILTIN_PMACSDQH, 0, (int)MULTI_ARG_3_SI_DI },
{ OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_pmadcsswd, "__builtin_ia32_pmadcsswd", IX86_BUILTIN_PMADCSSWD, 0, (int)MULTI_ARG_3_HI_SI },
{ OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_pmadcswd, "__builtin_ia32_pmadcswd", IX86_BUILTIN_PMADCSWD, 0, (int)MULTI_ARG_3_HI_SI },
- { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_rotlv2di3, "__builtin_ia32_protq", IX86_BUILTIN_PROTQ, 0, (int)MULTI_ARG_2_DI },
- { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_rotlv4si3, "__builtin_ia32_protd", IX86_BUILTIN_PROTD, 0, (int)MULTI_ARG_2_SI },
- { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_rotlv8hi3, "__builtin_ia32_protw", IX86_BUILTIN_PROTW, 0, (int)MULTI_ARG_2_HI },
- { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_rotlv16qi3, "__builtin_ia32_protb", IX86_BUILTIN_PROTB, 0, (int)MULTI_ARG_2_QI },
- { OPTION_MASK_ISA_SSE5, CODE_FOR_rotlv2di3, "__builtin_ia32_protqi", IX86_BUILTIN_PROTQ_IMM, 0, (int)MULTI_ARG_2_DI_IMM },
- { OPTION_MASK_ISA_SSE5, CODE_FOR_rotlv4si3, "__builtin_ia32_protdi", IX86_BUILTIN_PROTD_IMM, 0, (int)MULTI_ARG_2_SI_IMM },
- { OPTION_MASK_ISA_SSE5, CODE_FOR_rotlv8hi3, "__builtin_ia32_protwi", IX86_BUILTIN_PROTW_IMM, 0, (int)MULTI_ARG_2_HI_IMM },
- { OPTION_MASK_ISA_SSE5, CODE_FOR_rotlv16qi3, "__builtin_ia32_protbi", IX86_BUILTIN_PROTB_IMM, 0, (int)MULTI_ARG_2_QI_IMM },
+ { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_vrotlv2di3, "__builtin_ia32_protq", IX86_BUILTIN_PROTQ, 0, (int)MULTI_ARG_2_DI },
+ { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_vrotlv4si3, "__builtin_ia32_protd", IX86_BUILTIN_PROTD, 0, (int)MULTI_ARG_2_SI },
+ { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_vrotlv8hi3, "__builtin_ia32_protw", IX86_BUILTIN_PROTW, 0, (int)MULTI_ARG_2_HI },
+ { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_vrotlv16qi3, "__builtin_ia32_protb", IX86_BUILTIN_PROTB, 0, (int)MULTI_ARG_2_QI },
+ { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_rotlv2di3, "__builtin_ia32_protqi", IX86_BUILTIN_PROTQ_IMM, 0, (int)MULTI_ARG_2_DI_IMM },
+ { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_rotlv4si3, "__builtin_ia32_protdi", IX86_BUILTIN_PROTD_IMM, 0, (int)MULTI_ARG_2_SI_IMM },
+ { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_rotlv8hi3, "__builtin_ia32_protwi", IX86_BUILTIN_PROTW_IMM, 0, (int)MULTI_ARG_2_HI_IMM },
+ { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_rotlv16qi3, "__builtin_ia32_protbi", IX86_BUILTIN_PROTB_IMM, 0, (int)MULTI_ARG_2_QI_IMM },
{ OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_ashlv2di3, "__builtin_ia32_pshaq", IX86_BUILTIN_PSHAQ, 0, (int)MULTI_ARG_2_DI },
{ OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_ashlv4si3, "__builtin_ia32_pshad", IX86_BUILTIN_PSHAD, 0, (int)MULTI_ARG_2_SI },
{ OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_ashlv8hi3, "__builtin_ia32_pshaw", IX86_BUILTIN_PSHAW, 0, (int)MULTI_ARG_2_HI },
NUM is the number of operands.
USES_OC0 is true if the instruction uses OC0 and provides 4 variants.
NUM_MEMORY is the maximum number of memory operands to accept. */
+
bool
-ix86_sse5_valid_op_p (rtx operands[], rtx insn, int num, bool uses_oc0, int num_memory)
+ix86_sse5_valid_op_p (rtx operands[], rtx insn ATTRIBUTE_UNUSED, int num,
+ bool uses_oc0, int num_memory)
{
int mem_mask;
int mem_count;
}
}
+ /* Special case pmacsdq{l,h} where we allow the 3rd argument to be
+ a memory operation. */
+ if (num_memory < 0)
+ {
+ num_memory = -num_memory;
+ if ((mem_mask & (1 << (num-1))) != 0)
+ {
+ mem_mask &= ~(1 << (num-1));
+ mem_count--;
+ }
+ }
+
/* If there were no memory operations, allow the insn */
if (mem_mask == 0)
return true;
(UNSPEC_SSE5_UNSIGNED_CMP 151)
(UNSPEC_SSE5_TRUEFALSE 152)
(UNSPEC_SSE5_PERMUTE 153)
- (UNSPEC_SSE5_ASHIFT 154)
- (UNSPEC_SSE5_LSHIFT 155)
- (UNSPEC_FRCZ 156)
- (UNSPEC_CVTPH2PS 157)
- (UNSPEC_CVTPS2PH 158)
+ (UNSPEC_FRCZ 154)
+ (UNSPEC_CVTPH2PS 155)
+ (UNSPEC_CVTPS2PH 156)
; For AES support
(UNSPEC_AESENC 159)
(COM_TRUE_P 5)
])
+;; Constants used in the SSE5 pperm instruction
+(define_constants
+ [(PPERM_SRC 0x00) /* copy source */
+ (PPERM_INVERT 0x20) /* invert source */
+ (PPERM_REVERSE 0x40) /* bit reverse source */
+ (PPERM_REV_INV 0x60) /* bit reverse & invert src */
+ (PPERM_ZERO 0x80) /* all 0's */
+ (PPERM_ONES 0xa0) /* all 1's */
+ (PPERM_SIGN 0xc0) /* propagate sign bit */
+ (PPERM_INV_SIGN 0xe0) /* invert & propagate sign */
+ (PPERM_SRC1 0x00) /* use first source byte */
+ (PPERM_SRC2 0x10) /* use second source byte */
+ ])
+
;; Registers by name.
(define_constants
[(AX_REG 0)
(define_mode_attr sserotatemax [(V16QI "7") (V8HI "15") (V4SI "31") (V2DI "63")])
;; Mapping of vector modes back to the scalar modes
-(define_mode_attr ssescalarmode [(V4SF "SF") (V2DF "DF")])
+(define_mode_attr ssescalarmode [(V4SF "SF") (V2DF "DF")
+ (V16QI "QI") (V8HI "HI")
+ (V4SI "SI") (V2DI "DI")])
+
+;; Number of scalar elements in each vector type
+(define_mode_attr ssescalarnum [(V4SF "4") (V2DF "2")
+ (V16QI "16") (V8HI "8")
+ (V4SI "4") (V2DI "2")])
;; Mapping of immediate bits for blend instructions
(define_mode_attr blendbits [(V4SF "15") (V2DF "3")])
;; We don't have a straight 32-bit parallel multiply on SSE5, so fake it with a
;; multiply/add. In general, we expect the define_split to occur before
;; register allocation, so we have to handle the corner case where the target
-;; is used as the base or index register in operands 1/2.
+;; is the same as one of the inputs.
(define_insn_and_split "*sse5_mulv4si3"
[(set (match_operand:V4SI 0 "register_operand" "=&x")
(mult:V4SI (match_operand:V4SI 1 "register_operand" "%x")
rtx t1, t2, t3, t4, t5, t6, thirtytwo;
rtx op0, op1, op2;
+ if (TARGET_SSE5)
+ {
+ /* op1: A,B,C,D, op2: E,F,G,H */
+ op0 = operands[0];
+ op1 = gen_lowpart (V4SImode, operands[1]);
+ op2 = gen_lowpart (V4SImode, operands[2]);
+ t1 = gen_reg_rtx (V4SImode);
+ t2 = gen_reg_rtx (V4SImode);
+ t3 = gen_reg_rtx (V4SImode);
+ t4 = gen_reg_rtx (V2DImode);
+ t5 = gen_reg_rtx (V2DImode);
+
+ /* t1: B,A,D,C */
+ emit_insn (gen_sse2_pshufd_1 (t1, op1,
+ GEN_INT (1),
+ GEN_INT (0),
+ GEN_INT (3),
+ GEN_INT (2)));
+
+ /* t2: 0 */
+ emit_move_insn (t2, CONST0_RTX (V4SImode));
+
+ /* t3: (B*E),(A*F),(D*G),(C*H) */
+ emit_insn (gen_sse5_pmacsdd (t3, t1, op2, t2));
+
+ /* t4: (B*E)+(A*F), (D*G)+(C*H) */
+ emit_insn (gen_sse5_phadddq (t4, t3));
+
+ /* t5: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
+ emit_insn (gen_ashlv2di3 (t5, t4, GEN_INT (32)));
+
+ /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
+ emit_insn (gen_sse5_pmacsdql (op0, op1, op2, t5));
+ DONE;
+ }
+
op0 = operands[0];
op1 = operands[1];
op2 = operands[2];
DONE;
})
+(define_expand "vec_widen_smult_hi_v4si"
+ [(match_operand:V2DI 0 "register_operand" "")
+ (match_operand:V4SI 1 "register_operand" "")
+ (match_operand:V4SI 2 "register_operand" "")]
+ "TARGET_SSE5"
+{
+ rtx t1, t2;
+
+ t1 = gen_reg_rtx (V4SImode);
+ t2 = gen_reg_rtx (V4SImode);
+
+ emit_insn (gen_sse2_pshufd_1 (t1, operands[1],
+ GEN_INT (0),
+ GEN_INT (2),
+ GEN_INT (1),
+ GEN_INT (3)));
+ emit_insn (gen_sse2_pshufd_1 (t2, operands[2],
+ GEN_INT (0),
+ GEN_INT (2),
+ GEN_INT (1),
+ GEN_INT (3)));
+ emit_insn (gen_sse5_mulv2div2di3_high (operands[0], t1, t2));
+ DONE;
+})
+
+(define_expand "vec_widen_smult_lo_v4si"
+ [(match_operand:V2DI 0 "register_operand" "")
+ (match_operand:V4SI 1 "register_operand" "")
+ (match_operand:V4SI 2 "register_operand" "")]
+ "TARGET_SSE5"
+{
+ rtx t1, t2;
+
+ t1 = gen_reg_rtx (V4SImode);
+ t2 = gen_reg_rtx (V4SImode);
+
+ emit_insn (gen_sse2_pshufd_1 (t1, operands[1],
+ GEN_INT (0),
+ GEN_INT (2),
+ GEN_INT (1),
+ GEN_INT (3)));
+ emit_insn (gen_sse2_pshufd_1 (t2, operands[2],
+ GEN_INT (0),
+ GEN_INT (2),
+ GEN_INT (1),
+ GEN_INT (3)));
+ emit_insn (gen_sse5_mulv2div2di3_low (operands[0], t1, t2));
+ DONE;
+ DONE;
+})
+
(define_expand "vec_widen_umult_hi_v4si"
[(match_operand:V2DI 0 "register_operand" "")
(match_operand:V4SI 1 "register_operand" "")
{
rtx op1, op2, h1, l1, h2, l2, h3, l3;
+ if (TARGET_SSE5)
+ {
+ ix86_expand_sse5_pack (operands);
+ DONE;
+ }
+
op1 = gen_lowpart (V16QImode, operands[1]);
op2 = gen_lowpart (V16QImode, operands[2]);
h1 = gen_reg_rtx (V16QImode);
{
rtx op1, op2, h1, l1, h2, l2;
+ if (TARGET_SSE5)
+ {
+ ix86_expand_sse5_pack (operands);
+ DONE;
+ }
+
op1 = gen_lowpart (V8HImode, operands[1]);
op2 = gen_lowpart (V8HImode, operands[2]);
h1 = gen_reg_rtx (V8HImode);
{
rtx op1, op2, h1, l1;
+ if (TARGET_SSE5)
+ {
+ ix86_expand_sse5_pack (operands);
+ DONE;
+ }
+
op1 = gen_lowpart (V4SImode, operands[1]);
op2 = gen_lowpart (V4SImode, operands[2]);
h1 = gen_reg_rtx (V4SImode);
[(set_attr "type" "ssemuladd")
(set_attr "mode" "TI")])
+(define_insn_and_split "*sse5_pmacsdql_mem"
+ [(set (match_operand:V2DI 0 "register_operand" "=&x,&x,&x")
+ (plus:V2DI
+ (mult:V2DI
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_operand:V4SI 1 "nonimmediate_operand" "x,x,m")
+ (parallel [(const_int 1)
+ (const_int 3)])))
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_operand:V4SI 2 "nonimmediate_operand" "x,m,x")
+ (parallel [(const_int 1)
+ (const_int 3)]))))
+ (match_operand:V2DI 3 "memory_operand" "m,m,m")))]
+ "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, false, -1)"
+ "#"
+ "&& (reload_completed
+ || (!reg_mentioned_p (operands[0], operands[1])
+ && !reg_mentioned_p (operands[0], operands[2])))"
+ [(set (match_dup 0)
+ (match_dup 3))
+ (set (match_dup 0)
+ (plus:V2DI
+ (mult:V2DI
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_dup 1)
+ (parallel [(const_int 1)
+ (const_int 3)])))
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_dup 2)
+ (parallel [(const_int 1)
+ (const_int 3)]))))
+ (match_dup 0)))])
+
+;; We don't have a straight 32-bit parallel multiply and extend on SSE5, so
+;; fake it with a multiply/add. In general, we expect the define_split to
+;; occur before register allocation, so we have to handle the corner case where
+;; the target is the same as operands 1/2
+(define_insn_and_split "sse5_mulv2div2di3_low"
+ [(set (match_operand:V2DI 0 "register_operand" "=&x")
+ (mult:V2DI
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_operand:V4SI 1 "nonimmediate_operand" "%x")
+ (parallel [(const_int 1)
+ (const_int 3)])))
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_operand:V4SI 2 "nonimmediate_operand" "xm")
+ (parallel [(const_int 1)
+ (const_int 3)])))))]
+ "TARGET_SSE5"
+ "#"
+ "&& (reload_completed
+ || (!reg_mentioned_p (operands[0], operands[1])
+ && !reg_mentioned_p (operands[0], operands[2])))"
+ [(set (match_dup 0)
+ (match_dup 3))
+ (set (match_dup 0)
+ (plus:V2DI
+ (mult:V2DI
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_dup 1)
+ (parallel [(const_int 1)
+ (const_int 3)])))
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_dup 2)
+ (parallel [(const_int 1)
+ (const_int 3)]))))
+ (match_dup 0)))]
+{
+ operands[3] = CONST0_RTX (V2DImode);
+}
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "TI")])
+
(define_insn "sse5_pmacsdqh"
[(set (match_operand:V2DI 0 "register_operand" "=x,x,x")
(plus:V2DI
[(set_attr "type" "ssemuladd")
(set_attr "mode" "TI")])
+(define_insn_and_split "*sse5_pmacsdqh_mem"
+ [(set (match_operand:V2DI 0 "register_operand" "=&x,&x,&x")
+ (plus:V2DI
+ (mult:V2DI
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_operand:V4SI 1 "nonimmediate_operand" "x,x,m")
+ (parallel [(const_int 0)
+ (const_int 2)])))
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_operand:V4SI 2 "nonimmediate_operand" "x,m,x")
+ (parallel [(const_int 0)
+ (const_int 2)]))))
+ (match_operand:V2DI 3 "memory_operand" "m,m,m")))]
+ "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, false, -1)"
+ "#"
+ "&& (reload_completed
+ || (!reg_mentioned_p (operands[0], operands[1])
+ && !reg_mentioned_p (operands[0], operands[2])))"
+ [(set (match_dup 0)
+ (match_dup 3))
+ (set (match_dup 0)
+ (plus:V2DI
+ (mult:V2DI
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_dup 1)
+ (parallel [(const_int 0)
+ (const_int 2)])))
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_dup 2)
+ (parallel [(const_int 0)
+ (const_int 2)]))))
+ (match_dup 0)))])
+
+;; We don't have a straight 32-bit parallel multiply and extend on SSE5, so
+;; fake it with a multiply/add. In general, we expect the define_split to
+;; occur before register allocation, so we have to handle the corner case where
+;; the target is the same as either operands[1] or operands[2]
+(define_insn_and_split "sse5_mulv2div2di3_high"
+ [(set (match_operand:V2DI 0 "register_operand" "=&x")
+ (mult:V2DI
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_operand:V4SI 1 "nonimmediate_operand" "%x")
+ (parallel [(const_int 0)
+ (const_int 2)])))
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_operand:V4SI 2 "nonimmediate_operand" "xm")
+ (parallel [(const_int 0)
+ (const_int 2)])))))]
+ "TARGET_SSE5"
+ "#"
+ "&& (reload_completed
+ || (!reg_mentioned_p (operands[0], operands[1])
+ && !reg_mentioned_p (operands[0], operands[2])))"
+ [(set (match_dup 0)
+ (match_dup 3))
+ (set (match_dup 0)
+ (plus:V2DI
+ (mult:V2DI
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_dup 1)
+ (parallel [(const_int 0)
+ (const_int 2)])))
+ (sign_extend:V2DI
+ (vec_select:V2SI
+ (match_dup 2)
+ (parallel [(const_int 0)
+ (const_int 2)]))))
+ (match_dup 0)))]
+{
+ operands[3] = CONST0_RTX (V2DImode);
+}
+ [(set_attr "type" "ssemuladd")
+ (set_attr "mode" "TI")])
+
;; SSE5 parallel integer multiply/add instructions for the intrinisics
(define_insn "sse5_pmacsswd"
[(set (match_operand:V4SI 0 "register_operand" "=x,x,x")
;; SSE5 parallel XMM conditional moves
(define_insn "sse5_pcmov_<mode>"
- [(set (match_operand:SSEMODE 0 "register_operand" "=x,x,x,x,x,x")
+ [(set (match_operand:SSEMODE 0 "register_operand" "=x,x,x,x")
(if_then_else:SSEMODE
- (match_operand:SSEMODE 3 "nonimmediate_operand" "0,0,xm,x,0,0")
- (match_operand:SSEMODE 1 "vector_move_operand" "x,xm,0,0,C,x")
- (match_operand:SSEMODE 2 "vector_move_operand" "xm,x,x,xm,x,C")))]
+ (match_operand:SSEMODE 3 "nonimmediate_operand" "0,0,xm,x")
+ (match_operand:SSEMODE 1 "vector_move_operand" "x,xm,0,0")
+ (match_operand:SSEMODE 2 "vector_move_operand" "xm,x,x,xm")))]
"TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, true, 1)"
"@
pcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}
pcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}
pcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}
- pcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}
- andps\t{%2, %0|%0, %2}
- andnps\t{%1, %0|%0, %1}"
+ pcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}"
[(set_attr "type" "sse4arg")])
;; SSE5 horizontal add/subtract instructions
(set_attr "mode" "<MODE>")])
;; SSE5 packed rotate instructions
-(define_insn "rotl<mode>3"
+(define_expand "rotl<mode>3"
+ [(set (match_operand:SSEMODE1248 0 "register_operand" "")
+ (rotate:SSEMODE1248
+ (match_operand:SSEMODE1248 1 "nonimmediate_operand" "")
+ (match_operand:SI 2 "general_operand")))]
+ "TARGET_SSE5"
+{
+ /* If we were given a scalar, convert it to parallel */
+ if (! const_0_to_<sserotatemax>_operand (operands[2], SImode))
+ {
+ rtvec vs = rtvec_alloc (<ssescalarnum>);
+ rtx par = gen_rtx_PARALLEL (<MODE>mode, vs);
+ rtx reg = gen_reg_rtx (<MODE>mode);
+ rtx op2 = operands[2];
+ int i;
+
+ if (GET_MODE (op2) != <ssescalarmode>mode)
+ {
+ op2 = gen_reg_rtx (<ssescalarmode>mode);
+ convert_move (op2, operands[2], false);
+ }
+
+ for (i = 0; i < <ssescalarnum>; i++)
+ RTVEC_ELT (vs, i) = op2;
+
+ emit_insn (gen_vec_init<mode> (reg, par));
+ emit_insn (gen_sse5_vrotl<mode>3 (operands[0], operands[1], reg));
+ DONE;
+ }
+})
+
+(define_expand "rotr<mode>3"
+ [(set (match_operand:SSEMODE1248 0 "register_operand" "")
+ (rotatert:SSEMODE1248
+ (match_operand:SSEMODE1248 1 "nonimmediate_operand" "")
+ (match_operand:SI 2 "general_operand")))]
+ "TARGET_SSE5"
+{
+ /* If we were given a scalar, convert it to parallel */
+ if (! const_0_to_<sserotatemax>_operand (operands[2], SImode))
+ {
+ rtvec vs = rtvec_alloc (<ssescalarnum>);
+ rtx par = gen_rtx_PARALLEL (<MODE>mode, vs);
+ rtx neg = gen_reg_rtx (<MODE>mode);
+ rtx reg = gen_reg_rtx (<MODE>mode);
+ rtx op2 = operands[2];
+ int i;
+
+ if (GET_MODE (op2) != <ssescalarmode>mode)
+ {
+ op2 = gen_reg_rtx (<ssescalarmode>mode);
+ convert_move (op2, operands[2], false);
+ }
+
+ for (i = 0; i < <ssescalarnum>; i++)
+ RTVEC_ELT (vs, i) = op2;
+
+ emit_insn (gen_vec_init<mode> (reg, par));
+ emit_insn (gen_neg<mode>2 (neg, reg));
+ emit_insn (gen_sse5_vrotl<mode>3 (operands[0], operands[1], neg));
+ DONE;
+ }
+})
+
+(define_insn "sse5_rotl<mode>3"
[(set (match_operand:SSEMODE1248 0 "register_operand" "=x")
(rotate:SSEMODE1248
(match_operand:SSEMODE1248 1 "nonimmediate_operand" "xm")
[(set_attr "type" "sseishft")
(set_attr "mode" "TI")])
-(define_insn "sse5_rotl<mode>3"
+(define_insn "sse5_rotr<mode>3"
+ [(set (match_operand:SSEMODE1248 0 "register_operand" "=x")
+ (rotatert:SSEMODE1248
+ (match_operand:SSEMODE1248 1 "nonimmediate_operand" "xm")
+ (match_operand:SI 2 "const_0_to_<sserotatemax>_operand" "n")))]
+ "TARGET_SSE5"
+{
+ operands[3] = GEN_INT ((<ssescalarnum> * 8) - INTVAL (operands[2]));
+ return \"prot<ssevecsize>\t{%3, %1, %0|%0, %1, %3}\";
+}
+ [(set_attr "type" "sseishft")
+ (set_attr "mode" "TI")])
+
+(define_expand "vrotr<mode>3"
+ [(match_operand:SSEMODE1248 0 "register_operand" "")
+ (match_operand:SSEMODE1248 1 "register_operand" "")
+ (match_operand:SSEMODE1248 2 "register_operand" "")]
+ "TARGET_SSE5"
+{
+ rtx reg = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_neg<mode>2 (reg, operands[2]));
+ emit_insn (gen_sse5_vrotl<mode>3 (operands[0], operands[1], reg));
+ DONE;
+})
+
+(define_expand "vrotl<mode>3"
+ [(match_operand:SSEMODE1248 0 "register_operand" "")
+ (match_operand:SSEMODE1248 1 "register_operand" "")
+ (match_operand:SSEMODE1248 2 "register_operand" "")]
+ "TARGET_SSE5"
+{
+ emit_insn (gen_sse5_vrotl<mode>3 (operands[0], operands[1], operands[2]));
+ DONE;
+})
+
+(define_insn "sse5_vrotl<mode>3"
[(set (match_operand:SSEMODE1248 0 "register_operand" "=x,x")
- (rotate:SSEMODE1248
- (match_operand:SSEMODE1248 1 "nonimmediate_operand" "x,xm")
- (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm,x")))]
+ (if_then_else:SSEMODE1248
+ (ge:SSEMODE1248
+ (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm,x")
+ (const_int 0))
+ (rotate:SSEMODE1248
+ (match_operand:SSEMODE1248 1 "nonimmediate_operand" "x,xm")
+ (match_dup 2))
+ (rotatert:SSEMODE1248
+ (match_dup 1)
+ (neg:SSEMODE1248 (match_dup 2)))))]
"TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 3, true, 1)"
"prot<ssevecsize>\t{%2, %1, %0|%0, %1, %2}"
[(set_attr "type" "sseishft")
(set_attr "mode" "TI")])
-;; SSE5 packed shift instructions. Note negative values for the shift amount
-;; convert this into a right shift instead of left shift. For now, model this
-;; with an UNSPEC instead of using ashift/lshift since the rest of the x86 does
-;; not have the concept of negating the shift amount. Also, there is no LSHIFT
+;; SSE5 packed shift instructions.
+;; FIXME: add V2DI back in
+(define_expand "vlshr<mode>3"
+ [(match_operand:SSEMODE124 0 "register_operand" "")
+ (match_operand:SSEMODE124 1 "register_operand" "")
+ (match_operand:SSEMODE124 2 "register_operand" "")]
+ "TARGET_SSE5"
+{
+ rtx neg = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_neg<mode>2 (neg, operands[2]));
+ emit_insn (gen_sse5_lshl<mode>3 (operands[0], operands[1], neg));
+ DONE;
+})
+
+(define_expand "vashr<mode>3"
+ [(match_operand:SSEMODE124 0 "register_operand" "")
+ (match_operand:SSEMODE124 1 "register_operand" "")
+ (match_operand:SSEMODE124 2 "register_operand" "")]
+ "TARGET_SSE5"
+{
+ rtx neg = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_neg<mode>2 (neg, operands[2]));
+ emit_insn (gen_sse5_ashl<mode>3 (operands[0], operands[1], neg));
+ DONE;
+})
+
+(define_expand "vashl<mode>3"
+ [(match_operand:SSEMODE124 0 "register_operand" "")
+ (match_operand:SSEMODE124 1 "register_operand" "")
+ (match_operand:SSEMODE124 2 "register_operand" "")]
+ "TARGET_SSE5"
+{
+ emit_insn (gen_sse5_ashl<mode>3 (operands[0], operands[1], operands[2]));
+ DONE;
+})
+
(define_insn "sse5_ashl<mode>3"
[(set (match_operand:SSEMODE1248 0 "register_operand" "=x,x")
- (unspec:SSEMODE1248
- [(match_operand:SSEMODE1248 1 "nonimmediate_operand" "x,xm")
- (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm,x")]
- UNSPEC_SSE5_ASHIFT))]
+ (if_then_else:SSEMODE1248
+ (ge:SSEMODE1248
+ (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm,x")
+ (const_int 0))
+ (ashift:SSEMODE1248
+ (match_operand:SSEMODE1248 1 "nonimmediate_operand" "x,xm")
+ (match_dup 2))
+ (ashiftrt:SSEMODE1248
+ (match_dup 1)
+ (neg:SSEMODE1248 (match_dup 2)))))]
"TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 3, true, 1)"
"psha<ssevecsize>\t{%2, %1, %0|%0, %1, %2}"
[(set_attr "type" "sseishft")
(define_insn "sse5_lshl<mode>3"
[(set (match_operand:SSEMODE1248 0 "register_operand" "=x,x")
- (unspec:SSEMODE1248
- [(match_operand:SSEMODE1248 1 "nonimmediate_operand" "x,xm")
- (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm,x")]
- UNSPEC_SSE5_LSHIFT))]
+ (if_then_else:SSEMODE1248
+ (ge:SSEMODE1248
+ (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm,x")
+ (const_int 0))
+ (ashift:SSEMODE1248
+ (match_operand:SSEMODE1248 1 "nonimmediate_operand" "x,xm")
+ (match_dup 2))
+ (lshiftrt:SSEMODE1248
+ (match_dup 1)
+ (neg:SSEMODE1248 (match_dup 2)))))]
"TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 3, true, 1)"
"pshl<ssevecsize>\t{%2, %1, %0|%0, %1, %2}"
[(set_attr "type" "sseishft")
(set_attr "mode" "TI")])
+;; SSE2 doesn't have some shift varients, so define versions for SSE5
+(define_expand "ashlv16qi3"
+ [(match_operand:V16QI 0 "register_operand" "")
+ (match_operand:V16QI 1 "register_operand" "")
+ (match_operand:SI 2 "nonmemory_operand" "")]
+ "TARGET_SSE5"
+{
+ rtvec vs = rtvec_alloc (16);
+ rtx par = gen_rtx_PARALLEL (V16QImode, vs);
+ rtx reg = gen_reg_rtx (V16QImode);
+ int i;
+ for (i = 0; i < 16; i++)
+ RTVEC_ELT (vs, i) = operands[2];
+
+ emit_insn (gen_vec_initv16qi (reg, par));
+ emit_insn (gen_sse5_ashlv16qi3 (operands[0], operands[1], reg));
+ DONE;
+})
+
+(define_expand "lshlv16qi3"
+ [(match_operand:V16QI 0 "register_operand" "")
+ (match_operand:V16QI 1 "register_operand" "")
+ (match_operand:SI 2 "nonmemory_operand" "")]
+ "TARGET_SSE5"
+{
+ rtvec vs = rtvec_alloc (16);
+ rtx par = gen_rtx_PARALLEL (V16QImode, vs);
+ rtx reg = gen_reg_rtx (V16QImode);
+ int i;
+ for (i = 0; i < 16; i++)
+ RTVEC_ELT (vs, i) = operands[2];
+
+ emit_insn (gen_vec_initv16qi (reg, par));
+ emit_insn (gen_sse5_lshlv16qi3 (operands[0], operands[1], reg));
+ DONE;
+})
+
+(define_expand "ashrv16qi3"
+ [(match_operand:V16QI 0 "register_operand" "")
+ (match_operand:V16QI 1 "register_operand" "")
+ (match_operand:SI 2 "nonmemory_operand" "")]
+ "TARGET_SSE5"
+{
+ rtvec vs = rtvec_alloc (16);
+ rtx par = gen_rtx_PARALLEL (V16QImode, vs);
+ rtx reg = gen_reg_rtx (V16QImode);
+ int i;
+ rtx ele = ((GET_CODE (operands[2]) == CONST_INT)
+ ? GEN_INT (- INTVAL (operands[2]))
+ : operands[2]);
+
+ for (i = 0; i < 16; i++)
+ RTVEC_ELT (vs, i) = ele;
+
+ emit_insn (gen_vec_initv16qi (reg, par));
+
+ if (GET_CODE (operands[2]) != CONST_INT)
+ {
+ rtx neg = gen_reg_rtx (V16QImode);
+ emit_insn (gen_negv16qi2 (neg, reg));
+ emit_insn (gen_sse5_ashlv16qi3 (operands[0], operands[1], neg));
+ }
+ else
+ emit_insn (gen_sse5_ashlv16qi3 (operands[0], operands[1], reg));
+
+ DONE;
+})
+
+(define_expand "ashrv2di3"
+ [(match_operand:V2DI 0 "register_operand" "")
+ (match_operand:V2DI 1 "register_operand" "")
+ (match_operand:DI 2 "nonmemory_operand" "")]
+ "TARGET_SSE5"
+{
+ rtvec vs = rtvec_alloc (2);
+ rtx par = gen_rtx_PARALLEL (V2DImode, vs);
+ rtx reg = gen_reg_rtx (V2DImode);
+ rtx ele;
+
+ if (GET_CODE (operands[2]) == CONST_INT)
+ ele = GEN_INT (- INTVAL (operands[2]));
+ else if (GET_MODE (operands[2]) != DImode)
+ {
+ rtx move = gen_reg_rtx (DImode);
+ ele = gen_reg_rtx (DImode);
+ convert_move (move, operands[2], false);
+ emit_insn (gen_negdi2 (ele, move));
+ }
+ else
+ {
+ ele = gen_reg_rtx (DImode);
+ emit_insn (gen_negdi2 (ele, operands[2]));
+ }
+
+ RTVEC_ELT (vs, 0) = ele;
+ RTVEC_ELT (vs, 1) = ele;
+ emit_insn (gen_vec_initv2di (reg, par));
+ emit_insn (gen_sse5_ashlv2di3 (operands[0], operands[1], reg));
+ DONE;
+})
+
;; SSE5 FRCZ support
;; parallel insns
(define_insn "sse5_frcz<mode>2"
/* Generate [-0.0, -0.0, -0.0, -0.0]. */
neg0 = gen_reg_rtx (V4SImode);
emit_insn (gen_altivec_vspltisw (neg0, constm1_rtx));
- emit_insn (gen_ashlv4si3 (neg0, neg0, neg0));
+ emit_insn (gen_vashlv4si3 (neg0, neg0, neg0));
/* Use the multiply-add. */
emit_insn (gen_altivec_vmaddfp (operands[0], operands[1], operands[2],
high_product = gen_reg_rtx (V4SImode);
emit_insn (gen_altivec_vmsumuhm (high_product, one, small_swap, zero));
- emit_insn (gen_ashlv4si3 (high_product, high_product, sixteen));
+ emit_insn (gen_vashlv4si3 (high_product, high_product, sixteen));
emit_insn (gen_addv4si3 (operands[0], high_product, low_product));
"vslo %0,%1,%2"
[(set_attr "type" "vecperm")])
-(define_insn "ashl<mode>3"
+(define_insn "vashl<mode>3"
[(set (match_operand:VI 0 "register_operand" "=v")
(ashift:VI (match_operand:VI 1 "register_operand" "v")
(match_operand:VI 2 "register_operand" "v") ))]
"vsl<VI_char> %0,%1,%2"
[(set_attr "type" "vecsimple")])
-(define_insn "lshr<mode>3"
+(define_insn "vlshr<mode>3"
[(set (match_operand:VI 0 "register_operand" "=v")
(lshiftrt:VI (match_operand:VI 1 "register_operand" "v")
(match_operand:VI 2 "register_operand" "v") ))]
"vsr<VI_char> %0,%1,%2"
[(set_attr "type" "vecsimple")])
-(define_insn "ashr<mode>3"
+(define_insn "vashr<mode>3"
[(set (match_operand:VI 0 "register_operand" "=v")
(ashiftrt:VI (match_operand:VI 1 "register_operand" "v")
(match_operand:VI 2 "register_operand" "v") ))]
/* Generate [-0.0, -0.0, -0.0, -0.0]. */
neg0 = gen_reg_rtx (V4SImode);
emit_insn (gen_altivec_vspltisw (neg0, constm1_rtx));
- emit_insn (gen_ashlv4si3 (neg0, neg0, neg0));
+ emit_insn (gen_vashlv4si3 (neg0, neg0, neg0));
/* XOR */
emit_insn (gen_xorv4sf3 (operands[0],
{ MASK_ALTIVEC, CODE_FOR_altivec_vrlb, "__builtin_altivec_vrlb", ALTIVEC_BUILTIN_VRLB },
{ MASK_ALTIVEC, CODE_FOR_altivec_vrlh, "__builtin_altivec_vrlh", ALTIVEC_BUILTIN_VRLH },
{ MASK_ALTIVEC, CODE_FOR_altivec_vrlw, "__builtin_altivec_vrlw", ALTIVEC_BUILTIN_VRLW },
- { MASK_ALTIVEC, CODE_FOR_ashlv16qi3, "__builtin_altivec_vslb", ALTIVEC_BUILTIN_VSLB },
- { MASK_ALTIVEC, CODE_FOR_ashlv8hi3, "__builtin_altivec_vslh", ALTIVEC_BUILTIN_VSLH },
- { MASK_ALTIVEC, CODE_FOR_ashlv4si3, "__builtin_altivec_vslw", ALTIVEC_BUILTIN_VSLW },
+ { MASK_ALTIVEC, CODE_FOR_vashlv16qi3, "__builtin_altivec_vslb", ALTIVEC_BUILTIN_VSLB },
+ { MASK_ALTIVEC, CODE_FOR_vashlv8hi3, "__builtin_altivec_vslh", ALTIVEC_BUILTIN_VSLH },
+ { MASK_ALTIVEC, CODE_FOR_vashlv4si3, "__builtin_altivec_vslw", ALTIVEC_BUILTIN_VSLW },
{ MASK_ALTIVEC, CODE_FOR_altivec_vsl, "__builtin_altivec_vsl", ALTIVEC_BUILTIN_VSL },
{ MASK_ALTIVEC, CODE_FOR_altivec_vslo, "__builtin_altivec_vslo", ALTIVEC_BUILTIN_VSLO },
{ MASK_ALTIVEC, CODE_FOR_altivec_vspltb, "__builtin_altivec_vspltb", ALTIVEC_BUILTIN_VSPLTB },
{ MASK_ALTIVEC, CODE_FOR_altivec_vsplth, "__builtin_altivec_vsplth", ALTIVEC_BUILTIN_VSPLTH },
{ MASK_ALTIVEC, CODE_FOR_altivec_vspltw, "__builtin_altivec_vspltw", ALTIVEC_BUILTIN_VSPLTW },
- { MASK_ALTIVEC, CODE_FOR_lshrv16qi3, "__builtin_altivec_vsrb", ALTIVEC_BUILTIN_VSRB },
- { MASK_ALTIVEC, CODE_FOR_lshrv8hi3, "__builtin_altivec_vsrh", ALTIVEC_BUILTIN_VSRH },
- { MASK_ALTIVEC, CODE_FOR_lshrv4si3, "__builtin_altivec_vsrw", ALTIVEC_BUILTIN_VSRW },
- { MASK_ALTIVEC, CODE_FOR_ashrv16qi3, "__builtin_altivec_vsrab", ALTIVEC_BUILTIN_VSRAB },
- { MASK_ALTIVEC, CODE_FOR_ashrv8hi3, "__builtin_altivec_vsrah", ALTIVEC_BUILTIN_VSRAH },
- { MASK_ALTIVEC, CODE_FOR_ashrv4si3, "__builtin_altivec_vsraw", ALTIVEC_BUILTIN_VSRAW },
+ { MASK_ALTIVEC, CODE_FOR_vlshrv16qi3, "__builtin_altivec_vsrb", ALTIVEC_BUILTIN_VSRB },
+ { MASK_ALTIVEC, CODE_FOR_vlshrv8hi3, "__builtin_altivec_vsrh", ALTIVEC_BUILTIN_VSRH },
+ { MASK_ALTIVEC, CODE_FOR_vlshrv4si3, "__builtin_altivec_vsrw", ALTIVEC_BUILTIN_VSRW },
+ { MASK_ALTIVEC, CODE_FOR_vashrv16qi3, "__builtin_altivec_vsrab", ALTIVEC_BUILTIN_VSRAB },
+ { MASK_ALTIVEC, CODE_FOR_vashrv8hi3, "__builtin_altivec_vsrah", ALTIVEC_BUILTIN_VSRAH },
+ { MASK_ALTIVEC, CODE_FOR_vashrv4si3, "__builtin_altivec_vsraw", ALTIVEC_BUILTIN_VSRAW },
{ MASK_ALTIVEC, CODE_FOR_altivec_vsr, "__builtin_altivec_vsr", ALTIVEC_BUILTIN_VSR },
{ MASK_ALTIVEC, CODE_FOR_altivec_vsro, "__builtin_altivec_vsro", ALTIVEC_BUILTIN_VSRO },
{ MASK_ALTIVEC, CODE_FOR_subv16qi3, "__builtin_altivec_vsububm", ALTIVEC_BUILTIN_VSUBUBM },
DEF_BUILTIN (SI_EQV, CODE_FOR_eqv_v16qi, "si_eqv", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD))
DEF_BUILTIN (SI_SELB, CODE_FOR_selb, "si_selb", B_INSN, _A4(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD))
DEF_BUILTIN (SI_SHUFB, CODE_FOR_shufb, "si_shufb", B_INSN, _A4(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD))
-DEF_BUILTIN (SI_SHLH, CODE_FOR_ashlv8hi3, "si_shlh", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD))
-DEF_BUILTIN (SI_SHLHI, CODE_FOR_ashlv8hi3, "si_shlhi", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_7))
-DEF_BUILTIN (SI_SHL, CODE_FOR_ashlv4si3, "si_shl", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD))
-DEF_BUILTIN (SI_SHLI, CODE_FOR_ashlv4si3, "si_shli", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_7))
+DEF_BUILTIN (SI_SHLH, CODE_FOR_vashlv8hi3, "si_shlh", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD))
+DEF_BUILTIN (SI_SHLHI, CODE_FOR_vashlv8hi3, "si_shlhi", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_7))
+DEF_BUILTIN (SI_SHL, CODE_FOR_vashlv4si3, "si_shl", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD))
+DEF_BUILTIN (SI_SHLI, CODE_FOR_vashlv4si3, "si_shli", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_7))
DEF_BUILTIN (SI_SHLQBI, CODE_FOR_shlqbi_ti, "si_shlqbi", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD))
DEF_BUILTIN (SI_SHLQBII, CODE_FOR_shlqbi_ti, "si_shlqbii", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_7))
DEF_BUILTIN (SI_SHLQBY, CODE_FOR_shlqby_ti, "si_shlqby", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD))
DEF_BUILTIN (SI_SHLQBYI, CODE_FOR_shlqby_ti, "si_shlqbyi", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_7))
DEF_BUILTIN (SI_SHLQBYBI, CODE_FOR_shlqbybi_ti, "si_shlqbybi", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD))
-DEF_BUILTIN (SI_ROTH, CODE_FOR_rotlv8hi3, "si_roth", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD))
-DEF_BUILTIN (SI_ROTHI, CODE_FOR_rotlv8hi3, "si_rothi", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_7))
-DEF_BUILTIN (SI_ROT, CODE_FOR_rotlv4si3, "si_rot", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD))
-DEF_BUILTIN (SI_ROTI, CODE_FOR_rotlv4si3, "si_roti", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_7))
+DEF_BUILTIN (SI_ROTH, CODE_FOR_vrotlv8hi3, "si_roth", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD))
+DEF_BUILTIN (SI_ROTHI, CODE_FOR_vrotlv8hi3, "si_rothi", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_7))
+DEF_BUILTIN (SI_ROT, CODE_FOR_vrotlv4si3, "si_rot", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD))
+DEF_BUILTIN (SI_ROTI, CODE_FOR_vrotlv4si3, "si_roti", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_7))
DEF_BUILTIN (SI_ROTQBY, CODE_FOR_rotqby_ti, "si_rotqby", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD))
DEF_BUILTIN (SI_ROTQBYI, CODE_FOR_rotqby_ti, "si_rotqbyi", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_7))
DEF_BUILTIN (SI_ROTQBYBI, CODE_FOR_rotqbybi_ti, "si_rotqbybi", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD))
DEF_BUILTIN (SPU_XOR_14, CODE_FOR_xorv4si3, "spu_xor_14", B_INTERNAL, _A3(SPU_BTI_UV4SI, SPU_BTI_UV4SI, SPU_BTI_UINTSI))
DEF_BUILTIN (SPU_XOR_15, CODE_FOR_xorv4si3, "spu_xor_15", B_INTERNAL, _A3(SPU_BTI_V4SI, SPU_BTI_V4SI, SPU_BTI_INTSI))
DEF_BUILTIN (SPU_RL, CODE_FOR_nothing, "spu_rl", B_OVERLOAD, _A1(SPU_BTI_VOID))
-DEF_BUILTIN (SPU_RL_0, CODE_FOR_rotlv8hi3, "spu_rl_0", B_INTERNAL, _A3(SPU_BTI_UV8HI, SPU_BTI_UV8HI, SPU_BTI_V8HI))
-DEF_BUILTIN (SPU_RL_1, CODE_FOR_rotlv8hi3, "spu_rl_1", B_INTERNAL, _A3(SPU_BTI_V8HI, SPU_BTI_V8HI, SPU_BTI_V8HI))
-DEF_BUILTIN (SPU_RL_2, CODE_FOR_rotlv4si3, "spu_rl_2", B_INTERNAL, _A3(SPU_BTI_UV4SI, SPU_BTI_UV4SI, SPU_BTI_V4SI))
-DEF_BUILTIN (SPU_RL_3, CODE_FOR_rotlv4si3, "spu_rl_3", B_INTERNAL, _A3(SPU_BTI_V4SI, SPU_BTI_V4SI, SPU_BTI_V4SI))
-DEF_BUILTIN (SPU_RL_4, CODE_FOR_rotlv8hi3, "spu_rl_4", B_INTERNAL, _A3(SPU_BTI_UV8HI, SPU_BTI_UV8HI, SPU_BTI_INTHI))
-DEF_BUILTIN (SPU_RL_5, CODE_FOR_rotlv8hi3, "spu_rl_5", B_INTERNAL, _A3(SPU_BTI_V8HI, SPU_BTI_V8HI, SPU_BTI_INTHI))
-DEF_BUILTIN (SPU_RL_6, CODE_FOR_rotlv4si3, "spu_rl_6", B_INTERNAL, _A3(SPU_BTI_UV4SI, SPU_BTI_UV4SI, SPU_BTI_INTSI))
-DEF_BUILTIN (SPU_RL_7, CODE_FOR_rotlv4si3, "spu_rl_7", B_INTERNAL, _A3(SPU_BTI_V4SI, SPU_BTI_V4SI, SPU_BTI_INTSI))
+DEF_BUILTIN (SPU_RL_0, CODE_FOR_vrotlv8hi3, "spu_rl_0", B_INTERNAL, _A3(SPU_BTI_UV8HI, SPU_BTI_UV8HI, SPU_BTI_V8HI))
+DEF_BUILTIN (SPU_RL_1, CODE_FOR_vrotlv8hi3, "spu_rl_1", B_INTERNAL, _A3(SPU_BTI_V8HI, SPU_BTI_V8HI, SPU_BTI_V8HI))
+DEF_BUILTIN (SPU_RL_2, CODE_FOR_vrotlv4si3, "spu_rl_2", B_INTERNAL, _A3(SPU_BTI_UV4SI, SPU_BTI_UV4SI, SPU_BTI_V4SI))
+DEF_BUILTIN (SPU_RL_3, CODE_FOR_vrotlv4si3, "spu_rl_3", B_INTERNAL, _A3(SPU_BTI_V4SI, SPU_BTI_V4SI, SPU_BTI_V4SI))
+DEF_BUILTIN (SPU_RL_4, CODE_FOR_vrotlv8hi3, "spu_rl_4", B_INTERNAL, _A3(SPU_BTI_UV8HI, SPU_BTI_UV8HI, SPU_BTI_INTHI))
+DEF_BUILTIN (SPU_RL_5, CODE_FOR_vrotlv8hi3, "spu_rl_5", B_INTERNAL, _A3(SPU_BTI_V8HI, SPU_BTI_V8HI, SPU_BTI_INTHI))
+DEF_BUILTIN (SPU_RL_6, CODE_FOR_vrotlv4si3, "spu_rl_6", B_INTERNAL, _A3(SPU_BTI_UV4SI, SPU_BTI_UV4SI, SPU_BTI_INTSI))
+DEF_BUILTIN (SPU_RL_7, CODE_FOR_vrotlv4si3, "spu_rl_7", B_INTERNAL, _A3(SPU_BTI_V4SI, SPU_BTI_V4SI, SPU_BTI_INTSI))
DEF_BUILTIN (SPU_RLQW, CODE_FOR_nothing, "spu_rlqw", B_OVERLOAD, _A1(SPU_BTI_VOID))
DEF_BUILTIN (SPU_RLQW_0, CODE_FOR_rotqbi_ti, "spu_rlqw_0", B_INTERNAL, _A3(SPU_BTI_UV16QI, SPU_BTI_UV16QI, SPU_BTI_INTSI))
DEF_BUILTIN (SPU_RLQW_1, CODE_FOR_rotqbi_ti, "spu_rlqw_1", B_INTERNAL, _A3(SPU_BTI_V16QI, SPU_BTI_V16QI, SPU_BTI_INTSI))
DEF_BUILTIN (SPU_RLMASKQWBYTEBC_8, CODE_FOR_rotqmbybi_ti, "spu_rlmaskqwbytebc_8", B_INTERNAL, _A3(SPU_BTI_V4SF, SPU_BTI_V4SF, SPU_BTI_INTSI))
DEF_BUILTIN (SPU_RLMASKQWBYTEBC_9, CODE_FOR_rotqmbybi_ti, "spu_rlmaskqwbytebc_9", B_INTERNAL, _A3(SPU_BTI_V2DF, SPU_BTI_V2DF, SPU_BTI_INTSI))
DEF_BUILTIN (SPU_SL, CODE_FOR_nothing, "spu_sl", B_OVERLOAD, _A1(SPU_BTI_VOID))
-DEF_BUILTIN (SPU_SL_0, CODE_FOR_ashlv8hi3, "spu_sl_0", B_INTERNAL, _A3(SPU_BTI_UV8HI, SPU_BTI_UV8HI, SPU_BTI_UV8HI))
-DEF_BUILTIN (SPU_SL_1, CODE_FOR_ashlv8hi3, "spu_sl_1", B_INTERNAL, _A3(SPU_BTI_V8HI, SPU_BTI_V8HI, SPU_BTI_UV8HI))
-DEF_BUILTIN (SPU_SL_2, CODE_FOR_ashlv4si3, "spu_sl_2", B_INTERNAL, _A3(SPU_BTI_UV4SI, SPU_BTI_UV4SI, SPU_BTI_UV4SI))
-DEF_BUILTIN (SPU_SL_3, CODE_FOR_ashlv4si3, "spu_sl_3", B_INTERNAL, _A3(SPU_BTI_V4SI, SPU_BTI_V4SI, SPU_BTI_UV4SI))
-DEF_BUILTIN (SPU_SL_4, CODE_FOR_ashlv8hi3, "spu_sl_4", B_INTERNAL, _A3(SPU_BTI_UV8HI, SPU_BTI_UV8HI, SPU_BTI_UINTSI))
-DEF_BUILTIN (SPU_SL_5, CODE_FOR_ashlv8hi3, "spu_sl_5", B_INTERNAL, _A3(SPU_BTI_V8HI, SPU_BTI_V8HI, SPU_BTI_UINTSI))
-DEF_BUILTIN (SPU_SL_6, CODE_FOR_ashlv4si3, "spu_sl_6", B_INTERNAL, _A3(SPU_BTI_UV4SI, SPU_BTI_UV4SI, SPU_BTI_UINTSI))
-DEF_BUILTIN (SPU_SL_7, CODE_FOR_ashlv4si3, "spu_sl_7", B_INTERNAL, _A3(SPU_BTI_V4SI, SPU_BTI_V4SI, SPU_BTI_UINTSI))
+DEF_BUILTIN (SPU_SL_0, CODE_FOR_vashlv8hi3, "spu_sl_0", B_INTERNAL, _A3(SPU_BTI_UV8HI, SPU_BTI_UV8HI, SPU_BTI_UV8HI))
+DEF_BUILTIN (SPU_SL_1, CODE_FOR_vashlv8hi3, "spu_sl_1", B_INTERNAL, _A3(SPU_BTI_V8HI, SPU_BTI_V8HI, SPU_BTI_UV8HI))
+DEF_BUILTIN (SPU_SL_2, CODE_FOR_vashlv4si3, "spu_sl_2", B_INTERNAL, _A3(SPU_BTI_UV4SI, SPU_BTI_UV4SI, SPU_BTI_UV4SI))
+DEF_BUILTIN (SPU_SL_3, CODE_FOR_vashlv4si3, "spu_sl_3", B_INTERNAL, _A3(SPU_BTI_V4SI, SPU_BTI_V4SI, SPU_BTI_UV4SI))
+DEF_BUILTIN (SPU_SL_4, CODE_FOR_vashlv8hi3, "spu_sl_4", B_INTERNAL, _A3(SPU_BTI_UV8HI, SPU_BTI_UV8HI, SPU_BTI_UINTSI))
+DEF_BUILTIN (SPU_SL_5, CODE_FOR_vashlv8hi3, "spu_sl_5", B_INTERNAL, _A3(SPU_BTI_V8HI, SPU_BTI_V8HI, SPU_BTI_UINTSI))
+DEF_BUILTIN (SPU_SL_6, CODE_FOR_vashlv4si3, "spu_sl_6", B_INTERNAL, _A3(SPU_BTI_UV4SI, SPU_BTI_UV4SI, SPU_BTI_UINTSI))
+DEF_BUILTIN (SPU_SL_7, CODE_FOR_vashlv4si3, "spu_sl_7", B_INTERNAL, _A3(SPU_BTI_V4SI, SPU_BTI_V4SI, SPU_BTI_UINTSI))
DEF_BUILTIN (SPU_SLQW, CODE_FOR_nothing, "spu_slqw", B_OVERLOAD, _A1(SPU_BTI_VOID))
DEF_BUILTIN (SPU_SLQW_0, CODE_FOR_shlqbi_ti, "spu_slqw_0", B_INTERNAL, _A3(SPU_BTI_V2DI, SPU_BTI_V2DI, SPU_BTI_UINTSI))
DEF_BUILTIN (SPU_SLQW_1, CODE_FOR_shlqbi_ti, "spu_slqw_1", B_INTERNAL, _A3(SPU_BTI_UV2DI, SPU_BTI_UV2DI, SPU_BTI_UINTSI))
insnc = force_reg (V4SImode, array_to_constant (V4SImode, insna));
emit_insn (gen_shufb (shuf, fnaddr, cxt, shufc));
- emit_insn (gen_rotlv4si3 (rotl, shuf, spu_const (V4SImode, 7)));
+ emit_insn (gen_vrotlv4si3 (rotl, shuf, spu_const (V4SImode, 7)));
emit_insn (gen_movv4si (mask, spu_const (V4SImode, 0xffff << 7)));
emit_insn (gen_selb (insn, insnc, rotl, mask));
V8HI
V4SI])
+(define_mode_attr v [(V8HI "v") (V4SI "v")
+ (HI "") (SI "")])
+
(define_mode_attr bh [(QI "b") (V16QI "b")
(HI "h") (V8HI "h")
(SI "") (V4SI "")])
rtx op6_ti = gen_rtx_REG (TImode, REGNO (ops[6]));
emit_insn (gen_clzv4si2 (ops[3],op1_v4si));
emit_move_insn (ops[6], spu_const (V4SImode, 1023+31));
- emit_insn (gen_ashlv4si3 (ops[4],op1_v4si,ops[3]));
+ emit_insn (gen_vashlv4si3 (ops[4],op1_v4si,ops[3]));
emit_insn (gen_ceq_v4si (ops[5],ops[3],spu_const (V4SImode, 32)));
emit_insn (gen_subv4si3 (ops[6],ops[6],ops[3]));
emit_insn (gen_addv4si3 (ops[4],ops[4],ops[4]));
rtx op4_df = gen_rtx_REG (DFmode, REGNO(ops[4]));
rtx op5_df = gen_rtx_REG (DFmode, REGNO(ops[5]));
emit_insn (gen_clzv4si2 (ops[4],op1_v4si));
- emit_insn (gen_ashlv4si3 (ops[5],op1_v4si,ops[4]));
+ emit_insn (gen_vashlv4si3 (ops[5],op1_v4si,ops[4]));
emit_insn (gen_ceq_v4si (ops[6],ops[4],spu_const (V4SImode, 32)));
emit_insn (gen_subv4si3 (ops[4],ops[3],ops[4]));
emit_insn (gen_addv4si3 (ops[5],ops[5],ops[5]));
emit_move_insn (mask, spu_const (V4SImode, 0x0000ffff));
emit_insn (gen_spu_mpyhh (high, operands[1], operands[2]));
emit_insn (gen_spu_mpy (low, operands[1], operands[2]));
- emit_insn (gen_ashlv4si3 (shift, high, spu_const(V4SImode, 16)));
+ emit_insn (gen_vashlv4si3 (shift, high, spu_const(V4SImode, 16)));
emit_insn (gen_selb (result, shift, low, mask));
DONE;
}")
[(set_attr "type" "fxb")])
\f
-;; ashl
+;; ashl, vashl
-(define_insn "ashl<mode>3"
+(define_insn "<v>ashl<mode>3"
[(set (match_operand:VHSI 0 "spu_reg_operand" "=r,r")
(ashift:VHSI (match_operand:VHSI 1 "spu_reg_operand" "r,r")
(match_operand:VHSI 2 "spu_nonmem_operand" "r,W")))]
[(set_attr "type" "shuf,shuf")])
\f
-;; lshr
+;; lshr, vlshr
-(define_insn_and_split "lshr<mode>3"
+(define_insn_and_split "<v>lshr<mode>3"
[(set (match_operand:VHSI 0 "spu_reg_operand" "=r,r")
(lshiftrt:VHSI (match_operand:VHSI 1 "spu_reg_operand" "r,r")
(match_operand:VHSI 2 "spu_nonmem_operand" "r,W")))
[(set_attr "type" "shuf")])
\f
-;; ashr
+;; ashr, vashr
-(define_insn_and_split "ashr<mode>3"
+(define_insn_and_split "<v>ashr<mode>3"
[(set (match_operand:VHSI 0 "spu_reg_operand" "=r,r")
(ashiftrt:VHSI (match_operand:VHSI 1 "spu_reg_operand" "r,r")
(match_operand:VHSI 2 "spu_nonmem_operand" "r,W")))
emit_insn (gen_lshrti3 (op0, op1, GEN_INT (32)));
emit_insn (gen_spu_xswd (op0d, op0v));
if (val > 32)
- emit_insn (gen_ashrv4si3 (op0v, op0v, spu_const (V4SImode, val - 32)));
+ emit_insn (gen_vashrv4si3 (op0v, op0v, spu_const (V4SImode, val - 32)));
}
else
{
rtx op1_v4si = spu_gen_subreg (V4SImode, operands[1]);
rtx t = gen_reg_rtx (TImode);
emit_insn (gen_subsi3 (sign_shift, GEN_INT (128), force_reg (SImode, operands[2])));
- emit_insn (gen_ashrv4si3 (sign_mask_v4si, op1_v4si, spu_const (V4SImode, 31)));
+ emit_insn (gen_vashrv4si3 (sign_mask_v4si, op1_v4si, spu_const (V4SImode, 31)));
emit_insn (gen_fsm_ti (sign_mask, sign_mask));
emit_insn (gen_ashlti3 (sign_mask, sign_mask, sign_shift));
emit_insn (gen_lshrti3 (t, operands[1], operands[2]));
[(set_attr "type" "shuf")])
\f
-;; rotl
+;; vrotl, rotl
-(define_insn "rotl<mode>3"
+(define_insn "<v>rotl<mode>3"
[(set (match_operand:VHSI 0 "spu_reg_operand" "=r,r")
(rotate:VHSI (match_operand:VHSI 1 "spu_reg_operand" "r,r")
(match_operand:VHSI 2 "spu_nonmem_operand" "r,W")))]
emit_insn (gen_iorv4si3 (a_nan, a_nan, b_nan));
}
emit_move_insn (zero, CONST0_RTX (V4SImode));
- emit_insn (gen_ashrv4si3 (asel, ra, spu_const (V4SImode, 31)));
+ emit_insn (gen_vashrv4si3 (asel, ra, spu_const (V4SImode, 31)));
emit_insn (gen_shufb (asel, asel, asel, hi_promote));
emit_insn (gen_bg_v4si (abor, zero, a_abs));
emit_insn (gen_shufb (abor, abor, abor, borrow_shuffle));
emit_insn (gen_sfx_v4si (abor, zero, a_abs, abor));
emit_insn (gen_selb (abor, a_abs, abor, asel));
- emit_insn (gen_ashrv4si3 (bsel, rb, spu_const (V4SImode, 31)));
+ emit_insn (gen_vashrv4si3 (bsel, rb, spu_const (V4SImode, 31)));
emit_insn (gen_shufb (bsel, bsel, bsel, hi_promote));
emit_insn (gen_bg_v4si (bbor, zero, b_abs));
emit_insn (gen_shufb (bbor, bbor, bbor, borrow_shuffle));
emit_insn (gen_shufb (b_nan, b_nan, b_nan, hi_promote));
emit_insn (gen_iorv4si3 (a_nan, a_nan, b_nan));
emit_move_insn (zero, CONST0_RTX (V4SImode));
- emit_insn (gen_ashrv4si3 (asel, ra, spu_const (V4SImode, 31)));
+ emit_insn (gen_vashrv4si3 (asel, ra, spu_const (V4SImode, 31)));
emit_insn (gen_shufb (asel, asel, asel, hi_promote));
emit_insn (gen_bg_v4si (abor, zero, a_abs));
emit_insn (gen_shufb (abor, abor, abor, borrow_shuffle));
emit_insn (gen_sfx_v4si (abor, zero, a_abs, abor));
emit_insn (gen_selb (abor, a_abs, abor, asel));
- emit_insn (gen_ashrv4si3 (bsel, rb, spu_const (V4SImode, 31)));
+ emit_insn (gen_vashrv4si3 (bsel, rb, spu_const (V4SImode, 31)));
emit_insn (gen_shufb (bsel, bsel, bsel, hi_promote));
emit_insn (gen_bg_v4si (bbor, zero, b_abs));
emit_insn (gen_shufb (bbor, bbor, bbor, borrow_shuffle));
0x08090A0B, 0x08090A0B);
emit_move_insn (hi_promote, pat);
- emit_insn (gen_ashrv4si3 (sign, ra, spu_const (V4SImode, 31)));
+ emit_insn (gen_vashrv4si3 (sign, ra, spu_const (V4SImode, 31)));
emit_insn (gen_shufb (sign, sign, sign, hi_promote));
emit_insn (gen_andv4si3 (abs, ra, sign_mask));
instruction pattern, and the compiler will convert the operand to that
mode before generating the instruction. The meaning of out-of-range shift
counts can optionally be specified by @code{TARGET_SHIFT_TRUNCATION_MASK}.
-@xref{TARGET_SHIFT_TRUNCATION_MASK}.
+@xref{TARGET_SHIFT_TRUNCATION_MASK}. Operand 2 is always a scalar type.
@cindex @code{ashr@var{m}3} instruction pattern
@cindex @code{lshr@var{m}3} instruction pattern
@cindex @code{rotr@var{m}3} instruction pattern
@item @samp{ashr@var{m}3}, @samp{lshr@var{m}3}, @samp{rotl@var{m}3}, @samp{rotr@var{m}3}
Other shift and rotate instructions, analogous to the
-@code{ashl@var{m}3} instructions.
+@code{ashl@var{m}3} instructions. Operand 2 is always a scalar type.
+
+@cindex @code{vashl@var{m}3} instruction pattern
+@cindex @code{vashr@var{m}3} instruction pattern
+@cindex @code{vlshr@var{m}3} instruction pattern
+@cindex @code{vrotl@var{m}3} instruction pattern
+@cindex @code{vrotr@var{m}3} instruction pattern
+@item @samp{vashl@var{m}3}, @samp{vashr@var{m}3}, @samp{vlshr@var{m}3}, @samp{vrotl@var{m}3}, @samp{vrotr@var{m}3}
+Vector shift and rotate instructions that take vectors as operand 2
+instead of a scalar type.
@cindex @code{neg@var{m}2} instruction pattern
@cindex @code{ssneg@var{m}2} instruction pattern
rtx op1, temp = 0;
int left = (code == LSHIFT_EXPR || code == LROTATE_EXPR);
int rotate = (code == LROTATE_EXPR || code == RROTATE_EXPR);
+ optab lshift_optab = ashl_optab;
+ optab rshift_arith_optab = ashr_optab;
+ optab rshift_uns_optab = lshr_optab;
+ optab lrotate_optab = rotl_optab;
+ optab rrotate_optab = rotr_optab;
+ enum machine_mode op1_mode;
int try;
+ op1 = expand_normal (amount);
+ op1_mode = GET_MODE (op1);
+
+ /* Determine whether the shift/rotate amount is a vector, or scalar. If the
+ shift amount is a vector, use the vector/vector shift patterns. */
+ if (VECTOR_MODE_P (mode) && VECTOR_MODE_P (op1_mode))
+ {
+ lshift_optab = vashl_optab;
+ rshift_arith_optab = vashr_optab;
+ rshift_uns_optab = vlshr_optab;
+ lrotate_optab = vrotl_optab;
+ rrotate_optab = vrotr_optab;
+ }
+
/* Previously detected shift-counts computed by NEGATE_EXPR
and shifted in the other direction; but that does not work
on all machines. */
- op1 = expand_normal (amount);
-
if (SHIFT_COUNT_TRUNCATED)
{
if (GET_CODE (op1) == CONST_INT
}
temp = expand_binop (mode,
- left ? rotl_optab : rotr_optab,
+ left ? lrotate_optab : rrotate_optab,
shifted, op1, target, unsignedp, methods);
}
else if (unsignedp)
temp = expand_binop (mode,
- left ? ashl_optab : lshr_optab,
+ left ? lshift_optab : rshift_uns_optab,
shifted, op1, target, unsignedp, methods);
/* Do arithmetic shifts.
/* Arithmetic shift */
temp = expand_binop (mode,
- left ? ashl_optab : ashr_optab,
+ left ? lshift_optab : rshift_arith_optab,
shifted, op1, target, unsignedp, methods1);
}
if (modifier == EXPAND_STACK_PARM)
target = 0;
temp = expand_unop (mode,
- optab_for_tree_code (NEGATE_EXPR, type),
+ optab_for_tree_code (NEGATE_EXPR, type,
+ optab_default),
op0, target, 0);
gcc_assert (temp);
return REDUCE_BIT_FIELD (temp);
/* First try to do it with a special MIN or MAX instruction.
If that does not win, use a conditional jump to select the proper
value. */
- this_optab = optab_for_tree_code (code, type);
+ this_optab = optab_for_tree_code (code, type, optab_default);
temp = expand_binop (mode, this_optab, op0, op1, target, unsignedp,
OPTAB_WIDEN);
if (temp != 0)
case LROTATE_EXPR:
case RROTATE_EXPR:
- /* The expansion code only handles expansion of mode precision
- rotates. */
- gcc_assert (GET_MODE_PRECISION (TYPE_MODE (type))
- == TYPE_PRECISION (type));
+ gcc_assert (VECTOR_MODE_P (TYPE_MODE (type))
+ || (GET_MODE_PRECISION (TYPE_MODE (type))
+ == TYPE_PRECISION (type)));
+ /* fall through */
- /* Falltrough. */
case LSHIFT_EXPR:
case RSHIFT_EXPR:
/* If this is a fixed-point operation, then we cannot use the code
tree oprnd2 = TREE_OPERAND (exp, 2);
rtx op2;
- this_optab = optab_for_tree_code (code, type);
+ this_optab = optab_for_tree_code (code, type, optab_default);
expand_operands (oprnd0, oprnd1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
op2 = expand_normal (oprnd2);
temp = expand_ternary_op (mode, this_optab, op0, op1, op2,
case REDUC_PLUS_EXPR:
{
op0 = expand_normal (TREE_OPERAND (exp, 0));
- this_optab = optab_for_tree_code (code, type);
+ this_optab = optab_for_tree_code (code, type, optab_default);
temp = expand_unop (mode, this_optab, op0, target, unsignedp);
gcc_assert (temp);
return temp;
{
expand_operands (TREE_OPERAND (exp, 0), TREE_OPERAND (exp, 1),
NULL_RTX, &op0, &op1, 0);
- this_optab = optab_for_tree_code (code, type);
+ this_optab = optab_for_tree_code (code, type, optab_default);
temp = expand_binop (mode, this_optab, op0, op1, target, unsignedp,
OPTAB_WIDEN);
gcc_assert (temp);
{
expand_operands (TREE_OPERAND (exp, 0), TREE_OPERAND (exp, 1),
NULL_RTX, &op0, &op1, 0);
- this_optab = optab_for_tree_code (code, type);
+ this_optab = optab_for_tree_code (code, type, optab_default);
temp = expand_binop (mode, this_optab, op0, op1, target, unsignedp,
OPTAB_WIDEN);
gcc_assert (temp);
case VEC_UNPACK_LO_EXPR:
{
op0 = expand_normal (TREE_OPERAND (exp, 0));
- this_optab = optab_for_tree_code (code, type);
+ this_optab = optab_for_tree_code (code, type, optab_default);
temp = expand_widen_pattern_expr (exp, op0, NULL_RTX, NULL_RTX,
target, unsignedp);
gcc_assert (temp);
op0 = expand_normal (TREE_OPERAND (exp, 0));
/* The signedness is determined from input operand. */
this_optab = optab_for_tree_code (code,
- TREE_TYPE (TREE_OPERAND (exp, 0)));
+ TREE_TYPE (TREE_OPERAND (exp, 0)),
+ optab_default);
temp = expand_widen_pattern_expr
(exp, op0, NULL_RTX, NULL_RTX,
target, TYPE_UNSIGNED (TREE_TYPE (TREE_OPERAND (exp, 0))));
expand_operands (TREE_OPERAND (exp, 0), TREE_OPERAND (exp, 1),
subtarget, &op0, &op1, 0);
binop2:
- this_optab = optab_for_tree_code (code, type);
+ this_optab = optab_for_tree_code (code, type, optab_default);
binop3:
if (modifier == EXPAND_STACK_PARM)
target = 0;
"optab_handler (lshr_optab, $A)->insn_code = CODE_FOR_$(lshr$a3$)",
"optab_handler (rotl_optab, $A)->insn_code = CODE_FOR_$(rotl$a3$)",
"optab_handler (rotr_optab, $A)->insn_code = CODE_FOR_$(rotr$a3$)",
+ "optab_handler (vashr_optab, $A)->insn_code = CODE_FOR_$(vashr$a3$)",
+ "optab_handler (vlshr_optab, $A)->insn_code = CODE_FOR_$(vlshr$a3$)",
+ "optab_handler (vashl_optab, $A)->insn_code = CODE_FOR_$(vashl$a3$)",
+ "optab_handler (vrotl_optab, $A)->insn_code = CODE_FOR_$(vrotl$a3$)",
+ "optab_handler (vrotr_optab, $A)->insn_code = CODE_FOR_$(vrotr$a3$)",
"optab_handler (smin_optab, $A)->insn_code = CODE_FOR_$(smin$a3$)",
"optab_handler (smax_optab, $A)->insn_code = CODE_FOR_$(smax$a3$)",
"optab_handler (umin_optab, $A)->insn_code = CODE_FOR_$(umin$I$a3$)",
return result;
}
\f
-/* Return the optab used for computing the operation given by
- the tree code, CODE. This function is not always usable (for
- example, it cannot give complete results for multiplication
- or division) but probably ought to be relied on more widely
- throughout the expander. */
+/* Return the optab used for computing the operation given by the tree code,
+ CODE and the tree EXP. This function is not always usable (for example, it
+ cannot give complete results for multiplication or division) but probably
+ ought to be relied on more widely throughout the expander. */
optab
-optab_for_tree_code (enum tree_code code, const_tree type)
+optab_for_tree_code (enum tree_code code, const_tree type,
+ enum optab_subtype subtype)
{
bool trapv;
switch (code)
return TYPE_UNSIGNED (type) ? udiv_optab : sdiv_optab;
case LSHIFT_EXPR:
+ if (VECTOR_MODE_P (TYPE_MODE (type)))
+ {
+ if (subtype == optab_vector)
+ return TYPE_SATURATING (type) ? NULL : vashl_optab;
+
+ gcc_assert (subtype == optab_scalar);
+ }
if (TYPE_SATURATING(type))
return TYPE_UNSIGNED(type) ? usashl_optab : ssashl_optab;
return ashl_optab;
case RSHIFT_EXPR:
+ if (VECTOR_MODE_P (TYPE_MODE (type)))
+ {
+ if (subtype == optab_vector)
+ return TYPE_UNSIGNED (type) ? vlshr_optab : vashr_optab;
+
+ gcc_assert (subtype == optab_scalar);
+ }
return TYPE_UNSIGNED (type) ? lshr_optab : ashr_optab;
case LROTATE_EXPR:
+ if (VECTOR_MODE_P (TYPE_MODE (type)))
+ {
+ if (subtype == optab_vector)
+ return vrotl_optab;
+
+ gcc_assert (subtype == optab_scalar);
+ }
return rotl_optab;
case RROTATE_EXPR:
+ if (VECTOR_MODE_P (TYPE_MODE (type)))
+ {
+ if (subtype == optab_vector)
+ return vrotr_optab;
+
+ gcc_assert (subtype == optab_scalar);
+ }
return rotr_optab;
case MAX_EXPR:
oprnd0 = TREE_OPERAND (exp, 0);
tmode0 = TYPE_MODE (TREE_TYPE (oprnd0));
widen_pattern_optab =
- optab_for_tree_code (TREE_CODE (exp), TREE_TYPE (oprnd0));
+ optab_for_tree_code (TREE_CODE (exp), TREE_TYPE (oprnd0), optab_default);
icode = (int) optab_handler (widen_pattern_optab, tmode0)->insn_code;
gcc_assert (icode != CODE_FOR_nothing);
xmode0 = insn_data[icode].operand[1].mode;
OTI_rotl,
/* Rotate right */
OTI_rotr,
+
+ /* Arithmetic shift left of vector by vector */
+ OTI_vashl,
+ /* Logical shift right of vector by vector */
+ OTI_vlshr,
+ /* Arithmetic shift right of vector by vector */
+ OTI_vashr,
+ /* Rotate left of vector by vector */
+ OTI_vrotl,
+ /* Rotate right of vector by vector */
+ OTI_vrotr,
+
/* Signed and floating-point minimum value */
OTI_smin,
/* Signed and floating-point maximum value */
#define ashr_optab (&optab_table[OTI_ashr])
#define rotl_optab (&optab_table[OTI_rotl])
#define rotr_optab (&optab_table[OTI_rotr])
+#define vashl_optab (&optab_table[OTI_vashl])
+#define vlshr_optab (&optab_table[OTI_vlshr])
+#define vashr_optab (&optab_table[OTI_vashr])
+#define vrotl_optab (&optab_table[OTI_vrotl])
+#define vrotr_optab (&optab_table[OTI_vrotr])
#define smin_optab (&optab_table[OTI_smin])
#define smax_optab (&optab_table[OTI_smax])
#define umin_optab (&optab_table[OTI_umin])
extern void emit_cmp_insn (rtx, rtx, enum rtx_code, rtx, enum machine_mode,
int);
+/* An extra flag to control optab_for_tree_code's behavior. This is needed to
+ distinguish between machines with a vector shift that takes a scalar for the
+ shift amount vs. machines that take a vector for the shift amount. */
+enum optab_subtype
+{
+ optab_default,
+ optab_scalar,
+ optab_vector
+};
+
+/* Return the optab used for computing the given operation on the type given by
+ the second argument. The third argument distinguishes between the types of
+ vector shifts and rotates */
+extern optab optab_for_tree_code (enum tree_code, const_tree, enum optab_subtype);
+
/* The various uses that a comparison can have; used by can_compare_p:
jumps, conditional moves, store flag operations. */
enum can_compare_purpose
ccp_store_flag
};
-/* Return the optab used for computing the given operation on the type
- given by the second argument. */
-extern optab optab_for_tree_code (enum tree_code, const_tree);
-
/* Nonzero if a compare of mode MODE can be done straightforwardly
(without splitting it into pieces). */
extern int can_compare_p (enum rtx_code, enum machine_mode,
+2008-05-14 Michael Meissner <michael.meissner@amd.com>
+ Dwarakanath Rajagopal <dwarak.rajagopal@amd.com>
+
+ * gcc.target/i386/sse5-imul32widen-vector.c: New file to test x86
+ SSE5 optimizations.
+ * gcc.target/i386/sse5-imul64-vector.c: Ditto.
+ * gcc.target/i386/sse5-rotate1-vector.c: Ditto.
+ * gcc.target/i386/sse5-rotate2-vector.c: Ditto.
+ * gcc.target/i386/sse5-rotate3-vector.c: Ditto.
+ * gcc.target/i386/sse5-shift1-vector.c: Ditto.
+ * gcc.target/i386/sse5-shift2-vector.c: Ditto.
+ * gcc.target/i386/sse5-shift3-vector.c: Ditto.
+
2008-05-14 Michael Meissner <michael.meissner@amd.com>
PR target/36224
--- /dev/null
+/* Test that the compiler properly optimizes floating point multiply and add
+ instructions vector into pmacsdd/etc. on SSE5 systems. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -msse5 -ftree-vectorize" } */
+
+extern void exit (int);
+
+typedef long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
+
+#define SIZE 10240
+
+union {
+ __m128i i_align;
+ int i32[SIZE];
+ long i64[SIZE];
+} a, b, c, d;
+
+void
+imul32_to_64 (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.i64[i] = ((long)b.i32[i]) * ((long)c.i32[i]);
+}
+
+int main ()
+{
+ imul32_to_64 ();
+ exit (0);
+}
+
+/* { dg-final { scan-assembler "pmacsdql" } } */
+/* { dg-final { scan-assembler "pmacsdqh" } } */
--- /dev/null
+/* Test that the compiler properly optimizes floating point multiply and add
+ instructions vector into pmacsdd/etc. on SSE5 systems. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -msse5 -ftree-vectorize" } */
+
+extern void exit (int);
+
+typedef long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
+
+#define SIZE 10240
+
+union {
+ __m128i i_align;
+ long i64[SIZE];
+} a, b, c, d;
+
+void
+imul64 (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.i64[i] = b.i64[i] * c.i64[i];
+}
+
+int main ()
+{
+ imul64 ();
+ exit (0);
+}
+
+/* { dg-final { scan-assembler "pmacsdd" } } */
+/* { dg-final { scan-assembler "phadddq" } } */
+/* { dg-final { scan-assembler "pmacsdql" } } */
--- /dev/null
+/* Test that the compiler properly optimizes vector rotate instructions vector
+ into prot on SSE5 systems. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -msse5 -ftree-vectorize" } */
+
+extern void exit (int);
+
+typedef long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
+
+#define SIZE 10240
+
+union {
+ __m128i i_align;
+ unsigned u32[SIZE];
+} a, b, c;
+
+void
+left_rotate32 (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.u32[i] = (b.u32[i] << ((sizeof (int) * 8) - 4)) | (b.u32[i] >> 4);
+}
+
+int
+main ()
+{
+ left_rotate32 ();
+ exit (0);
+}
+
+/* { dg-final { scan-assembler "protd" } } */
--- /dev/null
+/* Test that the compiler properly optimizes vector rotate instructions vector
+ into prot on SSE5 systems. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -msse5 -ftree-vectorize" } */
+
+extern void exit (int);
+
+typedef long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
+
+#define SIZE 10240
+
+union {
+ __m128i i_align;
+ unsigned u32[SIZE];
+} a, b, c;
+
+void
+right_rotate32_b (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.u32[i] = (b.u32[i] >> ((sizeof (int) * 8) - 4)) | (b.u32[i] << 4);
+}
+
+int
+main ()
+{
+ right_rotate ();
+ exit (0);
+}
+
+/* { dg-final { scan-assembler "prot" } } */
--- /dev/null
+/* Test that the compiler properly optimizes vector rotate instructions vector
+ into prot on SSE5 systems. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -msse5 -ftree-vectorize" } */
+
+extern void exit (int);
+
+typedef long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
+
+#define SIZE 10240
+
+union {
+ __m128i i_align;
+ unsigned u32[SIZE];
+} a, b, c;
+
+void
+vector_rotate32 (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.u32[i] = (b.u32[i] >> ((sizeof (int) * 8) - c.u32[i])) | (b.u32[i] << c.u32[i]);
+}
+
+int main ()
+{
+ vector_rotate32 ();
+ exit (0);
+}
+
+/* { dg-final { scan-assembler "protd" } } */
--- /dev/null
+/* Test that the compiler properly optimizes vector shift instructions into
+ psha/pshl on SSE5 systems. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -msse5 -ftree-vectorize" } */
+
+extern void exit (int);
+
+typedef long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
+
+#define SIZE 10240
+
+union {
+ __m128i i_align;
+ int i32[SIZE];
+ unsigned u32[SIZE];
+} a, b, c;
+
+void
+left_shift32 (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.i32[i] = b.i32[i] << c.i32[i];
+}
+
+int main ()
+{
+ left_shfit32 ();
+ exit (0);
+}
+
+/* { dg-final { scan-assembler "pshad" } } */
--- /dev/null
+/* Test that the compiler properly optimizes vector shift instructions into
+ psha/pshl on SSE5 systems. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -msse5 -ftree-vectorize" } */
+
+extern void exit (int);
+
+typedef long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
+
+#define SIZE 10240
+
+union {
+ __m128i i_align;
+ int i32[SIZE];
+ unsigned u32[SIZE];
+} a, b, c;
+
+void
+right_sign_shift32 (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.i32[i] = b.i32[i] >> c.i32[i];
+}
+
+int main ()
+{
+ right_sign_shfit32 ();
+ exit (0);
+}
+
+/* { dg-final { scan-assembler "pshad" } } */
--- /dev/null
+/* Test that the compiler properly optimizes vector shift instructions into
+ psha/pshl on SSE5 systems. */
+
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -msse5 -ftree-vectorize" } */
+
+extern void exit (int);
+
+typedef long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
+
+#define SIZE 10240
+
+union {
+ __m128i i_align;
+ int i32[SIZE];
+ unsigned u32[SIZE];
+} a, b, c;
+
+void
+right_uns_shift32 (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.u32[i] = b.u32[i] >> c.i32[i];
+}
+
+int main ()
+{
+ right_uns_shfit32 ();
+ exit (0);
+}
+
+/* { dg-final { scan-assembler "pshld" } } */
/* Shift arguments should be equal in all the packed stmts for a
vector shift with scalar shift operand. */
- if (TREE_CODE (rhs) == LSHIFT_EXPR || TREE_CODE (rhs) == RSHIFT_EXPR)
+ if (TREE_CODE (rhs) == LSHIFT_EXPR || TREE_CODE (rhs) == RSHIFT_EXPR
+ || TREE_CODE (rhs) == LROTATE_EXPR
+ || TREE_CODE (rhs) == RROTATE_EXPR)
{
vec_mode = TYPE_MODE (vectype);
- optab = optab_for_tree_code (TREE_CODE (rhs), vectype);
- if (!optab)
- {
- if (vect_print_dump_info (REPORT_SLP))
- fprintf (vect_dump, "Build SLP failed: no optab.");
- return false;
- }
- icode = (int) optab->handlers[(int) vec_mode].insn_code;
- if (icode == CODE_FOR_nothing)
- {
- if (vect_print_dump_info (REPORT_SLP))
- fprintf (vect_dump,
- "Build SLP failed: op not supported by target.");
- return false;
- }
- optab_op2_mode = insn_data[icode].operand[2].mode;
- if (!VECTOR_MODE_P (optab_op2_mode))
+
+ /* First see if we have a vector/vector shift. */
+ optab = optab_for_tree_code (TREE_CODE (rhs), vectype,
+ optab_vector);
+
+ if (!optab
+ || (optab->handlers[(int) vec_mode].insn_code
+ == CODE_FOR_nothing))
{
- need_same_oprnds = true;
- first_op1 = TREE_OPERAND (rhs, 1);
+ /* No vector/vector shift, try for a vector/scalar shift. */
+ optab = optab_for_tree_code (TREE_CODE (rhs), vectype,
+ optab_scalar);
+
+ if (!optab)
+ {
+ if (vect_print_dump_info (REPORT_SLP))
+ fprintf (vect_dump, "Build SLP failed: no optab.");
+ return false;
+ }
+ icode = (int) optab->handlers[(int) vec_mode].insn_code;
+ if (icode == CODE_FOR_nothing)
+ {
+ if (vect_print_dump_info (REPORT_SLP))
+ fprintf (vect_dump,
+ "Build SLP failed: op not supported by target.");
+ return false;
+ }
+ optab_op2_mode = insn_data[icode].operand[2].mode;
+ if (!VECTOR_MODE_P (optab_op2_mode))
+ {
+ need_same_oprnds = true;
+ first_op1 = TREE_OPERAND (rhs, 1);
+ }
}
}
}
|| code == VEC_UNPACK_FLOAT_LO_EXPR)
type = TREE_TYPE (TREE_OPERAND (rhs, 0));
- op = optab_for_tree_code (code, type);
+ /* Choose between vector shift/rotate by vector and vector shift/rotate by
+ scalar */
+ if (code == LSHIFT_EXPR || code == RSHIFT_EXPR || code == LROTATE_EXPR
+ || code == RROTATE_EXPR)
+ {
+ /* If the 2nd argument is vector, we need a vector/vector shift */
+ if (VECTOR_MODE_P (TYPE_MODE (TREE_TYPE (TREE_OPERAND (rhs, 1)))))
+ op = optab_for_tree_code (code, type, optab_vector);
+
+ else
+ {
+ /* Try for a vector/scalar shift, and if we don't have one, see if we
+ have a vector/vector shift */
+ op = optab_for_tree_code (code, type, optab_scalar);
+ if (!op
+ || (op->handlers[(int) TYPE_MODE (type)].insn_code
+ == CODE_FOR_nothing))
+ op = optab_for_tree_code (code, type, optab_vector);
+ }
+ }
+ else
+ op = optab_for_tree_code (code, type, optab_default);
/* For widening/narrowing vector operations, the relevant type is of the
arguments, not the widened result. VEC_UNPACK_FLOAT_*_EXPR is
if (op == NULL
&& code == NEGATE_EXPR
&& INTEGRAL_TYPE_P (TREE_TYPE (type)))
- op = optab_for_tree_code (MINUS_EXPR, type);
+ op = optab_for_tree_code (MINUS_EXPR, type, optab_default);
/* For very wide vectors, try using a smaller vector mode. */
compute_type = type;
if (!pattern_vectype)
return;
- optab = optab_for_tree_code (TREE_CODE (pattern_expr), pattern_vectype);
+ optab = optab_for_tree_code (TREE_CODE (pattern_expr), pattern_vectype,
+ optab_default);
vec_mode = TYPE_MODE (pattern_vectype);
if (!optab
|| (icode = optab_handler (optab, vec_mode)->insn_code) ==
int element_bitsize = tree_low_cst (bitsize, 1);
int nelements = vec_size_in_bits / element_bitsize;
- optab = optab_for_tree_code (code, vectype);
+ optab = optab_for_tree_code (code, vectype, optab_default);
/* We have a whole vector shift available. */
if (VECTOR_MODE_P (mode)
have_whole_vector_shift = false;
else
{
- optab optab = optab_for_tree_code (code, vectype);
+ optab optab = optab_for_tree_code (code, vectype, optab_default);
if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
have_whole_vector_shift = false;
}
/* 4. Supportable by target? */
/* 4.1. check support for the operation in the loop */
- optab = optab_for_tree_code (code, vectype);
+ optab = optab_for_tree_code (code, vectype, optab_default);
if (!optab)
{
if (vect_print_dump_info (REPORT_DETAILS))
if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
return false;
- reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype);
+ reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype, optab_default);
if (!reduc_optab)
{
if (vect_print_dump_info (REPORT_DETAILS))
VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
tree vop0, vop1;
unsigned int k;
+ bool shift_p = false;
bool scalar_shift_arg = false;
/* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies
if (code == POINTER_PLUS_EXPR)
code = PLUS_EXPR;
- optab = optab_for_tree_code (code, vectype);
-
/* Support only unary or binary operations. */
op_type = TREE_OPERAND_LENGTH (operation);
if (op_type != unary_op && op_type != binary_op)
}
}
+ /* If this is a shift/rotate, determine whether the shift amount is a vector,
+ or scalar. If the shift/rotate amount is a vector, use the vector/vector
+ shift optabs. */
+ if (code == LSHIFT_EXPR || code == RSHIFT_EXPR || code == LROTATE_EXPR
+ || code == RROTATE_EXPR)
+ {
+ shift_p = true;
+
+ /* vector shifted by vector */
+ if (dt[1] == vect_loop_def)
+ {
+ optab = optab_for_tree_code (code, vectype, optab_vector);
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "vector/vector shift/rotate found.");
+ }
+
+ /* See if the machine has a vector shifted by scalar insn and if not
+ then see if it has a vector shifted by vector insn */
+ else if (dt[1] == vect_constant_def || dt[1] == vect_invariant_def)
+ {
+ optab = optab_for_tree_code (code, vectype, optab_scalar);
+ if (optab
+ && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
+ != CODE_FOR_nothing))
+ {
+ scalar_shift_arg = true;
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "vector/scalar shift/rotate found.");
+ }
+ else
+ {
+ optab = optab_for_tree_code (code, vectype, optab_vector);
+ if (vect_print_dump_info (REPORT_DETAILS)
+ && optab
+ && (optab_handler (optab, TYPE_MODE (vectype))->insn_code
+ != CODE_FOR_nothing))
+ fprintf (vect_dump, "vector/vector shift/rotate found.");
+ }
+ }
+
+ else
+ {
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, "operand mode requires invariant argument.");
+ return false;
+ }
+ }
+ else
+ optab = optab_for_tree_code (code, vectype, optab_default);
+
/* Supportable by target? */
if (!optab)
{
return false;
}
- if (code == LSHIFT_EXPR || code == RSHIFT_EXPR)
- {
- /* FORNOW: not yet supported. */
- if (!VECTOR_MODE_P (vec_mode))
- return false;
-
- /* Invariant argument is needed for a vector shift
- by a scalar shift operand. */
- optab_op2_mode = insn_data[icode].operand[2].mode;
- if (!VECTOR_MODE_P (optab_op2_mode))
- {
- if (dt[1] != vect_constant_def && dt[1] != vect_invariant_def)
- {
- if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "operand mode requires invariant"
- " argument.");
- return false;
- }
-
- scalar_shift_arg = true;
- }
- }
-
if (!vec_stmt) /* transformation not required. */
{
STMT_VINFO_TYPE (stmt_info) = op_vec_info_type;
/* Handle uses. */
if (j == 0)
{
- if (op_type == binary_op
- && (code == LSHIFT_EXPR || code == RSHIFT_EXPR))
+ if (op_type == binary_op && scalar_shift_arg)
{
/* Vector shl and shr insn patterns can be defined with scalar
operand 2 (shift operand). In this case, use constant or loop
/* Check that the operation is supported. */
interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR,
- vectype);
+ vectype, optab_default);
interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR,
- vectype);
+ vectype, optab_default);
if (!interleave_high_optab || !interleave_low_optab)
{
if (vect_print_dump_info (REPORT_DETAILS))
mode = (int) TYPE_MODE (vectype);
- perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype);
+ perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype,
+ optab_default);
if (!perm_even_optab)
{
if (vect_print_dump_info (REPORT_DETAILS))
return false;
}
- perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype);
+ perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype,
+ optab_default);
if (!perm_odd_optab)
{
if (vect_print_dump_info (REPORT_DETAILS))
if (code == FIX_TRUNC_EXPR)
{
/* The signedness is determined from output operand. */
- optab1 = optab_for_tree_code (c1, type);
- optab2 = optab_for_tree_code (c2, type);
+ optab1 = optab_for_tree_code (c1, type, optab_default);
+ optab2 = optab_for_tree_code (c2, type, optab_default);
}
else
{
- optab1 = optab_for_tree_code (c1, vectype);
- optab2 = optab_for_tree_code (c2, vectype);
+ optab1 = optab_for_tree_code (c1, vectype, optab_default);
+ optab2 = optab_for_tree_code (c2, vectype, optab_default);
}
if (!optab1 || !optab2)
if (code == FIX_TRUNC_EXPR)
/* The signedness is determined from output operand. */
- optab1 = optab_for_tree_code (c1, type);
+ optab1 = optab_for_tree_code (c1, type, optab_default);
else
- optab1 = optab_for_tree_code (c1, vectype);
+ optab1 = optab_for_tree_code (c1, vectype, optab_default);
if (!optab1)
return false;