From 71d46ca56cb36baa54ec8a1de7cbc0fda9ae3245 Mon Sep 17 00:00:00 2001 From: Michael Meissner Date: Wed, 14 May 2008 20:07:53 +0000 Subject: [PATCH] Add SSE5 vector shift/rotate; Update SSE5 vector multiply Co-Authored-By: Dwarakanath Rajagopal Co-Authored-By: Paolo Bonzini From-SVN: r135304 --- gcc/ChangeLog | 151 +++++ gcc/config/i386/i386.c | 250 ++++---- gcc/config/i386/i386.md | 22 +- gcc/config/i386/sse.md | 575 +++++++++++++++++- gcc/config/rs6000/altivec.md | 12 +- gcc/config/rs6000/rs6000.c | 18 +- gcc/config/spu/spu-builtins.def | 48 +- gcc/config/spu/spu.c | 2 +- gcc/config/spu/spu.md | 39 +- gcc/doc/md.texi | 13 +- gcc/expmed.c | 28 +- gcc/expr.c | 29 +- gcc/genopinit.c | 5 + gcc/optabs.c | 42 +- gcc/optabs.h | 36 +- gcc/testsuite/ChangeLog | 13 + .../gcc.target/i386/sse5-imul32widen-vector.c | 36 ++ .../gcc.target/i386/sse5-imul64-vector.c | 36 ++ .../gcc.target/i386/sse5-rotate1-vector.c | 35 ++ .../gcc.target/i386/sse5-rotate2-vector.c | 35 ++ .../gcc.target/i386/sse5-rotate3-vector.c | 34 ++ .../gcc.target/i386/sse5-shift1-vector.c | 35 ++ .../gcc.target/i386/sse5-shift2-vector.c | 35 ++ .../gcc.target/i386/sse5-shift3-vector.c | 35 ++ gcc/tree-vect-analyze.c | 55 +- gcc/tree-vect-generic.c | 25 +- gcc/tree-vect-patterns.c | 3 +- gcc/tree-vect-transform.c | 97 +-- gcc/tree-vectorizer.c | 12 +- 29 files changed, 1449 insertions(+), 307 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/sse5-imul32widen-vector.c create mode 100644 gcc/testsuite/gcc.target/i386/sse5-imul64-vector.c create mode 100644 gcc/testsuite/gcc.target/i386/sse5-rotate1-vector.c create mode 100644 gcc/testsuite/gcc.target/i386/sse5-rotate2-vector.c create mode 100644 gcc/testsuite/gcc.target/i386/sse5-rotate3-vector.c create mode 100644 gcc/testsuite/gcc.target/i386/sse5-shift1-vector.c create mode 100644 gcc/testsuite/gcc.target/i386/sse5-shift2-vector.c create mode 100644 gcc/testsuite/gcc.target/i386/sse5-shift3-vector.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 6b7e6d60a66..cfc8b09d5e4 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,154 @@ +2008-05-14 Michael Meissner + Dwarakanath Rajagopal + + * optabs.h (optab_index): Add OTI_vashl, OTI_vlshr, OTI_vashr, + OTI_vrotl, OTI_vrotr to support vector/vector shifts. + (vashl_optab): New optab for vector/vector shifts. + (vashr_optab): Ditto. + (vlshr_optab): Ditto. + (vrotl_optab): Ditto. + (vrotr_optab): Ditto. + (optab_subtype): New enum for optab_for_tree_code call. + (optab_for_tree_code): Add enum optab_subtype argument. + + * optabs.c (optab_for_tree_code): Take an additional argument to + distinguish between a vector shift by a scalar and vector shift by + a vector. Make lshr/ashr/ashl/rotl/rotr optabs just vector + shifted by a scalar. Use vlshr/vashr/vashl/vrotl/vrotr for the + vector shift by a vector. + (expand_widen_pattern_expr): Pass additional argument to + optab_for_tree_code. + + * genopinit.c (optabs): Add vashr_optab, vashl_optab, vlshr_optab, + vrotl_optab, vrotr_optab. + + * expr.c (expand_expr_real_1): Update calls to + optab_for_tree_code to distinguish between vector shifted by a + scalar and vector shifted by a vector. + * tree-vectorizer.c (supportable_widening_operation): Ditto. + (supportable_narrowing_operation): Ditto. + * tree-vect-analyze.c (vect_build_slp_tree): Ditto. + * tree-vect-patterns.c (vect_pattern_recog_1): Ditto. + * tree-vect-transform.c (vect_model_reduction_cost): Ditto. + (vect_create_epilog_for_reduction): Ditto. + (vectorizable_reduction): Ditto. + (vectorizable_operation): Ditto. + (vect_strided_store_supported): Ditto. + (vect_strided_load_supported): Ditto. + * tree-vect-generic.c (expand_vector_operations_1): Ditto. + * expmed.c (expand_shift): Ditto. + + * doc/md.texi (ashl@var{m}3): Document that operand 2 is always a + scalar type. + (ashr@var{m}3): Ditto. + (vashl@var{m}3): Document new vector/vector shift standard name. + (vashr@var{m}3): Ditto. + (vlshr@var{m}3): Ditto. + (vrotl@var{m}3): Ditto. + (vrotr@var{m}3): Ditto. + + * config/i386/i386.md (PPERM_SRC): Move PPERM masks here from + i386.c. + (PPERM_INVERT): Ditto. + (PPERM_REVERSE): Ditto. + (PPERM_REV_INV): Ditto. + (PPERM_ZERO): Ditto. + (PPERM_ONES): Ditto. + (PPERM_SIGN): Ditto. + (PPERM_INV_SIGN): Ditto. + (PPERM_SRC1): Ditto. + (PPERM_SRC2): Ditto. + + * config/i386/sse.md (mulv2di3): Add SSE5 support. + (sse5_pmacsdql_mem): New SSE5 define_and_split that temporarily + allows a memory operand to be the value being added, and split it + to improve vectorization. + (sse5_pmacsdqh_mem): Ditto. + (sse5_mulv2div2di3_low): SSE5 32-bit multiply and extend function. + (sse5_mulv2div2di3_high): Ditto. + (vec_pack_trunc_v8hi): Add SSE5 pperm support. + (vec_pack_trunc_v4si): Ditto. + (vec_pack_trunc_v2di): Ditto. + (sse5_pcmov_): Remove code that tried to use use + andps/andnps instead of pcmov. + (vec_widen_smult_hi_v4si): If we have SSE5, use the pmacsdql and + pmacsdqh instructions. + (vec_widen_smult_lo_v4si): Ditto. + + * config/i386/i386.c (PPERM_SRC): Move PPERM masks to i386.md. + (PPERM_INVERT): Ditto. + (PPERM_REVERSE): Ditto. + (PPERM_REV_INV): Ditto. + (PPERM_ZERO): Ditto. + (PPERM_ONES): Ditto. + (PPERM_SIGN): Ditto. + (PPERM_INV_SIGN): Ditto. + (PPERM_SRC1): Ditto. + (PPERM_SRC2): Ditto. + (ix86_expand_sse_movcc): Move the SSE5 test after the if + true/false tests. + (ix86_expand_int_vcond): If SSE5 generate all possible integer + comparisons. + (ix86_sse5_valid_op_p): Allow num_memory to be negative, which + says ignore whether the last reference is a memory operand. + +2008-05-14 Michael Meissner + Paolo Bonzini + + * config/rs6000/rs6000.c (bdesc_2arg): Change the names of vector + shift patterns. + + * config/rs6000/altivec.md (vashl3): Rename from + ashl3. + (vlshr3): Rename from vlshr3. + (vashr3): Rename from vashr3. + (mulv4sf3): Change the names of vector shift patterns. + (mulv4si3): Ditto. + (negv4sf2): Ditt. + + * config/spu/spu.c (spu_initialize_trampoline): Rename vector + shift insns. + + * config/spu/spu-builtins.def (SI_SHLH): Rename vector shift + insns. + (SI_SHLHI): Ditto. + (SI_SHL): Ditto. + (SI_SHLI): Ditto. + (SI_ROTH): Ditto. + (SI_ROTHI): Ditto. + (SI_ROT): Ditto. + (SI_ROTI): Ditto. + (SPU_RL_0): Ditto. + (SPU_RL_1): Ditto. + (SPU_RL_2): Ditto. + (SPU_RL_3): Ditto. + (SPU_RL_4): Ditto. + (SPU_RL_5): Ditto. + (SPU_RL_6): Ditto. + (SPU_RL_7): Ditto. + (SPU_SL_0): Ditto. + (SPU_SL_1): Ditto. + (SPU_SL_2): Ditto. + (SPU_SL_3): Ditto. + (SPU_SL_4): Ditto. + (SPU_SL_5): Ditto. + (SPU_SL_6): Ditto. + (SPU_SL_7): Ditto. + + * config/spu/spu.md (v): New iterator macro to add v for vector types. + (floatunssidf2_internal): Change vector/vector shift names. + (floatunsdidf2_internal): Ditto. + (mulv8hi3): Ditto. + (ashrdi3): Ditto. + (ashrti3): Ditto. + (cgt_df): Ditto. + (cgt_v2df): Ditto. + (dftsv): Ditto. + (vashl3): Rename from ashl3. + (vashr3): Rename from ashr3. + (vlshr3): Rename from lshr3. + (vrotl3): Rename from rotl3. + 2008-05-14 Michael Meissner PR target/36224 diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 945cd1b8445..d8fdc22226c 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -13306,15 +13306,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) enum machine_mode mode = GET_MODE (dest); rtx t2, t3, x; - if (TARGET_SSE5) - { - rtx pcmov = gen_rtx_SET (mode, dest, - gen_rtx_IF_THEN_ELSE (mode, cmp, - op_true, - op_false)); - emit_insn (pcmov); - } - else if (op_false == CONST0_RTX (mode)) + if (op_false == CONST0_RTX (mode)) { op_true = force_reg (mode, op_true); x = gen_rtx_AND (mode, cmp, op_true); @@ -13327,6 +13319,14 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) x = gen_rtx_AND (mode, x, op_false); emit_insn (gen_rtx_SET (VOIDmode, dest, x)); } + else if (TARGET_SSE5) + { + rtx pcmov = gen_rtx_SET (mode, dest, + gen_rtx_IF_THEN_ELSE (mode, cmp, + op_true, + op_false)); + emit_insn (pcmov); + } else { op_true = force_reg (mode, op_true); @@ -13472,115 +13472,119 @@ ix86_expand_int_vcond (rtx operands[]) cop0 = operands[4]; cop1 = operands[5]; - /* Canonicalize the comparison to EQ, GT, GTU. */ - switch (code) - { - case EQ: - case GT: - case GTU: - break; - - case NE: - case LE: - case LEU: - code = reverse_condition (code); - negate = true; - break; - - case GE: - case GEU: - code = reverse_condition (code); - negate = true; - /* FALLTHRU */ - - case LT: - case LTU: - code = swap_condition (code); - x = cop0, cop0 = cop1, cop1 = x; - break; - - default: - gcc_unreachable (); - } - - /* Only SSE4.1/SSE4.2 supports V2DImode. */ - if (mode == V2DImode) + /* SSE5 supports all of the comparisons on all vector int types. */ + if (!TARGET_SSE5) { + /* Canonicalize the comparison to EQ, GT, GTU. */ switch (code) { case EQ: - /* SSE4.1 supports EQ. */ - if (!TARGET_SSE4_1) - return false; - break; - case GT: case GTU: - /* SSE4.2 supports GT/GTU. */ - if (!TARGET_SSE4_2) - return false; + break; + + case NE: + case LE: + case LEU: + code = reverse_condition (code); + negate = true; + break; + + case GE: + case GEU: + code = reverse_condition (code); + negate = true; + /* FALLTHRU */ + + case LT: + case LTU: + code = swap_condition (code); + x = cop0, cop0 = cop1, cop1 = x; break; default: gcc_unreachable (); } - } - /* Unsigned parallel compare is not supported by the hardware. Play some - tricks to turn this into a signed comparison against 0. */ - if (code == GTU) - { - cop0 = force_reg (mode, cop0); + /* Only SSE4.1/SSE4.2 supports V2DImode. */ + if (mode == V2DImode) + { + switch (code) + { + case EQ: + /* SSE4.1 supports EQ. */ + if (!TARGET_SSE4_1) + return false; + break; - switch (mode) + case GT: + case GTU: + /* SSE4.2 supports GT/GTU. */ + if (!TARGET_SSE4_2) + return false; + break; + + default: + gcc_unreachable (); + } + } + + /* Unsigned parallel compare is not supported by the hardware. Play some + tricks to turn this into a signed comparison against 0. */ + if (code == GTU) { - case V4SImode: - case V2DImode: - { - rtx t1, t2, mask; - - /* Perform a parallel modulo subtraction. */ - t1 = gen_reg_rtx (mode); - emit_insn ((mode == V4SImode - ? gen_subv4si3 - : gen_subv2di3) (t1, cop0, cop1)); - - /* Extract the original sign bit of op0. */ - mask = ix86_build_signbit_mask (GET_MODE_INNER (mode), - true, false); - t2 = gen_reg_rtx (mode); - emit_insn ((mode == V4SImode - ? gen_andv4si3 - : gen_andv2di3) (t2, cop0, mask)); - - /* XOR it back into the result of the subtraction. This results - in the sign bit set iff we saw unsigned underflow. */ - x = gen_reg_rtx (mode); - emit_insn ((mode == V4SImode - ? gen_xorv4si3 - : gen_xorv2di3) (x, t1, t2)); - - code = GT; - } - break; + cop0 = force_reg (mode, cop0); - case V16QImode: - case V8HImode: - /* Perform a parallel unsigned saturating subtraction. */ - x = gen_reg_rtx (mode); - emit_insn (gen_rtx_SET (VOIDmode, x, - gen_rtx_US_MINUS (mode, cop0, cop1))); + switch (mode) + { + case V4SImode: + case V2DImode: + { + rtx t1, t2, mask; + + /* Perform a parallel modulo subtraction. */ + t1 = gen_reg_rtx (mode); + emit_insn ((mode == V4SImode + ? gen_subv4si3 + : gen_subv2di3) (t1, cop0, cop1)); + + /* Extract the original sign bit of op0. */ + mask = ix86_build_signbit_mask (GET_MODE_INNER (mode), + true, false); + t2 = gen_reg_rtx (mode); + emit_insn ((mode == V4SImode + ? gen_andv4si3 + : gen_andv2di3) (t2, cop0, mask)); + + /* XOR it back into the result of the subtraction. This results + in the sign bit set iff we saw unsigned underflow. */ + x = gen_reg_rtx (mode); + emit_insn ((mode == V4SImode + ? gen_xorv4si3 + : gen_xorv2di3) (x, t1, t2)); + + code = GT; + } + break; - code = EQ; - negate = !negate; - break; + case V16QImode: + case V8HImode: + /* Perform a parallel unsigned saturating subtraction. */ + x = gen_reg_rtx (mode); + emit_insn (gen_rtx_SET (VOIDmode, x, + gen_rtx_US_MINUS (mode, cop0, cop1))); - default: - gcc_unreachable (); - } + code = EQ; + negate = !negate; + break; + + default: + gcc_unreachable (); + } - cop0 = x; - cop1 = CONST0_RTX (mode); + cop0 = x; + cop1 = CONST0_RTX (mode); + } } x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1, @@ -13687,19 +13691,7 @@ ix86_expand_sse4_unpack (rtx operands[2], bool unsigned_p, bool high_p) } /* This function performs the same task as ix86_expand_sse_unpack, - but with amdfam15 instructions. */ - -#define PPERM_SRC 0x00 /* copy source */ -#define PPERM_INVERT 0x20 /* invert source */ -#define PPERM_REVERSE 0x40 /* bit reverse source */ -#define PPERM_REV_INV 0x60 /* bit reverse & invert src */ -#define PPERM_ZERO 0x80 /* all 0's */ -#define PPERM_ONES 0xa0 /* all 1's */ -#define PPERM_SIGN 0xc0 /* propagate sign bit */ -#define PPERM_INV_SIGN 0xe0 /* invert & propagate sign */ - -#define PPERM_SRC1 0x00 /* use first source byte */ -#define PPERM_SRC2 0x10 /* use second source byte */ + but with sse5 instructions. */ void ix86_expand_sse5_unpack (rtx operands[2], bool unsigned_p, bool high_p) @@ -18773,14 +18765,14 @@ static const struct builtin_description bdesc_multi_arg[] = { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_pmacsdqh, "__builtin_ia32_pmacsdqh", IX86_BUILTIN_PMACSDQH, 0, (int)MULTI_ARG_3_SI_DI }, { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_pmadcsswd, "__builtin_ia32_pmadcsswd", IX86_BUILTIN_PMADCSSWD, 0, (int)MULTI_ARG_3_HI_SI }, { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_pmadcswd, "__builtin_ia32_pmadcswd", IX86_BUILTIN_PMADCSWD, 0, (int)MULTI_ARG_3_HI_SI }, - { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_rotlv2di3, "__builtin_ia32_protq", IX86_BUILTIN_PROTQ, 0, (int)MULTI_ARG_2_DI }, - { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_rotlv4si3, "__builtin_ia32_protd", IX86_BUILTIN_PROTD, 0, (int)MULTI_ARG_2_SI }, - { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_rotlv8hi3, "__builtin_ia32_protw", IX86_BUILTIN_PROTW, 0, (int)MULTI_ARG_2_HI }, - { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_rotlv16qi3, "__builtin_ia32_protb", IX86_BUILTIN_PROTB, 0, (int)MULTI_ARG_2_QI }, - { OPTION_MASK_ISA_SSE5, CODE_FOR_rotlv2di3, "__builtin_ia32_protqi", IX86_BUILTIN_PROTQ_IMM, 0, (int)MULTI_ARG_2_DI_IMM }, - { OPTION_MASK_ISA_SSE5, CODE_FOR_rotlv4si3, "__builtin_ia32_protdi", IX86_BUILTIN_PROTD_IMM, 0, (int)MULTI_ARG_2_SI_IMM }, - { OPTION_MASK_ISA_SSE5, CODE_FOR_rotlv8hi3, "__builtin_ia32_protwi", IX86_BUILTIN_PROTW_IMM, 0, (int)MULTI_ARG_2_HI_IMM }, - { OPTION_MASK_ISA_SSE5, CODE_FOR_rotlv16qi3, "__builtin_ia32_protbi", IX86_BUILTIN_PROTB_IMM, 0, (int)MULTI_ARG_2_QI_IMM }, + { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_vrotlv2di3, "__builtin_ia32_protq", IX86_BUILTIN_PROTQ, 0, (int)MULTI_ARG_2_DI }, + { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_vrotlv4si3, "__builtin_ia32_protd", IX86_BUILTIN_PROTD, 0, (int)MULTI_ARG_2_SI }, + { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_vrotlv8hi3, "__builtin_ia32_protw", IX86_BUILTIN_PROTW, 0, (int)MULTI_ARG_2_HI }, + { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_vrotlv16qi3, "__builtin_ia32_protb", IX86_BUILTIN_PROTB, 0, (int)MULTI_ARG_2_QI }, + { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_rotlv2di3, "__builtin_ia32_protqi", IX86_BUILTIN_PROTQ_IMM, 0, (int)MULTI_ARG_2_DI_IMM }, + { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_rotlv4si3, "__builtin_ia32_protdi", IX86_BUILTIN_PROTD_IMM, 0, (int)MULTI_ARG_2_SI_IMM }, + { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_rotlv8hi3, "__builtin_ia32_protwi", IX86_BUILTIN_PROTW_IMM, 0, (int)MULTI_ARG_2_HI_IMM }, + { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_rotlv16qi3, "__builtin_ia32_protbi", IX86_BUILTIN_PROTB_IMM, 0, (int)MULTI_ARG_2_QI_IMM }, { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_ashlv2di3, "__builtin_ia32_pshaq", IX86_BUILTIN_PSHAQ, 0, (int)MULTI_ARG_2_DI }, { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_ashlv4si3, "__builtin_ia32_pshad", IX86_BUILTIN_PSHAD, 0, (int)MULTI_ARG_2_SI }, { OPTION_MASK_ISA_SSE5, CODE_FOR_sse5_ashlv8hi3, "__builtin_ia32_pshaw", IX86_BUILTIN_PSHAW, 0, (int)MULTI_ARG_2_HI }, @@ -25331,8 +25323,10 @@ ix86_expand_round (rtx operand0, rtx operand1) NUM is the number of operands. USES_OC0 is true if the instruction uses OC0 and provides 4 variants. NUM_MEMORY is the maximum number of memory operands to accept. */ + bool -ix86_sse5_valid_op_p (rtx operands[], rtx insn, int num, bool uses_oc0, int num_memory) +ix86_sse5_valid_op_p (rtx operands[], rtx insn ATTRIBUTE_UNUSED, int num, + bool uses_oc0, int num_memory) { int mem_mask; int mem_count; @@ -25366,6 +25360,18 @@ ix86_sse5_valid_op_p (rtx operands[], rtx insn, int num, bool uses_oc0, int num_ } } + /* Special case pmacsdq{l,h} where we allow the 3rd argument to be + a memory operation. */ + if (num_memory < 0) + { + num_memory = -num_memory; + if ((mem_mask & (1 << (num-1))) != 0) + { + mem_mask &= ~(1 << (num-1)); + mem_count--; + } + } + /* If there were no memory operations, allow the insn */ if (mem_mask == 0) return true; diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index b5e9082ceec..145c373ff75 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -181,11 +181,9 @@ (UNSPEC_SSE5_UNSIGNED_CMP 151) (UNSPEC_SSE5_TRUEFALSE 152) (UNSPEC_SSE5_PERMUTE 153) - (UNSPEC_SSE5_ASHIFT 154) - (UNSPEC_SSE5_LSHIFT 155) - (UNSPEC_FRCZ 156) - (UNSPEC_CVTPH2PS 157) - (UNSPEC_CVTPS2PH 158) + (UNSPEC_FRCZ 154) + (UNSPEC_CVTPH2PS 155) + (UNSPEC_CVTPS2PH 156) ; For AES support (UNSPEC_AESENC 159) @@ -227,6 +225,20 @@ (COM_TRUE_P 5) ]) +;; Constants used in the SSE5 pperm instruction +(define_constants + [(PPERM_SRC 0x00) /* copy source */ + (PPERM_INVERT 0x20) /* invert source */ + (PPERM_REVERSE 0x40) /* bit reverse source */ + (PPERM_REV_INV 0x60) /* bit reverse & invert src */ + (PPERM_ZERO 0x80) /* all 0's */ + (PPERM_ONES 0xa0) /* all 1's */ + (PPERM_SIGN 0xc0) /* propagate sign bit */ + (PPERM_INV_SIGN 0xe0) /* invert & propagate sign */ + (PPERM_SRC1 0x00) /* use first source byte */ + (PPERM_SRC2 0x10) /* use second source byte */ + ]) + ;; Registers by name. (define_constants [(AX_REG 0) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 88822c5fb31..6ee090a6b4a 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -53,7 +53,14 @@ (define_mode_attr sserotatemax [(V16QI "7") (V8HI "15") (V4SI "31") (V2DI "63")]) ;; Mapping of vector modes back to the scalar modes -(define_mode_attr ssescalarmode [(V4SF "SF") (V2DF "DF")]) +(define_mode_attr ssescalarmode [(V4SF "SF") (V2DF "DF") + (V16QI "QI") (V8HI "HI") + (V4SI "SI") (V2DI "DI")]) + +;; Number of scalar elements in each vector type +(define_mode_attr ssescalarnum [(V4SF "4") (V2DF "2") + (V16QI "16") (V8HI "8") + (V4SI "4") (V2DI "2")]) ;; Mapping of immediate bits for blend instructions (define_mode_attr blendbits [(V4SF "15") (V2DF "3")]) @@ -3154,7 +3161,7 @@ ;; We don't have a straight 32-bit parallel multiply on SSE5, so fake it with a ;; multiply/add. In general, we expect the define_split to occur before ;; register allocation, so we have to handle the corner case where the target -;; is used as the base or index register in operands 1/2. +;; is the same as one of the inputs. (define_insn_and_split "*sse5_mulv4si3" [(set (match_operand:V4SI 0 "register_operand" "=&x") (mult:V4SI (match_operand:V4SI 1 "register_operand" "%x") @@ -3242,6 +3249,42 @@ rtx t1, t2, t3, t4, t5, t6, thirtytwo; rtx op0, op1, op2; + if (TARGET_SSE5) + { + /* op1: A,B,C,D, op2: E,F,G,H */ + op0 = operands[0]; + op1 = gen_lowpart (V4SImode, operands[1]); + op2 = gen_lowpart (V4SImode, operands[2]); + t1 = gen_reg_rtx (V4SImode); + t2 = gen_reg_rtx (V4SImode); + t3 = gen_reg_rtx (V4SImode); + t4 = gen_reg_rtx (V2DImode); + t5 = gen_reg_rtx (V2DImode); + + /* t1: B,A,D,C */ + emit_insn (gen_sse2_pshufd_1 (t1, op1, + GEN_INT (1), + GEN_INT (0), + GEN_INT (3), + GEN_INT (2))); + + /* t2: 0 */ + emit_move_insn (t2, CONST0_RTX (V4SImode)); + + /* t3: (B*E),(A*F),(D*G),(C*H) */ + emit_insn (gen_sse5_pmacsdd (t3, t1, op2, t2)); + + /* t4: (B*E)+(A*F), (D*G)+(C*H) */ + emit_insn (gen_sse5_phadddq (t4, t3)); + + /* t5: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */ + emit_insn (gen_ashlv2di3 (t5, t4, GEN_INT (32))); + + /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */ + emit_insn (gen_sse5_pmacsdql (op0, op1, op2, t5)); + DONE; + } + op0 = operands[0]; op1 = operands[1]; op2 = operands[2]; @@ -3357,6 +3400,57 @@ DONE; }) +(define_expand "vec_widen_smult_hi_v4si" + [(match_operand:V2DI 0 "register_operand" "") + (match_operand:V4SI 1 "register_operand" "") + (match_operand:V4SI 2 "register_operand" "")] + "TARGET_SSE5" +{ + rtx t1, t2; + + t1 = gen_reg_rtx (V4SImode); + t2 = gen_reg_rtx (V4SImode); + + emit_insn (gen_sse2_pshufd_1 (t1, operands[1], + GEN_INT (0), + GEN_INT (2), + GEN_INT (1), + GEN_INT (3))); + emit_insn (gen_sse2_pshufd_1 (t2, operands[2], + GEN_INT (0), + GEN_INT (2), + GEN_INT (1), + GEN_INT (3))); + emit_insn (gen_sse5_mulv2div2di3_high (operands[0], t1, t2)); + DONE; +}) + +(define_expand "vec_widen_smult_lo_v4si" + [(match_operand:V2DI 0 "register_operand" "") + (match_operand:V4SI 1 "register_operand" "") + (match_operand:V4SI 2 "register_operand" "")] + "TARGET_SSE5" +{ + rtx t1, t2; + + t1 = gen_reg_rtx (V4SImode); + t2 = gen_reg_rtx (V4SImode); + + emit_insn (gen_sse2_pshufd_1 (t1, operands[1], + GEN_INT (0), + GEN_INT (2), + GEN_INT (1), + GEN_INT (3))); + emit_insn (gen_sse2_pshufd_1 (t2, operands[2], + GEN_INT (0), + GEN_INT (2), + GEN_INT (1), + GEN_INT (3))); + emit_insn (gen_sse5_mulv2div2di3_low (operands[0], t1, t2)); + DONE; + DONE; +}) + (define_expand "vec_widen_umult_hi_v4si" [(match_operand:V2DI 0 "register_operand" "") (match_operand:V4SI 1 "register_operand" "") @@ -3893,6 +3987,12 @@ { rtx op1, op2, h1, l1, h2, l2, h3, l3; + if (TARGET_SSE5) + { + ix86_expand_sse5_pack (operands); + DONE; + } + op1 = gen_lowpart (V16QImode, operands[1]); op2 = gen_lowpart (V16QImode, operands[2]); h1 = gen_reg_rtx (V16QImode); @@ -3928,6 +4028,12 @@ { rtx op1, op2, h1, l1, h2, l2; + if (TARGET_SSE5) + { + ix86_expand_sse5_pack (operands); + DONE; + } + op1 = gen_lowpart (V8HImode, operands[1]); op2 = gen_lowpart (V8HImode, operands[2]); h1 = gen_reg_rtx (V8HImode); @@ -3957,6 +4063,12 @@ { rtx op1, op2, h1, l1; + if (TARGET_SSE5) + { + ix86_expand_sse5_pack (operands); + DONE; + } + op1 = gen_lowpart (V4SImode, operands[1]); op2 = gen_lowpart (V4SImode, operands[2]); h1 = gen_reg_rtx (V4SImode); @@ -7024,6 +7136,87 @@ [(set_attr "type" "ssemuladd") (set_attr "mode" "TI")]) +(define_insn_and_split "*sse5_pmacsdql_mem" + [(set (match_operand:V2DI 0 "register_operand" "=&x,&x,&x") + (plus:V2DI + (mult:V2DI + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "nonimmediate_operand" "x,x,m") + (parallel [(const_int 1) + (const_int 3)]))) + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 2 "nonimmediate_operand" "x,m,x") + (parallel [(const_int 1) + (const_int 3)])))) + (match_operand:V2DI 3 "memory_operand" "m,m,m")))] + "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, false, -1)" + "#" + "&& (reload_completed + || (!reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2])))" + [(set (match_dup 0) + (match_dup 3)) + (set (match_dup 0) + (plus:V2DI + (mult:V2DI + (sign_extend:V2DI + (vec_select:V2SI + (match_dup 1) + (parallel [(const_int 1) + (const_int 3)]))) + (sign_extend:V2DI + (vec_select:V2SI + (match_dup 2) + (parallel [(const_int 1) + (const_int 3)])))) + (match_dup 0)))]) + +;; We don't have a straight 32-bit parallel multiply and extend on SSE5, so +;; fake it with a multiply/add. In general, we expect the define_split to +;; occur before register allocation, so we have to handle the corner case where +;; the target is the same as operands 1/2 +(define_insn_and_split "sse5_mulv2div2di3_low" + [(set (match_operand:V2DI 0 "register_operand" "=&x") + (mult:V2DI + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "nonimmediate_operand" "%x") + (parallel [(const_int 1) + (const_int 3)]))) + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 1) + (const_int 3)])))))] + "TARGET_SSE5" + "#" + "&& (reload_completed + || (!reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2])))" + [(set (match_dup 0) + (match_dup 3)) + (set (match_dup 0) + (plus:V2DI + (mult:V2DI + (sign_extend:V2DI + (vec_select:V2SI + (match_dup 1) + (parallel [(const_int 1) + (const_int 3)]))) + (sign_extend:V2DI + (vec_select:V2SI + (match_dup 2) + (parallel [(const_int 1) + (const_int 3)])))) + (match_dup 0)))] +{ + operands[3] = CONST0_RTX (V2DImode); +} + [(set_attr "type" "ssemuladd") + (set_attr "mode" "TI")]) + (define_insn "sse5_pmacsdqh" [(set (match_operand:V2DI 0 "register_operand" "=x,x,x") (plus:V2DI @@ -7047,6 +7240,87 @@ [(set_attr "type" "ssemuladd") (set_attr "mode" "TI")]) +(define_insn_and_split "*sse5_pmacsdqh_mem" + [(set (match_operand:V2DI 0 "register_operand" "=&x,&x,&x") + (plus:V2DI + (mult:V2DI + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "nonimmediate_operand" "x,x,m") + (parallel [(const_int 0) + (const_int 2)]))) + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 2 "nonimmediate_operand" "x,m,x") + (parallel [(const_int 0) + (const_int 2)])))) + (match_operand:V2DI 3 "memory_operand" "m,m,m")))] + "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, false, -1)" + "#" + "&& (reload_completed + || (!reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2])))" + [(set (match_dup 0) + (match_dup 3)) + (set (match_dup 0) + (plus:V2DI + (mult:V2DI + (sign_extend:V2DI + (vec_select:V2SI + (match_dup 1) + (parallel [(const_int 0) + (const_int 2)]))) + (sign_extend:V2DI + (vec_select:V2SI + (match_dup 2) + (parallel [(const_int 0) + (const_int 2)])))) + (match_dup 0)))]) + +;; We don't have a straight 32-bit parallel multiply and extend on SSE5, so +;; fake it with a multiply/add. In general, we expect the define_split to +;; occur before register allocation, so we have to handle the corner case where +;; the target is the same as either operands[1] or operands[2] +(define_insn_and_split "sse5_mulv2div2di3_high" + [(set (match_operand:V2DI 0 "register_operand" "=&x") + (mult:V2DI + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 1 "nonimmediate_operand" "%x") + (parallel [(const_int 0) + (const_int 2)]))) + (sign_extend:V2DI + (vec_select:V2SI + (match_operand:V4SI 2 "nonimmediate_operand" "xm") + (parallel [(const_int 0) + (const_int 2)])))))] + "TARGET_SSE5" + "#" + "&& (reload_completed + || (!reg_mentioned_p (operands[0], operands[1]) + && !reg_mentioned_p (operands[0], operands[2])))" + [(set (match_dup 0) + (match_dup 3)) + (set (match_dup 0) + (plus:V2DI + (mult:V2DI + (sign_extend:V2DI + (vec_select:V2SI + (match_dup 1) + (parallel [(const_int 0) + (const_int 2)]))) + (sign_extend:V2DI + (vec_select:V2SI + (match_dup 2) + (parallel [(const_int 0) + (const_int 2)])))) + (match_dup 0)))] +{ + operands[3] = CONST0_RTX (V2DImode); +} + [(set_attr "type" "ssemuladd") + (set_attr "mode" "TI")]) + ;; SSE5 parallel integer multiply/add instructions for the intrinisics (define_insn "sse5_pmacsswd" [(set (match_operand:V4SI 0 "register_operand" "=x,x,x") @@ -7190,19 +7464,17 @@ ;; SSE5 parallel XMM conditional moves (define_insn "sse5_pcmov_" - [(set (match_operand:SSEMODE 0 "register_operand" "=x,x,x,x,x,x") + [(set (match_operand:SSEMODE 0 "register_operand" "=x,x,x,x") (if_then_else:SSEMODE - (match_operand:SSEMODE 3 "nonimmediate_operand" "0,0,xm,x,0,0") - (match_operand:SSEMODE 1 "vector_move_operand" "x,xm,0,0,C,x") - (match_operand:SSEMODE 2 "vector_move_operand" "xm,x,x,xm,x,C")))] + (match_operand:SSEMODE 3 "nonimmediate_operand" "0,0,xm,x") + (match_operand:SSEMODE 1 "vector_move_operand" "x,xm,0,0") + (match_operand:SSEMODE 2 "vector_move_operand" "xm,x,x,xm")))] "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 4, true, 1)" "@ pcmov\t{%3, %2, %1, %0|%0, %1, %2, %3} pcmov\t{%3, %2, %1, %0|%0, %1, %2, %3} pcmov\t{%3, %2, %1, %0|%0, %1, %2, %3} - pcmov\t{%3, %2, %1, %0|%0, %1, %2, %3} - andps\t{%2, %0|%0, %2} - andnps\t{%1, %0|%0, %1}" + pcmov\t{%3, %2, %1, %0|%0, %1, %2, %3}" [(set_attr "type" "sse4arg")]) ;; SSE5 horizontal add/subtract instructions @@ -7801,7 +8073,71 @@ (set_attr "mode" "")]) ;; SSE5 packed rotate instructions -(define_insn "rotl3" +(define_expand "rotl3" + [(set (match_operand:SSEMODE1248 0 "register_operand" "") + (rotate:SSEMODE1248 + (match_operand:SSEMODE1248 1 "nonimmediate_operand" "") + (match_operand:SI 2 "general_operand")))] + "TARGET_SSE5" +{ + /* If we were given a scalar, convert it to parallel */ + if (! const_0_to__operand (operands[2], SImode)) + { + rtvec vs = rtvec_alloc (); + rtx par = gen_rtx_PARALLEL (mode, vs); + rtx reg = gen_reg_rtx (mode); + rtx op2 = operands[2]; + int i; + + if (GET_MODE (op2) != mode) + { + op2 = gen_reg_rtx (mode); + convert_move (op2, operands[2], false); + } + + for (i = 0; i < ; i++) + RTVEC_ELT (vs, i) = op2; + + emit_insn (gen_vec_init (reg, par)); + emit_insn (gen_sse5_vrotl3 (operands[0], operands[1], reg)); + DONE; + } +}) + +(define_expand "rotr3" + [(set (match_operand:SSEMODE1248 0 "register_operand" "") + (rotatert:SSEMODE1248 + (match_operand:SSEMODE1248 1 "nonimmediate_operand" "") + (match_operand:SI 2 "general_operand")))] + "TARGET_SSE5" +{ + /* If we were given a scalar, convert it to parallel */ + if (! const_0_to__operand (operands[2], SImode)) + { + rtvec vs = rtvec_alloc (); + rtx par = gen_rtx_PARALLEL (mode, vs); + rtx neg = gen_reg_rtx (mode); + rtx reg = gen_reg_rtx (mode); + rtx op2 = operands[2]; + int i; + + if (GET_MODE (op2) != mode) + { + op2 = gen_reg_rtx (mode); + convert_move (op2, operands[2], false); + } + + for (i = 0; i < ; i++) + RTVEC_ELT (vs, i) = op2; + + emit_insn (gen_vec_init (reg, par)); + emit_insn (gen_neg2 (neg, reg)); + emit_insn (gen_sse5_vrotl3 (operands[0], operands[1], neg)); + DONE; + } +}) + +(define_insn "sse5_rotl3" [(set (match_operand:SSEMODE1248 0 "register_operand" "=x") (rotate:SSEMODE1248 (match_operand:SSEMODE1248 1 "nonimmediate_operand" "xm") @@ -7811,26 +8147,106 @@ [(set_attr "type" "sseishft") (set_attr "mode" "TI")]) -(define_insn "sse5_rotl3" +(define_insn "sse5_rotr3" + [(set (match_operand:SSEMODE1248 0 "register_operand" "=x") + (rotatert:SSEMODE1248 + (match_operand:SSEMODE1248 1 "nonimmediate_operand" "xm") + (match_operand:SI 2 "const_0_to__operand" "n")))] + "TARGET_SSE5" +{ + operands[3] = GEN_INT (( * 8) - INTVAL (operands[2])); + return \"prot\t{%3, %1, %0|%0, %1, %3}\"; +} + [(set_attr "type" "sseishft") + (set_attr "mode" "TI")]) + +(define_expand "vrotr3" + [(match_operand:SSEMODE1248 0 "register_operand" "") + (match_operand:SSEMODE1248 1 "register_operand" "") + (match_operand:SSEMODE1248 2 "register_operand" "")] + "TARGET_SSE5" +{ + rtx reg = gen_reg_rtx (mode); + emit_insn (gen_neg2 (reg, operands[2])); + emit_insn (gen_sse5_vrotl3 (operands[0], operands[1], reg)); + DONE; +}) + +(define_expand "vrotl3" + [(match_operand:SSEMODE1248 0 "register_operand" "") + (match_operand:SSEMODE1248 1 "register_operand" "") + (match_operand:SSEMODE1248 2 "register_operand" "")] + "TARGET_SSE5" +{ + emit_insn (gen_sse5_vrotl3 (operands[0], operands[1], operands[2])); + DONE; +}) + +(define_insn "sse5_vrotl3" [(set (match_operand:SSEMODE1248 0 "register_operand" "=x,x") - (rotate:SSEMODE1248 - (match_operand:SSEMODE1248 1 "nonimmediate_operand" "x,xm") - (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm,x")))] + (if_then_else:SSEMODE1248 + (ge:SSEMODE1248 + (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm,x") + (const_int 0)) + (rotate:SSEMODE1248 + (match_operand:SSEMODE1248 1 "nonimmediate_operand" "x,xm") + (match_dup 2)) + (rotatert:SSEMODE1248 + (match_dup 1) + (neg:SSEMODE1248 (match_dup 2)))))] "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 3, true, 1)" "prot\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "sseishft") (set_attr "mode" "TI")]) -;; SSE5 packed shift instructions. Note negative values for the shift amount -;; convert this into a right shift instead of left shift. For now, model this -;; with an UNSPEC instead of using ashift/lshift since the rest of the x86 does -;; not have the concept of negating the shift amount. Also, there is no LSHIFT +;; SSE5 packed shift instructions. +;; FIXME: add V2DI back in +(define_expand "vlshr3" + [(match_operand:SSEMODE124 0 "register_operand" "") + (match_operand:SSEMODE124 1 "register_operand" "") + (match_operand:SSEMODE124 2 "register_operand" "")] + "TARGET_SSE5" +{ + rtx neg = gen_reg_rtx (mode); + emit_insn (gen_neg2 (neg, operands[2])); + emit_insn (gen_sse5_lshl3 (operands[0], operands[1], neg)); + DONE; +}) + +(define_expand "vashr3" + [(match_operand:SSEMODE124 0 "register_operand" "") + (match_operand:SSEMODE124 1 "register_operand" "") + (match_operand:SSEMODE124 2 "register_operand" "")] + "TARGET_SSE5" +{ + rtx neg = gen_reg_rtx (mode); + emit_insn (gen_neg2 (neg, operands[2])); + emit_insn (gen_sse5_ashl3 (operands[0], operands[1], neg)); + DONE; +}) + +(define_expand "vashl3" + [(match_operand:SSEMODE124 0 "register_operand" "") + (match_operand:SSEMODE124 1 "register_operand" "") + (match_operand:SSEMODE124 2 "register_operand" "")] + "TARGET_SSE5" +{ + emit_insn (gen_sse5_ashl3 (operands[0], operands[1], operands[2])); + DONE; +}) + (define_insn "sse5_ashl3" [(set (match_operand:SSEMODE1248 0 "register_operand" "=x,x") - (unspec:SSEMODE1248 - [(match_operand:SSEMODE1248 1 "nonimmediate_operand" "x,xm") - (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm,x")] - UNSPEC_SSE5_ASHIFT))] + (if_then_else:SSEMODE1248 + (ge:SSEMODE1248 + (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm,x") + (const_int 0)) + (ashift:SSEMODE1248 + (match_operand:SSEMODE1248 1 "nonimmediate_operand" "x,xm") + (match_dup 2)) + (ashiftrt:SSEMODE1248 + (match_dup 1) + (neg:SSEMODE1248 (match_dup 2)))))] "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 3, true, 1)" "psha\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "sseishft") @@ -7838,15 +8254,122 @@ (define_insn "sse5_lshl3" [(set (match_operand:SSEMODE1248 0 "register_operand" "=x,x") - (unspec:SSEMODE1248 - [(match_operand:SSEMODE1248 1 "nonimmediate_operand" "x,xm") - (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm,x")] - UNSPEC_SSE5_LSHIFT))] + (if_then_else:SSEMODE1248 + (ge:SSEMODE1248 + (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm,x") + (const_int 0)) + (ashift:SSEMODE1248 + (match_operand:SSEMODE1248 1 "nonimmediate_operand" "x,xm") + (match_dup 2)) + (lshiftrt:SSEMODE1248 + (match_dup 1) + (neg:SSEMODE1248 (match_dup 2)))))] "TARGET_SSE5 && ix86_sse5_valid_op_p (operands, insn, 3, true, 1)" "pshl\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "sseishft") (set_attr "mode" "TI")]) +;; SSE2 doesn't have some shift varients, so define versions for SSE5 +(define_expand "ashlv16qi3" + [(match_operand:V16QI 0 "register_operand" "") + (match_operand:V16QI 1 "register_operand" "") + (match_operand:SI 2 "nonmemory_operand" "")] + "TARGET_SSE5" +{ + rtvec vs = rtvec_alloc (16); + rtx par = gen_rtx_PARALLEL (V16QImode, vs); + rtx reg = gen_reg_rtx (V16QImode); + int i; + for (i = 0; i < 16; i++) + RTVEC_ELT (vs, i) = operands[2]; + + emit_insn (gen_vec_initv16qi (reg, par)); + emit_insn (gen_sse5_ashlv16qi3 (operands[0], operands[1], reg)); + DONE; +}) + +(define_expand "lshlv16qi3" + [(match_operand:V16QI 0 "register_operand" "") + (match_operand:V16QI 1 "register_operand" "") + (match_operand:SI 2 "nonmemory_operand" "")] + "TARGET_SSE5" +{ + rtvec vs = rtvec_alloc (16); + rtx par = gen_rtx_PARALLEL (V16QImode, vs); + rtx reg = gen_reg_rtx (V16QImode); + int i; + for (i = 0; i < 16; i++) + RTVEC_ELT (vs, i) = operands[2]; + + emit_insn (gen_vec_initv16qi (reg, par)); + emit_insn (gen_sse5_lshlv16qi3 (operands[0], operands[1], reg)); + DONE; +}) + +(define_expand "ashrv16qi3" + [(match_operand:V16QI 0 "register_operand" "") + (match_operand:V16QI 1 "register_operand" "") + (match_operand:SI 2 "nonmemory_operand" "")] + "TARGET_SSE5" +{ + rtvec vs = rtvec_alloc (16); + rtx par = gen_rtx_PARALLEL (V16QImode, vs); + rtx reg = gen_reg_rtx (V16QImode); + int i; + rtx ele = ((GET_CODE (operands[2]) == CONST_INT) + ? GEN_INT (- INTVAL (operands[2])) + : operands[2]); + + for (i = 0; i < 16; i++) + RTVEC_ELT (vs, i) = ele; + + emit_insn (gen_vec_initv16qi (reg, par)); + + if (GET_CODE (operands[2]) != CONST_INT) + { + rtx neg = gen_reg_rtx (V16QImode); + emit_insn (gen_negv16qi2 (neg, reg)); + emit_insn (gen_sse5_ashlv16qi3 (operands[0], operands[1], neg)); + } + else + emit_insn (gen_sse5_ashlv16qi3 (operands[0], operands[1], reg)); + + DONE; +}) + +(define_expand "ashrv2di3" + [(match_operand:V2DI 0 "register_operand" "") + (match_operand:V2DI 1 "register_operand" "") + (match_operand:DI 2 "nonmemory_operand" "")] + "TARGET_SSE5" +{ + rtvec vs = rtvec_alloc (2); + rtx par = gen_rtx_PARALLEL (V2DImode, vs); + rtx reg = gen_reg_rtx (V2DImode); + rtx ele; + + if (GET_CODE (operands[2]) == CONST_INT) + ele = GEN_INT (- INTVAL (operands[2])); + else if (GET_MODE (operands[2]) != DImode) + { + rtx move = gen_reg_rtx (DImode); + ele = gen_reg_rtx (DImode); + convert_move (move, operands[2], false); + emit_insn (gen_negdi2 (ele, move)); + } + else + { + ele = gen_reg_rtx (DImode); + emit_insn (gen_negdi2 (ele, operands[2])); + } + + RTVEC_ELT (vs, 0) = ele; + RTVEC_ELT (vs, 1) = ele; + emit_insn (gen_vec_initv2di (reg, par)); + emit_insn (gen_sse5_ashlv2di3 (operands[0], operands[1], reg)); + DONE; +}) + ;; SSE5 FRCZ support ;; parallel insns (define_insn "sse5_frcz2" diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 95c588ca070..f8ed145350e 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -575,7 +575,7 @@ /* Generate [-0.0, -0.0, -0.0, -0.0]. */ neg0 = gen_reg_rtx (V4SImode); emit_insn (gen_altivec_vspltisw (neg0, constm1_rtx)); - emit_insn (gen_ashlv4si3 (neg0, neg0, neg0)); + emit_insn (gen_vashlv4si3 (neg0, neg0, neg0)); /* Use the multiply-add. */ emit_insn (gen_altivec_vmaddfp (operands[0], operands[1], operands[2], @@ -634,7 +634,7 @@ high_product = gen_reg_rtx (V4SImode); emit_insn (gen_altivec_vmsumuhm (high_product, one, small_swap, zero)); - emit_insn (gen_ashlv4si3 (high_product, high_product, sixteen)); + emit_insn (gen_vashlv4si3 (high_product, high_product, sixteen)); emit_insn (gen_addv4si3 (operands[0], high_product, low_product)); @@ -1238,7 +1238,7 @@ "vslo %0,%1,%2" [(set_attr "type" "vecperm")]) -(define_insn "ashl3" +(define_insn "vashl3" [(set (match_operand:VI 0 "register_operand" "=v") (ashift:VI (match_operand:VI 1 "register_operand" "v") (match_operand:VI 2 "register_operand" "v") ))] @@ -1246,7 +1246,7 @@ "vsl %0,%1,%2" [(set_attr "type" "vecsimple")]) -(define_insn "lshr3" +(define_insn "vlshr3" [(set (match_operand:VI 0 "register_operand" "=v") (lshiftrt:VI (match_operand:VI 1 "register_operand" "v") (match_operand:VI 2 "register_operand" "v") ))] @@ -1254,7 +1254,7 @@ "vsr %0,%1,%2" [(set_attr "type" "vecsimple")]) -(define_insn "ashr3" +(define_insn "vashr3" [(set (match_operand:VI 0 "register_operand" "=v") (ashiftrt:VI (match_operand:VI 1 "register_operand" "v") (match_operand:VI 2 "register_operand" "v") ))] @@ -2640,7 +2640,7 @@ /* Generate [-0.0, -0.0, -0.0, -0.0]. */ neg0 = gen_reg_rtx (V4SImode); emit_insn (gen_altivec_vspltisw (neg0, constm1_rtx)); - emit_insn (gen_ashlv4si3 (neg0, neg0, neg0)); + emit_insn (gen_vashlv4si3 (neg0, neg0, neg0)); /* XOR */ emit_insn (gen_xorv4sf3 (operands[0], diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 2e0031a67c1..076cf6cce04 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -7109,20 +7109,20 @@ static struct builtin_description bdesc_2arg[] = { MASK_ALTIVEC, CODE_FOR_altivec_vrlb, "__builtin_altivec_vrlb", ALTIVEC_BUILTIN_VRLB }, { MASK_ALTIVEC, CODE_FOR_altivec_vrlh, "__builtin_altivec_vrlh", ALTIVEC_BUILTIN_VRLH }, { MASK_ALTIVEC, CODE_FOR_altivec_vrlw, "__builtin_altivec_vrlw", ALTIVEC_BUILTIN_VRLW }, - { MASK_ALTIVEC, CODE_FOR_ashlv16qi3, "__builtin_altivec_vslb", ALTIVEC_BUILTIN_VSLB }, - { MASK_ALTIVEC, CODE_FOR_ashlv8hi3, "__builtin_altivec_vslh", ALTIVEC_BUILTIN_VSLH }, - { MASK_ALTIVEC, CODE_FOR_ashlv4si3, "__builtin_altivec_vslw", ALTIVEC_BUILTIN_VSLW }, + { MASK_ALTIVEC, CODE_FOR_vashlv16qi3, "__builtin_altivec_vslb", ALTIVEC_BUILTIN_VSLB }, + { MASK_ALTIVEC, CODE_FOR_vashlv8hi3, "__builtin_altivec_vslh", ALTIVEC_BUILTIN_VSLH }, + { MASK_ALTIVEC, CODE_FOR_vashlv4si3, "__builtin_altivec_vslw", ALTIVEC_BUILTIN_VSLW }, { MASK_ALTIVEC, CODE_FOR_altivec_vsl, "__builtin_altivec_vsl", ALTIVEC_BUILTIN_VSL }, { MASK_ALTIVEC, CODE_FOR_altivec_vslo, "__builtin_altivec_vslo", ALTIVEC_BUILTIN_VSLO }, { MASK_ALTIVEC, CODE_FOR_altivec_vspltb, "__builtin_altivec_vspltb", ALTIVEC_BUILTIN_VSPLTB }, { MASK_ALTIVEC, CODE_FOR_altivec_vsplth, "__builtin_altivec_vsplth", ALTIVEC_BUILTIN_VSPLTH }, { MASK_ALTIVEC, CODE_FOR_altivec_vspltw, "__builtin_altivec_vspltw", ALTIVEC_BUILTIN_VSPLTW }, - { MASK_ALTIVEC, CODE_FOR_lshrv16qi3, "__builtin_altivec_vsrb", ALTIVEC_BUILTIN_VSRB }, - { MASK_ALTIVEC, CODE_FOR_lshrv8hi3, "__builtin_altivec_vsrh", ALTIVEC_BUILTIN_VSRH }, - { MASK_ALTIVEC, CODE_FOR_lshrv4si3, "__builtin_altivec_vsrw", ALTIVEC_BUILTIN_VSRW }, - { MASK_ALTIVEC, CODE_FOR_ashrv16qi3, "__builtin_altivec_vsrab", ALTIVEC_BUILTIN_VSRAB }, - { MASK_ALTIVEC, CODE_FOR_ashrv8hi3, "__builtin_altivec_vsrah", ALTIVEC_BUILTIN_VSRAH }, - { MASK_ALTIVEC, CODE_FOR_ashrv4si3, "__builtin_altivec_vsraw", ALTIVEC_BUILTIN_VSRAW }, + { MASK_ALTIVEC, CODE_FOR_vlshrv16qi3, "__builtin_altivec_vsrb", ALTIVEC_BUILTIN_VSRB }, + { MASK_ALTIVEC, CODE_FOR_vlshrv8hi3, "__builtin_altivec_vsrh", ALTIVEC_BUILTIN_VSRH }, + { MASK_ALTIVEC, CODE_FOR_vlshrv4si3, "__builtin_altivec_vsrw", ALTIVEC_BUILTIN_VSRW }, + { MASK_ALTIVEC, CODE_FOR_vashrv16qi3, "__builtin_altivec_vsrab", ALTIVEC_BUILTIN_VSRAB }, + { MASK_ALTIVEC, CODE_FOR_vashrv8hi3, "__builtin_altivec_vsrah", ALTIVEC_BUILTIN_VSRAH }, + { MASK_ALTIVEC, CODE_FOR_vashrv4si3, "__builtin_altivec_vsraw", ALTIVEC_BUILTIN_VSRAW }, { MASK_ALTIVEC, CODE_FOR_altivec_vsr, "__builtin_altivec_vsr", ALTIVEC_BUILTIN_VSR }, { MASK_ALTIVEC, CODE_FOR_altivec_vsro, "__builtin_altivec_vsro", ALTIVEC_BUILTIN_VSRO }, { MASK_ALTIVEC, CODE_FOR_subv16qi3, "__builtin_altivec_vsububm", ALTIVEC_BUILTIN_VSUBUBM }, diff --git a/gcc/config/spu/spu-builtins.def b/gcc/config/spu/spu-builtins.def index 2073c85daa5..eecd337f73f 100644 --- a/gcc/config/spu/spu-builtins.def +++ b/gcc/config/spu/spu-builtins.def @@ -107,19 +107,19 @@ DEF_BUILTIN (SI_NOR, CODE_FOR_nor_v16qi, "si_nor", B_INSN, DEF_BUILTIN (SI_EQV, CODE_FOR_eqv_v16qi, "si_eqv", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD)) DEF_BUILTIN (SI_SELB, CODE_FOR_selb, "si_selb", B_INSN, _A4(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD)) DEF_BUILTIN (SI_SHUFB, CODE_FOR_shufb, "si_shufb", B_INSN, _A4(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD)) -DEF_BUILTIN (SI_SHLH, CODE_FOR_ashlv8hi3, "si_shlh", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD)) -DEF_BUILTIN (SI_SHLHI, CODE_FOR_ashlv8hi3, "si_shlhi", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_7)) -DEF_BUILTIN (SI_SHL, CODE_FOR_ashlv4si3, "si_shl", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD)) -DEF_BUILTIN (SI_SHLI, CODE_FOR_ashlv4si3, "si_shli", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_7)) +DEF_BUILTIN (SI_SHLH, CODE_FOR_vashlv8hi3, "si_shlh", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD)) +DEF_BUILTIN (SI_SHLHI, CODE_FOR_vashlv8hi3, "si_shlhi", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_7)) +DEF_BUILTIN (SI_SHL, CODE_FOR_vashlv4si3, "si_shl", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD)) +DEF_BUILTIN (SI_SHLI, CODE_FOR_vashlv4si3, "si_shli", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_7)) DEF_BUILTIN (SI_SHLQBI, CODE_FOR_shlqbi_ti, "si_shlqbi", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD)) DEF_BUILTIN (SI_SHLQBII, CODE_FOR_shlqbi_ti, "si_shlqbii", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_7)) DEF_BUILTIN (SI_SHLQBY, CODE_FOR_shlqby_ti, "si_shlqby", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD)) DEF_BUILTIN (SI_SHLQBYI, CODE_FOR_shlqby_ti, "si_shlqbyi", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_7)) DEF_BUILTIN (SI_SHLQBYBI, CODE_FOR_shlqbybi_ti, "si_shlqbybi", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD)) -DEF_BUILTIN (SI_ROTH, CODE_FOR_rotlv8hi3, "si_roth", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD)) -DEF_BUILTIN (SI_ROTHI, CODE_FOR_rotlv8hi3, "si_rothi", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_7)) -DEF_BUILTIN (SI_ROT, CODE_FOR_rotlv4si3, "si_rot", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD)) -DEF_BUILTIN (SI_ROTI, CODE_FOR_rotlv4si3, "si_roti", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_7)) +DEF_BUILTIN (SI_ROTH, CODE_FOR_vrotlv8hi3, "si_roth", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD)) +DEF_BUILTIN (SI_ROTHI, CODE_FOR_vrotlv8hi3, "si_rothi", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_7)) +DEF_BUILTIN (SI_ROT, CODE_FOR_vrotlv4si3, "si_rot", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD)) +DEF_BUILTIN (SI_ROTI, CODE_FOR_vrotlv4si3, "si_roti", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_7)) DEF_BUILTIN (SI_ROTQBY, CODE_FOR_rotqby_ti, "si_rotqby", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD)) DEF_BUILTIN (SI_ROTQBYI, CODE_FOR_rotqby_ti, "si_rotqbyi", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_7)) DEF_BUILTIN (SI_ROTQBYBI, CODE_FOR_rotqbybi_ti, "si_rotqbybi", B_INSN, _A3(SPU_BTI_QUADWORD, SPU_BTI_QUADWORD, SPU_BTI_QUADWORD)) @@ -536,14 +536,14 @@ DEF_BUILTIN (SPU_XOR_13, CODE_FOR_xorv8hi3, "spu_xor_13", DEF_BUILTIN (SPU_XOR_14, CODE_FOR_xorv4si3, "spu_xor_14", B_INTERNAL, _A3(SPU_BTI_UV4SI, SPU_BTI_UV4SI, SPU_BTI_UINTSI)) DEF_BUILTIN (SPU_XOR_15, CODE_FOR_xorv4si3, "spu_xor_15", B_INTERNAL, _A3(SPU_BTI_V4SI, SPU_BTI_V4SI, SPU_BTI_INTSI)) DEF_BUILTIN (SPU_RL, CODE_FOR_nothing, "spu_rl", B_OVERLOAD, _A1(SPU_BTI_VOID)) -DEF_BUILTIN (SPU_RL_0, CODE_FOR_rotlv8hi3, "spu_rl_0", B_INTERNAL, _A3(SPU_BTI_UV8HI, SPU_BTI_UV8HI, SPU_BTI_V8HI)) -DEF_BUILTIN (SPU_RL_1, CODE_FOR_rotlv8hi3, "spu_rl_1", B_INTERNAL, _A3(SPU_BTI_V8HI, SPU_BTI_V8HI, SPU_BTI_V8HI)) -DEF_BUILTIN (SPU_RL_2, CODE_FOR_rotlv4si3, "spu_rl_2", B_INTERNAL, _A3(SPU_BTI_UV4SI, SPU_BTI_UV4SI, SPU_BTI_V4SI)) -DEF_BUILTIN (SPU_RL_3, CODE_FOR_rotlv4si3, "spu_rl_3", B_INTERNAL, _A3(SPU_BTI_V4SI, SPU_BTI_V4SI, SPU_BTI_V4SI)) -DEF_BUILTIN (SPU_RL_4, CODE_FOR_rotlv8hi3, "spu_rl_4", B_INTERNAL, _A3(SPU_BTI_UV8HI, SPU_BTI_UV8HI, SPU_BTI_INTHI)) -DEF_BUILTIN (SPU_RL_5, CODE_FOR_rotlv8hi3, "spu_rl_5", B_INTERNAL, _A3(SPU_BTI_V8HI, SPU_BTI_V8HI, SPU_BTI_INTHI)) -DEF_BUILTIN (SPU_RL_6, CODE_FOR_rotlv4si3, "spu_rl_6", B_INTERNAL, _A3(SPU_BTI_UV4SI, SPU_BTI_UV4SI, SPU_BTI_INTSI)) -DEF_BUILTIN (SPU_RL_7, CODE_FOR_rotlv4si3, "spu_rl_7", B_INTERNAL, _A3(SPU_BTI_V4SI, SPU_BTI_V4SI, SPU_BTI_INTSI)) +DEF_BUILTIN (SPU_RL_0, CODE_FOR_vrotlv8hi3, "spu_rl_0", B_INTERNAL, _A3(SPU_BTI_UV8HI, SPU_BTI_UV8HI, SPU_BTI_V8HI)) +DEF_BUILTIN (SPU_RL_1, CODE_FOR_vrotlv8hi3, "spu_rl_1", B_INTERNAL, _A3(SPU_BTI_V8HI, SPU_BTI_V8HI, SPU_BTI_V8HI)) +DEF_BUILTIN (SPU_RL_2, CODE_FOR_vrotlv4si3, "spu_rl_2", B_INTERNAL, _A3(SPU_BTI_UV4SI, SPU_BTI_UV4SI, SPU_BTI_V4SI)) +DEF_BUILTIN (SPU_RL_3, CODE_FOR_vrotlv4si3, "spu_rl_3", B_INTERNAL, _A3(SPU_BTI_V4SI, SPU_BTI_V4SI, SPU_BTI_V4SI)) +DEF_BUILTIN (SPU_RL_4, CODE_FOR_vrotlv8hi3, "spu_rl_4", B_INTERNAL, _A3(SPU_BTI_UV8HI, SPU_BTI_UV8HI, SPU_BTI_INTHI)) +DEF_BUILTIN (SPU_RL_5, CODE_FOR_vrotlv8hi3, "spu_rl_5", B_INTERNAL, _A3(SPU_BTI_V8HI, SPU_BTI_V8HI, SPU_BTI_INTHI)) +DEF_BUILTIN (SPU_RL_6, CODE_FOR_vrotlv4si3, "spu_rl_6", B_INTERNAL, _A3(SPU_BTI_UV4SI, SPU_BTI_UV4SI, SPU_BTI_INTSI)) +DEF_BUILTIN (SPU_RL_7, CODE_FOR_vrotlv4si3, "spu_rl_7", B_INTERNAL, _A3(SPU_BTI_V4SI, SPU_BTI_V4SI, SPU_BTI_INTSI)) DEF_BUILTIN (SPU_RLQW, CODE_FOR_nothing, "spu_rlqw", B_OVERLOAD, _A1(SPU_BTI_VOID)) DEF_BUILTIN (SPU_RLQW_0, CODE_FOR_rotqbi_ti, "spu_rlqw_0", B_INTERNAL, _A3(SPU_BTI_UV16QI, SPU_BTI_UV16QI, SPU_BTI_INTSI)) DEF_BUILTIN (SPU_RLQW_1, CODE_FOR_rotqbi_ti, "spu_rlqw_1", B_INTERNAL, _A3(SPU_BTI_V16QI, SPU_BTI_V16QI, SPU_BTI_INTSI)) @@ -629,14 +629,14 @@ DEF_BUILTIN (SPU_RLMASKQWBYTEBC_7, CODE_FOR_rotqmbybi_ti, "spu_rlmaskqwbytebc_7 DEF_BUILTIN (SPU_RLMASKQWBYTEBC_8, CODE_FOR_rotqmbybi_ti, "spu_rlmaskqwbytebc_8", B_INTERNAL, _A3(SPU_BTI_V4SF, SPU_BTI_V4SF, SPU_BTI_INTSI)) DEF_BUILTIN (SPU_RLMASKQWBYTEBC_9, CODE_FOR_rotqmbybi_ti, "spu_rlmaskqwbytebc_9", B_INTERNAL, _A3(SPU_BTI_V2DF, SPU_BTI_V2DF, SPU_BTI_INTSI)) DEF_BUILTIN (SPU_SL, CODE_FOR_nothing, "spu_sl", B_OVERLOAD, _A1(SPU_BTI_VOID)) -DEF_BUILTIN (SPU_SL_0, CODE_FOR_ashlv8hi3, "spu_sl_0", B_INTERNAL, _A3(SPU_BTI_UV8HI, SPU_BTI_UV8HI, SPU_BTI_UV8HI)) -DEF_BUILTIN (SPU_SL_1, CODE_FOR_ashlv8hi3, "spu_sl_1", B_INTERNAL, _A3(SPU_BTI_V8HI, SPU_BTI_V8HI, SPU_BTI_UV8HI)) -DEF_BUILTIN (SPU_SL_2, CODE_FOR_ashlv4si3, "spu_sl_2", B_INTERNAL, _A3(SPU_BTI_UV4SI, SPU_BTI_UV4SI, SPU_BTI_UV4SI)) -DEF_BUILTIN (SPU_SL_3, CODE_FOR_ashlv4si3, "spu_sl_3", B_INTERNAL, _A3(SPU_BTI_V4SI, SPU_BTI_V4SI, SPU_BTI_UV4SI)) -DEF_BUILTIN (SPU_SL_4, CODE_FOR_ashlv8hi3, "spu_sl_4", B_INTERNAL, _A3(SPU_BTI_UV8HI, SPU_BTI_UV8HI, SPU_BTI_UINTSI)) -DEF_BUILTIN (SPU_SL_5, CODE_FOR_ashlv8hi3, "spu_sl_5", B_INTERNAL, _A3(SPU_BTI_V8HI, SPU_BTI_V8HI, SPU_BTI_UINTSI)) -DEF_BUILTIN (SPU_SL_6, CODE_FOR_ashlv4si3, "spu_sl_6", B_INTERNAL, _A3(SPU_BTI_UV4SI, SPU_BTI_UV4SI, SPU_BTI_UINTSI)) -DEF_BUILTIN (SPU_SL_7, CODE_FOR_ashlv4si3, "spu_sl_7", B_INTERNAL, _A3(SPU_BTI_V4SI, SPU_BTI_V4SI, SPU_BTI_UINTSI)) +DEF_BUILTIN (SPU_SL_0, CODE_FOR_vashlv8hi3, "spu_sl_0", B_INTERNAL, _A3(SPU_BTI_UV8HI, SPU_BTI_UV8HI, SPU_BTI_UV8HI)) +DEF_BUILTIN (SPU_SL_1, CODE_FOR_vashlv8hi3, "spu_sl_1", B_INTERNAL, _A3(SPU_BTI_V8HI, SPU_BTI_V8HI, SPU_BTI_UV8HI)) +DEF_BUILTIN (SPU_SL_2, CODE_FOR_vashlv4si3, "spu_sl_2", B_INTERNAL, _A3(SPU_BTI_UV4SI, SPU_BTI_UV4SI, SPU_BTI_UV4SI)) +DEF_BUILTIN (SPU_SL_3, CODE_FOR_vashlv4si3, "spu_sl_3", B_INTERNAL, _A3(SPU_BTI_V4SI, SPU_BTI_V4SI, SPU_BTI_UV4SI)) +DEF_BUILTIN (SPU_SL_4, CODE_FOR_vashlv8hi3, "spu_sl_4", B_INTERNAL, _A3(SPU_BTI_UV8HI, SPU_BTI_UV8HI, SPU_BTI_UINTSI)) +DEF_BUILTIN (SPU_SL_5, CODE_FOR_vashlv8hi3, "spu_sl_5", B_INTERNAL, _A3(SPU_BTI_V8HI, SPU_BTI_V8HI, SPU_BTI_UINTSI)) +DEF_BUILTIN (SPU_SL_6, CODE_FOR_vashlv4si3, "spu_sl_6", B_INTERNAL, _A3(SPU_BTI_UV4SI, SPU_BTI_UV4SI, SPU_BTI_UINTSI)) +DEF_BUILTIN (SPU_SL_7, CODE_FOR_vashlv4si3, "spu_sl_7", B_INTERNAL, _A3(SPU_BTI_V4SI, SPU_BTI_V4SI, SPU_BTI_UINTSI)) DEF_BUILTIN (SPU_SLQW, CODE_FOR_nothing, "spu_slqw", B_OVERLOAD, _A1(SPU_BTI_VOID)) DEF_BUILTIN (SPU_SLQW_0, CODE_FOR_shlqbi_ti, "spu_slqw_0", B_INTERNAL, _A3(SPU_BTI_V2DI, SPU_BTI_V2DI, SPU_BTI_UINTSI)) DEF_BUILTIN (SPU_SLQW_1, CODE_FOR_shlqbi_ti, "spu_slqw_1", B_INTERNAL, _A3(SPU_BTI_UV2DI, SPU_BTI_UV2DI, SPU_BTI_UINTSI)) diff --git a/gcc/config/spu/spu.c b/gcc/config/spu/spu.c index ad41994d5a5..692a8dae34f 100644 --- a/gcc/config/spu/spu.c +++ b/gcc/config/spu/spu.c @@ -4799,7 +4799,7 @@ spu_initialize_trampoline (rtx tramp, rtx fnaddr, rtx cxt) insnc = force_reg (V4SImode, array_to_constant (V4SImode, insna)); emit_insn (gen_shufb (shuf, fnaddr, cxt, shufc)); - emit_insn (gen_rotlv4si3 (rotl, shuf, spu_const (V4SImode, 7))); + emit_insn (gen_vrotlv4si3 (rotl, shuf, spu_const (V4SImode, 7))); emit_insn (gen_movv4si (mask, spu_const (V4SImode, 0xffff << 7))); emit_insn (gen_selb (insn, insnc, rotl, mask)); diff --git a/gcc/config/spu/spu.md b/gcc/config/spu/spu.md index e98fcf3f232..6985a683697 100644 --- a/gcc/config/spu/spu.md +++ b/gcc/config/spu/spu.md @@ -211,6 +211,9 @@ V8HI V4SI]) +(define_mode_attr v [(V8HI "v") (V4SI "v") + (HI "") (SI "")]) + (define_mode_attr bh [(QI "b") (V16QI "b") (HI "h") (V8HI "h") (SI "") (V4SI "")]) @@ -727,7 +730,7 @@ rtx op6_ti = gen_rtx_REG (TImode, REGNO (ops[6])); emit_insn (gen_clzv4si2 (ops[3],op1_v4si)); emit_move_insn (ops[6], spu_const (V4SImode, 1023+31)); - emit_insn (gen_ashlv4si3 (ops[4],op1_v4si,ops[3])); + emit_insn (gen_vashlv4si3 (ops[4],op1_v4si,ops[3])); emit_insn (gen_ceq_v4si (ops[5],ops[3],spu_const (V4SImode, 32))); emit_insn (gen_subv4si3 (ops[6],ops[6],ops[3])); emit_insn (gen_addv4si3 (ops[4],ops[4],ops[4])); @@ -822,7 +825,7 @@ rtx op4_df = gen_rtx_REG (DFmode, REGNO(ops[4])); rtx op5_df = gen_rtx_REG (DFmode, REGNO(ops[5])); emit_insn (gen_clzv4si2 (ops[4],op1_v4si)); - emit_insn (gen_ashlv4si3 (ops[5],op1_v4si,ops[4])); + emit_insn (gen_vashlv4si3 (ops[5],op1_v4si,ops[4])); emit_insn (gen_ceq_v4si (ops[6],ops[4],spu_const (V4SImode, 32))); emit_insn (gen_subv4si3 (ops[4],ops[3],ops[4])); emit_insn (gen_addv4si3 (ops[5],ops[5],ops[5])); @@ -1222,7 +1225,7 @@ emit_move_insn (mask, spu_const (V4SImode, 0x0000ffff)); emit_insn (gen_spu_mpyhh (high, operands[1], operands[2])); emit_insn (gen_spu_mpy (low, operands[1], operands[2])); - emit_insn (gen_ashlv4si3 (shift, high, spu_const(V4SImode, 16))); + emit_insn (gen_vashlv4si3 (shift, high, spu_const(V4SImode, 16))); emit_insn (gen_selb (result, shift, low, mask)); DONE; }") @@ -2100,9 +2103,9 @@ [(set_attr "type" "fxb")]) -;; ashl +;; ashl, vashl -(define_insn "ashl3" +(define_insn "ashl3" [(set (match_operand:VHSI 0 "spu_reg_operand" "=r,r") (ashift:VHSI (match_operand:VHSI 1 "spu_reg_operand" "r,r") (match_operand:VHSI 2 "spu_nonmem_operand" "r,W")))] @@ -2234,9 +2237,9 @@ [(set_attr "type" "shuf,shuf")]) -;; lshr +;; lshr, vlshr -(define_insn_and_split "lshr3" +(define_insn_and_split "lshr3" [(set (match_operand:VHSI 0 "spu_reg_operand" "=r,r") (lshiftrt:VHSI (match_operand:VHSI 1 "spu_reg_operand" "r,r") (match_operand:VHSI 2 "spu_nonmem_operand" "r,W"))) @@ -2363,9 +2366,9 @@ [(set_attr "type" "shuf")]) -;; ashr +;; ashr, vashr -(define_insn_and_split "ashr3" +(define_insn_and_split "ashr3" [(set (match_operand:VHSI 0 "spu_reg_operand" "=r,r") (ashiftrt:VHSI (match_operand:VHSI 1 "spu_reg_operand" "r,r") (match_operand:VHSI 2 "spu_nonmem_operand" "r,W"))) @@ -2430,7 +2433,7 @@ emit_insn (gen_lshrti3 (op0, op1, GEN_INT (32))); emit_insn (gen_spu_xswd (op0d, op0v)); if (val > 32) - emit_insn (gen_ashrv4si3 (op0v, op0v, spu_const (V4SImode, val - 32))); + emit_insn (gen_vashrv4si3 (op0v, op0v, spu_const (V4SImode, val - 32))); } else { @@ -2479,7 +2482,7 @@ rtx op1_v4si = spu_gen_subreg (V4SImode, operands[1]); rtx t = gen_reg_rtx (TImode); emit_insn (gen_subsi3 (sign_shift, GEN_INT (128), force_reg (SImode, operands[2]))); - emit_insn (gen_ashrv4si3 (sign_mask_v4si, op1_v4si, spu_const (V4SImode, 31))); + emit_insn (gen_vashrv4si3 (sign_mask_v4si, op1_v4si, spu_const (V4SImode, 31))); emit_insn (gen_fsm_ti (sign_mask, sign_mask)); emit_insn (gen_ashlti3 (sign_mask, sign_mask, sign_shift)); emit_insn (gen_lshrti3 (t, operands[1], operands[2])); @@ -2496,9 +2499,9 @@ [(set_attr "type" "shuf")]) -;; rotl +;; vrotl, rotl -(define_insn "rotl3" +(define_insn "rotl3" [(set (match_operand:VHSI 0 "spu_reg_operand" "=r,r") (rotate:VHSI (match_operand:VHSI 1 "spu_reg_operand" "r,r") (match_operand:VHSI 2 "spu_nonmem_operand" "r,W")))] @@ -3046,14 +3049,14 @@ selb\t%0,%5,%0,%3" emit_insn (gen_iorv4si3 (a_nan, a_nan, b_nan)); } emit_move_insn (zero, CONST0_RTX (V4SImode)); - emit_insn (gen_ashrv4si3 (asel, ra, spu_const (V4SImode, 31))); + emit_insn (gen_vashrv4si3 (asel, ra, spu_const (V4SImode, 31))); emit_insn (gen_shufb (asel, asel, asel, hi_promote)); emit_insn (gen_bg_v4si (abor, zero, a_abs)); emit_insn (gen_shufb (abor, abor, abor, borrow_shuffle)); emit_insn (gen_sfx_v4si (abor, zero, a_abs, abor)); emit_insn (gen_selb (abor, a_abs, abor, asel)); - emit_insn (gen_ashrv4si3 (bsel, rb, spu_const (V4SImode, 31))); + emit_insn (gen_vashrv4si3 (bsel, rb, spu_const (V4SImode, 31))); emit_insn (gen_shufb (bsel, bsel, bsel, hi_promote)); emit_insn (gen_bg_v4si (bbor, zero, b_abs)); emit_insn (gen_shufb (bbor, bbor, bbor, borrow_shuffle)); @@ -3154,13 +3157,13 @@ selb\t%0,%5,%0,%3" emit_insn (gen_shufb (b_nan, b_nan, b_nan, hi_promote)); emit_insn (gen_iorv4si3 (a_nan, a_nan, b_nan)); emit_move_insn (zero, CONST0_RTX (V4SImode)); - emit_insn (gen_ashrv4si3 (asel, ra, spu_const (V4SImode, 31))); + emit_insn (gen_vashrv4si3 (asel, ra, spu_const (V4SImode, 31))); emit_insn (gen_shufb (asel, asel, asel, hi_promote)); emit_insn (gen_bg_v4si (abor, zero, a_abs)); emit_insn (gen_shufb (abor, abor, abor, borrow_shuffle)); emit_insn (gen_sfx_v4si (abor, zero, a_abs, abor)); emit_insn (gen_selb (abor, a_abs, abor, asel)); - emit_insn (gen_ashrv4si3 (bsel, rb, spu_const (V4SImode, 31))); + emit_insn (gen_vashrv4si3 (bsel, rb, spu_const (V4SImode, 31))); emit_insn (gen_shufb (bsel, bsel, bsel, hi_promote)); emit_insn (gen_bg_v4si (bbor, zero, b_abs)); emit_insn (gen_shufb (bbor, bbor, bbor, borrow_shuffle)); @@ -3344,7 +3347,7 @@ selb\t%0,%4,%0,%3" 0x08090A0B, 0x08090A0B); emit_move_insn (hi_promote, pat); - emit_insn (gen_ashrv4si3 (sign, ra, spu_const (V4SImode, 31))); + emit_insn (gen_vashrv4si3 (sign, ra, spu_const (V4SImode, 31))); emit_insn (gen_shufb (sign, sign, sign, hi_promote)); emit_insn (gen_andv4si3 (abs, ra, sign_mask)); diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index a4117a34efb..a8e43ead2fd 100644 --- a/gcc/doc/md.texi +++ b/gcc/doc/md.texi @@ -3848,7 +3848,7 @@ operand 0 and operand 1; operand 2's mode is specified by the instruction pattern, and the compiler will convert the operand to that mode before generating the instruction. The meaning of out-of-range shift counts can optionally be specified by @code{TARGET_SHIFT_TRUNCATION_MASK}. -@xref{TARGET_SHIFT_TRUNCATION_MASK}. +@xref{TARGET_SHIFT_TRUNCATION_MASK}. Operand 2 is always a scalar type. @cindex @code{ashr@var{m}3} instruction pattern @cindex @code{lshr@var{m}3} instruction pattern @@ -3856,7 +3856,16 @@ counts can optionally be specified by @code{TARGET_SHIFT_TRUNCATION_MASK}. @cindex @code{rotr@var{m}3} instruction pattern @item @samp{ashr@var{m}3}, @samp{lshr@var{m}3}, @samp{rotl@var{m}3}, @samp{rotr@var{m}3} Other shift and rotate instructions, analogous to the -@code{ashl@var{m}3} instructions. +@code{ashl@var{m}3} instructions. Operand 2 is always a scalar type. + +@cindex @code{vashl@var{m}3} instruction pattern +@cindex @code{vashr@var{m}3} instruction pattern +@cindex @code{vlshr@var{m}3} instruction pattern +@cindex @code{vrotl@var{m}3} instruction pattern +@cindex @code{vrotr@var{m}3} instruction pattern +@item @samp{vashl@var{m}3}, @samp{vashr@var{m}3}, @samp{vlshr@var{m}3}, @samp{vrotl@var{m}3}, @samp{vrotr@var{m}3} +Vector shift and rotate instructions that take vectors as operand 2 +instead of a scalar type. @cindex @code{neg@var{m}2} instruction pattern @cindex @code{ssneg@var{m}2} instruction pattern diff --git a/gcc/expmed.c b/gcc/expmed.c index 5268b318f8d..ab5057a9e8f 100644 --- a/gcc/expmed.c +++ b/gcc/expmed.c @@ -2044,14 +2044,32 @@ expand_shift (enum tree_code code, enum machine_mode mode, rtx shifted, rtx op1, temp = 0; int left = (code == LSHIFT_EXPR || code == LROTATE_EXPR); int rotate = (code == LROTATE_EXPR || code == RROTATE_EXPR); + optab lshift_optab = ashl_optab; + optab rshift_arith_optab = ashr_optab; + optab rshift_uns_optab = lshr_optab; + optab lrotate_optab = rotl_optab; + optab rrotate_optab = rotr_optab; + enum machine_mode op1_mode; int try; + op1 = expand_normal (amount); + op1_mode = GET_MODE (op1); + + /* Determine whether the shift/rotate amount is a vector, or scalar. If the + shift amount is a vector, use the vector/vector shift patterns. */ + if (VECTOR_MODE_P (mode) && VECTOR_MODE_P (op1_mode)) + { + lshift_optab = vashl_optab; + rshift_arith_optab = vashr_optab; + rshift_uns_optab = vlshr_optab; + lrotate_optab = vrotl_optab; + rrotate_optab = vrotr_optab; + } + /* Previously detected shift-counts computed by NEGATE_EXPR and shifted in the other direction; but that does not work on all machines. */ - op1 = expand_normal (amount); - if (SHIFT_COUNT_TRUNCATED) { if (GET_CODE (op1) == CONST_INT @@ -2141,12 +2159,12 @@ expand_shift (enum tree_code code, enum machine_mode mode, rtx shifted, } temp = expand_binop (mode, - left ? rotl_optab : rotr_optab, + left ? lrotate_optab : rrotate_optab, shifted, op1, target, unsignedp, methods); } else if (unsignedp) temp = expand_binop (mode, - left ? ashl_optab : lshr_optab, + left ? lshift_optab : rshift_uns_optab, shifted, op1, target, unsignedp, methods); /* Do arithmetic shifts. @@ -2165,7 +2183,7 @@ expand_shift (enum tree_code code, enum machine_mode mode, rtx shifted, /* Arithmetic shift */ temp = expand_binop (mode, - left ? ashl_optab : ashr_optab, + left ? lshift_optab : rshift_arith_optab, shifted, op1, target, unsignedp, methods1); } diff --git a/gcc/expr.c b/gcc/expr.c index 0bf82382400..0a0f401e9c3 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -8691,7 +8691,8 @@ expand_expr_real_1 (tree exp, rtx target, enum machine_mode tmode, if (modifier == EXPAND_STACK_PARM) target = 0; temp = expand_unop (mode, - optab_for_tree_code (NEGATE_EXPR, type), + optab_for_tree_code (NEGATE_EXPR, type, + optab_default), op0, target, 0); gcc_assert (temp); return REDUCE_BIT_FIELD (temp); @@ -8730,7 +8731,7 @@ expand_expr_real_1 (tree exp, rtx target, enum machine_mode tmode, /* First try to do it with a special MIN or MAX instruction. If that does not win, use a conditional jump to select the proper value. */ - this_optab = optab_for_tree_code (code, type); + this_optab = optab_for_tree_code (code, type, optab_default); temp = expand_binop (mode, this_optab, op0, op1, target, unsignedp, OPTAB_WIDEN); if (temp != 0) @@ -8868,12 +8869,11 @@ expand_expr_real_1 (tree exp, rtx target, enum machine_mode tmode, case LROTATE_EXPR: case RROTATE_EXPR: - /* The expansion code only handles expansion of mode precision - rotates. */ - gcc_assert (GET_MODE_PRECISION (TYPE_MODE (type)) - == TYPE_PRECISION (type)); + gcc_assert (VECTOR_MODE_P (TYPE_MODE (type)) + || (GET_MODE_PRECISION (TYPE_MODE (type)) + == TYPE_PRECISION (type))); + /* fall through */ - /* Falltrough. */ case LSHIFT_EXPR: case RSHIFT_EXPR: /* If this is a fixed-point operation, then we cannot use the code @@ -9215,7 +9215,7 @@ expand_expr_real_1 (tree exp, rtx target, enum machine_mode tmode, tree oprnd2 = TREE_OPERAND (exp, 2); rtx op2; - this_optab = optab_for_tree_code (code, type); + this_optab = optab_for_tree_code (code, type, optab_default); expand_operands (oprnd0, oprnd1, NULL_RTX, &op0, &op1, EXPAND_NORMAL); op2 = expand_normal (oprnd2); temp = expand_ternary_op (mode, this_optab, op0, op1, op2, @@ -9254,7 +9254,7 @@ expand_expr_real_1 (tree exp, rtx target, enum machine_mode tmode, case REDUC_PLUS_EXPR: { op0 = expand_normal (TREE_OPERAND (exp, 0)); - this_optab = optab_for_tree_code (code, type); + this_optab = optab_for_tree_code (code, type, optab_default); temp = expand_unop (mode, this_optab, op0, target, unsignedp); gcc_assert (temp); return temp; @@ -9265,7 +9265,7 @@ expand_expr_real_1 (tree exp, rtx target, enum machine_mode tmode, { expand_operands (TREE_OPERAND (exp, 0), TREE_OPERAND (exp, 1), NULL_RTX, &op0, &op1, 0); - this_optab = optab_for_tree_code (code, type); + this_optab = optab_for_tree_code (code, type, optab_default); temp = expand_binop (mode, this_optab, op0, op1, target, unsignedp, OPTAB_WIDEN); gcc_assert (temp); @@ -9277,7 +9277,7 @@ expand_expr_real_1 (tree exp, rtx target, enum machine_mode tmode, { expand_operands (TREE_OPERAND (exp, 0), TREE_OPERAND (exp, 1), NULL_RTX, &op0, &op1, 0); - this_optab = optab_for_tree_code (code, type); + this_optab = optab_for_tree_code (code, type, optab_default); temp = expand_binop (mode, this_optab, op0, op1, target, unsignedp, OPTAB_WIDEN); gcc_assert (temp); @@ -9295,7 +9295,7 @@ expand_expr_real_1 (tree exp, rtx target, enum machine_mode tmode, case VEC_UNPACK_LO_EXPR: { op0 = expand_normal (TREE_OPERAND (exp, 0)); - this_optab = optab_for_tree_code (code, type); + this_optab = optab_for_tree_code (code, type, optab_default); temp = expand_widen_pattern_expr (exp, op0, NULL_RTX, NULL_RTX, target, unsignedp); gcc_assert (temp); @@ -9308,7 +9308,8 @@ expand_expr_real_1 (tree exp, rtx target, enum machine_mode tmode, op0 = expand_normal (TREE_OPERAND (exp, 0)); /* The signedness is determined from input operand. */ this_optab = optab_for_tree_code (code, - TREE_TYPE (TREE_OPERAND (exp, 0))); + TREE_TYPE (TREE_OPERAND (exp, 0)), + optab_default); temp = expand_widen_pattern_expr (exp, op0, NULL_RTX, NULL_RTX, target, TYPE_UNSIGNED (TREE_TYPE (TREE_OPERAND (exp, 0)))); @@ -9355,7 +9356,7 @@ expand_expr_real_1 (tree exp, rtx target, enum machine_mode tmode, expand_operands (TREE_OPERAND (exp, 0), TREE_OPERAND (exp, 1), subtarget, &op0, &op1, 0); binop2: - this_optab = optab_for_tree_code (code, type); + this_optab = optab_for_tree_code (code, type, optab_default); binop3: if (modifier == EXPAND_STACK_PARM) target = 0; diff --git a/gcc/genopinit.c b/gcc/genopinit.c index a497eb881b1..f5a18a3a0c0 100644 --- a/gcc/genopinit.c +++ b/gcc/genopinit.c @@ -130,6 +130,11 @@ static const char * const optabs[] = "optab_handler (lshr_optab, $A)->insn_code = CODE_FOR_$(lshr$a3$)", "optab_handler (rotl_optab, $A)->insn_code = CODE_FOR_$(rotl$a3$)", "optab_handler (rotr_optab, $A)->insn_code = CODE_FOR_$(rotr$a3$)", + "optab_handler (vashr_optab, $A)->insn_code = CODE_FOR_$(vashr$a3$)", + "optab_handler (vlshr_optab, $A)->insn_code = CODE_FOR_$(vlshr$a3$)", + "optab_handler (vashl_optab, $A)->insn_code = CODE_FOR_$(vashl$a3$)", + "optab_handler (vrotl_optab, $A)->insn_code = CODE_FOR_$(vrotl$a3$)", + "optab_handler (vrotr_optab, $A)->insn_code = CODE_FOR_$(vrotr$a3$)", "optab_handler (smin_optab, $A)->insn_code = CODE_FOR_$(smin$a3$)", "optab_handler (smax_optab, $A)->insn_code = CODE_FOR_$(smax$a3$)", "optab_handler (umin_optab, $A)->insn_code = CODE_FOR_$(umin$I$a3$)", diff --git a/gcc/optabs.c b/gcc/optabs.c index c13ccaeb460..f10b1705dd6 100644 --- a/gcc/optabs.c +++ b/gcc/optabs.c @@ -334,13 +334,13 @@ widen_operand (rtx op, enum machine_mode mode, enum machine_mode oldmode, return result; } -/* Return the optab used for computing the operation given by - the tree code, CODE. This function is not always usable (for - example, it cannot give complete results for multiplication - or division) but probably ought to be relied on more widely - throughout the expander. */ +/* Return the optab used for computing the operation given by the tree code, + CODE and the tree EXP. This function is not always usable (for example, it + cannot give complete results for multiplication or division) but probably + ought to be relied on more widely throughout the expander. */ optab -optab_for_tree_code (enum tree_code code, const_tree type) +optab_for_tree_code (enum tree_code code, const_tree type, + enum optab_subtype subtype) { bool trapv; switch (code) @@ -374,17 +374,45 @@ optab_for_tree_code (enum tree_code code, const_tree type) return TYPE_UNSIGNED (type) ? udiv_optab : sdiv_optab; case LSHIFT_EXPR: + if (VECTOR_MODE_P (TYPE_MODE (type))) + { + if (subtype == optab_vector) + return TYPE_SATURATING (type) ? NULL : vashl_optab; + + gcc_assert (subtype == optab_scalar); + } if (TYPE_SATURATING(type)) return TYPE_UNSIGNED(type) ? usashl_optab : ssashl_optab; return ashl_optab; case RSHIFT_EXPR: + if (VECTOR_MODE_P (TYPE_MODE (type))) + { + if (subtype == optab_vector) + return TYPE_UNSIGNED (type) ? vlshr_optab : vashr_optab; + + gcc_assert (subtype == optab_scalar); + } return TYPE_UNSIGNED (type) ? lshr_optab : ashr_optab; case LROTATE_EXPR: + if (VECTOR_MODE_P (TYPE_MODE (type))) + { + if (subtype == optab_vector) + return vrotl_optab; + + gcc_assert (subtype == optab_scalar); + } return rotl_optab; case RROTATE_EXPR: + if (VECTOR_MODE_P (TYPE_MODE (type))) + { + if (subtype == optab_vector) + return vrotr_optab; + + gcc_assert (subtype == optab_scalar); + } return rotr_optab; case MAX_EXPR: @@ -540,7 +568,7 @@ expand_widen_pattern_expr (tree exp, rtx op0, rtx op1, rtx wide_op, rtx target, oprnd0 = TREE_OPERAND (exp, 0); tmode0 = TYPE_MODE (TREE_TYPE (oprnd0)); widen_pattern_optab = - optab_for_tree_code (TREE_CODE (exp), TREE_TYPE (oprnd0)); + optab_for_tree_code (TREE_CODE (exp), TREE_TYPE (oprnd0), optab_default); icode = (int) optab_handler (widen_pattern_optab, tmode0)->insn_code; gcc_assert (icode != CODE_FOR_nothing); xmode0 = insn_data[icode].operand[1].mode; diff --git a/gcc/optabs.h b/gcc/optabs.h index 8e4f6e769c3..0b55c4fc8cc 100644 --- a/gcc/optabs.h +++ b/gcc/optabs.h @@ -167,6 +167,18 @@ enum optab_index OTI_rotl, /* Rotate right */ OTI_rotr, + + /* Arithmetic shift left of vector by vector */ + OTI_vashl, + /* Logical shift right of vector by vector */ + OTI_vlshr, + /* Arithmetic shift right of vector by vector */ + OTI_vashr, + /* Rotate left of vector by vector */ + OTI_vrotl, + /* Rotate right of vector by vector */ + OTI_vrotr, + /* Signed and floating-point minimum value */ OTI_smin, /* Signed and floating-point maximum value */ @@ -412,6 +424,11 @@ extern struct optab optab_table[OTI_MAX]; #define ashr_optab (&optab_table[OTI_ashr]) #define rotl_optab (&optab_table[OTI_rotl]) #define rotr_optab (&optab_table[OTI_rotr]) +#define vashl_optab (&optab_table[OTI_vashl]) +#define vlshr_optab (&optab_table[OTI_vlshr]) +#define vashr_optab (&optab_table[OTI_vashr]) +#define vrotl_optab (&optab_table[OTI_vrotl]) +#define vrotr_optab (&optab_table[OTI_vrotr]) #define smin_optab (&optab_table[OTI_smin]) #define smax_optab (&optab_table[OTI_smax]) #define umin_optab (&optab_table[OTI_umin]) @@ -714,6 +731,21 @@ extern void maybe_encapsulate_block (rtx, rtx, rtx); extern void emit_cmp_insn (rtx, rtx, enum rtx_code, rtx, enum machine_mode, int); +/* An extra flag to control optab_for_tree_code's behavior. This is needed to + distinguish between machines with a vector shift that takes a scalar for the + shift amount vs. machines that take a vector for the shift amount. */ +enum optab_subtype +{ + optab_default, + optab_scalar, + optab_vector +}; + +/* Return the optab used for computing the given operation on the type given by + the second argument. The third argument distinguishes between the types of + vector shifts and rotates */ +extern optab optab_for_tree_code (enum tree_code, const_tree, enum optab_subtype); + /* The various uses that a comparison can have; used by can_compare_p: jumps, conditional moves, store flag operations. */ enum can_compare_purpose @@ -723,10 +755,6 @@ enum can_compare_purpose ccp_store_flag }; -/* Return the optab used for computing the given operation on the type - given by the second argument. */ -extern optab optab_for_tree_code (enum tree_code, const_tree); - /* Nonzero if a compare of mode MODE can be done straightforwardly (without splitting it into pieces). */ extern int can_compare_p (enum rtx_code, enum machine_mode, diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 38c6ddd622e..ab790da0667 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,16 @@ +2008-05-14 Michael Meissner + Dwarakanath Rajagopal + + * gcc.target/i386/sse5-imul32widen-vector.c: New file to test x86 + SSE5 optimizations. + * gcc.target/i386/sse5-imul64-vector.c: Ditto. + * gcc.target/i386/sse5-rotate1-vector.c: Ditto. + * gcc.target/i386/sse5-rotate2-vector.c: Ditto. + * gcc.target/i386/sse5-rotate3-vector.c: Ditto. + * gcc.target/i386/sse5-shift1-vector.c: Ditto. + * gcc.target/i386/sse5-shift2-vector.c: Ditto. + * gcc.target/i386/sse5-shift3-vector.c: Ditto. + 2008-05-14 Michael Meissner PR target/36224 diff --git a/gcc/testsuite/gcc.target/i386/sse5-imul32widen-vector.c b/gcc/testsuite/gcc.target/i386/sse5-imul32widen-vector.c new file mode 100644 index 00000000000..ef29d081473 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse5-imul32widen-vector.c @@ -0,0 +1,36 @@ +/* Test that the compiler properly optimizes floating point multiply and add + instructions vector into pmacsdd/etc. on SSE5 systems. */ + +/* { dg-do compile } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O2 -msse5 -ftree-vectorize" } */ + +extern void exit (int); + +typedef long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); + +#define SIZE 10240 + +union { + __m128i i_align; + int i32[SIZE]; + long i64[SIZE]; +} a, b, c, d; + +void +imul32_to_64 (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.i64[i] = ((long)b.i32[i]) * ((long)c.i32[i]); +} + +int main () +{ + imul32_to_64 (); + exit (0); +} + +/* { dg-final { scan-assembler "pmacsdql" } } */ +/* { dg-final { scan-assembler "pmacsdqh" } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse5-imul64-vector.c b/gcc/testsuite/gcc.target/i386/sse5-imul64-vector.c new file mode 100644 index 00000000000..06ad1d2e96e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse5-imul64-vector.c @@ -0,0 +1,36 @@ +/* Test that the compiler properly optimizes floating point multiply and add + instructions vector into pmacsdd/etc. on SSE5 systems. */ + +/* { dg-do compile } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O2 -msse5 -ftree-vectorize" } */ + +extern void exit (int); + +typedef long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); + +#define SIZE 10240 + +union { + __m128i i_align; + long i64[SIZE]; +} a, b, c, d; + +void +imul64 (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.i64[i] = b.i64[i] * c.i64[i]; +} + +int main () +{ + imul64 (); + exit (0); +} + +/* { dg-final { scan-assembler "pmacsdd" } } */ +/* { dg-final { scan-assembler "phadddq" } } */ +/* { dg-final { scan-assembler "pmacsdql" } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse5-rotate1-vector.c b/gcc/testsuite/gcc.target/i386/sse5-rotate1-vector.c new file mode 100644 index 00000000000..0db9b9f790a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse5-rotate1-vector.c @@ -0,0 +1,35 @@ +/* Test that the compiler properly optimizes vector rotate instructions vector + into prot on SSE5 systems. */ + +/* { dg-do compile } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O2 -msse5 -ftree-vectorize" } */ + +extern void exit (int); + +typedef long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); + +#define SIZE 10240 + +union { + __m128i i_align; + unsigned u32[SIZE]; +} a, b, c; + +void +left_rotate32 (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.u32[i] = (b.u32[i] << ((sizeof (int) * 8) - 4)) | (b.u32[i] >> 4); +} + +int +main () +{ + left_rotate32 (); + exit (0); +} + +/* { dg-final { scan-assembler "protd" } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse5-rotate2-vector.c b/gcc/testsuite/gcc.target/i386/sse5-rotate2-vector.c new file mode 100644 index 00000000000..4ea762a208e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse5-rotate2-vector.c @@ -0,0 +1,35 @@ +/* Test that the compiler properly optimizes vector rotate instructions vector + into prot on SSE5 systems. */ + +/* { dg-do compile } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O2 -msse5 -ftree-vectorize" } */ + +extern void exit (int); + +typedef long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); + +#define SIZE 10240 + +union { + __m128i i_align; + unsigned u32[SIZE]; +} a, b, c; + +void +right_rotate32_b (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.u32[i] = (b.u32[i] >> ((sizeof (int) * 8) - 4)) | (b.u32[i] << 4); +} + +int +main () +{ + right_rotate (); + exit (0); +} + +/* { dg-final { scan-assembler "prot" } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse5-rotate3-vector.c b/gcc/testsuite/gcc.target/i386/sse5-rotate3-vector.c new file mode 100644 index 00000000000..de7272439ab --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse5-rotate3-vector.c @@ -0,0 +1,34 @@ +/* Test that the compiler properly optimizes vector rotate instructions vector + into prot on SSE5 systems. */ + +/* { dg-do compile } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O2 -msse5 -ftree-vectorize" } */ + +extern void exit (int); + +typedef long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); + +#define SIZE 10240 + +union { + __m128i i_align; + unsigned u32[SIZE]; +} a, b, c; + +void +vector_rotate32 (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.u32[i] = (b.u32[i] >> ((sizeof (int) * 8) - c.u32[i])) | (b.u32[i] << c.u32[i]); +} + +int main () +{ + vector_rotate32 (); + exit (0); +} + +/* { dg-final { scan-assembler "protd" } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse5-shift1-vector.c b/gcc/testsuite/gcc.target/i386/sse5-shift1-vector.c new file mode 100644 index 00000000000..c1ce023263c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse5-shift1-vector.c @@ -0,0 +1,35 @@ +/* Test that the compiler properly optimizes vector shift instructions into + psha/pshl on SSE5 systems. */ + +/* { dg-do compile } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O2 -msse5 -ftree-vectorize" } */ + +extern void exit (int); + +typedef long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); + +#define SIZE 10240 + +union { + __m128i i_align; + int i32[SIZE]; + unsigned u32[SIZE]; +} a, b, c; + +void +left_shift32 (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.i32[i] = b.i32[i] << c.i32[i]; +} + +int main () +{ + left_shfit32 (); + exit (0); +} + +/* { dg-final { scan-assembler "pshad" } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse5-shift2-vector.c b/gcc/testsuite/gcc.target/i386/sse5-shift2-vector.c new file mode 100644 index 00000000000..c0d97bc3d4a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse5-shift2-vector.c @@ -0,0 +1,35 @@ +/* Test that the compiler properly optimizes vector shift instructions into + psha/pshl on SSE5 systems. */ + +/* { dg-do compile } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O2 -msse5 -ftree-vectorize" } */ + +extern void exit (int); + +typedef long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); + +#define SIZE 10240 + +union { + __m128i i_align; + int i32[SIZE]; + unsigned u32[SIZE]; +} a, b, c; + +void +right_sign_shift32 (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.i32[i] = b.i32[i] >> c.i32[i]; +} + +int main () +{ + right_sign_shfit32 (); + exit (0); +} + +/* { dg-final { scan-assembler "pshad" } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse5-shift3-vector.c b/gcc/testsuite/gcc.target/i386/sse5-shift3-vector.c new file mode 100644 index 00000000000..0027457e384 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse5-shift3-vector.c @@ -0,0 +1,35 @@ +/* Test that the compiler properly optimizes vector shift instructions into + psha/pshl on SSE5 systems. */ + +/* { dg-do compile } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O2 -msse5 -ftree-vectorize" } */ + +extern void exit (int); + +typedef long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); + +#define SIZE 10240 + +union { + __m128i i_align; + int i32[SIZE]; + unsigned u32[SIZE]; +} a, b, c; + +void +right_uns_shift32 (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.u32[i] = b.u32[i] >> c.i32[i]; +} + +int main () +{ + right_uns_shfit32 (); + exit (0); +} + +/* { dg-final { scan-assembler "pshld" } } */ diff --git a/gcc/tree-vect-analyze.c b/gcc/tree-vect-analyze.c index 1e2012f85d1..66d83a5c328 100644 --- a/gcc/tree-vect-analyze.c +++ b/gcc/tree-vect-analyze.c @@ -2706,29 +2706,44 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node, /* Shift arguments should be equal in all the packed stmts for a vector shift with scalar shift operand. */ - if (TREE_CODE (rhs) == LSHIFT_EXPR || TREE_CODE (rhs) == RSHIFT_EXPR) + if (TREE_CODE (rhs) == LSHIFT_EXPR || TREE_CODE (rhs) == RSHIFT_EXPR + || TREE_CODE (rhs) == LROTATE_EXPR + || TREE_CODE (rhs) == RROTATE_EXPR) { vec_mode = TYPE_MODE (vectype); - optab = optab_for_tree_code (TREE_CODE (rhs), vectype); - if (!optab) - { - if (vect_print_dump_info (REPORT_SLP)) - fprintf (vect_dump, "Build SLP failed: no optab."); - return false; - } - icode = (int) optab->handlers[(int) vec_mode].insn_code; - if (icode == CODE_FOR_nothing) - { - if (vect_print_dump_info (REPORT_SLP)) - fprintf (vect_dump, - "Build SLP failed: op not supported by target."); - return false; - } - optab_op2_mode = insn_data[icode].operand[2].mode; - if (!VECTOR_MODE_P (optab_op2_mode)) + + /* First see if we have a vector/vector shift. */ + optab = optab_for_tree_code (TREE_CODE (rhs), vectype, + optab_vector); + + if (!optab + || (optab->handlers[(int) vec_mode].insn_code + == CODE_FOR_nothing)) { - need_same_oprnds = true; - first_op1 = TREE_OPERAND (rhs, 1); + /* No vector/vector shift, try for a vector/scalar shift. */ + optab = optab_for_tree_code (TREE_CODE (rhs), vectype, + optab_scalar); + + if (!optab) + { + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, "Build SLP failed: no optab."); + return false; + } + icode = (int) optab->handlers[(int) vec_mode].insn_code; + if (icode == CODE_FOR_nothing) + { + if (vect_print_dump_info (REPORT_SLP)) + fprintf (vect_dump, + "Build SLP failed: op not supported by target."); + return false; + } + optab_op2_mode = insn_data[icode].operand[2].mode; + if (!VECTOR_MODE_P (optab_op2_mode)) + { + need_same_oprnds = true; + first_op1 = TREE_OPERAND (rhs, 1); + } } } } diff --git a/gcc/tree-vect-generic.c b/gcc/tree-vect-generic.c index a4026d6acf5..8313e54bdbc 100644 --- a/gcc/tree-vect-generic.c +++ b/gcc/tree-vect-generic.c @@ -437,7 +437,28 @@ expand_vector_operations_1 (block_stmt_iterator *bsi) || code == VEC_UNPACK_FLOAT_LO_EXPR) type = TREE_TYPE (TREE_OPERAND (rhs, 0)); - op = optab_for_tree_code (code, type); + /* Choose between vector shift/rotate by vector and vector shift/rotate by + scalar */ + if (code == LSHIFT_EXPR || code == RSHIFT_EXPR || code == LROTATE_EXPR + || code == RROTATE_EXPR) + { + /* If the 2nd argument is vector, we need a vector/vector shift */ + if (VECTOR_MODE_P (TYPE_MODE (TREE_TYPE (TREE_OPERAND (rhs, 1))))) + op = optab_for_tree_code (code, type, optab_vector); + + else + { + /* Try for a vector/scalar shift, and if we don't have one, see if we + have a vector/vector shift */ + op = optab_for_tree_code (code, type, optab_scalar); + if (!op + || (op->handlers[(int) TYPE_MODE (type)].insn_code + == CODE_FOR_nothing)) + op = optab_for_tree_code (code, type, optab_vector); + } + } + else + op = optab_for_tree_code (code, type, optab_default); /* For widening/narrowing vector operations, the relevant type is of the arguments, not the widened result. VEC_UNPACK_FLOAT_*_EXPR is @@ -458,7 +479,7 @@ expand_vector_operations_1 (block_stmt_iterator *bsi) if (op == NULL && code == NEGATE_EXPR && INTEGRAL_TYPE_P (TREE_TYPE (type))) - op = optab_for_tree_code (MINUS_EXPR, type); + op = optab_for_tree_code (MINUS_EXPR, type, optab_default); /* For very wide vectors, try using a smaller vector mode. */ compute_type = type; diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c index c1ce14c4bbb..b9a302d8430 100644 --- a/gcc/tree-vect-patterns.c +++ b/gcc/tree-vect-patterns.c @@ -685,7 +685,8 @@ vect_pattern_recog_1 ( if (!pattern_vectype) return; - optab = optab_for_tree_code (TREE_CODE (pattern_expr), pattern_vectype); + optab = optab_for_tree_code (TREE_CODE (pattern_expr), pattern_vectype, + optab_default); vec_mode = TYPE_MODE (pattern_vectype); if (!optab || (icode = optab_handler (optab, vec_mode)->insn_code) == diff --git a/gcc/tree-vect-transform.c b/gcc/tree-vect-transform.c index cc489811006..82a4c84893a 100644 --- a/gcc/tree-vect-transform.c +++ b/gcc/tree-vect-transform.c @@ -502,7 +502,7 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code, int element_bitsize = tree_low_cst (bitsize, 1); int nelements = vec_size_in_bits / element_bitsize; - optab = optab_for_tree_code (code, vectype); + optab = optab_for_tree_code (code, vectype, optab_default); /* We have a whole vector shift available. */ if (VECTOR_MODE_P (mode) @@ -2458,7 +2458,7 @@ vect_create_epilog_for_reduction (tree vect_def, tree stmt, have_whole_vector_shift = false; else { - optab optab = optab_for_tree_code (code, vectype); + optab optab = optab_for_tree_code (code, vectype, optab_default); if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing) have_whole_vector_shift = false; } @@ -2818,7 +2818,7 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) /* 4. Supportable by target? */ /* 4.1. check support for the operation in the loop */ - optab = optab_for_tree_code (code, vectype); + optab = optab_for_tree_code (code, vectype, optab_default); if (!optab) { if (vect_print_dump_info (REPORT_DETAILS)) @@ -2909,7 +2909,7 @@ vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code)) return false; - reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype); + reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype, optab_default); if (!reduc_optab) { if (vect_print_dump_info (REPORT_DETAILS)) @@ -3852,6 +3852,7 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt, VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL; tree vop0, vop1; unsigned int k; + bool shift_p = false; bool scalar_shift_arg = false; /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies @@ -3896,8 +3897,6 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt, if (code == POINTER_PLUS_EXPR) code = PLUS_EXPR; - optab = optab_for_tree_code (code, vectype); - /* Support only unary or binary operations. */ op_type = TREE_OPERAND_LENGTH (operation); if (op_type != unary_op && op_type != binary_op) @@ -3926,6 +3925,56 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt, } } + /* If this is a shift/rotate, determine whether the shift amount is a vector, + or scalar. If the shift/rotate amount is a vector, use the vector/vector + shift optabs. */ + if (code == LSHIFT_EXPR || code == RSHIFT_EXPR || code == LROTATE_EXPR + || code == RROTATE_EXPR) + { + shift_p = true; + + /* vector shifted by vector */ + if (dt[1] == vect_loop_def) + { + optab = optab_for_tree_code (code, vectype, optab_vector); + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "vector/vector shift/rotate found."); + } + + /* See if the machine has a vector shifted by scalar insn and if not + then see if it has a vector shifted by vector insn */ + else if (dt[1] == vect_constant_def || dt[1] == vect_invariant_def) + { + optab = optab_for_tree_code (code, vectype, optab_scalar); + if (optab + && (optab_handler (optab, TYPE_MODE (vectype))->insn_code + != CODE_FOR_nothing)) + { + scalar_shift_arg = true; + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "vector/scalar shift/rotate found."); + } + else + { + optab = optab_for_tree_code (code, vectype, optab_vector); + if (vect_print_dump_info (REPORT_DETAILS) + && optab + && (optab_handler (optab, TYPE_MODE (vectype))->insn_code + != CODE_FOR_nothing)) + fprintf (vect_dump, "vector/vector shift/rotate found."); + } + } + + else + { + if (vect_print_dump_info (REPORT_DETAILS)) + fprintf (vect_dump, "operand mode requires invariant argument."); + return false; + } + } + else + optab = optab_for_tree_code (code, vectype, optab_default); + /* Supportable by target? */ if (!optab) { @@ -3960,29 +4009,6 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt, return false; } - if (code == LSHIFT_EXPR || code == RSHIFT_EXPR) - { - /* FORNOW: not yet supported. */ - if (!VECTOR_MODE_P (vec_mode)) - return false; - - /* Invariant argument is needed for a vector shift - by a scalar shift operand. */ - optab_op2_mode = insn_data[icode].operand[2].mode; - if (!VECTOR_MODE_P (optab_op2_mode)) - { - if (dt[1] != vect_constant_def && dt[1] != vect_invariant_def) - { - if (vect_print_dump_info (REPORT_DETAILS)) - fprintf (vect_dump, "operand mode requires invariant" - " argument."); - return false; - } - - scalar_shift_arg = true; - } - } - if (!vec_stmt) /* transformation not required. */ { STMT_VINFO_TYPE (stmt_info) = op_vec_info_type; @@ -4075,8 +4101,7 @@ vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt, /* Handle uses. */ if (j == 0) { - if (op_type == binary_op - && (code == LSHIFT_EXPR || code == RSHIFT_EXPR)) + if (op_type == binary_op && scalar_shift_arg) { /* Vector shl and shr insn patterns can be defined with scalar operand 2 (shift operand). In this case, use constant or loop @@ -4495,9 +4520,9 @@ vect_strided_store_supported (tree vectype) /* Check that the operation is supported. */ interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR, - vectype); + vectype, optab_default); interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR, - vectype); + vectype, optab_default); if (!interleave_high_optab || !interleave_low_optab) { if (vect_print_dump_info (REPORT_DETAILS)) @@ -5238,7 +5263,8 @@ vect_strided_load_supported (tree vectype) mode = (int) TYPE_MODE (vectype); - perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype); + perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype, + optab_default); if (!perm_even_optab) { if (vect_print_dump_info (REPORT_DETAILS)) @@ -5253,7 +5279,8 @@ vect_strided_load_supported (tree vectype) return false; } - perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype); + perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype, + optab_default); if (!perm_odd_optab) { if (vect_print_dump_info (REPORT_DETAILS)) diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index bfb41a67675..79a7461a1a9 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -2211,13 +2211,13 @@ supportable_widening_operation (enum tree_code code, tree stmt, tree vectype, if (code == FIX_TRUNC_EXPR) { /* The signedness is determined from output operand. */ - optab1 = optab_for_tree_code (c1, type); - optab2 = optab_for_tree_code (c2, type); + optab1 = optab_for_tree_code (c1, type, optab_default); + optab2 = optab_for_tree_code (c2, type, optab_default); } else { - optab1 = optab_for_tree_code (c1, vectype); - optab2 = optab_for_tree_code (c2, vectype); + optab1 = optab_for_tree_code (c1, vectype, optab_default); + optab2 = optab_for_tree_code (c2, vectype, optab_default); } if (!optab1 || !optab2) @@ -2285,9 +2285,9 @@ supportable_narrowing_operation (enum tree_code code, if (code == FIX_TRUNC_EXPR) /* The signedness is determined from output operand. */ - optab1 = optab_for_tree_code (c1, type); + optab1 = optab_for_tree_code (c1, type, optab_default); else - optab1 = optab_for_tree_code (c1, vectype); + optab1 = optab_for_tree_code (c1, vectype, optab_default); if (!optab1) return false; -- 2.30.2