return true;
}
+/* Expand a vector operation shift by constant for a V*QImode in terms of the
+ same operation on V*HImode. Return true if success. */
+bool
+ix86_expand_vec_shift_qihi_constant (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+{
+ machine_mode qimode, himode;
+ unsigned int and_constant, xor_constant;
+ HOST_WIDE_INT shift_amount;
+ rtx vec_const_and, vec_const_xor;
+ rtx tmp, op1_subreg;
+ rtx (*gen_shift) (rtx, rtx, rtx);
+ rtx (*gen_and) (rtx, rtx, rtx);
+ rtx (*gen_xor) (rtx, rtx, rtx);
+ rtx (*gen_sub) (rtx, rtx, rtx);
+
+ /* Only optimize shift by constant. */
+ if (!CONST_INT_P (op2))
+ return false;
+
+ qimode = GET_MODE (dest);
+ shift_amount = INTVAL (op2);
+ /* Do nothing when shift amount greater equal 8. */
+ if (shift_amount > 7)
+ return false;
+
+ gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
+ /* Record sign bit. */
+ xor_constant = 1 << (8 - shift_amount - 1);
+
+ /* Zero upper/lower bits shift from left/right element. */
+ and_constant
+ = (code == ASHIFT ? 256 - (1 << shift_amount)
+ : (1 << (8 - shift_amount)) - 1);
+
+ switch (qimode)
+ {
+ case V16QImode:
+ himode = V8HImode;
+ gen_shift =
+ ((code == ASHIFT)
+ ? gen_ashlv8hi3
+ : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
+ gen_and = gen_andv16qi3;
+ gen_xor = gen_xorv16qi3;
+ gen_sub = gen_subv16qi3;
+ break;
+ case V32QImode:
+ himode = V16HImode;
+ gen_shift =
+ ((code == ASHIFT)
+ ? gen_ashlv16hi3
+ : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
+ gen_and = gen_andv32qi3;
+ gen_xor = gen_xorv32qi3;
+ gen_sub = gen_subv32qi3;
+ break;
+ case V64QImode:
+ himode = V32HImode;
+ gen_shift =
+ ((code == ASHIFT)
+ ? gen_ashlv32hi3
+ : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
+ gen_and = gen_andv64qi3;
+ gen_xor = gen_xorv64qi3;
+ gen_sub = gen_subv64qi3;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ tmp = gen_reg_rtx (himode);
+ vec_const_and = gen_reg_rtx (qimode);
+ op1_subreg = lowpart_subreg (himode, op1, qimode);
+
+ /* For ASHIFT and LSHIFTRT, perform operation like
+ vpsllw/vpsrlw $shift_amount, %op1, %dest.
+ vpand %vec_const_and, %dest. */
+ emit_insn (gen_shift (tmp, op1_subreg, op2));
+ emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
+ emit_move_insn (vec_const_and,
+ ix86_build_const_vector (qimode, true,
+ GEN_INT (and_constant)));
+ emit_insn (gen_and (dest, dest, vec_const_and));
+
+ /* For ASHIFTRT, perform extra operation like
+ vpxor %vec_const_xor, %dest, %dest
+ vpsubb %vec_const_xor, %dest, %dest */
+ if (code == ASHIFTRT)
+ {
+ vec_const_xor = gen_reg_rtx (qimode);
+ emit_move_insn (vec_const_xor,
+ ix86_build_const_vector (qimode, true,
+ GEN_INT (xor_constant)));
+ emit_insn (gen_xor (dest, dest, vec_const_xor));
+ emit_insn (gen_sub (dest, dest, vec_const_xor));
+ }
+ return true;
+}
+
/* Expand a vector operation CODE for a V*QImode in terms of the
same operation on V*HImode. */
extern bool ix86_expand_vecmul_qihi (rtx, rtx, rtx);
extern void ix86_expand_vecop_qihi (enum rtx_code, rtx, rtx, rtx);
+extern bool ix86_expand_vec_shift_qihi_constant (enum rtx_code, rtx, rtx, rtx);
extern rtx ix86_split_stack_guard (void);
gen = (<CODE> == LSHIFTRT ? gen_xop_shlv16qi3 : gen_xop_shav16qi3);
emit_insn (gen (operands[0], operands[1], tmp));
}
- else
+ else if (!ix86_expand_vec_shift_qihi_constant (<CODE>, operands[0],
+ operands[1], operands[2]))
ix86_expand_vecop_qihi (<CODE>, operands[0], operands[1], operands[2]);
DONE;
})
--- /dev/null
+/* PR target/95524 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2" } */
+/* { dg-final { scan-assembler-times "vpand\[^\n\]*%ymm" 3 } } */
+typedef char v32qi __attribute__ ((vector_size (32)));
+typedef unsigned char v32uqi __attribute__ ((vector_size (32)));
+
+__attribute__((noipa)) v32qi
+foo_ashiftrt_256 (v32qi a)
+{
+ return a >> 2;
+}
+/* { dg-final { scan-assembler-times "vpsraw\[^\n\]*%ymm" 1 } } */
+/* { dg-final { scan-assembler-times "vpxor\[^\n\]*%ymm" 1 } } */
+/* { dg-final { scan-assembler-times "vpsubb\[^\n\]*%ymm" 1 } } */
+
+__attribute__((noipa)) v32qi
+foo_ashift_256 (v32qi a)
+{
+ return a << 7;
+}
+
+/* { dg-final { scan-assembler-times "vpsllw\[^\n\]*%ymm" 1 } } */
+
+__attribute__((noipa)) v32uqi
+foo_lshiftrt_256 (v32uqi a)
+{
+ return a >> 2;
+}
+
+/* { dg-final { scan-assembler-times "vpsrlw\[^\n\]*%ymm" 1 } } */
--- /dev/null
+/* PR target/95524 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512bw -Wno-shift-count-overflow" } */
+
+#ifndef CHECK
+#define CHECK "avx512bw-check.h"
+#endif
+
+#include CHECK
+
+#ifndef TEST
+#define TEST avx512bw_test
+#endif
+
+typedef char v64qi __attribute__ ((vector_size (64)));
+typedef unsigned char v64uqi __attribute__ ((vector_size (64)));
+
+#define TEST_SHIFT(N) \
+ do \
+ { \
+ int i; \
+ for (i = 0; i < 64; i++) \
+ exp1.a[i] = op1.a[i] << N; \
+ res1.x = (__m512i) (((v64qi) op1.x) << N); \
+ if (check_union512i_b (res1, exp1.a)) \
+ abort (); \
+ \
+ for (i = 0; i < 64; i++) \
+ exp1.a[i] = op1.a[i] >> N; \
+ res1.x = (__m512i) (((v64qi) op1.x) >> N); \
+ if (check_union512i_b (res1, exp1.a)) \
+ abort (); \
+ \
+ for (i = 0; i < 64; i++) \
+ exp2.a[i] = op2.a[i] >> N; \
+ res2.x = (__m512i) (((v64uqi) op2.x >> N)); \
+ if (check_union512i_ub (res2, exp2.a)) \
+ abort (); \
+ } \
+ while (0)
+
+static void
+TEST (void)
+{
+ union512i_b op1, exp1, res1;
+ union512i_ub op2, exp2, res2;
+ for (int i = 0; i != 64; i++)
+ {
+ op2.a[i] = i * i;
+ op1.a[i] = i * i + 200 * i;
+ }
+ TEST_SHIFT (0);
+ TEST_SHIFT (1);
+ TEST_SHIFT (2);
+ TEST_SHIFT (3);
+ TEST_SHIFT (4);
+ TEST_SHIFT (5);
+ TEST_SHIFT (6);
+ TEST_SHIFT (7);
+ TEST_SHIFT (8);
+}
+
--- /dev/null
+/* PR target/95524 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bw" } */
+/* { dg-final { scan-assembler-times "vpand\[^\n\]*%zmm" 3 } } */
+typedef char v64qi __attribute__ ((vector_size (64)));
+typedef unsigned char v64uqi __attribute__ ((vector_size (64)));
+
+__attribute__((noipa)) v64qi
+foo_ashiftrt_512 (v64qi a)
+{
+ return a >> 2;
+}
+/* { dg-final { scan-assembler-times "vpsraw\[^\n\]*%zmm" 1 } } */
+/* { dg-final { scan-assembler-times "vpxor\[^\n\]*%zmm" 1 } } */
+/* { dg-final { scan-assembler-times "vpsubb\[^\n\]*%zmm" 1 } } */
+
+__attribute__((noipa)) v64qi
+foo_ashift_512 (v64qi a)
+{
+ return a << 7;
+}
+
+/* { dg-final { scan-assembler-times "vpsllw\[^\n\]*%zmm" 1 } } */
+
+__attribute__((noipa)) v64uqi
+foo_lshiftrt_512 (v64uqi a)
+{
+ return a >> 2;
+}
+
+/* { dg-final { scan-assembler-times "vpsrlw\[^\n\]*%zmm" 1 } } */
--- /dev/null
+/* PR target/95524 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -Wno-shift-count-overflow" } */
+
+#ifndef CHECK
+#define CHECK "avx2-check.h"
+#endif
+
+#include CHECK
+
+#ifndef TEST
+#define TEST avx2_test
+#endif
+
+typedef char v32qi __attribute__ ((vector_size (32)));
+typedef unsigned char v32uqi __attribute__ ((vector_size (32)));
+
+#define TEST_SHIFT(N) \
+ do \
+ { \
+ int i; \
+ for (i = 0; i < 32; i++) \
+ exp1.a[i] = op1.a[i] << N; \
+ res1.x = (__m256i) (((v32qi) op1.x) << N); \
+ if (check_union256i_b (res1, exp1.a)) \
+ abort (); \
+ \
+ for (i = 0; i < 32; i++) \
+ exp1.a[i] = op1.a[i] >> N; \
+ res1.x = (__m256i) (((v32qi) op1.x) >> N); \
+ if (check_union256i_b (res1, exp1.a)) \
+ abort (); \
+ \
+ for (i = 0; i < 32; i++) \
+ exp2.a[i] = op2.a[i] >> N; \
+ res2.x = (__m256i) (((v32uqi) op2.x >> N)); \
+ if (check_union256i_ub (res2, exp2.a)) \
+ abort (); \
+ } \
+ while (0)
+
+static void
+TEST (void)
+{
+ union256i_b op1, exp1, res1;
+ union256i_ub op2, exp2, res2;
+ for (int i = 0; i != 32; i++)
+ {
+ op2.a[i] = i * i;
+ op1.a[i] = i * i + 200 * i;
+ }
+ TEST_SHIFT (0);
+ TEST_SHIFT (1);
+ TEST_SHIFT (2);
+ TEST_SHIFT (3);
+ TEST_SHIFT (4);
+ TEST_SHIFT (5);
+ TEST_SHIFT (6);
+ TEST_SHIFT (7);
+ TEST_SHIFT (8);
+}
+
--- /dev/null
+/* PR target/95524 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-final { scan-assembler-times "pand\[^\n\]*%xmm" 3 { xfail *-*-* } } } */
+typedef char v16qi __attribute__ ((vector_size (16)));
+typedef unsigned char v16uqi __attribute__ ((vector_size (16)));
+
+__attribute__((noipa)) v16qi
+foo_ashiftrt_128 (v16qi a)
+{
+ return a >> 2;
+}
+/* { dg-final { scan-assembler-times "psraw\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "pxor\[^\n\]*%xmm" 1 } } */
+/* { dg-final { scan-assembler-times "psubb\[^\n\]*%xmm" 1 } } */
+
+__attribute__((noipa)) v16qi
+foo_ashift_128 (v16qi a)
+{
+ return a << 7;
+}
+
+/* { dg-final { scan-assembler-times "psllw\[^\n\]*%xmm" 1 { xfail *-*-* } } } */
+
+__attribute__((noipa)) v16uqi
+foo_lshiftrt_128 (v16uqi a)
+{
+ return a >> 2;
+}
+
+/* { dg-final { scan-assembler-times "psrlw\[^\n\]*%xmm" 1 } } */
--- /dev/null
+/* PR target/95524 */
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2 -Wno-shift-count-overflow" } */
+
+#ifndef CHECK
+#define CHECK "sse2-check.h"
+#endif
+
+#include CHECK
+
+#ifndef TEST
+#define TEST sse2_test
+#endif
+
+typedef char v16qi __attribute__ ((vector_size (16)));
+typedef unsigned char v16uqi __attribute__ ((vector_size (16)));
+
+#define TEST_SHIFT(N) \
+ do \
+ { \
+ int i; \
+ for (i = 0; i < 16; i++) \
+ exp1.a[i] = op1.a[i] << N; \
+ res1.x = (__m128i) (((v16qi) op1.x) << N); \
+ if (check_union128i_b (res1, exp1.a)) \
+ abort (); \
+ \
+ for (i = 0; i < 16; i++) \
+ exp1.a[i] = op1.a[i] >> N; \
+ res1.x = (__m128i) (((v16qi) op1.x) >> N); \
+ if (check_union128i_b (res1, exp1.a)) \
+ abort (); \
+ \
+ for (i = 0; i < 16; i++) \
+ exp2.a[i] = op2.a[i] >> N; \
+ res2.x = (__m128i) (((v16uqi) op2.x >> N)); \
+ if (check_union128i_ub (res2, exp2.a)) \
+ abort (); \
+ } \
+ while (0)
+
+static void
+TEST (void)
+{
+ union128i_b op1, exp1, res1;
+ union128i_ub op2, exp2, res2;
+ for (int i = 0; i != 16; i++)
+ {
+ op2.a[i] = i * i;
+ op1.a[i] = i * i + 200 * i;
+ }
+ TEST_SHIFT (0);
+ TEST_SHIFT (1);
+ TEST_SHIFT (2);
+ TEST_SHIFT (3);
+ TEST_SHIFT (4);
+ TEST_SHIFT (5);
+ TEST_SHIFT (6);
+ TEST_SHIFT (7);
+ TEST_SHIFT (8);
+}
+