From c7199fb6e694d1a0964351200648c24c3ee97973 Mon Sep 17 00:00:00 2001 From: liuhongt Date: Mon, 15 Jun 2020 13:48:45 +0800 Subject: [PATCH] Optimize V16QI/V32QI/V64QI shift by constant. gcc/ChangeLog: PR target/95524 * config/i386/i386-expand.c (ix86_expand_vec_shift_qihi_constant): New function. * config/i386/i386-protos.h (ix86_expand_vec_shift_qihi_constant): Declare. * config/i386/sse.md (3): Optimize shift V*QImode by constant. gcc/testsuite/ChangeLog: * gcc.target/i386/avx2-shiftqihi-constant-1.c: New test. * gcc.target/i386/avx2-shiftqihi-constant-2.c: Ditto. * gcc.target/i386/avx512bw-shiftqihi-constant-1.c: Ditto. * gcc.target/i386/avx512bw-shiftqihi-constant-2.c: Ditto. * gcc.target/i386/sse2-shiftqihi-constant-1.c: Ditto. * gcc.target/i386/sse2-shiftqihi-constant-2.c: Ditto. --- gcc/config/i386/i386-expand.c | 99 +++++++++++++++++++ gcc/config/i386/i386-protos.h | 1 + gcc/config/i386/sse.md | 3 +- .../i386/avx2-shiftqihi-constant-1.c | 31 ++++++ .../i386/avx2-shiftqihi-constant-2.c | 62 ++++++++++++ .../i386/avx512bw-shiftqihi-constant-1.c | 31 ++++++ .../i386/avx512bw-shiftqihi-constant-2.c | 62 ++++++++++++ .../i386/sse2-shiftqihi-constant-1.c | 31 ++++++ .../i386/sse2-shiftqihi-constant-2.c | 62 ++++++++++++ 9 files changed, 381 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/i386/avx2-shiftqihi-constant-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx2-shiftqihi-constant-2.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-2.c create mode 100644 gcc/testsuite/gcc.target/i386/sse2-shiftqihi-constant-1.c create mode 100644 gcc/testsuite/gcc.target/i386/sse2-shiftqihi-constant-2.c diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 3a414f69b3b..d7077980f9b 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -19532,6 +19532,105 @@ ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2) return true; } +/* Expand a vector operation shift by constant for a V*QImode in terms of the + same operation on V*HImode. Return true if success. */ +bool +ix86_expand_vec_shift_qihi_constant (enum rtx_code code, rtx dest, rtx op1, rtx op2) +{ + machine_mode qimode, himode; + unsigned int and_constant, xor_constant; + HOST_WIDE_INT shift_amount; + rtx vec_const_and, vec_const_xor; + rtx tmp, op1_subreg; + rtx (*gen_shift) (rtx, rtx, rtx); + rtx (*gen_and) (rtx, rtx, rtx); + rtx (*gen_xor) (rtx, rtx, rtx); + rtx (*gen_sub) (rtx, rtx, rtx); + + /* Only optimize shift by constant. */ + if (!CONST_INT_P (op2)) + return false; + + qimode = GET_MODE (dest); + shift_amount = INTVAL (op2); + /* Do nothing when shift amount greater equal 8. */ + if (shift_amount > 7) + return false; + + gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT); + /* Record sign bit. */ + xor_constant = 1 << (8 - shift_amount - 1); + + /* Zero upper/lower bits shift from left/right element. */ + and_constant + = (code == ASHIFT ? 256 - (1 << shift_amount) + : (1 << (8 - shift_amount)) - 1); + + switch (qimode) + { + case V16QImode: + himode = V8HImode; + gen_shift = + ((code == ASHIFT) + ? gen_ashlv8hi3 + : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3); + gen_and = gen_andv16qi3; + gen_xor = gen_xorv16qi3; + gen_sub = gen_subv16qi3; + break; + case V32QImode: + himode = V16HImode; + gen_shift = + ((code == ASHIFT) + ? gen_ashlv16hi3 + : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3); + gen_and = gen_andv32qi3; + gen_xor = gen_xorv32qi3; + gen_sub = gen_subv32qi3; + break; + case V64QImode: + himode = V32HImode; + gen_shift = + ((code == ASHIFT) + ? gen_ashlv32hi3 + : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3); + gen_and = gen_andv64qi3; + gen_xor = gen_xorv64qi3; + gen_sub = gen_subv64qi3; + break; + default: + gcc_unreachable (); + } + + tmp = gen_reg_rtx (himode); + vec_const_and = gen_reg_rtx (qimode); + op1_subreg = lowpart_subreg (himode, op1, qimode); + + /* For ASHIFT and LSHIFTRT, perform operation like + vpsllw/vpsrlw $shift_amount, %op1, %dest. + vpand %vec_const_and, %dest. */ + emit_insn (gen_shift (tmp, op1_subreg, op2)); + emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0)); + emit_move_insn (vec_const_and, + ix86_build_const_vector (qimode, true, + GEN_INT (and_constant))); + emit_insn (gen_and (dest, dest, vec_const_and)); + + /* For ASHIFTRT, perform extra operation like + vpxor %vec_const_xor, %dest, %dest + vpsubb %vec_const_xor, %dest, %dest */ + if (code == ASHIFTRT) + { + vec_const_xor = gen_reg_rtx (qimode); + emit_move_insn (vec_const_xor, + ix86_build_const_vector (qimode, true, + GEN_INT (xor_constant))); + emit_insn (gen_xor (dest, dest, vec_const_xor)); + emit_insn (gen_sub (dest, dest, vec_const_xor)); + } + return true; +} + /* Expand a vector operation CODE for a V*QImode in terms of the same operation on V*HImode. */ diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index f5320494fa1..7c2ce618f3f 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -206,6 +206,7 @@ extern void ix86_expand_round_sse4 (rtx, rtx); extern bool ix86_expand_vecmul_qihi (rtx, rtx, rtx); extern void ix86_expand_vecop_qihi (enum rtx_code, rtx, rtx, rtx); +extern bool ix86_expand_vec_shift_qihi_constant (enum rtx_code, rtx, rtx, rtx); extern rtx ix86_split_stack_guard (void); diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index aa9fdc87c68..431571a4bc1 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -19863,7 +19863,8 @@ gen = ( == LSHIFTRT ? gen_xop_shlv16qi3 : gen_xop_shav16qi3); emit_insn (gen (operands[0], operands[1], tmp)); } - else + else if (!ix86_expand_vec_shift_qihi_constant (, operands[0], + operands[1], operands[2])) ix86_expand_vecop_qihi (, operands[0], operands[1], operands[2]); DONE; }) diff --git a/gcc/testsuite/gcc.target/i386/avx2-shiftqihi-constant-1.c b/gcc/testsuite/gcc.target/i386/avx2-shiftqihi-constant-1.c new file mode 100644 index 00000000000..72065039581 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx2-shiftqihi-constant-1.c @@ -0,0 +1,31 @@ +/* PR target/95524 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx2" } */ +/* { dg-final { scan-assembler-times "vpand\[^\n\]*%ymm" 3 } } */ +typedef char v32qi __attribute__ ((vector_size (32))); +typedef unsigned char v32uqi __attribute__ ((vector_size (32))); + +__attribute__((noipa)) v32qi +foo_ashiftrt_256 (v32qi a) +{ + return a >> 2; +} +/* { dg-final { scan-assembler-times "vpsraw\[^\n\]*%ymm" 1 } } */ +/* { dg-final { scan-assembler-times "vpxor\[^\n\]*%ymm" 1 } } */ +/* { dg-final { scan-assembler-times "vpsubb\[^\n\]*%ymm" 1 } } */ + +__attribute__((noipa)) v32qi +foo_ashift_256 (v32qi a) +{ + return a << 7; +} + +/* { dg-final { scan-assembler-times "vpsllw\[^\n\]*%ymm" 1 } } */ + +__attribute__((noipa)) v32uqi +foo_lshiftrt_256 (v32uqi a) +{ + return a >> 2; +} + +/* { dg-final { scan-assembler-times "vpsrlw\[^\n\]*%ymm" 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx2-shiftqihi-constant-2.c b/gcc/testsuite/gcc.target/i386/avx2-shiftqihi-constant-2.c new file mode 100644 index 00000000000..509d5a8d762 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx2-shiftqihi-constant-2.c @@ -0,0 +1,62 @@ +/* PR target/95524 */ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx512bw -Wno-shift-count-overflow" } */ + +#ifndef CHECK +#define CHECK "avx512bw-check.h" +#endif + +#include CHECK + +#ifndef TEST +#define TEST avx512bw_test +#endif + +typedef char v64qi __attribute__ ((vector_size (64))); +typedef unsigned char v64uqi __attribute__ ((vector_size (64))); + +#define TEST_SHIFT(N) \ + do \ + { \ + int i; \ + for (i = 0; i < 64; i++) \ + exp1.a[i] = op1.a[i] << N; \ + res1.x = (__m512i) (((v64qi) op1.x) << N); \ + if (check_union512i_b (res1, exp1.a)) \ + abort (); \ + \ + for (i = 0; i < 64; i++) \ + exp1.a[i] = op1.a[i] >> N; \ + res1.x = (__m512i) (((v64qi) op1.x) >> N); \ + if (check_union512i_b (res1, exp1.a)) \ + abort (); \ + \ + for (i = 0; i < 64; i++) \ + exp2.a[i] = op2.a[i] >> N; \ + res2.x = (__m512i) (((v64uqi) op2.x >> N)); \ + if (check_union512i_ub (res2, exp2.a)) \ + abort (); \ + } \ + while (0) + +static void +TEST (void) +{ + union512i_b op1, exp1, res1; + union512i_ub op2, exp2, res2; + for (int i = 0; i != 64; i++) + { + op2.a[i] = i * i; + op1.a[i] = i * i + 200 * i; + } + TEST_SHIFT (0); + TEST_SHIFT (1); + TEST_SHIFT (2); + TEST_SHIFT (3); + TEST_SHIFT (4); + TEST_SHIFT (5); + TEST_SHIFT (6); + TEST_SHIFT (7); + TEST_SHIFT (8); +} + diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c new file mode 100644 index 00000000000..78bf5d33689 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-1.c @@ -0,0 +1,31 @@ +/* PR target/95524 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512bw" } */ +/* { dg-final { scan-assembler-times "vpand\[^\n\]*%zmm" 3 } } */ +typedef char v64qi __attribute__ ((vector_size (64))); +typedef unsigned char v64uqi __attribute__ ((vector_size (64))); + +__attribute__((noipa)) v64qi +foo_ashiftrt_512 (v64qi a) +{ + return a >> 2; +} +/* { dg-final { scan-assembler-times "vpsraw\[^\n\]*%zmm" 1 } } */ +/* { dg-final { scan-assembler-times "vpxor\[^\n\]*%zmm" 1 } } */ +/* { dg-final { scan-assembler-times "vpsubb\[^\n\]*%zmm" 1 } } */ + +__attribute__((noipa)) v64qi +foo_ashift_512 (v64qi a) +{ + return a << 7; +} + +/* { dg-final { scan-assembler-times "vpsllw\[^\n\]*%zmm" 1 } } */ + +__attribute__((noipa)) v64uqi +foo_lshiftrt_512 (v64uqi a) +{ + return a >> 2; +} + +/* { dg-final { scan-assembler-times "vpsrlw\[^\n\]*%zmm" 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-2.c b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-2.c new file mode 100644 index 00000000000..d6f7934f3b4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bw-shiftqihi-constant-2.c @@ -0,0 +1,62 @@ +/* PR target/95524 */ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx2 -Wno-shift-count-overflow" } */ + +#ifndef CHECK +#define CHECK "avx2-check.h" +#endif + +#include CHECK + +#ifndef TEST +#define TEST avx2_test +#endif + +typedef char v32qi __attribute__ ((vector_size (32))); +typedef unsigned char v32uqi __attribute__ ((vector_size (32))); + +#define TEST_SHIFT(N) \ + do \ + { \ + int i; \ + for (i = 0; i < 32; i++) \ + exp1.a[i] = op1.a[i] << N; \ + res1.x = (__m256i) (((v32qi) op1.x) << N); \ + if (check_union256i_b (res1, exp1.a)) \ + abort (); \ + \ + for (i = 0; i < 32; i++) \ + exp1.a[i] = op1.a[i] >> N; \ + res1.x = (__m256i) (((v32qi) op1.x) >> N); \ + if (check_union256i_b (res1, exp1.a)) \ + abort (); \ + \ + for (i = 0; i < 32; i++) \ + exp2.a[i] = op2.a[i] >> N; \ + res2.x = (__m256i) (((v32uqi) op2.x >> N)); \ + if (check_union256i_ub (res2, exp2.a)) \ + abort (); \ + } \ + while (0) + +static void +TEST (void) +{ + union256i_b op1, exp1, res1; + union256i_ub op2, exp2, res2; + for (int i = 0; i != 32; i++) + { + op2.a[i] = i * i; + op1.a[i] = i * i + 200 * i; + } + TEST_SHIFT (0); + TEST_SHIFT (1); + TEST_SHIFT (2); + TEST_SHIFT (3); + TEST_SHIFT (4); + TEST_SHIFT (5); + TEST_SHIFT (6); + TEST_SHIFT (7); + TEST_SHIFT (8); +} + diff --git a/gcc/testsuite/gcc.target/i386/sse2-shiftqihi-constant-1.c b/gcc/testsuite/gcc.target/i386/sse2-shiftqihi-constant-1.c new file mode 100644 index 00000000000..f1c68cb2972 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-shiftqihi-constant-1.c @@ -0,0 +1,31 @@ +/* PR target/95524 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse2" } */ +/* { dg-final { scan-assembler-times "pand\[^\n\]*%xmm" 3 { xfail *-*-* } } } */ +typedef char v16qi __attribute__ ((vector_size (16))); +typedef unsigned char v16uqi __attribute__ ((vector_size (16))); + +__attribute__((noipa)) v16qi +foo_ashiftrt_128 (v16qi a) +{ + return a >> 2; +} +/* { dg-final { scan-assembler-times "psraw\[^\n\]*%xmm" 1 } } */ +/* { dg-final { scan-assembler-times "pxor\[^\n\]*%xmm" 1 } } */ +/* { dg-final { scan-assembler-times "psubb\[^\n\]*%xmm" 1 } } */ + +__attribute__((noipa)) v16qi +foo_ashift_128 (v16qi a) +{ + return a << 7; +} + +/* { dg-final { scan-assembler-times "psllw\[^\n\]*%xmm" 1 { xfail *-*-* } } } */ + +__attribute__((noipa)) v16uqi +foo_lshiftrt_128 (v16uqi a) +{ + return a >> 2; +} + +/* { dg-final { scan-assembler-times "psrlw\[^\n\]*%xmm" 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse2-shiftqihi-constant-2.c b/gcc/testsuite/gcc.target/i386/sse2-shiftqihi-constant-2.c new file mode 100644 index 00000000000..d95171f7a47 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-shiftqihi-constant-2.c @@ -0,0 +1,62 @@ +/* PR target/95524 */ +/* { dg-do run } */ +/* { dg-options "-O2 -msse2 -Wno-shift-count-overflow" } */ + +#ifndef CHECK +#define CHECK "sse2-check.h" +#endif + +#include CHECK + +#ifndef TEST +#define TEST sse2_test +#endif + +typedef char v16qi __attribute__ ((vector_size (16))); +typedef unsigned char v16uqi __attribute__ ((vector_size (16))); + +#define TEST_SHIFT(N) \ + do \ + { \ + int i; \ + for (i = 0; i < 16; i++) \ + exp1.a[i] = op1.a[i] << N; \ + res1.x = (__m128i) (((v16qi) op1.x) << N); \ + if (check_union128i_b (res1, exp1.a)) \ + abort (); \ + \ + for (i = 0; i < 16; i++) \ + exp1.a[i] = op1.a[i] >> N; \ + res1.x = (__m128i) (((v16qi) op1.x) >> N); \ + if (check_union128i_b (res1, exp1.a)) \ + abort (); \ + \ + for (i = 0; i < 16; i++) \ + exp2.a[i] = op2.a[i] >> N; \ + res2.x = (__m128i) (((v16uqi) op2.x >> N)); \ + if (check_union128i_ub (res2, exp2.a)) \ + abort (); \ + } \ + while (0) + +static void +TEST (void) +{ + union128i_b op1, exp1, res1; + union128i_ub op2, exp2, res2; + for (int i = 0; i != 16; i++) + { + op2.a[i] = i * i; + op1.a[i] = i * i + 200 * i; + } + TEST_SHIFT (0); + TEST_SHIFT (1); + TEST_SHIFT (2); + TEST_SHIFT (3); + TEST_SHIFT (4); + TEST_SHIFT (5); + TEST_SHIFT (6); + TEST_SHIFT (7); + TEST_SHIFT (8); +} + -- 2.30.2