From b668a06e37f72fd96bacd6769990ec97dac4ac6d Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Wed, 13 Jan 2021 08:02:54 +0100 Subject: [PATCH] i386: Optimize _mm_unpacklo_epi8 of 0 vector as second argument or similar VEC_PERM_EXPRs into pmovzx [PR95905] The following patch adds patterns (so far 128-bit only) for permutations like { 0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23 } where the second operand is CONST0_RTX CONST_VECTOR to be emitted as pmovzx. 2021-01-13 Jakub Jelinek PR target/95905 * config/i386/predicates.md (pmovzx_parallel): New predicate. * config/i386/sse.md (*sse4_1_zero_extendv8qiv8hi2_3): New define_insn_and_split pattern. (*sse4_1_zero_extendv4hiv4si2_3): Likewise. (*sse4_1_zero_extendv2siv2di2_3): Likewise. * gcc.target/i386/pr95905-1.c: New test. * gcc.target/i386/pr95905-2.c: New test. --- gcc/config/i386/predicates.md | 32 +++++++++ gcc/config/i386/sse.md | 84 +++++++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr95905-1.c | 26 +++++++ gcc/testsuite/gcc.target/i386/pr95905-2.c | 46 +++++++++++++ 4 files changed, 188 insertions(+) create mode 100644 gcc/testsuite/gcc.target/i386/pr95905-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr95905-2.c diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 1b69d7b8d90..0a3ab4dce68 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1600,6 +1600,38 @@ return true; }) +;; Return true if OP is a parallel for an pmovz{bw,wd,dq} vec_select, +;; where one of the two operands of the vec_concat is const0_operand. +(define_predicate "pmovzx_parallel" + (and (match_code "parallel") + (match_code "const_int" "a")) +{ + int nelt = XVECLEN (op, 0); + int elt, i; + + if (nelt < 2) + return false; + + /* Check that the permutation is suitable for pmovz{bw,wd,dq}. + For example { 0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23 }. */ + elt = INTVAL (XVECEXP (op, 0, 0)); + if (elt == 0) + { + for (i = 1; i < nelt; ++i) + if ((i & 1) != 0) + { + if (INTVAL (XVECEXP (op, 0, i)) < nelt) + return false; + } + else if (INTVAL (XVECEXP (op, 0, i)) != i / 2) + return false; + } + else + return false; + + return true; +}) + ;; Return true if OP is a parallel for a vbroadcast permute. (define_predicate "avx_vbroadcast_operand" (and (match_code "parallel") diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 928eff5e05e..2a260c1cfbd 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -17683,6 +17683,36 @@ (any_extend:V8HI (match_dup 1)))] "operands[1] = adjust_address_nv (operands[1], V8QImode, 0);") +(define_insn_and_split "*sse4_1_zero_extendv8qiv8hi2_3" + [(set (match_operand:V16QI 0 "register_operand" "=Yr,*x,v") + (vec_select:V16QI + (vec_concat:V32QI + (match_operand:V16QI 1 "vector_operand" "Yrm,*xm,vm") + (match_operand:V16QI 2 "const0_operand" "C,C,C")) + (match_parallel 3 "pmovzx_parallel" + [(match_operand 4 "const_int_operand" "n,n,n")])))] + "TARGET_SSE4_1" + "#" + "&& reload_completed" + [(set (match_dup 0) + (zero_extend:V8HI + (vec_select:V8QI + (match_dup 1) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3) + (const_int 4) (const_int 5) + (const_int 6) (const_int 7)]))))] +{ + operands[0] = lowpart_subreg (V8HImode, operands[0], V16QImode); + if (MEM_P (operands[1])) + { + operands[1] = lowpart_subreg (V8QImode, operands[1], V16QImode); + operands[1] = gen_rtx_ZERO_EXTEND (V8HImode, operands[1]); + emit_insn (gen_rtx_SET (operands[0], operands[1])); + DONE; + } +}) + (define_expand "v8qiv8hi2" [(set (match_operand:V8HI 0 "register_operand") (any_extend:V8HI @@ -17929,6 +17959,34 @@ } }) +(define_insn_and_split "*sse4_1_zero_extendv4hiv4si2_3" + [(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,v") + (vec_select:V8HI + (vec_concat:V16HI + (match_operand:V8HI 1 "vector_operand" "Yrm,*xm,vm") + (match_operand:V8HI 2 "const0_operand" "C,C,C")) + (match_parallel 3 "pmovzx_parallel" + [(match_operand 4 "const_int_operand" "n,n,n")])))] + "TARGET_SSE4_1" + "#" + "&& reload_completed" + [(set (match_dup 0) + (zero_extend:V4SI + (vec_select:V4HI + (match_dup 1) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)]))))] +{ + operands[0] = lowpart_subreg (V4SImode, operands[0], V8HImode); + if (MEM_P (operands[1])) + { + operands[1] = lowpart_subreg (V4HImode, operands[1], V8HImode); + operands[1] = gen_rtx_ZERO_EXTEND (V4SImode, operands[1]); + emit_insn (gen_rtx_SET (operands[0], operands[1])); + DONE; + } +}) + (define_insn "avx512f_v8qiv8di2" [(set (match_operand:V8DI 0 "register_operand" "=v") (any_extend:V8DI @@ -18283,6 +18341,32 @@ (any_extend:V2DI (match_dup 1)))] "operands[1] = adjust_address_nv (operands[1], V2SImode, 0);") +(define_insn_and_split "*sse4_1_zero_extendv2siv2di2_3" + [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v") + (vec_select:V4SI + (vec_concat:V8SI + (match_operand:V4SI 1 "vector_operand" "Yrm,*xm,vm") + (match_operand:V4SI 2 "const0_operand" "C,C,C")) + (match_parallel 3 "pmovzx_parallel" + [(match_operand 4 "const_int_operand" "n,n,n")])))] + "TARGET_SSE4_1" + "#" + "&& reload_completed" + [(set (match_dup 0) + (zero_extend:V2DI + (vec_select:V2SI (match_dup 1) + (parallel [(const_int 0) (const_int 1)]))))] +{ + operands[0] = lowpart_subreg (V2DImode, operands[0], V4SImode); + if (MEM_P (operands[1])) + { + operands[1] = lowpart_subreg (V2SImode, operands[1], V4SImode); + operands[1] = gen_rtx_ZERO_EXTEND (V2DImode, operands[1]); + emit_insn (gen_rtx_SET (operands[0], operands[1])); + DONE; + } +}) + (define_expand "v2siv2di2" [(set (match_operand:V2DI 0 "register_operand") (any_extend:V2DI diff --git a/gcc/testsuite/gcc.target/i386/pr95905-1.c b/gcc/testsuite/gcc.target/i386/pr95905-1.c new file mode 100644 index 00000000000..8de715e8821 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr95905-1.c @@ -0,0 +1,26 @@ +/* PR target/95905 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse4.1 -mno-avx" } */ +/* { dg-final { scan-assembler "\tpmovzxbw\t" } } */ +/* { dg-final { scan-assembler "\tpmovzxwd\t" } } */ +/* { dg-final { scan-assembler "\tpmovzxdq\t" } } */ + +#include + +__m128i +f1 (__m128i a) +{ + return _mm_unpacklo_epi8 (a, _mm_setzero_si128 ()); +} + +__m128i +f2 (__m128i a) +{ + return _mm_unpacklo_epi16 (a, _mm_setzero_si128 ()); +} + +__m128i +f3 (__m128i a) +{ + return _mm_unpacklo_epi32 (a, _mm_setzero_si128 ()); +} diff --git a/gcc/testsuite/gcc.target/i386/pr95905-2.c b/gcc/testsuite/gcc.target/i386/pr95905-2.c new file mode 100644 index 00000000000..7cd20a3654a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr95905-2.c @@ -0,0 +1,46 @@ +/* PR target/95905 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse4.1" } */ +/* { dg-final { scan-assembler "\tv?pmovzxbw\t" } } */ +/* { dg-final { scan-assembler "\tv?pmovzxwd\t" } } */ +/* { dg-final { scan-assembler "\tv?pmovzxdq\t" } } */ + +typedef unsigned char V1 __attribute__((vector_size (16))); +typedef unsigned short V2 __attribute__((vector_size (16))); +typedef unsigned int V3 __attribute__((vector_size (16))); + +V1 +f1 (V1 x) +{ + return __builtin_shuffle (x, (V1) {}, (V1) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 }); +} + +V2 +f2 (V2 x) +{ + return __builtin_shuffle (x, (V2) {}, (V2) { 0, 8, 1, 9, 2, 10, 3, 11 }); +} + +V3 +f3 (V3 x) +{ + return __builtin_shuffle (x, (V3) {}, (V3) { 0, 4, 1, 5 }); +} + +V1 +f4 (V1 *x) +{ + return __builtin_shuffle (*x, (V1) {}, (V1) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 }); +} + +V2 +f5 (V2 *x) +{ + return __builtin_shuffle (*x, (V2) {}, (V2) { 0, 8, 1, 9, 2, 10, 3, 11 }); +} + +V3 +f6 (V3 *x) +{ + return __builtin_shuffle (*x, (V3) {}, (V3) { 0, 4, 1, 5 }); +} -- 2.30.2