From e38341a8e0c7f89eb2146feddea8c2f3bf25a331 Mon Sep 17 00:00:00 2001 From: Sylvia Taylor Date: Thu, 18 Jul 2019 15:42:13 +0000 Subject: [PATCH] [patch1/2][arm][PR90317]: fix sha1 patterns This patch fixes: 1) Ice message thrown when using the crypto_sha1h intrinsic due to incompatible mode used for zero_extend. Removed zero extend as it is not a good choice for vector modes and using an equivalent single mode like TI (128bits) instead of V4SI produces extra instructions making it inefficient. This affects gcc version 8 and above. 2) Incorrect combine optimizations made due to vec_select usage in the sha1 patterns on arm. The patterns should only combine a vec select within a sha1h instruction when the lane is 0. This affects gcc version 5 and above. - Fixed by explicitly declaring the valid const int for such optimizations. For cases when the lane is not 0, the vector lane selection now occurs in a e.g. vmov instruction prior to sha1h. - Updated the sha1h testcases on arm to check for additional cases with custom vector lane selection. The intrinsic functions for the sha1 patterns have also been simplified which seems to eliminate extra vmovs like: - vmov.i32 q8, #0. 2019-07-18 Sylvia Taylor PR target/90317 * config/arm/arm_neon.h (vsha1h_u32): Refactor. (vsha1cq_u32): Likewise. (vsha1pq_u32): Likewise. (vsha1mq_u32): Likewise. * config/arm/crypto.md: (crypto_sha1h): Remove zero extend, correct vec select. (crypto_sha1c): Correct vec select. (crypto_sha1m): Likewise. (crypto_sha1p): Likewise. * gcc.target/arm/crypto-vsha1cq_u32.c (foo): Change return type to uint32_t. (GET_LANE, TEST_SHA1C_VEC_SELECT): New. * gcc.target/arm/crypto-vsha1h_u32.c (foo): Change return type to uint32_t. (GET_LANE, TEST_SHA1H_VEC_SELECT): New. * gcc.target/arm/crypto-vsha1mq_u32.c (foo): Change return type to uint32_t. (GET_LANE, TEST_SHA1M_VEC_SELECT): New. * gcc.target/arm/crypto-vsha1pq_u32.c (foo): Change return type to uint32_t. (GET_LANE, TEST_SHA1P_VEC_SELECT): New. From-SVN: r273574 --- gcc/ChangeLog | 14 +++++++++++ gcc/config/arm/arm_neon.h | 21 +++++++---------- gcc/config/arm/crypto.md | 22 ++++++++++++------ gcc/testsuite/ChangeLog | 16 +++++++++++++ .../gcc.target/arm/crypto-vsha1cq_u32.c | 23 ++++++++++++++++--- .../gcc.target/arm/crypto-vsha1h_u32.c | 23 ++++++++++++++++--- .../gcc.target/arm/crypto-vsha1mq_u32.c | 23 ++++++++++++++++--- .../gcc.target/arm/crypto-vsha1pq_u32.c | 23 ++++++++++++++++--- 8 files changed, 133 insertions(+), 32 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index a4a625e7eb0..668dc40b7fa 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,17 @@ +2019-07-18 Sylvia Taylor + + PR target/90317 + * config/arm/arm_neon.h + (vsha1h_u32): Refactor. + (vsha1cq_u32): Likewise. + (vsha1pq_u32): Likewise. + (vsha1mq_u32): Likewise. + * config/arm/crypto.md: + (crypto_sha1h): Remove zero extend, correct vec select. + (crypto_sha1c): Correct vec select. + (crypto_sha1m): Likewise. + (crypto_sha1p): Likewise. + 2019-07-18 Richard Earnshaw * config/arm/predicates.md (arm_borrow_operation): New predicate. diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 6b982392ece..1f200d491d1 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -16938,37 +16938,32 @@ __extension__ extern __inline uint32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vsha1h_u32 (uint32_t __hash_e) { - uint32x4_t __t = vdupq_n_u32 (0); - __t = vsetq_lane_u32 (__hash_e, __t, 0); - __t = __builtin_arm_crypto_sha1h (__t); - return vgetq_lane_u32 (__t, 0); + return vgetq_lane_u32 (__builtin_arm_crypto_sha1h (vdupq_n_u32 (__hash_e)), + 0); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vsha1cq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk) { - uint32x4_t __t = vdupq_n_u32 (0); - __t = vsetq_lane_u32 (__hash_e, __t, 0); - return __builtin_arm_crypto_sha1c (__hash_abcd, __t, __wk); + return __builtin_arm_crypto_sha1c (__hash_abcd, vdupq_n_u32 (__hash_e), + __wk); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vsha1pq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk) { - uint32x4_t __t = vdupq_n_u32 (0); - __t = vsetq_lane_u32 (__hash_e, __t, 0); - return __builtin_arm_crypto_sha1p (__hash_abcd, __t, __wk); + return __builtin_arm_crypto_sha1p (__hash_abcd, vdupq_n_u32 (__hash_e), + __wk); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vsha1mq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk) { - uint32x4_t __t = vdupq_n_u32 (0); - __t = vsetq_lane_u32 (__hash_e, __t, 0); - return __builtin_arm_crypto_sha1m (__hash_abcd, __t, __wk); + return __builtin_arm_crypto_sha1m (__hash_abcd, vdupq_n_u32 (__hash_e), + __wk); } __extension__ extern __inline uint32x4_t diff --git a/gcc/config/arm/crypto.md b/gcc/config/arm/crypto.md index bf34f69fc75..115c515ac46 100644 --- a/gcc/config/arm/crypto.md +++ b/gcc/config/arm/crypto.md @@ -105,14 +105,18 @@ [(set_attr "type" "")] ) +/* The vec_select operation always selects index 0 from the lower V2SI subreg + of the V4SI, adjusted for endianness. Required due to neon_vget_lane and + neon_set_lane that change the element ordering in memory for big-endian. */ + (define_insn "crypto_sha1h" [(set (match_operand:V4SI 0 "register_operand" "=w") - (zero_extend:V4SI - (unspec:SI [(vec_select:SI - (match_operand:V4SI 1 "register_operand" "w") - (parallel [(match_operand:SI 2 "immediate_operand" "i")]))] - UNSPEC_SHA1H)))] - "TARGET_CRYPTO" + (unspec:V4SI + [(vec_select:SI + (match_operand:V4SI 1 "register_operand" "w") + (parallel [(match_operand:SI 2 "immediate_operand" "i")]))] + UNSPEC_SHA1H))] + "TARGET_CRYPTO && INTVAL (operands[2]) == NEON_ENDIAN_LANE_N (V2SImode, 0)" "sha1h.32\\t%q0, %q1" [(set_attr "type" "crypto_sha1_fast")] ) @@ -127,6 +131,10 @@ [(set_attr "type" "crypto_pmull")] ) +/* The vec_select operation always selects index 0 from the lower V2SI subreg + of the V4SI, adjusted for endianness. Required due to neon_vget_lane and + neon_set_lane that change the element ordering in memory for big-endian. */ + (define_insn "crypto_" [(set (match_operand:V4SI 0 "register_operand" "=w") (unspec: @@ -136,7 +144,7 @@ (parallel [(match_operand:SI 4 "immediate_operand" "i")])) (match_operand: 3 "register_operand" "w")] CRYPTO_SELECTING))] - "TARGET_CRYPTO" + "TARGET_CRYPTO && INTVAL (operands[4]) == NEON_ENDIAN_LANE_N (V2SImode, 0)" ".\\t%q0, %q2, %q3" [(set_attr "type" "")] ) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 0f47604da85..7bf322fc182 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,19 @@ +2019-07-18 Sylvia Taylor + + PR target/90317 + * gcc.target/arm/crypto-vsha1cq_u32.c (foo): Change return type to + uint32_t. + (GET_LANE, TEST_SHA1C_VEC_SELECT): New. + * gcc.target/arm/crypto-vsha1h_u32.c (foo): Change return type to + uint32_t. + (GET_LANE, TEST_SHA1H_VEC_SELECT): New. + * gcc.target/arm/crypto-vsha1mq_u32.c (foo): Change return type to + uint32_t. + (GET_LANE, TEST_SHA1M_VEC_SELECT): New. + * gcc.target/arm/crypto-vsha1pq_u32.c (foo): Change return type to + uint32_t. + (GET_LANE, TEST_SHA1P_VEC_SELECT): New. + 2019-07-18 Jan Hubicka * g++.dg/lto/alias-5_0.C: New testcase. diff --git a/gcc/testsuite/gcc.target/arm/crypto-vsha1cq_u32.c b/gcc/testsuite/gcc.target/arm/crypto-vsha1cq_u32.c index 4dc9dee6617..41f97a74d6f 100644 --- a/gcc/testsuite/gcc.target/arm/crypto-vsha1cq_u32.c +++ b/gcc/testsuite/gcc.target/arm/crypto-vsha1cq_u32.c @@ -1,11 +1,12 @@ /* { dg-do compile } */ /* { dg-require-effective-target arm_crypto_ok } */ /* { dg-add-options arm_crypto } */ +/* { dg-additional-options "-O3" } */ #include "arm_neon.h" -int -foo (void) +uint32_t foo (void) + { uint32_t hash = 0xdeadbeef; uint32x4_t a = {0, 1, 2, 3}; @@ -15,4 +16,20 @@ foo (void) return res[0]; } -/* { dg-final { scan-assembler "sha1c.32\tq\[0-9\]+, q\[0-9\]+" } } */ +#define GET_LANE(lane) \ + uint32x4_t foo_lane##lane (uint32x4_t val,uint32x4_t a, uint32x4_t b)\ + { \ + return vsha1cq_u32 (a, vgetq_lane_u32 (val, lane), b); \ + } + +#define TEST_SHA1C_VEC_SELECT(FUNC) \ + FUNC (0) \ + FUNC (1) \ + FUNC (2) \ + FUNC (3) \ + +TEST_SHA1C_VEC_SELECT (GET_LANE) + +/* { dg-final { scan-assembler-times {sha1c.32\tq[0-9]+, q[0-9]+} 5 } } */ +/* { dg-final { scan-assembler-times {vdup.32\tq[0-9]+, r[0-9]+} 3 } } */ +/* { dg-final { scan-assembler-times {vmov.32\tr[0-9]+, d[0-9]+\[[0-9]+\]+} 4 } } */ diff --git a/gcc/testsuite/gcc.target/arm/crypto-vsha1h_u32.c b/gcc/testsuite/gcc.target/arm/crypto-vsha1h_u32.c index dee27748524..b2846675a27 100644 --- a/gcc/testsuite/gcc.target/arm/crypto-vsha1h_u32.c +++ b/gcc/testsuite/gcc.target/arm/crypto-vsha1h_u32.c @@ -1,14 +1,31 @@ /* { dg-do compile } */ /* { dg-require-effective-target arm_crypto_ok } */ /* { dg-add-options arm_crypto } */ +/* { dg-additional-options "-O3" } */ #include "arm_neon.h" -int -foo (void) +uint32_t foo (void) + { uint32_t val = 0xdeadbeef; return vsha1h_u32 (val); } -/* { dg-final { scan-assembler "sha1h.32\tq\[0-9\]+, q\[0-9\]+" } } */ +#define GET_LANE(lane) \ + uint32_t foo_lane##lane (uint32x4_t val) \ + { \ + return vsha1h_u32 (vgetq_lane_u32 (val, lane)); \ + } + +#define TEST_SHA1H_VEC_SELECT(FUNC) \ + FUNC (0) \ + FUNC (1) \ + FUNC (2) \ + FUNC (3) \ + +TEST_SHA1H_VEC_SELECT (GET_LANE) + +/* { dg-final { scan-assembler-times {sha1h.32\tq[0-9]+, q[0-9]+} 5 } } */ +/* { dg-final { scan-assembler-times {vdup.32\tq[0-9]+, r[0-9]+} 3 } } */ +/* { dg-final { scan-assembler-times {vmov.32\tr[0-9]+, d[0-9]+\[[0-9]+\]+} 8 } } */ diff --git a/gcc/testsuite/gcc.target/arm/crypto-vsha1mq_u32.c b/gcc/testsuite/gcc.target/arm/crypto-vsha1mq_u32.c index 672b93a9747..676e64ce779 100644 --- a/gcc/testsuite/gcc.target/arm/crypto-vsha1mq_u32.c +++ b/gcc/testsuite/gcc.target/arm/crypto-vsha1mq_u32.c @@ -1,11 +1,12 @@ /* { dg-do compile } */ /* { dg-require-effective-target arm_crypto_ok } */ /* { dg-add-options arm_crypto } */ +/* { dg-additional-options "-O3" } */ #include "arm_neon.h" -int -foo (void) +uint32_t foo (void) + { uint32_t hash = 0xdeadbeef; uint32x4_t a = {0, 1, 2, 3}; @@ -15,4 +16,20 @@ foo (void) return res[0]; } -/* { dg-final { scan-assembler "sha1m.32\tq\[0-9\]+, q\[0-9\]+" } } */ +#define GET_LANE(lane) \ + uint32x4_t foo_lane##lane (uint32x4_t val,uint32x4_t a, uint32x4_t b)\ + { \ + return vsha1mq_u32 (a, vgetq_lane_u32 (val, lane), b); \ + } + +#define TEST_SHA1M_VEC_SELECT(FUNC) \ + FUNC (0) \ + FUNC (1) \ + FUNC (2) \ + FUNC (3) \ + +TEST_SHA1M_VEC_SELECT (GET_LANE) + +/* { dg-final { scan-assembler-times {sha1m.32\tq[0-9]+, q[0-9]+} 5 } } */ +/* { dg-final { scan-assembler-times {vdup.32\tq[0-9]+, r[0-9]+} 3 } } */ +/* { dg-final { scan-assembler-times {vmov.32\tr[0-9]+, d[0-9]+\[[0-9]+\]+} 4 } } */ diff --git a/gcc/testsuite/gcc.target/arm/crypto-vsha1pq_u32.c b/gcc/testsuite/gcc.target/arm/crypto-vsha1pq_u32.c index ff508e0dc7f..ed10fe265ba 100644 --- a/gcc/testsuite/gcc.target/arm/crypto-vsha1pq_u32.c +++ b/gcc/testsuite/gcc.target/arm/crypto-vsha1pq_u32.c @@ -1,11 +1,12 @@ /* { dg-do compile } */ /* { dg-require-effective-target arm_crypto_ok } */ /* { dg-add-options arm_crypto } */ +/* { dg-additional-options "-O3" } */ #include "arm_neon.h" -int -foo (void) +uint32_t foo (void) + { uint32_t hash = 0xdeadbeef; uint32x4_t a = {0, 1, 2, 3}; @@ -15,4 +16,20 @@ foo (void) return res[0]; } -/* { dg-final { scan-assembler "sha1p.32\tq\[0-9\]+, q\[0-9\]+" } } */ +#define GET_LANE(lane) \ + uint32x4_t foo_lane##lane (uint32x4_t val,uint32x4_t a, uint32x4_t b)\ + { \ + return vsha1pq_u32 (a, vgetq_lane_u32 (val, lane), b); \ + } + +#define TEST_SHA1P_VEC_SELECT(FUNC) \ + FUNC (0) \ + FUNC (1) \ + FUNC (2) \ + FUNC (3) \ + +TEST_SHA1P_VEC_SELECT (GET_LANE) + +/* { dg-final { scan-assembler-times {sha1p.32\tq[0-9]+, q[0-9]+} 5 } } */ +/* { dg-final { scan-assembler-times {vdup.32\tq[0-9]+, r[0-9]+} 3 } } */ +/* { dg-final { scan-assembler-times {vmov.32\tr[0-9]+, d[0-9]+\[[0-9]+\]+} 4 } } */ -- 2.30.2