From: Sylvia Taylor Date: Tue, 9 Jul 2019 09:28:09 +0000 (+0000) Subject: [arm]: redefine aes patterns X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=4c12dc0556605578f4f35c4be27cff9fef28877b;p=gcc.git [arm]: redefine aes patterns This patch removes the arch-common aese/aesmc and aesd/aesimc fusions (i.e. aes fusion) implemented in the scheduling phase through the aarch_crypto_can_dual function. The reason is due to observing undesired behaviour in cases such as: - when register allocation goes bad (e.g. extra movs) - aes operations with xor and zeroed keys among interleaved operations A more stable version should be provided by instead doing the aes fusion during the combine pass. As such, new combine patterns have been added to enable this. The second change is the aese and aesd patterns have been rewritten as encapsulating a xor operation. The purpose is to simplify the need of having additional combine patterns for cases like the ones below: For AESE (though it also applies to AESD as both have a xor operation): data = data ^ key; data = vaeseq_u8(data, zero); --- veor q1, q0, q1 aese.8 q1, q9 Should mean and generate the same as: data = vaeseq_u8(data, key); --- aese.8 q1, q0 2019-07-09 Sylvia Taylor * config/arm/crypto.md: (crypto_): Redefine aese/aesd pattern with xor. (crypto_): Remove attribute enabled for aesmc. (crypto_): Split CRYPTO_BINARY into 2 patterns. (*aarch32_crypto_aese_fused, *aarch32_crypto_aesd_fused): New. * config/arm/arm.c (aarch_macro_fusion_pair_p): Remove aes/aesmc fusion check. * config/arm/aarch-common-protos.h (aarch_crypto_can_dual_issue): Remove. * config/arm/aarch-common.c (aarch_crypto_can_dual_issue): Likewise. * config/arm/exynos-m1.md: Remove aese/aesmc fusion. * config/arm/cortex-a53.md: Likewise. * config/arm/cortex-a57.md: Likewise. * config/arm/iterators.md: (CRYPTO_BINARY): Redefine. (CRYPTO_UNARY): Removed. (CRYPTO_AES, CRYPTO_AESMC): New. * gcc.target/arm/aes-fuse-1.c: New. * gcc.target/arm/aes-fuse-2.c: New. * gcc.target/arm/aes_xor_combine.c: New. From-SVN: r273296 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 5072a03e56b..21d7c1c8498 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,24 @@ +2019-07-09 Sylvia Taylor + + * config/arm/crypto.md: + (crypto_): Redefine aese/aesd pattern with xor. + (crypto_): Remove attribute enabled for aesmc. + (crypto_): Split CRYPTO_BINARY into 2 patterns. + (*aarch32_crypto_aese_fused, *aarch32_crypto_aesd_fused): New. + * config/arm/arm.c + (aarch_macro_fusion_pair_p): Remove aes/aesmc fusion check. + * config/arm/aarch-common-protos.h + (aarch_crypto_can_dual_issue): Remove. + * config/arm/aarch-common.c + (aarch_crypto_can_dual_issue): Likewise. + * config/arm/exynos-m1.md: Remove aese/aesmc fusion. + * config/arm/cortex-a53.md: Likewise. + * config/arm/cortex-a57.md: Likewise. + * config/arm/iterators.md: + (CRYPTO_BINARY): Redefine. + (CRYPTO_UNARY): Removed. + (CRYPTO_AES, CRYPTO_AESMC): New. + 2019-07-09 Richard Biener * tree-ssa-sccvn.c (struct vn_walk_cb_data): Add orig_ref member. diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h index 11cd5145bbc..3bf38a104f6 100644 --- a/gcc/config/arm/aarch-common-protos.h +++ b/gcc/config/arm/aarch-common-protos.h @@ -24,7 +24,6 @@ #define GCC_AARCH_COMMON_PROTOS_H extern int aarch_accumulator_forwarding (rtx_insn *, rtx_insn *); -extern int aarch_crypto_can_dual_issue (rtx_insn *, rtx_insn *); extern bool aarch_rev16_p (rtx); extern bool aarch_rev16_shleft_mask_imm_p (rtx, machine_mode); extern bool aarch_rev16_shright_mask_imm_p (rtx, machine_mode); diff --git a/gcc/config/arm/aarch-common.c b/gcc/config/arm/aarch-common.c index c7af12d4cd1..965a07a43e3 100644 --- a/gcc/config/arm/aarch-common.c +++ b/gcc/config/arm/aarch-common.c @@ -31,46 +31,6 @@ #include "rtl-iter.h" #include "memmodel.h" -/* In ARMv8-A there's a general expectation that AESE/AESMC - and AESD/AESIMC sequences of the form: - - AESE Vn, _ - AESMC Vn, Vn - - will issue both instructions in a single cycle on super-scalar - implementations. This function identifies such pairs. */ - -int -aarch_crypto_can_dual_issue (rtx_insn *producer_insn, rtx_insn *consumer_insn) -{ - rtx producer_set, consumer_set; - rtx producer_src, consumer_src; - - producer_set = single_set (producer_insn); - consumer_set = single_set (consumer_insn); - - producer_src = producer_set ? SET_SRC (producer_set) : NULL; - consumer_src = consumer_set ? SET_SRC (consumer_set) : NULL; - - if (producer_src && consumer_src - && GET_CODE (producer_src) == UNSPEC && GET_CODE (consumer_src) == UNSPEC - && ((XINT (producer_src, 1) == UNSPEC_AESE - && XINT (consumer_src, 1) == UNSPEC_AESMC) - || (XINT (producer_src, 1) == UNSPEC_AESD - && XINT (consumer_src, 1) == UNSPEC_AESIMC))) - { - unsigned int regno = REGNO (SET_DEST (producer_set)); - - /* Before reload the registers are virtual, so the destination of - consumer_set doesn't need to match. */ - - return (REGNO (SET_DEST (consumer_set)) == regno || !reload_completed) - && REGNO (XVECEXP (consumer_src, 0, 0)) == regno; - } - - return 0; -} - /* Return TRUE if X is either an arithmetic shift left, or is a multiplication by a power of two. */ bool diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 820502ac8a0..f42a7b1d4ed 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -30606,10 +30606,6 @@ aarch_macro_fusion_pair_p (rtx_insn* prev, rtx_insn* curr) if (!arm_macro_fusion_p ()) return false; - if (current_tune->fusible_ops & tune_params::FUSE_AES_AESMC - && aarch_crypto_can_dual_issue (prev, curr)) - return true; - if (current_tune->fusible_ops & tune_params::FUSE_MOVW_MOVT && arm_sets_movw_movt_fusible_p (prev_set, curr_set)) return true; diff --git a/gcc/config/arm/cortex-a57.md b/gcc/config/arm/cortex-a57.md index a5f0b10d142..2d96a9cdd5a 100644 --- a/gcc/config/arm/cortex-a57.md +++ b/gcc/config/arm/cortex-a57.md @@ -801,9 +801,3 @@ ;; help. (define_bypass 1 "cortex_a57_*" "cortex_a57_call,cortex_a57_branch") - -;; AESE+AESMC and AESD+AESIMC pairs forward with zero latency -(define_bypass 0 "cortex_a57_crypto_simple" - "cortex_a57_crypto_simple" - "aarch_crypto_can_dual_issue") - diff --git a/gcc/config/arm/crypto.md b/gcc/config/arm/crypto.md index 63d9d9ffa42..bf34f69fc75 100644 --- a/gcc/config/arm/crypto.md +++ b/gcc/config/arm/crypto.md @@ -19,33 +19,76 @@ ;; . -;; When AES/AESMC fusion is enabled we want the register allocation to -;; look like: -;; AESE Vn, _ -;; AESMC Vn, Vn -;; So prefer to tie operand 1 to operand 0 when fusing. - (define_insn "crypto_" - [(set (match_operand: 0 "register_operand" "=w,w") - (unspec: [(match_operand: 1 - "register_operand" "0,w")] - CRYPTO_UNARY))] + [(set (match_operand: 0 "register_operand" "=w") + (unspec: + [(match_operand: 1 "register_operand" "w")] + CRYPTO_AESMC))] "TARGET_CRYPTO" ".\\t%q0, %q1" - [(set_attr "type" "") - (set_attr_alternative "enabled" - [(if_then_else (match_test - "arm_fusion_enabled_p (tune_params::FUSE_AES_AESMC)") - (const_string "yes" ) - (const_string "no")) - (const_string "yes")])] + [(set_attr "type" "")] +) + +(define_insn "crypto_" + [(set (match_operand:V16QI 0 "register_operand" "=w") + (unspec:V16QI + [(xor:V16QI + (match_operand:V16QI 1 "register_operand" "%0") + (match_operand:V16QI 2 "register_operand" "w"))] + CRYPTO_AES))] + "TARGET_CRYPTO" + ".\\t%q0, %q2" + [(set_attr "type" "")] +) + +;; When AESE/AESMC fusion is enabled we really want to keep the two together +;; and enforce the register dependency without scheduling or register +;; allocation messing up the order or introducing moves inbetween. +;; Mash the two together during combine. + +(define_insn "*aarch32_crypto_aese_fused" + [(set (match_operand:V16QI 0 "register_operand" "=w") + (unspec:V16QI + [(unspec:V16QI + [(xor:V16QI + (match_operand:V16QI 1 "register_operand" "%0") + (match_operand:V16QI 2 "register_operand" "w"))] + UNSPEC_AESE)] + UNSPEC_AESMC))] + "TARGET_CRYPTO + && arm_fusion_enabled_p (tune_params::FUSE_AES_AESMC)" + "aese.8\\t%q0, %q2\;aesmc.8\\t%q0, %q0" + [(set_attr "type" "crypto_aese") + (set_attr "length" "8")] +) + +;; When AESD/AESIMC fusion is enabled we really want to keep the two together +;; and enforce the register dependency without scheduling or register +;; allocation messing up the order or introducing moves inbetween. +;; Mash the two together during combine. + +(define_insn "*aarch32_crypto_aesd_fused" + [(set (match_operand:V16QI 0 "register_operand" "=w") + (unspec:V16QI + [(unspec:V16QI + [(xor:V16QI + (match_operand:V16QI 1 "register_operand" "%0") + (match_operand:V16QI 2 "register_operand" "w"))] + UNSPEC_AESD)] + UNSPEC_AESIMC))] + "TARGET_CRYPTO + && arm_fusion_enabled_p (tune_params::FUSE_AES_AESMC)" + "aesd.8\\t%q0, %q2\;aesimc.8\\t%q0, %q0" + [(set_attr "type" "crypto_aese") + (set_attr "length" "8")] ) (define_insn "crypto_" [(set (match_operand: 0 "register_operand" "=w") - (unspec: [(match_operand: 1 "register_operand" "0") - (match_operand: 2 "register_operand" "w")] - CRYPTO_BINARY))] + (unspec: + [(match_operand: 1 "register_operand" "0") + (match_operand: 2 "register_operand" "w")] + CRYPTO_BINARY))] "TARGET_CRYPTO" ".\\t%q0, %q2" [(set_attr "type" "")] diff --git a/gcc/config/arm/exynos-m1.md b/gcc/config/arm/exynos-m1.md index 3d04a52ac3d..150ac85ebc7 100644 --- a/gcc/config/arm/exynos-m1.md +++ b/gcc/config/arm/exynos-m1.md @@ -950,11 +950,6 @@ "exynos_m1_crypto_simple, exynos_m1_crypto_complex,\ exynos_m1_crypto_poly*") -;; AES{D,E}/AESMC pairs can feed each other instantly. -(define_bypass 0 "exynos_m1_crypto_simple" - "exynos_m1_crypto_simple" - "aarch_crypto_can_dual_issue") - ;; Predicted branches take no time, but mispredicted ones take forever anyway. (define_bypass 1 "exynos_m1_*" "exynos_m1_call, exynos_m1_branch") diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 2462b8c87ea..e03a7202417 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -413,10 +413,11 @@ (define_int_iterator CRC [UNSPEC_CRC32B UNSPEC_CRC32H UNSPEC_CRC32W UNSPEC_CRC32CB UNSPEC_CRC32CH UNSPEC_CRC32CW]) -(define_int_iterator CRYPTO_UNARY [UNSPEC_AESMC UNSPEC_AESIMC]) +(define_int_iterator CRYPTO_AESMC [UNSPEC_AESMC UNSPEC_AESIMC]) -(define_int_iterator CRYPTO_BINARY [UNSPEC_AESD UNSPEC_AESE - UNSPEC_SHA1SU1 UNSPEC_SHA256SU0]) +(define_int_iterator CRYPTO_AES [UNSPEC_AESD UNSPEC_AESE]) + +(define_int_iterator CRYPTO_BINARY [UNSPEC_SHA1SU1 UNSPEC_SHA256SU0]) (define_int_iterator CRYPTO_TERNARY [UNSPEC_SHA1SU0 UNSPEC_SHA256H UNSPEC_SHA256H2 UNSPEC_SHA256SU1]) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index ce78f7781f5..995076b34d1 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,9 @@ +2019-07-09 Sylvia Taylor + + * gcc.target/arm/aes-fuse-1.c: New. + * gcc.target/arm/aes-fuse-2.c: New. + * gcc.target/arm/aes_xor_combine.c: New. + 2019-07-09 Martin Liska * gcc.dg/predict-17.c: Test loop optimizer assumption diff --git a/gcc/testsuite/gcc.target/arm/aes-fuse-1.c b/gcc/testsuite/gcc.target/arm/aes-fuse-1.c new file mode 100644 index 00000000000..27b08aeef7b --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/aes-fuse-1.c @@ -0,0 +1,66 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_crypto_ok } */ +/* { dg-add-options arm_crypto } */ +/* { dg-additional-options "-mcpu=cortex-a72 -O3 -dp" } */ + +#include + +#define AESE(r, v, key) (r = vaeseq_u8 ((v), (key))); +#define AESMC(r, i) (r = vaesmcq_u8 (i)) + +const uint8x16_t zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + +uint8x16_t dummy; +uint8x16_t a; +uint8x16_t b; +uint8x16_t c; +uint8x16_t d; +uint8x16_t x; +uint8x16_t y; +uint8x16_t k; + +void foo (void) +{ + AESE (a, a, k); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESE (b, b, k); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESE (c, c, k); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESE (d, d, k); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + + x = x ^ k; + AESE (x, x, zero); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + y = y ^ k; + AESE (y, y, zero); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + + AESMC (d, d); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESMC (c, c); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESMC (b, b); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESMC (a, a); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + + AESMC (y, y); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESMC (x, x); +} + +/* { dg-final { scan-assembler-times "crypto_aese_fused" 6 } } */ +/* { dg-final { scan-assembler-not "veor" } } */ diff --git a/gcc/testsuite/gcc.target/arm/aes-fuse-2.c b/gcc/testsuite/gcc.target/arm/aes-fuse-2.c new file mode 100644 index 00000000000..1266a287531 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/aes-fuse-2.c @@ -0,0 +1,66 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_crypto_ok } */ +/* { dg-add-options arm_crypto } */ +/* { dg-additional-options "-mcpu=cortex-a72 -O3 -dp" } */ + +#include + +#define AESD(r, v, key) (r = vaesdq_u8 ((v), (key))); +#define AESIMC(r, i) (r = vaesimcq_u8 (i)) + +const uint8x16_t zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + +uint8x16_t dummy; +uint8x16_t a; +uint8x16_t b; +uint8x16_t c; +uint8x16_t d; +uint8x16_t x; +uint8x16_t y; +uint8x16_t k; + +void foo (void) +{ + AESD (a, a, k); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESD (b, b, k); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESD (c, c, k); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESD (d, d, k); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + + x = x ^ k; + AESD (x, x, zero); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + y = y ^ k; + AESD (y, y, zero); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + + AESIMC (d, d); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESIMC (c, c); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESIMC (b, b); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESIMC (a, a); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + + AESIMC (y, y); + dummy = vaddq_u8 (dummy, dummy); + dummy = vaddq_u8 (dummy, dummy); + AESIMC (x, x); +} + +/* { dg-final { scan-assembler-times "crypto_aesd_fused" 6 } } */ +/* { dg-final { scan-assembler-not "veor" } } */ diff --git a/gcc/testsuite/gcc.target/arm/aes_xor_combine.c b/gcc/testsuite/gcc.target/arm/aes_xor_combine.c new file mode 100644 index 00000000000..17ae1c53e4a --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/aes_xor_combine.c @@ -0,0 +1,43 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_crypto_ok } */ +/* { dg-add-options arm_crypto } */ +/* { dg-additional-options "-O3" } */ + +#include + +#define AESE(r, v, key) (r = vaeseq_u8 ((v), (key))); +#define AESD(r, v, key) (r = vaesdq_u8 ((v), (key))); + +const uint8x16_t zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + +uint8x16_t foo_key_0 (uint8x16_t dummy, uint8x16_t foo, uint8x16_t bar) +{ + dummy = dummy ^ foo; + AESE(dummy, dummy, zero); + dummy = dummy ^ bar; + AESE(dummy, dummy, zero); + + dummy = dummy ^ foo; + AESD(dummy, dummy, zero); + dummy = dummy ^ bar; + AESD(dummy, dummy, zero); + + return dummy; +} + +uint8x16_t foo_data_0 (uint8x16_t dummy, uint8x16_t foo, uint8x16_t bar) +{ + dummy = dummy ^ foo; + AESE(dummy, zero, dummy); + dummy = dummy ^ bar; + AESE(dummy, zero, dummy); + + dummy = dummy ^ foo; + AESD(dummy, zero, dummy); + dummy = dummy ^ bar; + AESD(dummy, zero, dummy); + + return dummy; +} + +/* { dg-final { scan-assembler-not "veor" } } */