+2019-07-09 Sylvia Taylor <sylvia.taylor@arm.com>
+
+ * config/arm/crypto.md:
+ (crypto_<crypto_pattern>): Redefine aese/aesd pattern with xor.
+ (crypto_<crypto_pattern>): Remove attribute enabled for aesmc.
+ (crypto_<crypto_pattern>): Split CRYPTO_BINARY into 2 patterns.
+ (*aarch32_crypto_aese_fused, *aarch32_crypto_aesd_fused): New.
+ * config/arm/arm.c
+ (aarch_macro_fusion_pair_p): Remove aes/aesmc fusion check.
+ * config/arm/aarch-common-protos.h
+ (aarch_crypto_can_dual_issue): Remove.
+ * config/arm/aarch-common.c
+ (aarch_crypto_can_dual_issue): Likewise.
+ * config/arm/exynos-m1.md: Remove aese/aesmc fusion.
+ * config/arm/cortex-a53.md: Likewise.
+ * config/arm/cortex-a57.md: Likewise.
+ * config/arm/iterators.md:
+ (CRYPTO_BINARY): Redefine.
+ (CRYPTO_UNARY): Removed.
+ (CRYPTO_AES, CRYPTO_AESMC): New.
+
2019-07-09 Richard Biener <rguenther@suse.de>
* tree-ssa-sccvn.c (struct vn_walk_cb_data): Add orig_ref member.
#define GCC_AARCH_COMMON_PROTOS_H
extern int aarch_accumulator_forwarding (rtx_insn *, rtx_insn *);
-extern int aarch_crypto_can_dual_issue (rtx_insn *, rtx_insn *);
extern bool aarch_rev16_p (rtx);
extern bool aarch_rev16_shleft_mask_imm_p (rtx, machine_mode);
extern bool aarch_rev16_shright_mask_imm_p (rtx, machine_mode);
#include "rtl-iter.h"
#include "memmodel.h"
-/* In ARMv8-A there's a general expectation that AESE/AESMC
- and AESD/AESIMC sequences of the form:
-
- AESE Vn, _
- AESMC Vn, Vn
-
- will issue both instructions in a single cycle on super-scalar
- implementations. This function identifies such pairs. */
-
-int
-aarch_crypto_can_dual_issue (rtx_insn *producer_insn, rtx_insn *consumer_insn)
-{
- rtx producer_set, consumer_set;
- rtx producer_src, consumer_src;
-
- producer_set = single_set (producer_insn);
- consumer_set = single_set (consumer_insn);
-
- producer_src = producer_set ? SET_SRC (producer_set) : NULL;
- consumer_src = consumer_set ? SET_SRC (consumer_set) : NULL;
-
- if (producer_src && consumer_src
- && GET_CODE (producer_src) == UNSPEC && GET_CODE (consumer_src) == UNSPEC
- && ((XINT (producer_src, 1) == UNSPEC_AESE
- && XINT (consumer_src, 1) == UNSPEC_AESMC)
- || (XINT (producer_src, 1) == UNSPEC_AESD
- && XINT (consumer_src, 1) == UNSPEC_AESIMC)))
- {
- unsigned int regno = REGNO (SET_DEST (producer_set));
-
- /* Before reload the registers are virtual, so the destination of
- consumer_set doesn't need to match. */
-
- return (REGNO (SET_DEST (consumer_set)) == regno || !reload_completed)
- && REGNO (XVECEXP (consumer_src, 0, 0)) == regno;
- }
-
- return 0;
-}
-
/* Return TRUE if X is either an arithmetic shift left, or
is a multiplication by a power of two. */
bool
if (!arm_macro_fusion_p ())
return false;
- if (current_tune->fusible_ops & tune_params::FUSE_AES_AESMC
- && aarch_crypto_can_dual_issue (prev, curr))
- return true;
-
if (current_tune->fusible_ops & tune_params::FUSE_MOVW_MOVT
&& arm_sets_movw_movt_fusible_p (prev_set, curr_set))
return true;
;; help.
(define_bypass 1 "cortex_a57_*"
"cortex_a57_call,cortex_a57_branch")
-
-;; AESE+AESMC and AESD+AESIMC pairs forward with zero latency
-(define_bypass 0 "cortex_a57_crypto_simple"
- "cortex_a57_crypto_simple"
- "aarch_crypto_can_dual_issue")
-
;; <http://www.gnu.org/licenses/>.
-;; When AES/AESMC fusion is enabled we want the register allocation to
-;; look like:
-;; AESE Vn, _
-;; AESMC Vn, Vn
-;; So prefer to tie operand 1 to operand 0 when fusing.
-
(define_insn "crypto_<crypto_pattern>"
- [(set (match_operand:<crypto_mode> 0 "register_operand" "=w,w")
- (unspec:<crypto_mode> [(match_operand:<crypto_mode> 1
- "register_operand" "0,w")]
- CRYPTO_UNARY))]
+ [(set (match_operand:<crypto_mode> 0 "register_operand" "=w")
+ (unspec:<crypto_mode>
+ [(match_operand:<crypto_mode> 1 "register_operand" "w")]
+ CRYPTO_AESMC))]
"TARGET_CRYPTO"
"<crypto_pattern>.<crypto_size_sfx>\\t%q0, %q1"
- [(set_attr "type" "<crypto_type>")
- (set_attr_alternative "enabled"
- [(if_then_else (match_test
- "arm_fusion_enabled_p (tune_params::FUSE_AES_AESMC)")
- (const_string "yes" )
- (const_string "no"))
- (const_string "yes")])]
+ [(set_attr "type" "<crypto_type>")]
+)
+
+(define_insn "crypto_<crypto_pattern>"
+ [(set (match_operand:V16QI 0 "register_operand" "=w")
+ (unspec:V16QI
+ [(xor:V16QI
+ (match_operand:V16QI 1 "register_operand" "%0")
+ (match_operand:V16QI 2 "register_operand" "w"))]
+ CRYPTO_AES))]
+ "TARGET_CRYPTO"
+ "<crypto_pattern>.<crypto_size_sfx>\\t%q0, %q2"
+ [(set_attr "type" "<crypto_type>")]
+)
+
+;; When AESE/AESMC fusion is enabled we really want to keep the two together
+;; and enforce the register dependency without scheduling or register
+;; allocation messing up the order or introducing moves inbetween.
+;; Mash the two together during combine.
+
+(define_insn "*aarch32_crypto_aese_fused"
+ [(set (match_operand:V16QI 0 "register_operand" "=w")
+ (unspec:V16QI
+ [(unspec:V16QI
+ [(xor:V16QI
+ (match_operand:V16QI 1 "register_operand" "%0")
+ (match_operand:V16QI 2 "register_operand" "w"))]
+ UNSPEC_AESE)]
+ UNSPEC_AESMC))]
+ "TARGET_CRYPTO
+ && arm_fusion_enabled_p (tune_params::FUSE_AES_AESMC)"
+ "aese.8\\t%q0, %q2\;aesmc.8\\t%q0, %q0"
+ [(set_attr "type" "crypto_aese")
+ (set_attr "length" "8")]
+)
+
+;; When AESD/AESIMC fusion is enabled we really want to keep the two together
+;; and enforce the register dependency without scheduling or register
+;; allocation messing up the order or introducing moves inbetween.
+;; Mash the two together during combine.
+
+(define_insn "*aarch32_crypto_aesd_fused"
+ [(set (match_operand:V16QI 0 "register_operand" "=w")
+ (unspec:V16QI
+ [(unspec:V16QI
+ [(xor:V16QI
+ (match_operand:V16QI 1 "register_operand" "%0")
+ (match_operand:V16QI 2 "register_operand" "w"))]
+ UNSPEC_AESD)]
+ UNSPEC_AESIMC))]
+ "TARGET_CRYPTO
+ && arm_fusion_enabled_p (tune_params::FUSE_AES_AESMC)"
+ "aesd.8\\t%q0, %q2\;aesimc.8\\t%q0, %q0"
+ [(set_attr "type" "crypto_aese")
+ (set_attr "length" "8")]
)
(define_insn "crypto_<crypto_pattern>"
[(set (match_operand:<crypto_mode> 0 "register_operand" "=w")
- (unspec:<crypto_mode> [(match_operand:<crypto_mode> 1 "register_operand" "0")
- (match_operand:<crypto_mode> 2 "register_operand" "w")]
- CRYPTO_BINARY))]
+ (unspec:<crypto_mode>
+ [(match_operand:<crypto_mode> 1 "register_operand" "0")
+ (match_operand:<crypto_mode> 2 "register_operand" "w")]
+ CRYPTO_BINARY))]
"TARGET_CRYPTO"
"<crypto_pattern>.<crypto_size_sfx>\\t%q0, %q2"
[(set_attr "type" "<crypto_type>")]
"exynos_m1_crypto_simple, exynos_m1_crypto_complex,\
exynos_m1_crypto_poly*")
-;; AES{D,E}/AESMC pairs can feed each other instantly.
-(define_bypass 0 "exynos_m1_crypto_simple"
- "exynos_m1_crypto_simple"
- "aarch_crypto_can_dual_issue")
-
;; Predicted branches take no time, but mispredicted ones take forever anyway.
(define_bypass 1 "exynos_m1_*"
"exynos_m1_call, exynos_m1_branch")
(define_int_iterator CRC [UNSPEC_CRC32B UNSPEC_CRC32H UNSPEC_CRC32W
UNSPEC_CRC32CB UNSPEC_CRC32CH UNSPEC_CRC32CW])
-(define_int_iterator CRYPTO_UNARY [UNSPEC_AESMC UNSPEC_AESIMC])
+(define_int_iterator CRYPTO_AESMC [UNSPEC_AESMC UNSPEC_AESIMC])
-(define_int_iterator CRYPTO_BINARY [UNSPEC_AESD UNSPEC_AESE
- UNSPEC_SHA1SU1 UNSPEC_SHA256SU0])
+(define_int_iterator CRYPTO_AES [UNSPEC_AESD UNSPEC_AESE])
+
+(define_int_iterator CRYPTO_BINARY [UNSPEC_SHA1SU1 UNSPEC_SHA256SU0])
(define_int_iterator CRYPTO_TERNARY [UNSPEC_SHA1SU0 UNSPEC_SHA256H
UNSPEC_SHA256H2 UNSPEC_SHA256SU1])
+2019-07-09 Sylvia Taylor <sylvia.taylor@arm.com>
+
+ * gcc.target/arm/aes-fuse-1.c: New.
+ * gcc.target/arm/aes-fuse-2.c: New.
+ * gcc.target/arm/aes_xor_combine.c: New.
+
2019-07-09 Martin Liska <mliska@suse.cz>
* gcc.dg/predict-17.c: Test loop optimizer assumption
--- /dev/null
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_crypto_ok } */
+/* { dg-add-options arm_crypto } */
+/* { dg-additional-options "-mcpu=cortex-a72 -O3 -dp" } */
+
+#include <arm_neon.h>
+
+#define AESE(r, v, key) (r = vaeseq_u8 ((v), (key)));
+#define AESMC(r, i) (r = vaesmcq_u8 (i))
+
+const uint8x16_t zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+uint8x16_t dummy;
+uint8x16_t a;
+uint8x16_t b;
+uint8x16_t c;
+uint8x16_t d;
+uint8x16_t x;
+uint8x16_t y;
+uint8x16_t k;
+
+void foo (void)
+{
+ AESE (a, a, k);
+ dummy = vaddq_u8 (dummy, dummy);
+ dummy = vaddq_u8 (dummy, dummy);
+ AESE (b, b, k);
+ dummy = vaddq_u8 (dummy, dummy);
+ dummy = vaddq_u8 (dummy, dummy);
+ AESE (c, c, k);
+ dummy = vaddq_u8 (dummy, dummy);
+ dummy = vaddq_u8 (dummy, dummy);
+ AESE (d, d, k);
+ dummy = vaddq_u8 (dummy, dummy);
+ dummy = vaddq_u8 (dummy, dummy);
+
+ x = x ^ k;
+ AESE (x, x, zero);
+ dummy = vaddq_u8 (dummy, dummy);
+ dummy = vaddq_u8 (dummy, dummy);
+ y = y ^ k;
+ AESE (y, y, zero);
+ dummy = vaddq_u8 (dummy, dummy);
+ dummy = vaddq_u8 (dummy, dummy);
+
+ AESMC (d, d);
+ dummy = vaddq_u8 (dummy, dummy);
+ dummy = vaddq_u8 (dummy, dummy);
+ AESMC (c, c);
+ dummy = vaddq_u8 (dummy, dummy);
+ dummy = vaddq_u8 (dummy, dummy);
+ AESMC (b, b);
+ dummy = vaddq_u8 (dummy, dummy);
+ dummy = vaddq_u8 (dummy, dummy);
+ AESMC (a, a);
+ dummy = vaddq_u8 (dummy, dummy);
+ dummy = vaddq_u8 (dummy, dummy);
+
+ AESMC (y, y);
+ dummy = vaddq_u8 (dummy, dummy);
+ dummy = vaddq_u8 (dummy, dummy);
+ AESMC (x, x);
+}
+
+/* { dg-final { scan-assembler-times "crypto_aese_fused" 6 } } */
+/* { dg-final { scan-assembler-not "veor" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_crypto_ok } */
+/* { dg-add-options arm_crypto } */
+/* { dg-additional-options "-mcpu=cortex-a72 -O3 -dp" } */
+
+#include <arm_neon.h>
+
+#define AESD(r, v, key) (r = vaesdq_u8 ((v), (key)));
+#define AESIMC(r, i) (r = vaesimcq_u8 (i))
+
+const uint8x16_t zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+uint8x16_t dummy;
+uint8x16_t a;
+uint8x16_t b;
+uint8x16_t c;
+uint8x16_t d;
+uint8x16_t x;
+uint8x16_t y;
+uint8x16_t k;
+
+void foo (void)
+{
+ AESD (a, a, k);
+ dummy = vaddq_u8 (dummy, dummy);
+ dummy = vaddq_u8 (dummy, dummy);
+ AESD (b, b, k);
+ dummy = vaddq_u8 (dummy, dummy);
+ dummy = vaddq_u8 (dummy, dummy);
+ AESD (c, c, k);
+ dummy = vaddq_u8 (dummy, dummy);
+ dummy = vaddq_u8 (dummy, dummy);
+ AESD (d, d, k);
+ dummy = vaddq_u8 (dummy, dummy);
+ dummy = vaddq_u8 (dummy, dummy);
+
+ x = x ^ k;
+ AESD (x, x, zero);
+ dummy = vaddq_u8 (dummy, dummy);
+ dummy = vaddq_u8 (dummy, dummy);
+ y = y ^ k;
+ AESD (y, y, zero);
+ dummy = vaddq_u8 (dummy, dummy);
+ dummy = vaddq_u8 (dummy, dummy);
+
+ AESIMC (d, d);
+ dummy = vaddq_u8 (dummy, dummy);
+ dummy = vaddq_u8 (dummy, dummy);
+ AESIMC (c, c);
+ dummy = vaddq_u8 (dummy, dummy);
+ dummy = vaddq_u8 (dummy, dummy);
+ AESIMC (b, b);
+ dummy = vaddq_u8 (dummy, dummy);
+ dummy = vaddq_u8 (dummy, dummy);
+ AESIMC (a, a);
+ dummy = vaddq_u8 (dummy, dummy);
+ dummy = vaddq_u8 (dummy, dummy);
+
+ AESIMC (y, y);
+ dummy = vaddq_u8 (dummy, dummy);
+ dummy = vaddq_u8 (dummy, dummy);
+ AESIMC (x, x);
+}
+
+/* { dg-final { scan-assembler-times "crypto_aesd_fused" 6 } } */
+/* { dg-final { scan-assembler-not "veor" } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_crypto_ok } */
+/* { dg-add-options arm_crypto } */
+/* { dg-additional-options "-O3" } */
+
+#include <arm_neon.h>
+
+#define AESE(r, v, key) (r = vaeseq_u8 ((v), (key)));
+#define AESD(r, v, key) (r = vaesdq_u8 ((v), (key)));
+
+const uint8x16_t zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+uint8x16_t foo_key_0 (uint8x16_t dummy, uint8x16_t foo, uint8x16_t bar)
+{
+ dummy = dummy ^ foo;
+ AESE(dummy, dummy, zero);
+ dummy = dummy ^ bar;
+ AESE(dummy, dummy, zero);
+
+ dummy = dummy ^ foo;
+ AESD(dummy, dummy, zero);
+ dummy = dummy ^ bar;
+ AESD(dummy, dummy, zero);
+
+ return dummy;
+}
+
+uint8x16_t foo_data_0 (uint8x16_t dummy, uint8x16_t foo, uint8x16_t bar)
+{
+ dummy = dummy ^ foo;
+ AESE(dummy, zero, dummy);
+ dummy = dummy ^ bar;
+ AESE(dummy, zero, dummy);
+
+ dummy = dummy ^ foo;
+ AESD(dummy, zero, dummy);
+ dummy = dummy ^ bar;
+ AESD(dummy, zero, dummy);
+
+ return dummy;
+}
+
+/* { dg-final { scan-assembler-not "veor" } } */