[arm]: redefine aes patterns
authorSylvia Taylor <sylvia.taylor@arm.com>
Tue, 9 Jul 2019 09:28:09 +0000 (09:28 +0000)
committerKyrylo Tkachov <ktkachov@gcc.gnu.org>
Tue, 9 Jul 2019 09:28:09 +0000 (09:28 +0000)
This patch removes the arch-common aese/aesmc and aesd/aesimc fusions
(i.e. aes fusion) implemented in the scheduling phase through the
aarch_crypto_can_dual function. The reason is due to observing
undesired behaviour in cases such as:
- when register allocation goes bad (e.g. extra movs)
- aes operations with xor and zeroed keys among interleaved operations

A more stable version should be provided by instead doing the aes fusion
during the combine pass. As such, new combine patterns have been added to
enable this.

The second change is the aese and aesd patterns have been rewritten as
encapsulating a xor operation. The purpose is to simplify the need of
having additional combine patterns for cases like the ones below:

For AESE (though it also applies to AESD as both have a xor operation):

    data = data ^ key;
    data = vaeseq_u8(data, zero);
    ---
    veor        q1, q0, q1
    aese.8      q1, q9

Should mean and generate the same as:

    data = vaeseq_u8(data, key);
    ---
    aese.8       q1, q0

2019-07-09  Sylvia Taylor  <sylvia.taylor@arm.com>

* config/arm/crypto.md:
(crypto_<crypto_pattern>): Redefine aese/aesd pattern with xor.
(crypto_<crypto_pattern>): Remove attribute enabled for aesmc.
(crypto_<crypto_pattern>): Split CRYPTO_BINARY into 2 patterns.
(*aarch32_crypto_aese_fused, *aarch32_crypto_aesd_fused): New.
* config/arm/arm.c
(aarch_macro_fusion_pair_p): Remove aes/aesmc fusion check.
* config/arm/aarch-common-protos.h
(aarch_crypto_can_dual_issue): Remove.
* config/arm/aarch-common.c
(aarch_crypto_can_dual_issue): Likewise.
* config/arm/exynos-m1.md: Remove aese/aesmc fusion.
* config/arm/cortex-a53.md: Likewise.
* config/arm/cortex-a57.md: Likewise.
* config/arm/iterators.md:
(CRYPTO_BINARY): Redefine.
(CRYPTO_UNARY): Removed.
(CRYPTO_AES, CRYPTO_AESMC): New.

* gcc.target/arm/aes-fuse-1.c: New.
* gcc.target/arm/aes-fuse-2.c: New.
* gcc.target/arm/aes_xor_combine.c: New.

From-SVN: r273296

12 files changed:
gcc/ChangeLog
gcc/config/arm/aarch-common-protos.h
gcc/config/arm/aarch-common.c
gcc/config/arm/arm.c
gcc/config/arm/cortex-a57.md
gcc/config/arm/crypto.md
gcc/config/arm/exynos-m1.md
gcc/config/arm/iterators.md
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.target/arm/aes-fuse-1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/arm/aes-fuse-2.c [new file with mode: 0644]
gcc/testsuite/gcc.target/arm/aes_xor_combine.c [new file with mode: 0644]

index 5072a03e56bf06e5196f4d6f1af24209c9dc3331..21d7c1c8498fd9d82265ced71f9ea38ee3c2ec59 100644 (file)
@@ -1,3 +1,24 @@
+2019-07-09  Sylvia Taylor  <sylvia.taylor@arm.com>
+
+       * config/arm/crypto.md:
+       (crypto_<crypto_pattern>): Redefine aese/aesd pattern with xor.
+       (crypto_<crypto_pattern>): Remove attribute enabled for aesmc.
+       (crypto_<crypto_pattern>): Split CRYPTO_BINARY into 2 patterns.
+       (*aarch32_crypto_aese_fused, *aarch32_crypto_aesd_fused): New.
+       * config/arm/arm.c
+       (aarch_macro_fusion_pair_p): Remove aes/aesmc fusion check.
+       * config/arm/aarch-common-protos.h
+       (aarch_crypto_can_dual_issue): Remove.
+       * config/arm/aarch-common.c
+       (aarch_crypto_can_dual_issue): Likewise.
+       * config/arm/exynos-m1.md: Remove aese/aesmc fusion.
+       * config/arm/cortex-a53.md: Likewise.
+       * config/arm/cortex-a57.md: Likewise.
+       * config/arm/iterators.md:
+       (CRYPTO_BINARY): Redefine.
+       (CRYPTO_UNARY): Removed.
+       (CRYPTO_AES, CRYPTO_AESMC): New.
+
 2019-07-09  Richard Biener  <rguenther@suse.de>
 
        * tree-ssa-sccvn.c (struct vn_walk_cb_data): Add orig_ref member.
index 11cd5145bbc77ab35e7874a75a93ec0e7bb0ea28..3bf38a104f6941eec1ce88db7d6b6ceb7da0af92 100644 (file)
@@ -24,7 +24,6 @@
 #define GCC_AARCH_COMMON_PROTOS_H
 
 extern int aarch_accumulator_forwarding (rtx_insn *, rtx_insn *);
-extern int aarch_crypto_can_dual_issue (rtx_insn *, rtx_insn *);
 extern bool aarch_rev16_p (rtx);
 extern bool aarch_rev16_shleft_mask_imm_p (rtx, machine_mode);
 extern bool aarch_rev16_shright_mask_imm_p (rtx, machine_mode);
index c7af12d4cd1714c70ebc6d6c7d4454606d15f864..965a07a43e3129dd1743d4a79813a597feca0b71 100644 (file)
 #include "rtl-iter.h"
 #include "memmodel.h"
 
-/* In ARMv8-A there's a general expectation that AESE/AESMC
-   and AESD/AESIMC sequences of the form:
-
-   AESE Vn, _
-   AESMC Vn, Vn
-
-   will issue both instructions in a single cycle on super-scalar
-   implementations.  This function identifies such pairs.  */
-
-int
-aarch_crypto_can_dual_issue (rtx_insn *producer_insn, rtx_insn *consumer_insn)
-{
-  rtx producer_set, consumer_set;
-  rtx producer_src, consumer_src;
-
-  producer_set = single_set (producer_insn);
-  consumer_set = single_set (consumer_insn);
-
-  producer_src = producer_set ? SET_SRC (producer_set) : NULL;
-  consumer_src = consumer_set ? SET_SRC (consumer_set) : NULL;
-
-  if (producer_src && consumer_src
-      && GET_CODE (producer_src) == UNSPEC && GET_CODE (consumer_src) == UNSPEC
-      && ((XINT (producer_src, 1) == UNSPEC_AESE
-           && XINT (consumer_src, 1) == UNSPEC_AESMC)
-          || (XINT (producer_src, 1) == UNSPEC_AESD
-              && XINT (consumer_src, 1) == UNSPEC_AESIMC)))
-  {
-    unsigned int regno = REGNO (SET_DEST (producer_set));
-
-    /* Before reload the registers are virtual, so the destination of
-       consumer_set doesn't need to match.  */
-
-    return (REGNO (SET_DEST (consumer_set)) == regno || !reload_completed)
-           && REGNO (XVECEXP (consumer_src, 0, 0)) == regno;
-  }
-
-  return 0;
-}
-
 /* Return TRUE if X is either an arithmetic shift left, or
    is a multiplication by a power of two.  */
 bool
index 820502ac8a0a23274d5ea48ab19794959c811d90..f42a7b1d4edb04f4a26617f9cda1730dcd7c393a 100644 (file)
@@ -30606,10 +30606,6 @@ aarch_macro_fusion_pair_p (rtx_insn* prev, rtx_insn* curr)
   if (!arm_macro_fusion_p ())
     return false;
 
-  if (current_tune->fusible_ops & tune_params::FUSE_AES_AESMC
-      && aarch_crypto_can_dual_issue (prev, curr))
-    return true;
-
   if (current_tune->fusible_ops & tune_params::FUSE_MOVW_MOVT
       && arm_sets_movw_movt_fusible_p (prev_set, curr_set))
     return true;
index a5f0b10d142160b68570c4cf00a641b07dc742f1..2d96a9cdd5a0d8a4533d0265e557e71fb2eb9427 100644 (file)
 ;; help.
 (define_bypass 1 "cortex_a57_*"
                 "cortex_a57_call,cortex_a57_branch")
-
-;; AESE+AESMC and AESD+AESIMC pairs forward with zero latency
-(define_bypass 0 "cortex_a57_crypto_simple"
-                "cortex_a57_crypto_simple"
-                "aarch_crypto_can_dual_issue")
-
index 63d9d9ffa424fa51b05ebee5138b2c7c0f304745..bf34f69fc75c4b16dd947a7fe48728a97a319abf 100644 (file)
 ;; <http://www.gnu.org/licenses/>.
 
 
-;; When AES/AESMC fusion is enabled we want the register allocation to
-;; look like:
-;;    AESE Vn, _
-;;    AESMC Vn, Vn
-;; So prefer to tie operand 1 to operand 0 when fusing.
-
 (define_insn "crypto_<crypto_pattern>"
-  [(set (match_operand:<crypto_mode> 0 "register_operand" "=w,w")
-        (unspec:<crypto_mode> [(match_operand:<crypto_mode> 1
-                       "register_operand" "0,w")]
-         CRYPTO_UNARY))]
+  [(set (match_operand:<crypto_mode> 0 "register_operand" "=w")
+       (unspec:<crypto_mode>
+               [(match_operand:<crypto_mode> 1 "register_operand" "w")]
+        CRYPTO_AESMC))]
   "TARGET_CRYPTO"
   "<crypto_pattern>.<crypto_size_sfx>\\t%q0, %q1"
-  [(set_attr "type" "<crypto_type>")
-   (set_attr_alternative "enabled"
-     [(if_then_else (match_test
-                      "arm_fusion_enabled_p (tune_params::FUSE_AES_AESMC)")
-                    (const_string "yes" )
-                    (const_string "no"))
-      (const_string "yes")])]
+  [(set_attr "type" "<crypto_type>")]
+)
+
+(define_insn "crypto_<crypto_pattern>"
+  [(set (match_operand:V16QI 0 "register_operand" "=w")
+       (unspec:V16QI
+               [(xor:V16QI
+                    (match_operand:V16QI 1 "register_operand" "%0")
+                    (match_operand:V16QI 2 "register_operand" "w"))]
+       CRYPTO_AES))]
+  "TARGET_CRYPTO"
+  "<crypto_pattern>.<crypto_size_sfx>\\t%q0, %q2"
+  [(set_attr "type" "<crypto_type>")]
+)
+
+;; When AESE/AESMC fusion is enabled we really want to keep the two together
+;; and enforce the register dependency without scheduling or register
+;; allocation messing up the order or introducing moves inbetween.
+;;  Mash the two together during combine.
+
+(define_insn "*aarch32_crypto_aese_fused"
+  [(set (match_operand:V16QI 0 "register_operand" "=w")
+       (unspec:V16QI
+               [(unspec:V16QI
+                   [(xor:V16QI
+                       (match_operand:V16QI 1 "register_operand" "%0")
+                       (match_operand:V16QI 2 "register_operand" "w"))]
+               UNSPEC_AESE)]
+       UNSPEC_AESMC))]
+  "TARGET_CRYPTO
+   && arm_fusion_enabled_p (tune_params::FUSE_AES_AESMC)"
+  "aese.8\\t%q0, %q2\;aesmc.8\\t%q0, %q0"
+  [(set_attr "type" "crypto_aese")
+   (set_attr "length" "8")]
+)
+
+;; When AESD/AESIMC fusion is enabled we really want to keep the two together
+;; and enforce the register dependency without scheduling or register
+;; allocation messing up the order or introducing moves inbetween.
+;;  Mash the two together during combine.
+
+(define_insn "*aarch32_crypto_aesd_fused"
+  [(set (match_operand:V16QI 0 "register_operand" "=w")
+       (unspec:V16QI
+               [(unspec:V16QI
+                   [(xor:V16QI
+                       (match_operand:V16QI 1 "register_operand" "%0")
+                       (match_operand:V16QI 2 "register_operand" "w"))]
+               UNSPEC_AESD)]
+       UNSPEC_AESIMC))]
+  "TARGET_CRYPTO
+   && arm_fusion_enabled_p (tune_params::FUSE_AES_AESMC)"
+  "aesd.8\\t%q0, %q2\;aesimc.8\\t%q0, %q0"
+  [(set_attr "type" "crypto_aese")
+   (set_attr "length" "8")]
 )
 
 (define_insn "crypto_<crypto_pattern>"
   [(set (match_operand:<crypto_mode> 0 "register_operand" "=w")
-        (unspec:<crypto_mode> [(match_operand:<crypto_mode> 1 "register_operand" "0")
-                      (match_operand:<crypto_mode> 2 "register_operand" "w")]
-         CRYPTO_BINARY))]
+       (unspec:<crypto_mode>
+               [(match_operand:<crypto_mode> 1 "register_operand" "0")
+               (match_operand:<crypto_mode> 2 "register_operand" "w")]
+       CRYPTO_BINARY))]
   "TARGET_CRYPTO"
   "<crypto_pattern>.<crypto_size_sfx>\\t%q0, %q2"
   [(set_attr "type" "<crypto_type>")]
index 3d04a52ac3d550ca97497cd8ebf058456614881c..150ac85ebc7dad0c0028a35a8851bef66a997642 100644 (file)
                 "exynos_m1_crypto_simple, exynos_m1_crypto_complex,\
                  exynos_m1_crypto_poly*")
 
-;; AES{D,E}/AESMC pairs can feed each other instantly.
-(define_bypass 0 "exynos_m1_crypto_simple"
-                "exynos_m1_crypto_simple"
-                "aarch_crypto_can_dual_issue")
-
 ;; Predicted branches take no time, but mispredicted ones take forever anyway.
 (define_bypass 1 "exynos_m1_*"
                 "exynos_m1_call, exynos_m1_branch")
index 2462b8c87ea7dbe60ba50d22b1e494bb4fe905c2..e03a7202417586808f6033864432bc77e63d746f 100644 (file)
 (define_int_iterator CRC [UNSPEC_CRC32B UNSPEC_CRC32H UNSPEC_CRC32W
                           UNSPEC_CRC32CB UNSPEC_CRC32CH UNSPEC_CRC32CW])
 
-(define_int_iterator CRYPTO_UNARY [UNSPEC_AESMC UNSPEC_AESIMC])
+(define_int_iterator CRYPTO_AESMC [UNSPEC_AESMC UNSPEC_AESIMC])
 
-(define_int_iterator CRYPTO_BINARY [UNSPEC_AESD UNSPEC_AESE
-                                    UNSPEC_SHA1SU1 UNSPEC_SHA256SU0])
+(define_int_iterator CRYPTO_AES [UNSPEC_AESD UNSPEC_AESE])
+
+(define_int_iterator CRYPTO_BINARY [UNSPEC_SHA1SU1 UNSPEC_SHA256SU0])
 
 (define_int_iterator CRYPTO_TERNARY [UNSPEC_SHA1SU0 UNSPEC_SHA256H
                                      UNSPEC_SHA256H2 UNSPEC_SHA256SU1])
index ce78f7781f514e4e6fb22f5a8cbe5d271a28d655..995076b34d1abf77329d6a82cfbbcab0e89e9423 100644 (file)
@@ -1,3 +1,9 @@
+2019-07-09  Sylvia Taylor  <sylvia.taylor@arm.com>
+
+       * gcc.target/arm/aes-fuse-1.c: New.
+       * gcc.target/arm/aes-fuse-2.c: New.
+       * gcc.target/arm/aes_xor_combine.c: New.
+
 2019-07-09  Martin Liska  <mliska@suse.cz>
 
        * gcc.dg/predict-17.c: Test loop optimizer assumption
diff --git a/gcc/testsuite/gcc.target/arm/aes-fuse-1.c b/gcc/testsuite/gcc.target/arm/aes-fuse-1.c
new file mode 100644 (file)
index 0000000..27b08ae
--- /dev/null
@@ -0,0 +1,66 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_crypto_ok } */
+/* { dg-add-options arm_crypto } */
+/* { dg-additional-options "-mcpu=cortex-a72 -O3 -dp" } */
+
+#include <arm_neon.h>
+
+#define AESE(r, v, key) (r = vaeseq_u8 ((v), (key)));
+#define AESMC(r, i) (r = vaesmcq_u8 (i))
+
+const uint8x16_t zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+uint8x16_t dummy;
+uint8x16_t a;
+uint8x16_t b;
+uint8x16_t c;
+uint8x16_t d;
+uint8x16_t x;
+uint8x16_t y;
+uint8x16_t k;
+
+void foo (void)
+{
+  AESE (a, a, k);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESE (b, b, k);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESE (c, c, k);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESE (d, d, k);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+
+  x = x ^ k;
+  AESE (x, x, zero);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  y = y ^ k;
+  AESE (y, y, zero);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+
+  AESMC (d, d);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESMC (c, c);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESMC (b, b);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESMC (a, a);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+
+  AESMC (y, y);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESMC (x, x);
+}
+
+/* { dg-final { scan-assembler-times "crypto_aese_fused" 6 } } */
+/* { dg-final { scan-assembler-not "veor" } } */
diff --git a/gcc/testsuite/gcc.target/arm/aes-fuse-2.c b/gcc/testsuite/gcc.target/arm/aes-fuse-2.c
new file mode 100644 (file)
index 0000000..1266a28
--- /dev/null
@@ -0,0 +1,66 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_crypto_ok } */
+/* { dg-add-options arm_crypto } */
+/* { dg-additional-options "-mcpu=cortex-a72 -O3 -dp" } */
+
+#include <arm_neon.h>
+
+#define AESD(r, v, key) (r = vaesdq_u8 ((v), (key)));
+#define AESIMC(r, i) (r = vaesimcq_u8 (i))
+
+const uint8x16_t zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+uint8x16_t dummy;
+uint8x16_t a;
+uint8x16_t b;
+uint8x16_t c;
+uint8x16_t d;
+uint8x16_t x;
+uint8x16_t y;
+uint8x16_t k;
+
+void foo (void)
+{
+  AESD (a, a, k);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESD (b, b, k);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESD (c, c, k);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESD (d, d, k);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+
+  x = x ^ k;
+  AESD (x, x, zero);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  y = y ^ k;
+  AESD (y, y, zero);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+
+  AESIMC (d, d);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESIMC (c, c);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESIMC (b, b);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESIMC (a, a);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+
+  AESIMC (y, y);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESIMC (x, x);
+}
+
+/* { dg-final { scan-assembler-times "crypto_aesd_fused" 6 } } */
+/* { dg-final { scan-assembler-not "veor" } } */
diff --git a/gcc/testsuite/gcc.target/arm/aes_xor_combine.c b/gcc/testsuite/gcc.target/arm/aes_xor_combine.c
new file mode 100644 (file)
index 0000000..17ae1c5
--- /dev/null
@@ -0,0 +1,43 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_crypto_ok } */
+/* { dg-add-options arm_crypto } */
+/* { dg-additional-options "-O3" } */
+
+#include <arm_neon.h>
+
+#define AESE(r, v, key) (r = vaeseq_u8 ((v), (key)));
+#define AESD(r, v, key) (r = vaesdq_u8 ((v), (key)));
+
+const uint8x16_t zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+uint8x16_t foo_key_0 (uint8x16_t dummy, uint8x16_t foo, uint8x16_t bar)
+{
+    dummy = dummy ^ foo;
+    AESE(dummy, dummy, zero);
+    dummy = dummy ^ bar;
+    AESE(dummy, dummy, zero);
+
+    dummy = dummy ^ foo;
+    AESD(dummy, dummy, zero);
+    dummy = dummy ^ bar;
+    AESD(dummy, dummy, zero);
+
+    return dummy;
+}
+
+uint8x16_t foo_data_0 (uint8x16_t dummy, uint8x16_t foo, uint8x16_t bar)
+{
+    dummy = dummy ^ foo;
+    AESE(dummy, zero, dummy);
+    dummy = dummy ^ bar;
+    AESE(dummy, zero, dummy);
+
+    dummy = dummy ^ foo;
+    AESD(dummy, zero, dummy);
+    dummy = dummy ^ bar;
+    AESD(dummy, zero, dummy);
+
+    return dummy;
+}
+
+/* { dg-final { scan-assembler-not "veor" } } */