From 26004f51f9cfb83d88d445903933fb8c39a841f1 Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Fri, 7 Dec 2018 15:03:15 +0000
Subject: [PATCH] [AArch64][SVE] Remove unnecessary PTRUEs from integer
 arithmetic

When using the unpredicated immediate forms of MUL, LSL, LSR and ASR,
the rtl patterns would still have the predicate operand we created for
the other forms.  This patch splits the patterns after reload in order
to get rid of the predicate, like we already do for WHILE.

2018-12-07  Richard Sandiford  <richard.sandiford@arm.com>

gcc/
	* config/aarch64/aarch64-sve.md (*mul<mode>3, *v<optab><mode>3):
	Split the patterns after reload if we don't need the predicate
	operand.
	(*post_ra_mul<mode>3, *post_ra_v<optab><mode>3): New patterns.

gcc/testsuite/
	* gcc.target/aarch64/sve/pred_elim_2.c: New test.

From-SVN: r266892
---
 gcc/ChangeLog                                 |  7 ++++
 gcc/config/aarch64/aarch64-sve.md             | 42 +++++++++++++++++--
 gcc/testsuite/ChangeLog                       |  4 ++
 .../gcc.target/aarch64/sve/pred_elim_2.c      | 31 ++++++++++++++
 4 files changed, 80 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/pred_elim_2.c
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index d1a40254232..95c1a4a8029 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@
+2018-12-07  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* config/aarch64/aarch64-sve.md (*mul<mode>3, *v<optab><mode>3):
+	Split the patterns after reload if we don't need the predicate
+	operand.
+	(*post_ra_mul<mode>3, *post_ra_v<optab><mode>3): New patterns.
+
 2018-12-07  Richard Sandiford  <richard.sandiford@arm.com>
 
 	* config/aarch64/iterators.md (SVE_UNPRED_FP_BINARY): New code
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index edc6cff8fbd..8569a8e1ea7 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -936,7 +936,7 @@
 ;; predicate for the first alternative, but using Upa or X isn't likely
 ;; to gain much and would make the instruction seem less uniform to the
 ;; register allocator.
-(define_insn "*mul<mode>3"
+(define_insn_and_split "*mul<mode>3"
   [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
 	(unspec:SVE_I
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
@@ -946,12 +946,30 @@
 	  UNSPEC_MERGE_PTRUE))]
   "TARGET_SVE"
   "@
-   mul\t%0.<Vetype>, %0.<Vetype>, #%3
+   #
    mul\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
    movprfx\t%0, %2\;mul\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  ; Split the unpredicated form after reload, so that we don't have
+  ; the unnecessary PTRUE.
+  "&& reload_completed
+   && !register_operand (operands[3], <MODE>mode)"
+  [(set (match_dup 0) (mult:SVE_I (match_dup 2) (match_dup 3)))]
+  ""
   [(set_attr "movprfx" "*,*,yes")]
 )
 
+;; Unpredicated multiplications by a constant (post-RA only).
+;; These are generated by splitting a predicated instruction whose
+;; predicate is unused.
+(define_insn "*post_ra_mul<mode>3"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w")
+	(mult:SVE_I
+	  (match_operand:SVE_I 1 "register_operand" "0")
+	  (match_operand:SVE_I 2 "aarch64_sve_mul_immediate")))]
+  "TARGET_SVE && reload_completed"
+  "mul\t%0.<Vetype>, %0.<Vetype>, #%2"
+)
+
 (define_insn "*madd<mode>"
   [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
 	(plus:SVE_I
@@ -1232,7 +1250,7 @@
 ;; actually need the predicate for the first alternative, but using Upa
 ;; or X isn't likely to gain much and would make the instruction seem
 ;; less uniform to the register allocator.
-(define_insn "*v<optab><mode>3"
+(define_insn_and_split "*v<optab><mode>3"
   [(set (match_operand:SVE_I 0 "register_operand" "=w, w, ?&w")
 	(unspec:SVE_I
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
@@ -1242,12 +1260,28 @@
 	  UNSPEC_MERGE_PTRUE))]
   "TARGET_SVE"
   "@
-   <shift>\t%0.<Vetype>, %2.<Vetype>, #%3
+   #
    <shift>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
    movprfx\t%0, %2\;<shift>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+  "&& reload_completed
+   && !register_operand (operands[3], <MODE>mode)"
+  [(set (match_dup 0) (ASHIFT:SVE_I (match_dup 2) (match_dup 3)))]
+  ""
   [(set_attr "movprfx" "*,*,yes")]
 )
 
+;; Unpredicated shift operations by a constant (post-RA only).
+;; These are generated by splitting a predicated instruction whose
+;; predicate is unused.
+(define_insn "*post_ra_v<optab><mode>3"
+  [(set (match_operand:SVE_I 0 "register_operand" "=w")
+	(ASHIFT:SVE_I
+	  (match_operand:SVE_I 1 "register_operand" "w")
+	  (match_operand:SVE_I 2 "aarch64_simd_<lr>shift_imm")))]
+  "TARGET_SVE && reload_completed"
+  "<shift>\t%0.<Vetype>, %1.<Vetype>, #%2"
+)
+
 ;; LSL, LSR and ASR by a scalar, which expands into one of the vector
 ;; shifts above.
 (define_expand "<ASHIFT:optab><mode>3"
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 996cacda1cb..231275454bd 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,7 @@
+2018-12-07  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* gcc.target/aarch64/sve/pred_elim_2.c: New test.
+
 2018-12-07  Richard Sandiford  <richard.sandiford@arm.com>
 
 	* gcc.target/aarch64/sve/pred_elim_1.c: New test.
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred_elim_2.c b/gcc/testsuite/gcc.target/aarch64/sve/pred_elim_2.c
new file mode 100644
index 00000000000..ed9c7007d2e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred_elim_2.c
@@ -0,0 +1,31 @@
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define TEST_OP(NAME, TYPE, OP)					\
+  void								\
+  NAME##_##TYPE (TYPE *restrict a, TYPE *restrict b, int n)	\
+  {								\
+    for (int i = 0; i < n; ++i)					\
+      a[i] = b[i] OP;						\
+  }
+
+#define TEST_TYPE(TYPE) \
+  TEST_OP (shl, TYPE, << 6) \
+  TEST_OP (shr, TYPE, >> 6) \
+  TEST_OP (mult, TYPE, * 0x2b)
+
+TEST_TYPE (int8_t)
+TEST_TYPE (int16_t)
+TEST_TYPE (int32_t)
+TEST_TYPE (int64_t)
+TEST_TYPE (uint8_t)
+TEST_TYPE (uint16_t)
+TEST_TYPE (uint32_t)
+TEST_TYPE (uint64_t)
+
+/* { dg-final { scan-assembler-times {\tlsl\t} 8 } } */
+/* { dg-final { scan-assembler-times {\tlsr\t} 4 } } */
+/* { dg-final { scan-assembler-times {\tasr\t} 4 } } */
+/* { dg-final { scan-assembler-times {\tmul\t} 8 } } */
+/* { dg-final { scan-assembler-not {\tptrue\t} } } */
-- 
2.30.2