From e811f10b61aa2d26416f3d6a79f024dcd21e895c Mon Sep 17 00:00:00 2001
From: Kyrylo Tkachov <kyrylo.tkachov@arm.com>
Date: Thu, 7 Jan 2021 16:03:08 +0000
Subject: [PATCH] aarch64: Reimplement most vpadal intrinsics using builtins

This patch reimplements most of the vpadal intrinsics to use RTL
builtins in the normal way.
The ones that aren't converted are the int32x2_t -> int64x1_t ones as
the RTL pattern doesn't currently handle
these modes. We don't have a V1DI mode so it would need to return a
DImode value or a V2DI one with the first lane
being the result. It's not hard to do, but it would require a bit more
refactoring so we can do it separately later.

This patch hopefully improves the status quo.

The new Vwhalf mode attribute is created because the existing Vwtype
attribute maps V8QI wrongly (for this pattern) to "8h" as the
suffix rather than "4h" as needed.

gcc/
	* config/aarch64/iterators.md (Vwhalf): New iterator.
	* config/aarch64/aarch64-simd.md (aarch64_<sur>adalp<mode>_3):
	Rename to...
	(aarch64_<sur>adalp<mode>): ... This.  Make more
	builtin-friendly.
	(<sur>sadv16qi): Adjust callsite of the above.
	* config/aarch64/aarch64-simd-builtins.def (sadalp, uadalp): New
	builtins.
	* config/aarch64/arm_neon.h (vpadal_s8): Reimplement using
	builtins.
	(vpadal_s16): Likewise.
	(vpadal_u8): Likewise.
	(vpadal_u16): Likewise.
	(vpadalq_s8): Likewise.
	(vpadalq_s16): Likewise.
	(vpadalq_s32): Likewise.
	(vpadalq_u8): Likewise.
	(vpadalq_u16): Likewise.
	(vpadalq_u32): Likewise.
---
 gcc/config/aarch64/aarch64-simd-builtins.def |  3 +
 gcc/config/aarch64/aarch64-simd.md           | 11 ++-
 gcc/config/aarch64/arm_neon.h                | 70 +++-----------------
 gcc/config/aarch64/iterators.md              |  5 ++
 4 files changed, 23 insertions(+), 66 deletions(-)
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index ca44b65f24b..0d611878c9d 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -157,6 +157,9 @@
   BUILTIN_VDQ_BHSI (TERNOP, saba, 0, NONE)
   BUILTIN_VDQ_BHSI (TERNOPU, uaba, 0, NONE)
 
+  BUILTIN_VDQV_S (BINOP, sadalp, 0, NONE)
+  BUILTIN_VDQV_S (BINOPU, uadalp, 0, NONE)
+
   /* Implemented by aarch64_<sur><addsub>hn<mode>.  */
   BUILTIN_VQN (BINOP, addhn, 0, NONE)
   BUILTIN_VQN (BINOP, subhn, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index f52cd7c41a5..b6629bfa93e 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -801,13 +801,13 @@
   [(set_attr "type" "neon_arith_acc<q>")]
 )
 
-(define_insn "aarch64_<sur>adalp<mode>_3"
+(define_insn "aarch64_<sur>adalp<mode>"
   [(set (match_operand:<VDBLW> 0 "register_operand" "=w")
-	(unspec:<VDBLW> [(match_operand:VDQV_S 1 "register_operand" "w")
-			  (match_operand:<VDBLW> 2 "register_operand" "0")]
+	(unspec:<VDBLW> [(match_operand:VDQV_S 2 "register_operand" "w")
+			  (match_operand:<VDBLW> 1 "register_operand" "0")]
 	ADALP))]
   "TARGET_SIMD"
-  "<sur>adalp\t%0.<Vwtype>, %1.<Vtype>"
+  "<sur>adalp\t%0.<Vwhalf>, %2.<Vtype>"
   [(set_attr "type" "neon_reduc_add<q>")]
 )
 
@@ -852,8 +852,7 @@
 					       operands[2]));
     emit_insn (gen_aarch64_<sur>abalv16qi_4 (reduc, operands[1],
 					      operands[2], reduc));
-    emit_insn (gen_aarch64_<sur>adalpv8hi_3 (operands[3], reduc,
-					      operands[3]));
+    emit_insn (gen_aarch64_<sur>adalpv8hi (operands[3], operands[3], reduc));
     emit_move_insn (operands[0], operands[3]);
     DONE;
   }
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 77f917143e8..608e582d258 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -9289,24 +9289,14 @@ __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vpadal_s8 (int16x4_t __a, int8x8_t __b)
 {
-  int16x4_t __result;
-  __asm__ ("sadalp %0.4h,%2.8b"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b)
-           : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_sadalpv8qi (__a, __b);
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vpadal_s16 (int32x2_t __a, int16x4_t __b)
 {
-  int32x2_t __result;
-  __asm__ ("sadalp %0.2s,%2.4h"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b)
-           : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_sadalpv4hi (__a, __b);
 }
 
 __extension__ extern __inline int64x1_t
@@ -9325,24 +9315,14 @@ __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vpadal_u8 (uint16x4_t __a, uint8x8_t __b)
 {
-  uint16x4_t __result;
-  __asm__ ("uadalp %0.4h,%2.8b"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b)
-           : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_uadalpv8qi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vpadal_u16 (uint32x2_t __a, uint16x4_t __b)
 {
-  uint32x2_t __result;
-  __asm__ ("uadalp %0.2s,%2.4h"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b)
-           : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_uadalpv4hi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint64x1_t
@@ -9361,72 +9341,42 @@ __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vpadalq_s8 (int16x8_t __a, int8x16_t __b)
 {
-  int16x8_t __result;
-  __asm__ ("sadalp %0.8h,%2.16b"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b)
-           : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_sadalpv16qi (__a, __b);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vpadalq_s16 (int32x4_t __a, int16x8_t __b)
 {
-  int32x4_t __result;
-  __asm__ ("sadalp %0.4s,%2.8h"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b)
-           : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_sadalpv8hi (__a, __b);
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vpadalq_s32 (int64x2_t __a, int32x4_t __b)
 {
-  int64x2_t __result;
-  __asm__ ("sadalp %0.2d,%2.4s"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b)
-           : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_sadalpv4si (__a, __b);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vpadalq_u8 (uint16x8_t __a, uint8x16_t __b)
 {
-  uint16x8_t __result;
-  __asm__ ("uadalp %0.8h,%2.16b"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b)
-           : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_uadalpv16qi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vpadalq_u16 (uint32x4_t __a, uint16x8_t __b)
 {
-  uint32x4_t __result;
-  __asm__ ("uadalp %0.4s,%2.8h"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b)
-           : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_uadalpv8hi_uuu (__a, __b);
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vpadalq_u32 (uint64x2_t __a, uint32x4_t __b)
 {
-  uint64x2_t __result;
-  __asm__ ("uadalp %0.2d,%2.4s"
-           : "=w"(__result)
-           : "0"(__a), "w"(__b)
-           : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_uadalpv4si_uuu (__a, __b);
 }
 
 __extension__ extern __inline int16x4_t
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 5775ce43e3d..5a82d9395f9 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1291,6 +1291,11 @@
 			  (V8HI "4s") (V4SI "2d")
 			  (V8HF "4s") (V4SF "2d")])
 
+;; Widened mode with half the element register suffixes for VD_BHSI/VQW/VQ_HSF.
+(define_mode_attr Vwhalf [(V8QI "4h") (V4HI "2s")
+			  (V2SI "1d") (V16QI "8h")
+			  (V8HI "4s") (V4SI "2d")])
+
 ;; SVE vector after narrowing.
 (define_mode_attr Ventype [(VNx8HI "b")
 			   (VNx4SI "h") (VNx4SF "h")
-- 
2.30.2