From 58a3bd25ba668138bcc9ea314736736e08fa41a1 Mon Sep 17 00:00:00 2001
From: Felix Yang <felix.yang@huawei.com>
Date: Mon, 8 Dec 2014 14:19:44 +0000
Subject: [PATCH] arm_neon.h (vrecpe_u32, [...]): Rewrite using builtin
 functions.

        * config/aarch64/arm_neon.h (vrecpe_u32, vrecpeq_u32): Rewrite using
        builtin functions.
        (vfma_f32, vfmaq_f32, vfmaq_f64, vfma_n_f32, vfmaq_n_f32, vfmaq_n_f64,
        vfms_f32, vfmsq_f32, vfmsq_f64): Likewise.
        (vhsub_s8, vhsub_u8, vhsub_s16, vhsub_u16, vhsub_s32, vhsub_u32,
        vhsubq_s8, vhsubq_u8, vhsubq_s16, vhsubq_u16, vhsubq_s32, vhsubq_u32,
        vsubhn_s16, vsubhn_u16, vsubhn_s32, vsubhn_u32, vsubhn_s64, vsubhn_u66,
        vrsubhn_s16, vrsubhn_u16, vrsubhn_s32, vrsubhn_u32, vrsubhn_s64,
        vrsubhn_u64, vsubhn_high_s16, vsubhn_high_u16, vsubhn_high_s32,
        vsubhn_high_u32, vsubhn_high_s64, vsubhn_high_u64, vrsubhn_high_s16,
        vrsubhn_high_u16, vrsubhn_high_s32, vrsubhn_high_u32, vrsubhn_high_s64,
        vrsubhn_high_u64): Likewise.
        * config/aarch64/iterators.md (VDQ_SI): New mode iterator.
        * config/aarch64/aarch64.md (define_c_enum "unspec"): Add UNSPEC_URECPE.
        * config/aarch64/aarch64-simd.md (aarch64_urecpe<mode>): New pattern.
        * config/aarch64/aarch64-simd-builtins.def (shsub, uhsub, subhn, rsubhn,
        subhn2, rsubhn2, urecpe): New builtins.

Co-Authored-By: Haijian Zhang <z.zhanghaijian@huawei.com>
Co-Authored-By: Jiji Jiang <jiangjiji@huawei.com>
Co-Authored-By: Pengfei Sui <suipengfei@huawei.com>

From-SVN: r218484
---
 gcc/ChangeLog                                 |  24 +-
 gcc/config/aarch64/aarch64-simd-builtins.def  |   8 +
 gcc/config/aarch64/aarch64-simd.md            |   8 +
 gcc/config/aarch64/aarch64.md                 |   1 +
 gcc/config/aarch64/arm_neon.h                 | 841 +++++++-----------
 gcc/config/aarch64/iterators.md               |   3 +
 gcc/testsuite/ChangeLog                       |  11 +
 .../aarch64/advsimd-intrinsics/vfma.c         |  67 ++
 .../aarch64/advsimd-intrinsics/vfma_n.c       |  69 ++
 .../aarch64/advsimd-intrinsics/vfms.c         |  67 ++
 .../aarch64/narrow_high-intrinsics.c          |   4 +-
 11 files changed, 575 insertions(+), 528 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma_n.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms.c
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index f22bba83a86..f01a99fd2d5 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,26 @@
+2014-12-08  Felix Yang  <felix.yang@huawei.com>
+	    Haijian Zhang  <z.zhanghaijian@huawei.com>
+	    Jiji Jiang  <jiangjiji@huawei.com>
+	    Pengfei Sui  <suipengfei@huawei.com>
+
+	* config/aarch64/arm_neon.h (vrecpe_u32, vrecpeq_u32): Rewrite using
+	builtin functions.
+	(vfma_f32, vfmaq_f32, vfmaq_f64, vfma_n_f32, vfmaq_n_f32, vfmaq_n_f64,
+	vfms_f32, vfmsq_f32, vfmsq_f64): Likewise.
+	(vhsub_s8, vhsub_u8, vhsub_s16, vhsub_u16, vhsub_s32, vhsub_u32,
+	vhsubq_s8, vhsubq_u8, vhsubq_s16, vhsubq_u16, vhsubq_s32, vhsubq_u32,
+	vsubhn_s16, vsubhn_u16, vsubhn_s32, vsubhn_u32, vsubhn_s64, vsubhn_u66,
+	vrsubhn_s16, vrsubhn_u16, vrsubhn_s32, vrsubhn_u32, vrsubhn_s64,
+	vrsubhn_u64, vsubhn_high_s16, vsubhn_high_u16, vsubhn_high_s32,
+	vsubhn_high_u32, vsubhn_high_s64, vsubhn_high_u64, vrsubhn_high_s16,
+	vrsubhn_high_u16, vrsubhn_high_s32, vrsubhn_high_u32, vrsubhn_high_s64,
+	vrsubhn_high_u64): Likewise.
+	* config/aarch64/iterators.md (VDQ_SI): New mode iterator.
+	* config/aarch64/aarch64.md (define_c_enum "unspec"): Add UNSPEC_URECPE.
+	* config/aarch64/aarch64-simd.md (aarch64_urecpe<mode>): New pattern.
+	* config/aarch64/aarch64-simd-builtins.def (shsub, uhsub, subhn, rsubhn,
+	subhn2, rsubhn2, urecpe): New builtins.
+
 2014-12-08  Ilya Tocar  <ilya.tocar@intel.com>
 
 	* config/i386/i386.c (ix86_expand_vec_perm_vpermi2): Handle v64qi.
@@ -5997,7 +6020,6 @@
 	* config/aarch64/aarch64-simd.md (*aarch64_simd_ld1r<mode>): Use
 	VALL mode iterator instead of VALLDI.
 
-
 2014-11-14  Jan Hubicka  <hubicka@ucw.cz>
 
 	* optc-save-gen.awk: Output cl_target_option_eq,
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 953eb53c217..745f1079156 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -127,15 +127,21 @@
   BUILTIN_VD_BHSI (BINOP, usubw, 0)
   /* Implemented by aarch64_<sur>h<addsub><mode>.  */
   BUILTIN_VDQ_BHSI (BINOP, shadd, 0)
+  BUILTIN_VDQ_BHSI (BINOP, shsub, 0)
   BUILTIN_VDQ_BHSI (BINOP, uhadd, 0)
+  BUILTIN_VDQ_BHSI (BINOP, uhsub, 0)
   BUILTIN_VDQ_BHSI (BINOP, srhadd, 0)
   BUILTIN_VDQ_BHSI (BINOP, urhadd, 0)
   /* Implemented by aarch64_<sur><addsub>hn<mode>.  */
   BUILTIN_VQN (BINOP, addhn, 0)
+  BUILTIN_VQN (BINOP, subhn, 0)
   BUILTIN_VQN (BINOP, raddhn, 0)
+  BUILTIN_VQN (BINOP, rsubhn, 0)
   /* Implemented by aarch64_<sur><addsub>hn2<mode>.  */
   BUILTIN_VQN (TERNOP, addhn2, 0)
+  BUILTIN_VQN (TERNOP, subhn2, 0)
   BUILTIN_VQN (TERNOP, raddhn2, 0)
+  BUILTIN_VQN (TERNOP, rsubhn2, 0)
 
   BUILTIN_VSQN_HSDI (UNOP, sqmovun, 0)
   /* Implemented by aarch64_<sur>qmovn<mode>.  */
@@ -338,6 +344,8 @@
   BUILTIN_GPF (BINOP, frecps, 0)
   BUILTIN_GPF (UNOP, frecpx, 0)
 
+  BUILTIN_VDQ_SI (UNOP, urecpe, 0)
+
   BUILTIN_VDQF (UNOP, frecpe, 0)
   BUILTIN_VDQF (BINOP, frecps, 0)
 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index d44d774e6b8..733512c427d 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4840,6 +4840,14 @@
   [(set_attr "type" "neon_fp_recps_<Vetype><q>")]
 )
 
+(define_insn "aarch64_urecpe<mode>"
+  [(set (match_operand:VDQ_SI 0 "register_operand" "=w")
+        (unspec:VDQ_SI [(match_operand:VDQ_SI 1 "register_operand" "w")]
+                UNSPEC_URECPE))]
+ "TARGET_SIMD"
+ "urecpe\\t%0.<Vtype>, %1.<Vtype>"
+  [(set_attr "type" "neon_fp_recpe_<Vetype><q>")])
+
 ;; Standard pattern name vec_extract<mode>.
 
 (define_expand "vec_extract<mode>"
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 46be23999ef..97c1dff2ed6 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -75,6 +75,7 @@
     UNSPEC_CRC32H
     UNSPEC_CRC32W
     UNSPEC_CRC32X
+    UNSPEC_URECPE
     UNSPEC_FRECPE
     UNSPEC_FRECPS
     UNSPEC_FRECPX
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index f3a87310dd5..0435f89c728 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -2287,6 +2287,246 @@ vqadd_u8 (uint8x8_t __a, uint8x8_t __b)
   return __builtin_aarch64_uqaddv8qi_uuu (__a, __b);
 }
 
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vhsub_s8 (int8x8_t __a, int8x8_t __b)
+{
+  return (int8x8_t)__builtin_aarch64_shsubv8qi (__a, __b);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vhsub_s16 (int16x4_t __a, int16x4_t __b)
+{
+  return (int16x4_t) __builtin_aarch64_shsubv4hi (__a, __b);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vhsub_s32 (int32x2_t __a, int32x2_t __b)
+{
+  return (int32x2_t) __builtin_aarch64_shsubv2si (__a, __b);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vhsub_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+  return (uint8x8_t) __builtin_aarch64_uhsubv8qi ((int8x8_t) __a,
+						  (int8x8_t) __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vhsub_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+  return (uint16x4_t) __builtin_aarch64_uhsubv4hi ((int16x4_t) __a,
+						   (int16x4_t) __b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vhsub_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+  return (uint32x2_t) __builtin_aarch64_uhsubv2si ((int32x2_t) __a,
+						   (int32x2_t) __b);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vhsubq_s8 (int8x16_t __a, int8x16_t __b)
+{
+  return (int8x16_t) __builtin_aarch64_shsubv16qi (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vhsubq_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int16x8_t) __builtin_aarch64_shsubv8hi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vhsubq_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int32x4_t) __builtin_aarch64_shsubv4si (__a, __b);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vhsubq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+  return (uint8x16_t) __builtin_aarch64_uhsubv16qi ((int8x16_t) __a,
+						    (int8x16_t) __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vhsubq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint16x8_t) __builtin_aarch64_uhsubv8hi ((int16x8_t) __a,
+						   (int16x8_t) __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vhsubq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint32x4_t) __builtin_aarch64_uhsubv4si ((int32x4_t) __a,
+						   (int32x4_t) __b);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vsubhn_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int8x8_t) __builtin_aarch64_subhnv8hi (__a, __b);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vsubhn_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int16x4_t) __builtin_aarch64_subhnv4si (__a, __b);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vsubhn_s64 (int64x2_t __a, int64x2_t __b)
+{
+  return (int32x2_t) __builtin_aarch64_subhnv2di (__a, __b);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vsubhn_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint8x8_t) __builtin_aarch64_subhnv8hi ((int16x8_t) __a,
+						  (int16x8_t) __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vsubhn_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint16x4_t) __builtin_aarch64_subhnv4si ((int32x4_t) __a,
+						   (int32x4_t) __b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vsubhn_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  return (uint32x2_t) __builtin_aarch64_subhnv2di ((int64x2_t) __a,
+						   (int64x2_t) __b);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vrsubhn_s16 (int16x8_t __a, int16x8_t __b)
+{
+  return (int8x8_t) __builtin_aarch64_rsubhnv8hi (__a, __b);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vrsubhn_s32 (int32x4_t __a, int32x4_t __b)
+{
+  return (int16x4_t) __builtin_aarch64_rsubhnv4si (__a, __b);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vrsubhn_s64 (int64x2_t __a, int64x2_t __b)
+{
+  return (int32x2_t) __builtin_aarch64_rsubhnv2di (__a, __b);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vrsubhn_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+  return (uint8x8_t) __builtin_aarch64_rsubhnv8hi ((int16x8_t) __a,
+						   (int16x8_t) __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vrsubhn_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+  return (uint16x4_t) __builtin_aarch64_rsubhnv4si ((int32x4_t) __a,
+						    (int32x4_t) __b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vrsubhn_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+  return (uint32x2_t) __builtin_aarch64_rsubhnv2di ((int64x2_t) __a,
+						    (int64x2_t) __b);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vrsubhn_high_s16 (int8x8_t __a, int16x8_t __b, int16x8_t __c)
+{
+  return (int8x16_t) __builtin_aarch64_rsubhn2v8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vrsubhn_high_s32 (int16x4_t __a, int32x4_t __b, int32x4_t __c)
+{
+  return (int16x8_t) __builtin_aarch64_rsubhn2v4si (__a, __b, __c);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vrsubhn_high_s64 (int32x2_t __a, int64x2_t __b, int64x2_t __c)
+{
+  return (int32x4_t) __builtin_aarch64_rsubhn2v2di (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vrsubhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
+{
+  return (uint8x16_t) __builtin_aarch64_rsubhn2v8hi ((int8x8_t) __a,
+						     (int16x8_t) __b,
+						     (int16x8_t) __c);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vrsubhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
+{
+  return (uint16x8_t) __builtin_aarch64_rsubhn2v4si ((int16x4_t) __a,
+						     (int32x4_t) __b,
+						     (int32x4_t) __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vrsubhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
+{
+  return (uint32x4_t) __builtin_aarch64_rsubhn2v2di ((int32x2_t) __a,
+						     (int64x2_t) __b,
+						     (int64x2_t) __c);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vsubhn_high_s16 (int8x8_t __a, int16x8_t __b, int16x8_t __c)
+{
+  return (int8x16_t) __builtin_aarch64_subhn2v8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vsubhn_high_s32 (int16x4_t __a, int32x4_t __b, int32x4_t __c)
+{
+  return (int16x8_t) __builtin_aarch64_subhn2v4si (__a, __b, __c);;
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vsubhn_high_s64 (int32x2_t __a, int64x2_t __b, int64x2_t __c)
+{
+  return (int32x4_t) __builtin_aarch64_subhn2v2di (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vsubhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
+{
+  return (uint8x16_t) __builtin_aarch64_subhn2v8hi ((int8x8_t) __a,
+						    (int16x8_t) __b,
+						    (int16x8_t) __c);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vsubhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
+{
+  return (uint16x8_t) __builtin_aarch64_subhn2v4si ((int16x4_t) __a,
+						    (int32x4_t) __b,
+						    (int32x4_t) __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vsubhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
+{
+  return (uint32x4_t) __builtin_aarch64_subhn2v2di ((int32x2_t) __a,
+						    (int64x2_t) __b,
+						    (int64x2_t) __c);
+}
+
 __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 vqadd_u16 (uint16x4_t __a, uint16x4_t __b)
 {
@@ -5756,237 +5996,6 @@ vcvtxd_f32_f64 (float64_t a)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vfma_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
-{
-  float32x2_t result;
-  __asm__ ("fmla %0.2s,%2.2s,%3.2s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vfmaq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
-{
-  float32x4_t result;
-  __asm__ ("fmla %0.4s,%2.4s,%3.4s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vfmaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
-{
-  float64x2_t result;
-  __asm__ ("fmla %0.2d,%2.2d,%3.2d"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vfma_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
-{
-  float32x2_t result;
-  __asm__ ("fmla %0.2s, %2.2s, %3.s[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vfmaq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
-{
-  float32x4_t result;
-  __asm__ ("fmla %0.4s, %2.4s, %3.s[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vfmaq_n_f64 (float64x2_t a, float64x2_t b, float64_t c)
-{
-  float64x2_t result;
-  __asm__ ("fmla %0.2d, %2.2d, %3.d[0]"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vfms_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
-{
-  float32x2_t result;
-  __asm__ ("fmls %0.2s,%2.2s,%3.2s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vfmsq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
-{
-  float32x4_t result;
-  __asm__ ("fmls %0.4s,%2.4s,%3.4s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vfmsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
-{
-  float64x2_t result;
-  __asm__ ("fmls %0.2d,%2.2d,%3.2d"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vhsub_s8 (int8x8_t a, int8x8_t b)
-{
-  int8x8_t result;
-  __asm__ ("shsub %0.8b, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vhsub_s16 (int16x4_t a, int16x4_t b)
-{
-  int16x4_t result;
-  __asm__ ("shsub %0.4h, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vhsub_s32 (int32x2_t a, int32x2_t b)
-{
-  int32x2_t result;
-  __asm__ ("shsub %0.2s, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vhsub_u8 (uint8x8_t a, uint8x8_t b)
-{
-  uint8x8_t result;
-  __asm__ ("uhsub %0.8b, %1.8b, %2.8b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vhsub_u16 (uint16x4_t a, uint16x4_t b)
-{
-  uint16x4_t result;
-  __asm__ ("uhsub %0.4h, %1.4h, %2.4h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vhsub_u32 (uint32x2_t a, uint32x2_t b)
-{
-  uint32x2_t result;
-  __asm__ ("uhsub %0.2s, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vhsubq_s8 (int8x16_t a, int8x16_t b)
-{
-  int8x16_t result;
-  __asm__ ("shsub %0.16b, %1.16b, %2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vhsubq_s16 (int16x8_t a, int16x8_t b)
-{
-  int16x8_t result;
-  __asm__ ("shsub %0.8h, %1.8h, %2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vhsubq_s32 (int32x4_t a, int32x4_t b)
-{
-  int32x4_t result;
-  __asm__ ("shsub %0.4s, %1.4s, %2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vhsubq_u8 (uint8x16_t a, uint8x16_t b)
-{
-  uint8x16_t result;
-  __asm__ ("uhsub %0.16b, %1.16b, %2.16b"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vhsubq_u16 (uint16x8_t a, uint16x8_t b)
-{
-  uint16x8_t result;
-  __asm__ ("uhsub %0.8h, %1.8h, %2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vhsubq_u32 (uint32x4_t a, uint32x4_t b)
-{
-  uint32x4_t result;
-  __asm__ ("uhsub %0.4s, %1.4s, %2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vmla_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
 {
@@ -9774,37 +9783,15 @@ vqrdmulhq_n_s32 (int32x4_t a, int32_t b)
     ({                                                                  \
        int64x2_t b_ = (b);                                              \
        uint32x2_t a_ = (a);                                             \
-       uint32x4_t result = vcombine_u32                                 \
-                             (a_, vcreate_u32                           \
-                                    (__AARCH64_UINT64_C (0x0)));        \
-       __asm__ ("sqshrun2 %0.4s, %1.2d, #%2"                            \
-                : "+w"(result)                                          \
-                : "w"(b_), "i"(c)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vrecpe_u32 (uint32x2_t a)
-{
-  uint32x2_t result;
-  __asm__ ("urecpe %0.2s,%1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vrecpeq_u32 (uint32x4_t a)
-{
-  uint32x4_t result;
-  __asm__ ("urecpe %0.4s,%1.4s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
+       uint32x4_t result = vcombine_u32                                 \
+                             (a_, vcreate_u32                           \
+                                    (__AARCH64_UINT64_C (0x0)));        \
+       __asm__ ("sqshrun2 %0.4s, %1.2d, #%2"                            \
+                : "+w"(result)                                          \
+                : "w"(b_), "i"(c)                                       \
+                : /* No clobbers */);                                   \
+       result;                                                          \
+     })
 
 #define vrshrn_high_n_s16(a, b, c)                                      \
   __extension__                                                         \
@@ -10111,138 +10098,6 @@ vrsqrtss_f32 (float32_t a, float32_t b)
   return result;
 }
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vrsubhn_high_s16 (int8x8_t a, int16x8_t b, int16x8_t c)
-{
-  int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("rsubhn2 %0.16b, %1.8h, %2.8h"
-           : "+w"(result)
-           : "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vrsubhn_high_s32 (int16x4_t a, int32x4_t b, int32x4_t c)
-{
-  int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("rsubhn2 %0.8h, %1.4s, %2.4s"
-           : "+w"(result)
-           : "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vrsubhn_high_s64 (int32x2_t a, int64x2_t b, int64x2_t c)
-{
-  int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("rsubhn2 %0.4s, %1.2d, %2.2d"
-           : "+w"(result)
-           : "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vrsubhn_high_u16 (uint8x8_t a, uint16x8_t b, uint16x8_t c)
-{
-  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("rsubhn2 %0.16b, %1.8h, %2.8h"
-           : "+w"(result)
-           : "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vrsubhn_high_u32 (uint16x4_t a, uint32x4_t b, uint32x4_t c)
-{
-  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("rsubhn2 %0.8h, %1.4s, %2.4s"
-           : "+w"(result)
-           : "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vrsubhn_high_u64 (uint32x2_t a, uint64x2_t b, uint64x2_t c)
-{
-  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("rsubhn2 %0.4s, %1.2d, %2.2d"
-           : "+w"(result)
-           : "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vrsubhn_s16 (int16x8_t a, int16x8_t b)
-{
-  int8x8_t result;
-  __asm__ ("rsubhn %0.8b, %1.8h, %2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vrsubhn_s32 (int32x4_t a, int32x4_t b)
-{
-  int16x4_t result;
-  __asm__ ("rsubhn %0.4h, %1.4s, %2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vrsubhn_s64 (int64x2_t a, int64x2_t b)
-{
-  int32x2_t result;
-  __asm__ ("rsubhn %0.2s, %1.2d, %2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vrsubhn_u16 (uint16x8_t a, uint16x8_t b)
-{
-  uint8x8_t result;
-  __asm__ ("rsubhn %0.8b, %1.8h, %2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vrsubhn_u32 (uint32x4_t a, uint32x4_t b)
-{
-  uint16x4_t result;
-  __asm__ ("rsubhn %0.4h, %1.4s, %2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vrsubhn_u64 (uint64x2_t a, uint64x2_t b)
-{
-  uint32x2_t result;
-  __asm__ ("rsubhn %0.2s, %1.2d, %2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 #define vshrn_high_n_s16(a, b, c)                                       \
   __extension__                                                         \
     ({                                                                  \
@@ -10774,137 +10629,6 @@ vrsubhn_u64 (uint64x2_t a, uint64x2_t b)
                 : "memory");                                            \
      })
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vsubhn_high_s16 (int8x8_t a, int16x8_t b, int16x8_t c)
-{
-  int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("subhn2 %0.16b, %1.8h, %2.8h"
-           : "+w"(result)
-           : "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vsubhn_high_s32 (int16x4_t a, int32x4_t b, int32x4_t c)
-{
-  int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("subhn2 %0.8h, %1.4s, %2.4s"
-           : "+w"(result)
-           : "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vsubhn_high_s64 (int32x2_t a, int64x2_t b, int64x2_t c)
-{
-  int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("subhn2 %0.4s, %1.2d, %2.2d"
-           : "+w"(result)
-           : "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vsubhn_high_u16 (uint8x8_t a, uint16x8_t b, uint16x8_t c)
-{
-  uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("subhn2 %0.16b, %1.8h, %2.8h"
-           : "+w"(result)
-           : "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vsubhn_high_u32 (uint16x4_t a, uint32x4_t b, uint32x4_t c)
-{
-  uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("subhn2 %0.8h, %1.4s, %2.4s"
-           : "+w"(result)
-           : "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsubhn_high_u64 (uint32x2_t a, uint64x2_t b, uint64x2_t c)
-{
-  uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("subhn2 %0.4s, %1.2d, %2.2d"
-           : "+w"(result)
-           : "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vsubhn_s16 (int16x8_t a, int16x8_t b)
-{
-  int8x8_t result;
-  __asm__ ("subhn %0.8b, %1.8h, %2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vsubhn_s32 (int32x4_t a, int32x4_t b)
-{
-  int16x4_t result;
-  __asm__ ("subhn %0.4h, %1.4s, %2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vsubhn_s64 (int64x2_t a, int64x2_t b)
-{
-  int32x2_t result;
-  __asm__ ("subhn %0.2s, %1.2d, %2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vsubhn_u16 (uint16x8_t a, uint16x8_t b)
-{
-  uint8x8_t result;
-  __asm__ ("subhn %0.8b, %1.8h, %2.8h"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vsubhn_u32 (uint32x4_t a, uint32x4_t b)
-{
-  uint16x4_t result;
-  __asm__ ("subhn %0.4h, %1.4s, %2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vsubhn_u64 (uint64x2_t a, uint64x2_t b)
-{
-  uint32x2_t result;
-  __asm__ ("subhn %0.2s, %1.2d, %2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 vtst_p8 (poly8x8_t a, poly8x8_t b)
@@ -15425,6 +15149,42 @@ vfma_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
   return (float64x1_t) {__builtin_fma (__b[0], __c[0], __a[0])};
 }
 
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfma_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+{
+  return __builtin_aarch64_fmav2sf (__b, __c, __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+{
+  return __builtin_aarch64_fmav4sf (__b, __c, __a);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vfmaq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
+{
+  return __builtin_aarch64_fmav2df (__b, __c, __a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
+{
+  return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
+{
+  return __builtin_aarch64_fmav4sf (__b, vdupq_n_f32 (__c), __a);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vfmaq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c)
+{
+  return __builtin_aarch64_fmav2df (__b, vdupq_n_f64 (__c), __a);
+}
+
 /* vfma_lane  */
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
@@ -15536,6 +15296,25 @@ vfms_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c)
   return (float64x1_t) {__builtin_fma (-__b[0], __c[0], __a[0])};
 }
 
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfms_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+{
+  return __builtin_aarch64_fmav2sf (-__b, __c, __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+{
+  return __builtin_aarch64_fmav4sf (-__b, __c, __a);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
+{
+  return __builtin_aarch64_fmav2df (-__b, __c, __a);
+}
+
+
 /* vfms_lane  */
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
@@ -20966,6 +20745,18 @@ vrbitq_u8 (uint8x16_t __a)
 
 /* vrecpe  */
 
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vrecpe_u32 (uint32x2_t __a)
+{
+  return (uint32x2_t) __builtin_aarch64_urecpev2si ((int32x2_t) __a);
+}
+ 
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vrecpeq_u32 (uint32x4_t __a)
+{
+  return (uint32x4_t) __builtin_aarch64_urecpev4si ((int32x4_t) __a);
+}
+
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vrecpes_f32 (float32_t __a)
 {
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 76be6927eb2..16a2647cc60 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -128,6 +128,9 @@
 ;; Vector modes except double int.
 (define_mode_iterator VDQIF [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF V2DF])
 
+;; Vector modes for S type.
+(define_mode_iterator VDQ_SI [V2SI V4SI])
+
 ;; Vector modes for Q and H types.
 (define_mode_iterator VDQQH [V8QI V16QI V4HI V8HI])
 
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 05342ed6fbd..11bfcb7580a 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,14 @@
+2014-12-08  Felix Yang  <felix.yang@huawei.com>
+	   Haijian Zhang  <z.zhanghaijian@huawei.com>
+	   Jiji Jiang  <jiangjiji@huawei.com>
+	   Pengfei Sui  <suipengfei@huawei.com>
+
+	* gcc.target/aarch64/vfma.c: New test.
+	* gcc.target/aarch64/vfma_n.c: New test.
+	* gcc.target/aarch64/vfms.c: New test.
+	* gcc.target/aarch64/narrow_high-intrinsics.c: Fix expected assembler
+	for rsubhn2 & subhn2.
+
 2014-12-08  Ilya Enkovich  <ilya.enkovich@intel.com>
 
 	* gcc.target/i386/chkp-bndret.c: New.
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma.c
new file mode 100644
index 00000000000..7ff482ce812
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma.c
@@ -0,0 +1,67 @@
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results.  */
+VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x4438ca3d, 0x44390a3d };
+VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x44869eb8, 0x4486beb8, 0x4486deb8, 0x4486feb8 };
+VECT_VAR_DECL(expected,hfloat,64,2) [] = { 0x408906e1532b8520, 0x40890ee1532b8520 };
+
+#define TEST_MSG "VFMA/VFMAQ"
+void exec_vfma (void)
+{
+  /* Basic test: v4=vfma(v1,v2), then store the result.  */
+#define TEST_VFMA(Q, T1, T2, W, N)					\
+  VECT_VAR(vector_res, T1, W, N) =					\
+    vfma##Q##_##T2##W(VECT_VAR(vector1, T1, W, N),			\
+		      VECT_VAR(vector2, T1, W, N),			\
+			  VECT_VAR(vector3, T1, W, N));			\
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector_res, T1, W, N))
+
+#define CHECK_VFMA_RESULTS(test_name,comment)				\
+  {									\
+    CHECK_FP(test_name, float, 32, 2, PRIx32, expected, comment);	\
+    CHECK_FP(test_name, float, 32, 4, PRIx32, expected, comment);	\
+	CHECK_FP(test_name, float, 64, 2, PRIx64, expected, comment);	\
+  }	
+
+#define DECL_VABD_VAR(VAR)			\
+  DECL_VARIABLE(VAR, float, 32, 2);		\
+  DECL_VARIABLE(VAR, float, 32, 4);		\
+  DECL_VARIABLE(VAR, float, 64, 2);		
+
+  DECL_VABD_VAR(vector1);
+  DECL_VABD_VAR(vector2);
+  DECL_VABD_VAR(vector3);
+  DECL_VABD_VAR(vector_res);
+
+  clean_results ();
+
+  /* Initialize input "vector1" from "buffer".  */
+  VLOAD(vector1, buffer, , float, f, 32, 2);
+  VLOAD(vector1, buffer, q, float, f, 32, 4);
+  VLOAD(vector1, buffer, q, float, f, 64, 2);
+
+  /* Choose init value arbitrarily.  */
+  VDUP(vector2, , float, f, 32, 2, 9.3f);
+  VDUP(vector2, q, float, f, 32, 4, 29.7f);
+  VDUP(vector2, q, float, f, 64, 2, 15.8f);
+  
+  /* Choose init value arbitrarily.  */
+  VDUP(vector3, , float, f, 32, 2, 81.2f);
+  VDUP(vector3, q, float, f, 32, 4, 36.8f);
+  VDUP(vector3, q, float, f, 64, 2, 51.7f);
+
+  /* Execute the tests.  */
+  TEST_VFMA(, float, f, 32, 2);
+  TEST_VFMA(q, float, f, 32, 4);
+  TEST_VFMA(q, float, f, 64, 2);
+
+  CHECK_VFMA_RESULTS (TEST_MSG, "");
+}
+
+int main (void)
+{
+  exec_vfma ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma_n.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma_n.c
new file mode 100644
index 00000000000..d773f8b3076
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma_n.c
@@ -0,0 +1,69 @@
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results.  */
+VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x4438ca3d, 0x44390a3d };
+VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x44869eb8, 0x4486beb8, 0x4486deb8, 0x4486feb8 };
+VECT_VAR_DECL(expected,hfloat,64,2) [] = { 0x408906e1532b8520, 0x40890ee1532b8520 };
+
+#define VECT_VAR_ASSIGN(S,Q,T1,W) S##Q##_##T1##W
+#define ASSIGN(S, Q, T, W, V) T##W##_t S##Q##_##T##W = V
+#define TEST_MSG "VFMA/VFMAQ"
+void exec_vfma_n (void)
+{
+  /* Basic test: v4=vfma_n(v1,v2), then store the result.  */
+#define TEST_VFMA(Q, T1, T2, W, N)					\
+  VECT_VAR(vector_res, T1, W, N) =					\
+    vfma##Q##_n_##T2##W(VECT_VAR(vector1, T1, W, N),			\
+		      VECT_VAR(vector2, T1, W, N),			\
+			  VECT_VAR_ASSIGN(Scalar, Q, T1, W));			\
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector_res, T1, W, N))
+
+#define CHECK_VFMA_RESULTS(test_name,comment)				\
+  {									\
+    CHECK_FP(test_name, float, 32, 2, PRIx32, expected, comment);	\
+    CHECK_FP(test_name, float, 32, 4, PRIx32, expected, comment);	\
+	CHECK_FP(test_name, float, 64, 2, PRIx64, expected, comment);	\
+  }	
+
+#define DECL_VABD_VAR(VAR)			\
+  DECL_VARIABLE(VAR, float, 32, 2);		\
+  DECL_VARIABLE(VAR, float, 32, 4);		\
+  DECL_VARIABLE(VAR, float, 64, 2);		
+
+  DECL_VABD_VAR(vector1);
+  DECL_VABD_VAR(vector2);
+  DECL_VABD_VAR(vector3);
+  DECL_VABD_VAR(vector_res);
+
+  clean_results ();
+
+  /* Initialize input "vector1" from "buffer".  */
+  VLOAD(vector1, buffer, , float, f, 32, 2);
+  VLOAD(vector1, buffer, q, float, f, 32, 4);
+  VLOAD(vector1, buffer, q, float, f, 64, 2);
+
+  /* Choose init value arbitrarily.  */
+  VDUP(vector2, , float, f, 32, 2, 9.3f);
+  VDUP(vector2, q, float, f, 32, 4, 29.7f);
+  VDUP(vector2, q, float, f, 64, 2, 15.8f);
+  
+  /* Choose init value arbitrarily.  */
+  ASSIGN(Scalar, , float, 32, 81.2f);
+  ASSIGN(Scalar, q, float, 32, 36.8f);
+  ASSIGN(Scalar, q, float, 64, 51.7f);
+
+  /* Execute the tests.  */
+  TEST_VFMA(, float, f, 32, 2);
+  TEST_VFMA(q, float, f, 32, 4);
+  TEST_VFMA(q, float, f, 64, 2);
+
+  CHECK_VFMA_RESULTS (TEST_MSG, "");
+}
+
+int main (void)
+{
+  exec_vfma_n ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms.c
new file mode 100644
index 00000000000..f70e56a04b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms.c
@@ -0,0 +1,67 @@
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results.  */
+VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc440ca3d, 0xc4408a3d };
+VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc48a9eb8, 0xc48a7eb8, 0xc48a5eb8, 0xc48a3eb8 };
+VECT_VAR_DECL(expected,hfloat,64,2) [] = { 0xc08a06e1532b8520, 0xc089fee1532b8520 };
+
+#define TEST_MSG "VFMA/VFMAQ"
+void exec_vfms (void)
+{
+  /* Basic test: v4=vfms(v1,v2), then store the result.  */
+#define TEST_VFMA(Q, T1, T2, W, N)					\
+  VECT_VAR(vector_res, T1, W, N) =					\
+    vfms##Q##_##T2##W(VECT_VAR(vector1, T1, W, N),			\
+		      VECT_VAR(vector2, T1, W, N),			\
+			  VECT_VAR(vector3, T1, W, N));			\
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector_res, T1, W, N))
+
+#define CHECK_VFMA_RESULTS(test_name,comment)				\
+  {									\
+    CHECK_FP(test_name, float, 32, 2, PRIx32, expected, comment);	\
+    CHECK_FP(test_name, float, 32, 4, PRIx32, expected, comment);	\
+	CHECK_FP(test_name, float, 64, 2, PRIx64, expected, comment);	\
+  }	
+
+#define DECL_VABD_VAR(VAR)			\
+  DECL_VARIABLE(VAR, float, 32, 2);		\
+  DECL_VARIABLE(VAR, float, 32, 4);		\
+  DECL_VARIABLE(VAR, float, 64, 2);		
+
+  DECL_VABD_VAR(vector1);
+  DECL_VABD_VAR(vector2);
+  DECL_VABD_VAR(vector3);
+  DECL_VABD_VAR(vector_res);
+
+  clean_results ();
+
+  /* Initialize input "vector1" from "buffer".  */
+  VLOAD(vector1, buffer, , float, f, 32, 2);
+  VLOAD(vector1, buffer, q, float, f, 32, 4);
+  VLOAD(vector1, buffer, q, float, f, 64, 2);
+
+  /* Choose init value arbitrarily.  */
+  VDUP(vector2, , float, f, 32, 2, 9.3f);
+  VDUP(vector2, q, float, f, 32, 4, 29.7f);
+  VDUP(vector2, q, float, f, 64, 2, 15.8f);
+  
+  /* Choose init value arbitrarily.  */
+  VDUP(vector3, , float, f, 32, 2, 81.2f);
+  VDUP(vector3, q, float, f, 32, 4, 36.8f);
+  VDUP(vector3, q, float, f, 64, 2, 51.7f);
+
+  /* Execute the tests.  */
+  TEST_VFMA(, float, f, 32, 2);
+  TEST_VFMA(q, float, f, 32, 4);
+  TEST_VFMA(q, float, f, 64, 2);
+
+  CHECK_VFMA_RESULTS (TEST_MSG, "");
+}
+
+int main (void)
+{
+  exec_vfms ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/narrow_high-intrinsics.c b/gcc/testsuite/gcc.target/aarch64/narrow_high-intrinsics.c
index 0f23cc9c7b5..8b8a6302692 100644
--- a/gcc/testsuite/gcc.target/aarch64/narrow_high-intrinsics.c
+++ b/gcc/testsuite/gcc.target/aarch64/narrow_high-intrinsics.c
@@ -107,9 +107,9 @@ ONE (vmovn_high, uint16x8_t, uint16x4_t, uint32x4_t, u32)
 ONE (vmovn_high, uint32x4_t, uint32x2_t, uint64x2_t, u64)
 
 
-/* { dg-final { scan-assembler-times "\\tsubhn2 v" 6} }  */
+/* { dg-final { scan-assembler-times "\\tsubhn2\\tv" 6} }  */
 /* { dg-final { scan-assembler-times "\\taddhn2\\tv" 6} }  */
-/* { dg-final { scan-assembler-times "rsubhn2 v" 6} }  */
+/* { dg-final { scan-assembler-times "rsubhn2\\tv" 6} }  */
 /* { dg-final { scan-assembler-times "raddhn2\\tv" 6} }  */
 /* { dg-final { scan-assembler-times "\\trshrn2 v" 6} }  */
 /* { dg-final { scan-assembler-times "\\tshrn2 v" 6} }  */
-- 
2.30.2