[AArch64] Improve arm_neon.h vml<as>_lane handling.
authorJames Greenhalgh <james.greenhalgh@arm.com>
Mon, 16 Sep 2013 09:53:11 +0000 (09:53 +0000)
committerJames Greenhalgh <jgreenhalgh@gcc.gnu.org>
Mon, 16 Sep 2013 09:53:11 +0000 (09:53 +0000)
gcc/
* config/aarch64/aarch64-simd-builtins.def (fma): New.
* config/aarch64/aarch64-simd.md
(aarch64_mla_elt<mode>): New.
(aarch64_mla_elt_<vswap_width_name><mode>): Likewise.
(aarch64_mls_elt<mode>): Likewise.
(aarch64_mls_elt_<vswap_width_name><mode>): Likewise.
(aarch64_fma4_elt<mode>): Likewise.
(aarch64_fma4_elt_<vswap_width_name><mode>): Likewise.
(aarch64_fma4_elt_to_128v2df): Likewise.
(aarch64_fma4_elt_to_64df): Likewise.
(fnma<mode>4): Likewise.
(aarch64_fnma4_elt<mode>): Likewise.
(aarch64_fnma4_elt_<vswap_width_name><mode>): Likewise.
(aarch64_fnma4_elt_to_128v2df): Likewise.
(aarch64_fnma4_elt_to_64df): Likewise.
* config/aarch64/iterators.md (VDQSF): New.
* config/aarch64/arm_neon.h
(vfm<as><sdq>_lane<q>_f<32, 64>): Convert to C implementation.
(vml<sa><q>_lane<q>_<fsu><16, 32, 64>): Likewise.

gcc/testsuite/
* gcc.target/aarch64/fmla-intrinsic.c: New.
* gcc.target/aarch64/mla-intrinsic.c: Likewise.
* gcc.target/aarch64/fmls-intrinsic.c: Likewise.
* gcc.target/aarch64/mls-intrinsic.c: Likewise.

From-SVN: r202625

gcc/ChangeLog
gcc/config/aarch64/aarch64-simd-builtins.def
gcc/config/aarch64/aarch64-simd.md
gcc/config/aarch64/arm_neon.h
gcc/config/aarch64/iterators.md
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/mla_intrinsic_1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/mls_intrinsic_1.c [new file with mode: 0644]

index aaab5ece33576b6266f3ca185b6acb9c4d1e44e2..68091bbcbbd1c46c13bfeb5e0b54b3464a39c1d2 100644 (file)
@@ -1,3 +1,25 @@
+2013-09-16  James Greenhalgh  <james.greenhalgh@arm.com>
+
+       * config/aarch64/aarch64-simd-builtins.def (fma): New.
+       * config/aarch64/aarch64-simd.md
+       (aarch64_mla_elt<mode>): New.
+       (aarch64_mla_elt_<vswap_width_name><mode>): Likewise.
+       (aarch64_mls_elt<mode>): Likewise.
+       (aarch64_mls_elt_<vswap_width_name><mode>): Likewise.
+       (aarch64_fma4_elt<mode>): Likewise.
+       (aarch64_fma4_elt_<vswap_width_name><mode>): Likewise.
+       (aarch64_fma4_elt_to_128v2df): Likewise.
+       (aarch64_fma4_elt_to_64df): Likewise.
+       (fnma<mode>4): Likewise.
+       (aarch64_fnma4_elt<mode>): Likewise.
+       (aarch64_fnma4_elt_<vswap_width_name><mode>): Likewise.
+       (aarch64_fnma4_elt_to_128v2df): Likewise.
+       (aarch64_fnma4_elt_to_64df): Likewise.
+       * config/aarch64/iterators.md (VDQSF): New.
+       * config/aarch64/arm_neon.h
+       (vfm<as><sdq>_lane<q>_f<32, 64>): Convert to C implementation.
+       (vml<sa><q>_lane<q>_<fsu><16, 32, 64>): Likewise.
+
 2013-09-16  James Greenhalgh  <james.greenhalgh@arm.com>
 
        * config/aarch64/aarch64-simd.md (aarch64_mul3_elt<mode>): New.
index 4046d7a7001c1eb81e79ae59a8a639312e1e7e6b..35897f3939556d7bb804d4b4ae692a300b103681 100644 (file)
   /* Implemented by aarch64_st1<VALL:mode>.  */
   BUILTIN_VALL (STORE1, st1, 0)
 
+  /* Implemented by fma<mode>4.  */
+  BUILTIN_VDQF (TERNOP, fma, 4)
+
index 04d5794ffcae73a8b33844f3147e4315747deb69..f13cd5b7cdbdff95bbc378a76a6dd05de031487d 100644 (file)
    (set_attr "simd_mode" "<MODE>")]
 )
 
+(define_insn "*aarch64_mla_elt<mode>"
+ [(set (match_operand:VDQHS 0 "register_operand" "=w")
+       (plus:VDQHS
+        (mult:VDQHS
+          (vec_duplicate:VDQHS
+             (vec_select:<VEL>
+               (match_operand:VDQHS 1 "register_operand" "<h_con>")
+                 (parallel [(match_operand:SI 2 "immediate_operand")])))
+          (match_operand:VDQHS 3 "register_operand" "w"))
+        (match_operand:VDQHS 4 "register_operand" "0")))]
+ "TARGET_SIMD"
+ "mla\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
+  [(set_attr "simd_type" "simd_mla")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "*aarch64_mla_elt_<vswap_width_name><mode>"
+ [(set (match_operand:VDQHS 0 "register_operand" "=w")
+       (plus:VDQHS
+        (mult:VDQHS
+          (vec_duplicate:VDQHS
+             (vec_select:<VEL>
+               (match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
+                 (parallel [(match_operand:SI 2 "immediate_operand")])))
+          (match_operand:VDQHS 3 "register_operand" "w"))
+        (match_operand:VDQHS 4 "register_operand" "0")))]
+ "TARGET_SIMD"
+ "mla\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
+  [(set_attr "simd_type" "simd_mla")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
 (define_insn "aarch64_mls<mode>"
  [(set (match_operand:VQ_S 0 "register_operand" "=w")
        (minus:VQ_S (match_operand:VQ_S 1 "register_operand" "0")
    (set_attr "simd_mode" "<MODE>")]
 )
 
+(define_insn "*aarch64_mls_elt<mode>"
+ [(set (match_operand:VDQHS 0 "register_operand" "=w")
+       (minus:VDQHS
+        (match_operand:VDQHS 4 "register_operand" "0")
+        (mult:VDQHS
+          (vec_duplicate:VDQHS
+             (vec_select:<VEL>
+               (match_operand:VDQHS 1 "register_operand" "<h_con>")
+                 (parallel [(match_operand:SI 2 "immediate_operand")])))
+          (match_operand:VDQHS 3 "register_operand" "w"))))]
+ "TARGET_SIMD"
+ "mls\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
+  [(set_attr "simd_type" "simd_mla")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "*aarch64_mls_elt_<vswap_width_name><mode>"
+ [(set (match_operand:VDQHS 0 "register_operand" "=w")
+       (minus:VDQHS
+        (match_operand:VDQHS 4 "register_operand" "0")
+        (mult:VDQHS
+          (vec_duplicate:VDQHS
+             (vec_select:<VEL>
+               (match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
+                 (parallel [(match_operand:SI 2 "immediate_operand")])))
+          (match_operand:VDQHS 3 "register_operand" "w"))))]
+ "TARGET_SIMD"
+ "mls\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
+  [(set_attr "simd_type" "simd_mla")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
 ;; Max/Min operations.
 (define_insn "<su><maxmin><mode>3"
  [(set (match_operand:VQ_S 0 "register_operand" "=w")
    (set_attr "simd_mode" "<MODE>")]
 )
 
+(define_insn "*aarch64_fma4_elt<mode>"
+  [(set (match_operand:VDQF 0 "register_operand" "=w")
+    (fma:VDQF
+      (vec_duplicate:VDQF
+       (vec_select:<VEL>
+         (match_operand:VDQF 1 "register_operand" "<h_con>")
+         (parallel [(match_operand:SI 2 "immediate_operand")])))
+      (match_operand:VDQF 3 "register_operand" "w")
+      (match_operand:VDQF 4 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "fmla\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
+  [(set_attr "simd_type" "simd_fmla_elt")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "*aarch64_fma4_elt_<vswap_width_name><mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+    (fma:VDQSF
+      (vec_duplicate:VDQSF
+       (vec_select:<VEL>
+         (match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
+         (parallel [(match_operand:SI 2 "immediate_operand")])))
+      (match_operand:VDQSF 3 "register_operand" "w")
+      (match_operand:VDQSF 4 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "fmla\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
+  [(set_attr "simd_type" "simd_fmla_elt")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "*aarch64_fma4_elt_to_128df"
+  [(set (match_operand:V2DF 0 "register_operand" "=w")
+    (fma:V2DF
+      (vec_duplicate:V2DF
+         (match_operand:DF 1 "register_operand" "w"))
+      (match_operand:V2DF 2 "register_operand" "w")
+      (match_operand:V2DF 3 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "fmla\\t%0.2d, %2.2d, %1.2d[0]"
+  [(set_attr "simd_type" "simd_fmla_elt")
+   (set_attr "simd_mode" "V2DF")]
+)
+
+(define_insn "*aarch64_fma4_elt_to_64v2df"
+  [(set (match_operand:DF 0 "register_operand" "=w")
+    (fma:DF
+       (vec_select:DF
+         (match_operand:V2DF 1 "register_operand" "w")
+         (parallel [(match_operand:SI 2 "immediate_operand")]))
+      (match_operand:DF 3 "register_operand" "w")
+      (match_operand:DF 4 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "fmla\\t%0.2d, %3.2d, %1.2d[%2]"
+  [(set_attr "simd_type" "simd_fmla_elt")
+   (set_attr "simd_mode" "V2DF")]
+)
+
+(define_insn "fnma<mode>4"
+  [(set (match_operand:VDQF 0 "register_operand" "=w")
+       (fma:VDQF
+         (match_operand:VDQF 1 "register_operand" "w")
+          (neg:VDQF
+           (match_operand:VDQF 2 "register_operand" "w"))
+         (match_operand:VDQF 3 "register_operand" "0")))]
+  "TARGET_SIMD"
+ "fmls\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "simd_type" "simd_fmla")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "*aarch64_fnma4_elt<mode>"
+  [(set (match_operand:VDQF 0 "register_operand" "=w")
+    (fma:VDQF
+      (neg:VDQF
+        (match_operand:VDQF 3 "register_operand" "w"))
+      (vec_duplicate:VDQF
+       (vec_select:<VEL>
+         (match_operand:VDQF 1 "register_operand" "<h_con>")
+         (parallel [(match_operand:SI 2 "immediate_operand")])))
+      (match_operand:VDQF 4 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "fmls\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
+  [(set_attr "simd_type" "simd_fmla_elt")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "*aarch64_fnma4_elt_<vswap_width_name><mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+    (fma:VDQSF
+      (neg:VDQSF
+        (match_operand:VDQSF 3 "register_operand" "w"))
+      (vec_duplicate:VDQSF
+       (vec_select:<VEL>
+         (match_operand:<VSWAP_WIDTH> 1 "register_operand" "<h_con>")
+         (parallel [(match_operand:SI 2 "immediate_operand")])))
+      (match_operand:VDQSF 4 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "fmls\\t%0.<Vtype>, %3.<Vtype>, %1.<Vtype>[%2]"
+  [(set_attr "simd_type" "simd_fmla_elt")
+   (set_attr "simd_mode" "<MODE>")]
+)
+
+(define_insn "*aarch64_fnma4_elt_to_128df"
+  [(set (match_operand:V2DF 0 "register_operand" "=w")
+    (fma:V2DF
+      (neg:V2DF
+        (match_operand:V2DF 2 "register_operand" "w"))
+      (vec_duplicate:V2DF
+       (match_operand:DF 1 "register_operand" "w"))
+      (match_operand:V2DF 3 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "fmls\\t%0.2d, %2.2d, %1.2d[0]"
+  [(set_attr "simd_type" "simd_fmla_elt")
+   (set_attr "simd_mode" "V2DF")]
+)
+
+(define_insn "*aarch64_fnma4_elt_to_64v2df"
+  [(set (match_operand:DF 0 "register_operand" "=w")
+    (fma:DF
+      (vec_select:DF
+       (match_operand:V2DF 1 "register_operand" "w")
+       (parallel [(match_operand:SI 2 "immediate_operand")]))
+      (neg:DF
+        (match_operand:DF 3 "register_operand" "w"))
+      (match_operand:DF 4 "register_operand" "0")))]
+  "TARGET_SIMD"
+  "fmls\\t%0.2d, %3.2d, %1.2d[%2]"
+  [(set_attr "simd_type" "simd_fmla_elt")
+   (set_attr "simd_mode" "V2DF")]
+)
+
 ;; Vector versions of the floating-point frint patterns.
 ;; Expands to btrunc, ceil, floor, nearbyint, rint, round.
 (define_insn "<frint_pattern><mode>2"
index 6c9dd79a69508139bfb09631941396aa042a05cc..cb5860206a1812f347a77d4a6e06519f8c3a696f 100644 (file)
@@ -6100,33 +6100,6 @@ vfma_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
   return result;
 }
 
-#define vfma_lane_f32(a, b, c, d)                                       \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x2_t c_ = (c);                                            \
-       float32x2_t b_ = (b);                                            \
-       float32x2_t a_ = (a);                                            \
-       float32x2_t result;                                              \
-       __asm__ ("fmla %0.2s,%2.2s,%3.s[%4]"                             \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vfmad_lane_f64(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       float64x2_t b_ = (b);                                            \
-       float64_t a_ = (a);                                              \
-       float64_t result;                                                \
-       __asm__ ("fmla %d0,%d1,%2.d[%3]"                                 \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vfmaq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
 {
@@ -6149,47 +6122,6 @@ vfmaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
   return result;
 }
 
-#define vfmaq_lane_f32(a, b, c, d)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t c_ = (c);                                            \
-       float32x4_t b_ = (b);                                            \
-       float32x4_t a_ = (a);                                            \
-       float32x4_t result;                                              \
-       __asm__ ("fmla %0.4s,%2.4s,%3.s[%4]"                             \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vfmaq_lane_f64(a, b, c, d)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       float64x2_t c_ = (c);                                            \
-       float64x2_t b_ = (b);                                            \
-       float64x2_t a_ = (a);                                            \
-       float64x2_t result;                                              \
-       __asm__ ("fmla %0.2d,%2.2d,%3.d[%4]"                             \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vfmas_lane_f32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t b_ = (b);                                            \
-       float32_t a_ = (a);                                              \
-       float32_t result;                                                \
-       __asm__ ("fmla %s0,%s1,%2.s[%3]"                                 \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vfma_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
 {
@@ -6234,19 +6166,6 @@ vfms_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
   return result;
 }
 
-#define vfmsd_lane_f64(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       float64x2_t b_ = (b);                                            \
-       float64_t a_ = (a);                                              \
-       float64_t result;                                                \
-       __asm__ ("fmls %d0,%d1,%2.d[%3]"                                 \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vfmsq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
 {
@@ -6269,19 +6188,6 @@ vfmsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
   return result;
 }
 
-#define vfmss_lane_f32(a, b, c)                                         \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t b_ = (b);                                            \
-       float32_t a_ = (a);                                              \
-       float32_t result;                                                \
-       __asm__ ("fmls %s0,%s1,%2.s[%3]"                                 \
-                : "=w"(result)                                          \
-                : "w"(a_), "w"(b_), "i"(c)                              \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vget_high_f32 (float32x4_t a)
 {
@@ -7122,133 +7028,6 @@ vld1q_dup_u64 (const uint64_t * a)
        result;                                                          \
      })
 
-#define vmla_lane_f32(a, b, c, d)                                       \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x2_t c_ = (c);                                            \
-       float32x2_t b_ = (b);                                            \
-       float32x2_t a_ = (a);                                            \
-       float32x2_t result;                                              \
-       float32x2_t t1;                                                  \
-       __asm__ ("fmul %1.2s, %3.2s, %4.s[%5]; fadd %0.2s, %0.2s, %1.2s" \
-                : "=w"(result), "=w"(t1)                                \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmla_lane_s16(a, b, c, d)                                       \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x4_t c_ = (c);                                              \
-       int16x4_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int16x4_t result;                                                \
-       __asm__ ("mla %0.4h, %2.4h, %3.h[%4]"                            \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmla_lane_s32(a, b, c, d)                                       \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x2_t c_ = (c);                                              \
-       int32x2_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int32x2_t result;                                                \
-       __asm__ ("mla %0.2s, %2.2s, %3.s[%4]"                            \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmla_lane_u16(a, b, c, d)                                       \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x4_t c_ = (c);                                             \
-       uint16x4_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint16x4_t result;                                               \
-       __asm__ ("mla %0.4h, %2.4h, %3.h[%4]"                            \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmla_lane_u32(a, b, c, d)                                       \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x2_t c_ = (c);                                             \
-       uint32x2_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint32x2_t result;                                               \
-       __asm__ ("mla %0.2s, %2.2s, %3.s[%4]"                            \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmla_laneq_s16(a, b, c, d)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t c_ = (c);                                              \
-       int16x4_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int16x4_t result;                                                \
-       __asm__ ("mla %0.4h, %2.4h, %3.h[%4]"                            \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmla_laneq_s32(a, b, c, d)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t c_ = (c);                                              \
-       int32x2_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int32x2_t result;                                                \
-       __asm__ ("mla %0.2s, %2.2s, %3.s[%4]"                            \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmla_laneq_u16(a, b, c, d)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t c_ = (c);                                             \
-       uint16x4_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint16x4_t result;                                               \
-       __asm__ ("mla %0.4h, %2.4h, %3.h[%4]"                            \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmla_laneq_u32(a, b, c, d)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t c_ = (c);                                             \
-       uint32x2_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint32x2_t result;                                               \
-       __asm__ ("mla %0.2s, %2.2s, %3.s[%4]"                            \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vmla_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
 {
@@ -7815,133 +7594,6 @@ vmlal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
   return result;
 }
 
-#define vmlaq_lane_f32(a, b, c, d)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t c_ = (c);                                            \
-       float32x4_t b_ = (b);                                            \
-       float32x4_t a_ = (a);                                            \
-       float32x4_t result;                                              \
-       float32x4_t t1;                                                  \
-       __asm__ ("fmul %1.4s, %3.4s, %4.s[%5]; fadd %0.4s, %0.4s, %1.4s" \
-                : "=w"(result), "=w"(t1)                                \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmlaq_lane_s16(a, b, c, d)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t c_ = (c);                                              \
-       int16x8_t b_ = (b);                                              \
-       int16x8_t a_ = (a);                                              \
-       int16x8_t result;                                                \
-       __asm__ ("mla %0.8h, %2.8h, %3.h[%4]"                            \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmlaq_lane_s32(a, b, c, d)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t c_ = (c);                                              \
-       int32x4_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("mla %0.4s, %2.4s, %3.s[%4]"                            \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmlaq_lane_u16(a, b, c, d)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t c_ = (c);                                             \
-       uint16x8_t b_ = (b);                                             \
-       uint16x8_t a_ = (a);                                             \
-       uint16x8_t result;                                               \
-       __asm__ ("mla %0.8h, %2.8h, %3.h[%4]"                            \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmlaq_lane_u32(a, b, c, d)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t c_ = (c);                                             \
-       uint32x4_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("mla %0.4s, %2.4s, %3.s[%4]"                            \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmlaq_laneq_s16(a, b, c, d)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t c_ = (c);                                              \
-       int16x8_t b_ = (b);                                              \
-       int16x8_t a_ = (a);                                              \
-       int16x8_t result;                                                \
-       __asm__ ("mla %0.8h, %2.8h, %3.h[%4]"                            \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmlaq_laneq_s32(a, b, c, d)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t c_ = (c);                                              \
-       int32x4_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("mla %0.4s, %2.4s, %3.s[%4]"                            \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmlaq_laneq_u16(a, b, c, d)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t c_ = (c);                                             \
-       uint16x8_t b_ = (b);                                             \
-       uint16x8_t a_ = (a);                                             \
-       uint16x8_t result;                                               \
-       __asm__ ("mla %0.8h, %2.8h, %3.h[%4]"                            \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmlaq_laneq_u32(a, b, c, d)                                     \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t c_ = (c);                                             \
-       uint32x4_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("mla %0.4s, %2.4s, %3.s[%4]"                            \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vmlaq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
 {
@@ -8046,106 +7698,35 @@ vmlaq_s32 (int32x4_t a, int32x4_t b, int32x4_t c)
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 vmlaq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c)
 {
-  uint8x16_t result;
-  __asm__ ("mla %0.16b, %2.16b, %3.16b"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmlaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
-{
-  uint16x8_t result;
-  __asm__ ("mla %0.8h, %2.8h, %3.8h"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
-{
-  uint32x4_t result;
-  __asm__ ("mla %0.4s, %2.4s, %3.4s"
-           : "=w"(result)
-           : "0"(a), "w"(b), "w"(c)
-           : /* No clobbers */);
-  return result;
-}
-
-#define vmls_lane_f32(a, b, c, d)                                       \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x2_t c_ = (c);                                            \
-       float32x2_t b_ = (b);                                            \
-       float32x2_t a_ = (a);                                            \
-       float32x2_t result;                                              \
-       float32x2_t t1;                                                  \
-       __asm__ ("fmul %1.2s, %3.2s, %4.s[%5]; fsub %0.2s, %0.2s, %1.2s" \
-                : "=w"(result), "=w"(t1)                                \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmls_lane_s16(a, b, c, d)                                       \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x4_t c_ = (c);                                              \
-       int16x4_t b_ = (b);                                              \
-       int16x4_t a_ = (a);                                              \
-       int16x4_t result;                                                \
-       __asm__ ("mls %0.4h,%2.4h,%3.h[%4]"                              \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmls_lane_s32(a, b, c, d)                                       \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x2_t c_ = (c);                                              \
-       int32x2_t b_ = (b);                                              \
-       int32x2_t a_ = (a);                                              \
-       int32x2_t result;                                                \
-       __asm__ ("mls %0.2s,%2.2s,%3.s[%4]"                              \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmls_lane_u16(a, b, c, d)                                       \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x4_t c_ = (c);                                             \
-       uint16x4_t b_ = (b);                                             \
-       uint16x4_t a_ = (a);                                             \
-       uint16x4_t result;                                               \
-       __asm__ ("mls %0.4h,%2.4h,%3.h[%4]"                              \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+  uint8x16_t result;
+  __asm__ ("mla %0.16b, %2.16b, %3.16b"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
 
-#define vmls_lane_u32(a, b, c, d)                                       \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x2_t c_ = (c);                                             \
-       uint32x2_t b_ = (b);                                             \
-       uint32x2_t a_ = (a);                                             \
-       uint32x2_t result;                                               \
-       __asm__ ("mls %0.2s,%2.2s,%3.s[%4]"                              \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmlaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c)
+{
+  uint16x8_t result;
+  __asm__ ("mla %0.8h, %2.8h, %3.8h"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
+{
+  uint32x4_t result;
+  __asm__ ("mla %0.4s, %2.4s, %3.4s"
+           : "=w"(result)
+           : "0"(a), "w"(b), "w"(c)
+           : /* No clobbers */);
+  return result;
+}
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vmls_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
@@ -8713,148 +8294,6 @@ vmlsl_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
   return result;
 }
 
-#define vmlsq_lane_f32(a, b, c, d)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t c_ = (c);                                            \
-       float32x4_t b_ = (b);                                            \
-       float32x4_t a_ = (a);                                            \
-       float32x4_t result;                                              \
-       float32x4_t t1;                                                  \
-       __asm__ ("fmul %1.4s, %3.4s, %4.s[%5]; fsub %0.4s, %0.4s, %1.4s" \
-                : "=w"(result), "=w"(t1)                                \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmlsq_lane_s16(a, b, c, d)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       int16x8_t c_ = (c);                                              \
-       int16x8_t b_ = (b);                                              \
-       int16x8_t a_ = (a);                                              \
-       int16x8_t result;                                                \
-       __asm__ ("mls %0.8h,%2.8h,%3.h[%4]"                              \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmlsq_lane_s32(a, b, c, d)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t c_ = (c);                                              \
-       int32x4_t b_ = (b);                                              \
-       int32x4_t a_ = (a);                                              \
-       int32x4_t result;                                                \
-       __asm__ ("mls %0.4s,%2.4s,%3.s[%4]"                              \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmlsq_lane_u16(a, b, c, d)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       uint16x8_t c_ = (c);                                             \
-       uint16x8_t b_ = (b);                                             \
-       uint16x8_t a_ = (a);                                             \
-       uint16x8_t result;                                               \
-       __asm__ ("mls %0.8h,%2.8h,%3.h[%4]"                              \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "x"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmlsq_lane_u32(a, b, c, d)                                      \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t c_ = (c);                                             \
-       uint32x4_t b_ = (b);                                             \
-       uint32x4_t a_ = (a);                                             \
-       uint32x4_t result;                                               \
-       __asm__ ("mls %0.4s,%2.4s,%3.s[%4]"                              \
-                : "=w"(result)                                          \
-                : "0"(a_), "w"(b_), "w"(c_), "i"(d)                     \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vmlsq_laneq_f32(__a, __b, __c, __d)                            \
-  __extension__                                                                \
-    ({                                                                 \
-       float32x4_t __c_ = (__c);                                       \
-       float32x4_t __b_ = (__b);                                       \
-       float32x4_t __a_ = (__a);                                       \
-       float32x4_t __result;                                           \
-       float32x4_t __t1;                                               \
-       __asm__ ("fmul %1.4s, %3.4s, %4.s[%5]; fsub %0.4s, %0.4s, %1.4s"        \
-                : "=w"(__result), "=w"(__t1)                           \
-                : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d)            \
-                : /* No clobbers */);                                  \
-       __result;                                                       \
-     })
-
-#define vmlsq_laneq_s16(__a, __b, __c, __d)                            \
-  __extension__                                                                \
-    ({                                                                 \
-       int16x8_t __c_ = (__c);                                         \
-       int16x8_t __b_ = (__b);                                         \
-       int16x8_t __a_ = (__a);                                         \
-       int16x8_t __result;                                             \
-       __asm__ ("mls %0.8h, %2.8h, %3.h[%4]"                           \
-                : "=w"(__result)                                       \
-                : "0"(__a_), "w"(__b_), "x"(__c_), "i"(__d)            \
-                : /* No clobbers */);                                  \
-       __result;                                                       \
-     })
-
-#define vmlsq_laneq_s32(__a, __b, __c, __d)                            \
-  __extension__                                                                \
-    ({                                                                 \
-       int32x4_t __c_ = (__c);                                         \
-       int32x4_t __b_ = (__b);                                         \
-       int32x4_t __a_ = (__a);                                         \
-       int32x4_t __result;                                             \
-       __asm__ ("mls %0.4s, %2.4s, %3.s[%4]"                           \
-                : "=w"(__result)                                       \
-                : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d)            \
-                : /* No clobbers */);                                  \
-       __result;                                                       \
-     })
-
-#define vmlsq_laneq_u16(__a, __b, __c, __d)                            \
-  __extension__                                                                \
-    ({                                                                 \
-       uint16x8_t __c_ = (__c);                                                \
-       uint16x8_t __b_ = (__b);                                                \
-       uint16x8_t __a_ = (__a);                                                \
-       uint16x8_t __result;                                            \
-       __asm__ ("mls %0.8h, %2.8h, %3.h[%4]"                           \
-                : "=w"(__result)                                       \
-                : "0"(__a_), "w"(__b_), "x"(__c_), "i"(__d)            \
-                : /* No clobbers */);                                  \
-       __result;                                                       \
-     })
-
-#define vmlsq_laneq_u32(__a, __b, __c, __d)                            \
-  __extension__                                                                \
-    ({                                                                 \
-       uint32x4_t __c_ = (__c);                                                \
-       uint32x4_t __b_ = (__b);                                                \
-       uint32x4_t __a_ = (__a);                                                \
-       uint32x4_t __result;                                            \
-       __asm__ ("mls %0.4s, %2.4s, %3.s[%4]"                           \
-                : "=w"(__result)                                       \
-                : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d)            \
-                : /* No clobbers */);                                  \
-       __result;                                                       \
-     })
-
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vmlsq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
 {
@@ -19488,130 +18927,334 @@ vduph_lane_p16 (poly16x4_t __a, const int __b)
   return __aarch64_vget_lane_p16 (__a, __b);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vduph_lane_s16 (int16x4_t __a, const int __b)
+__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+vduph_lane_s16 (int16x4_t __a, const int __b)
+{
+  return __aarch64_vget_lane_s16 (__a, __b);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vduph_lane_u16 (uint16x4_t __a, const int __b)
+{
+  return __aarch64_vget_lane_u16 (__a, __b);
+}
+
+/* vdups_lane  */
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vdups_lane_f32 (float32x2_t __a, const int __b)
+{
+  return __aarch64_vget_lane_f32 (__a, __b);
+}
+
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vdups_lane_s32 (int32x2_t __a, const int __b)
+{
+  return __aarch64_vget_lane_s32 (__a, __b);
+}
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vdups_lane_u32 (uint32x2_t __a, const int __b)
+{
+  return __aarch64_vget_lane_u32 (__a, __b);
+}
+
+/* vdupd_lane  */
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vdupd_lane_f64 (float64x1_t __a, const int __attribute__ ((unused)) __b)
+{
+  return __a;
+}
+
+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+vdupd_lane_s64 (int64x1_t __a, const int __attribute__ ((unused)) __b)
+{
+  return __a;
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vdupd_lane_u64 (uint64x1_t __a, const int __attribute__ ((unused)) __b)
+{
+  return __a;
+}
+
+/* vdupb_laneq  */
+__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
+vdupb_laneq_p8 (poly8x16_t __a, const int __b)
+{
+  return __aarch64_vgetq_lane_p8 (__a, __b);
+}
+
+__extension__ static __inline int8_t __attribute__ ((__always_inline__))
+vdupb_laneq_s8 (int8x16_t __a, const int __attribute__ ((unused)) __b)
+{
+  return __aarch64_vgetq_lane_s8 (__a, __b);
+}
+
+__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
+vdupb_laneq_u8 (uint8x16_t __a, const int __b)
+{
+  return __aarch64_vgetq_lane_u8 (__a, __b);
+}
+
+/* vduph_laneq  */
+__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
+vduph_laneq_p16 (poly16x8_t __a, const int __b)
+{
+  return __aarch64_vgetq_lane_p16 (__a, __b);
+}
+
+__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+vduph_laneq_s16 (int16x8_t __a, const int __b)
+{
+  return __aarch64_vgetq_lane_s16 (__a, __b);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vduph_laneq_u16 (uint16x8_t __a, const int __b)
+{
+  return __aarch64_vgetq_lane_u16 (__a, __b);
+}
+
+/* vdups_laneq  */
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vdups_laneq_f32 (float32x4_t __a, const int __b)
+{
+  return __aarch64_vgetq_lane_f32 (__a, __b);
+}
+
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vdups_laneq_s32 (int32x4_t __a, const int __b)
+{
+  return __aarch64_vgetq_lane_s32 (__a, __b);
+}
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vdups_laneq_u32 (uint32x4_t __a, const int __b)
+{
+  return __aarch64_vgetq_lane_u32 (__a, __b);
+}
+
+/* vdupd_laneq  */
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vdupd_laneq_f64 (float64x2_t __a, const int __b)
+{
+  return __aarch64_vgetq_lane_f64 (__a, __b);
+}
+
+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+vdupd_laneq_s64 (int64x2_t __a, const int __b)
+{
+  return __aarch64_vgetq_lane_s64 (__a, __b);
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vdupd_laneq_u64 (uint64x2_t __a, const int __b)
+{
+  return __aarch64_vgetq_lane_u64 (__a, __b);
+}
+
+/* vfma_lane  */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfma_lane_f32 (float32x2_t __a, float32x2_t __b,
+              float32x2_t __c, const int __lane)
+{
+  return __builtin_aarch64_fmav2sf (__b,
+                                   __aarch64_vdup_lane_f32 (__c, __lane),
+                                   __a);
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vfma_lane_f64 (float64_t __a, float64_t __b,
+              float64_t __c, const int __lane)
+{
+  return __builtin_fma (__b, __c, __a);
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vfmad_lane_f64 (float64_t __a, float64_t __b,
+               float64_t __c, const int __lane)
+{
+  return __builtin_fma (__b, __c, __a);
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vfmas_lane_f32 (float32_t __a, float32_t __b,
+               float32x2_t __c, const int __lane)
+{
+  return __builtin_fmaf (__b, __aarch64_vget_lane_f32 (__c, __lane), __a);
+}
+
+/* vfma_laneq  */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfma_laneq_f32 (float32x2_t __a, float32x2_t __b,
+               float32x4_t __c, const int __lane)
+{
+  return __builtin_aarch64_fmav2sf (__b,
+                                   __aarch64_vdup_laneq_f32 (__c, __lane),
+                                   __a);
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vfma_laneq_f64 (float64_t __a, float64_t __b,
+               float64x2_t __c, const int __lane)
 {
-  return __aarch64_vget_lane_s16 (__a, __b);
+  return __builtin_fma (__b, __aarch64_vgetq_lane_f64 (__c, __lane), __a);
 }
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-vduph_lane_u16 (uint16x4_t __a, const int __b)
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vfmad_laneq_f64 (float64_t __a, float64_t __b,
+                float64x2_t __c, const int __lane)
 {
-  return __aarch64_vget_lane_u16 (__a, __b);
+  return __builtin_fma (__b, __aarch64_vgetq_lane_f64 (__c, __lane), __a);
 }
 
-/* vdups_lane  */
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vdups_lane_f32 (float32x2_t __a, const int __b)
+vfmas_laneq_f32 (float32_t __a, float32_t __b,
+                float32x4_t __c, const int __lane)
 {
-  return __aarch64_vget_lane_f32 (__a, __b);
+  return __builtin_fmaf (__b, __aarch64_vgetq_lane_f32 (__c, __lane), __a);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vdups_lane_s32 (int32x2_t __a, const int __b)
+/* vfmaq_lane  */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmaq_lane_f32 (float32x4_t __a, float32x4_t __b,
+               float32x2_t __c, const int __lane)
 {
-  return __aarch64_vget_lane_s32 (__a, __b);
+  return __builtin_aarch64_fmav4sf (__b,
+                                   __aarch64_vdupq_lane_f32 (__c, __lane),
+                                   __a);
 }
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vdups_lane_u32 (uint32x2_t __a, const int __b)
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vfmaq_lane_f64 (float64x2_t __a, float64x2_t __b,
+               float64_t __c, const int __lane)
 {
-  return __aarch64_vget_lane_u32 (__a, __b);
+  return __builtin_aarch64_fmav2df (__b, vdupq_n_f64 (__c), __a);
 }
 
-/* vdupd_lane  */
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vdupd_lane_f64 (float64x1_t __a, const int __attribute__ ((unused)) __b)
+/* vfmaq_laneq  */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmaq_laneq_f32 (float32x4_t __a, float32x4_t __b,
+                float32x4_t __c, const int __lane)
 {
-  return __a;
+  return __builtin_aarch64_fmav4sf (__b,
+                                   __aarch64_vdupq_laneq_f32 (__c, __lane),
+                                   __a);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vdupd_lane_s64 (int64x1_t __a, const int __attribute__ ((unused)) __b)
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vfmaq_laneq_f64 (float64x2_t __a, float64x2_t __b,
+                float64x2_t __c, const int __lane)
 {
-  return __a;
+  return __builtin_aarch64_fmav2df (__b,
+                                   __aarch64_vdupq_laneq_f64 (__c, __lane),
+                                   __a);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vdupd_lane_u64 (uint64x1_t __a, const int __attribute__ ((unused)) __b)
+/* vfms_lane  */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfms_lane_f32 (float32x2_t __a, float32x2_t __b,
+              float32x2_t __c, const int __lane)
 {
-  return __a;
+  return __builtin_aarch64_fmav2sf (-__b,
+                                   __aarch64_vdup_lane_f32 (__c, __lane),
+                                   __a);
 }
 
-/* vdupb_laneq  */
-__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
-vdupb_laneq_p8 (poly8x16_t __a, const int __b)
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vfms_lane_f64 (float64_t __a, float64_t __b,
+              float64_t __c, const int __lane)
 {
-  return __aarch64_vgetq_lane_p8 (__a, __b);
+  return __builtin_fma (-__b, __c, __a);
 }
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vdupb_laneq_s8 (int8x16_t __a, const int __attribute__ ((unused)) __b)
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vfmsd_lane_f64 (float64_t __a, float64_t __b,
+               float64_t __c, const int __lane)
 {
-  return __aarch64_vgetq_lane_s8 (__a, __b);
+  return __builtin_fma (-__b, __c, __a);
 }
 
-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
-vdupb_laneq_u8 (uint8x16_t __a, const int __b)
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vfmss_lane_f32 (float32_t __a, float32_t __b,
+               float32x2_t __c, const int __lane)
 {
-  return __aarch64_vgetq_lane_u8 (__a, __b);
+  return __builtin_fmaf (-__b, __aarch64_vget_lane_f32 (__c, __lane), __a);
 }
 
-/* vduph_laneq  */
-__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
-vduph_laneq_p16 (poly16x8_t __a, const int __b)
+/* vfms_laneq  */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfms_laneq_f32 (float32x2_t __a, float32x2_t __b,
+               float32x4_t __c, const int __lane)
 {
-  return __aarch64_vgetq_lane_p16 (__a, __b);
+  return __builtin_aarch64_fmav2sf (-__b,
+                                   __aarch64_vdup_laneq_f32 (__c, __lane),
+                                   __a);
 }
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vduph_laneq_s16 (int16x8_t __a, const int __b)
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vfms_laneq_f64 (float64_t __a, float64_t __b,
+               float64x2_t __c, const int __lane)
 {
-  return __aarch64_vgetq_lane_s16 (__a, __b);
+  return __builtin_fma (-__b, __aarch64_vgetq_lane_f64 (__c, __lane), __a);
 }
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-vduph_laneq_u16 (uint16x8_t __a, const int __b)
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vfmsd_laneq_f64 (float64_t __a, float64_t __b,
+                float64x2_t __c, const int __lane)
 {
-  return __aarch64_vgetq_lane_u16 (__a, __b);
+  return __builtin_fma (-__b, __aarch64_vgetq_lane_f64 (__c, __lane), __a);
 }
 
-/* vdups_laneq  */
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vdups_laneq_f32 (float32x4_t __a, const int __b)
+vfmss_laneq_f32 (float32_t __a, float32_t __b,
+                float32x4_t __c, const int __lane)
 {
-  return __aarch64_vgetq_lane_f32 (__a, __b);
+  return __builtin_fmaf (-__b, __aarch64_vgetq_lane_f32 (__c, __lane), __a);
 }
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vdups_laneq_s32 (int32x4_t __a, const int __b)
-{
-  return __aarch64_vgetq_lane_s32 (__a, __b);
-}
+/* vfmsq_lane  */
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vdups_laneq_u32 (uint32x4_t __a, const int __b)
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmsq_lane_f32 (float32x4_t __a, float32x4_t __b,
+               float32x2_t __c, const int __lane)
 {
-  return __aarch64_vgetq_lane_u32 (__a, __b);
+  return __builtin_aarch64_fmav4sf (-__b,
+                                   __aarch64_vdupq_lane_f32 (__c, __lane),
+                                   __a);
 }
 
-/* vdupd_laneq  */
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vdupd_laneq_f64 (float64x2_t __a, const int __b)
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vfmsq_lane_f64 (float64x2_t __a, float64x2_t __b,
+               float64_t __c, const int __lane)
 {
-  return __aarch64_vgetq_lane_f64 (__a, __b);
+  return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c), __a);
 }
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vdupd_laneq_s64 (int64x2_t __a, const int __b)
+/* vfmsq_laneq  */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmsq_laneq_f32 (float32x4_t __a, float32x4_t __b,
+                float32x4_t __c, const int __lane)
 {
-  return __aarch64_vgetq_lane_s64 (__a, __b);
+  return __builtin_aarch64_fmav4sf (-__b,
+                                   __aarch64_vdupq_laneq_f32 (__c, __lane),
+                                   __a);
 }
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vdupd_laneq_u64 (uint64x2_t __a, const int __b)
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vfmsq_laneq_f64 (float64x2_t __a, float64x2_t __b,
+                float64x2_t __c, const int __lane)
 {
-  return __aarch64_vgetq_lane_u64 (__a, __b);
+  return __builtin_aarch64_fmav2df (-__b,
+                                   __aarch64_vdupq_laneq_f64 (__c, __lane),
+                                   __a);
 }
 
 /* vld1 */
@@ -21131,6 +20774,156 @@ vmlaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
   return a + b * c;
 }
 
+/* vmla_lane  */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmla_lane_f32 (float32x2_t __a, float32x2_t __b,
+              float32x2_t __c, const int __lane)
+{
+  return (__a + (__b * __aarch64_vget_lane_f32 (__c, __lane)));
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmla_lane_s16 (int16x4_t __a, int16x4_t __b,
+               int16x4_t __c, const int __lane)
+{
+  return (__a + (__b * __aarch64_vget_lane_s16 (__c, __lane)));
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmla_lane_s32 (int32x2_t __a, int32x2_t __b,
+               int32x2_t __c, const int __lane)
+{
+  return (__a + (__b * __aarch64_vget_lane_s32 (__c, __lane)));
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmla_lane_u16 (uint16x4_t __a, uint16x4_t __b,
+               uint16x4_t __c, const int __lane)
+{
+  return (__a + (__b * __aarch64_vget_lane_u16 (__c, __lane)));
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmla_lane_u32 (uint32x2_t __a, uint32x2_t __b,
+              uint32x2_t __c, const int __lane)
+{
+  return (__a + (__b * __aarch64_vget_lane_u32 (__c, __lane)));
+}
+
+/* vmla_laneq  */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmla_laneq_f32 (float32x2_t __a, float32x2_t __b,
+               float32x4_t __c, const int __lane)
+{
+  return (__a + (__b * __aarch64_vgetq_lane_f32 (__c, __lane)));
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmla_laneq_s16 (int16x4_t __a, int16x4_t __b,
+               int16x8_t __c, const int __lane)
+{
+  return (__a + (__b * __aarch64_vgetq_lane_s16 (__c, __lane)));
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmla_laneq_s32 (int32x2_t __a, int32x2_t __b,
+               int32x4_t __c, const int __lane)
+{
+  return (__a + (__b * __aarch64_vgetq_lane_s32 (__c, __lane)));
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmla_laneq_u16 (uint16x4_t __a, uint16x4_t __b,
+               uint16x8_t __c, const int __lane)
+{
+  return (__a + (__b * __aarch64_vgetq_lane_u16 (__c, __lane)));
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmla_laneq_u32 (uint32x2_t __a, uint32x2_t __b,
+               uint32x4_t __c, const int __lane)
+{
+  return (__a + (__b * __aarch64_vgetq_lane_u32 (__c, __lane)));
+}
+
+/* vmlaq_lane  */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmlaq_lane_f32 (float32x4_t __a, float32x4_t __b,
+               float32x2_t __c, const int __lane)
+{
+  return (__a + (__b * __aarch64_vget_lane_f32 (__c, __lane)));
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmlaq_lane_s16 (int16x8_t __a, int16x8_t __b,
+               int16x4_t __c, const int __lane)
+{
+  return (__a + (__b * __aarch64_vget_lane_s16 (__c, __lane)));
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlaq_lane_s32 (int32x4_t __a, int32x4_t __b,
+               int32x2_t __c, const int __lane)
+{
+  return (__a + (__b * __aarch64_vget_lane_s32 (__c, __lane)));
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmlaq_lane_u16 (uint16x8_t __a, uint16x8_t __b,
+               uint16x4_t __c, const int __lane)
+{
+  return (__a + (__b * __aarch64_vget_lane_u16 (__c, __lane)));
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlaq_lane_u32 (uint32x4_t __a, uint32x4_t __b,
+               uint32x2_t __c, const int __lane)
+{
+  return (__a + (__b * __aarch64_vget_lane_u32 (__c, __lane)));
+}
+
+  /* vmlaq_laneq  */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmlaq_laneq_f32 (float32x4_t __a, float32x4_t __b,
+                float32x4_t __c, const int __lane)
+{
+  return (__a + (__b * __aarch64_vgetq_lane_f32 (__c, __lane)));
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmlaq_laneq_s16 (int16x8_t __a, int16x8_t __b,
+               int16x8_t __c, const int __lane)
+{
+  return (__a + (__b * __aarch64_vgetq_lane_s16 (__c, __lane)));
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlaq_laneq_s32 (int32x4_t __a, int32x4_t __b,
+               int32x4_t __c, const int __lane)
+{
+  return (__a + (__b * __aarch64_vgetq_lane_s32 (__c, __lane)));
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmlaq_laneq_u16 (uint16x8_t __a, uint16x8_t __b,
+               uint16x8_t __c, const int __lane)
+{
+  return (__a + (__b * __aarch64_vgetq_lane_u16 (__c, __lane)));
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlaq_laneq_u32 (uint32x4_t __a, uint32x4_t __b,
+               uint32x4_t __c, const int __lane)
+{
+  return (__a + (__b * __aarch64_vgetq_lane_u32 (__c, __lane)));
+}
+
+/* vmls  */
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vmls_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
 {
@@ -21149,6 +20942,153 @@ vmlsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
   return a - b * c;
 }
 
+/* vmls_lane  */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmls_lane_f32 (float32x2_t __a, float32x2_t __b,
+              float32x2_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vget_lane_f32 (__c, __lane)));
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmls_lane_s16 (int16x4_t __a, int16x4_t __b,
+               int16x4_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vget_lane_s16 (__c, __lane)));
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmls_lane_s32 (int32x2_t __a, int32x2_t __b,
+               int32x2_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vget_lane_s32 (__c, __lane)));
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmls_lane_u16 (uint16x4_t __a, uint16x4_t __b,
+               uint16x4_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vget_lane_u16 (__c, __lane)));
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmls_lane_u32 (uint32x2_t __a, uint32x2_t __b,
+              uint32x2_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vget_lane_u32 (__c, __lane)));
+}
+
+/* vmls_laneq  */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmls_laneq_f32 (float32x2_t __a, float32x2_t __b,
+              float32x4_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vgetq_lane_f32 (__c, __lane)));
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmls_laneq_s16 (int16x4_t __a, int16x4_t __b,
+               int16x8_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vgetq_lane_s16 (__c, __lane)));
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmls_laneq_s32 (int32x2_t __a, int32x2_t __b,
+               int32x4_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vgetq_lane_s32 (__c, __lane)));
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmls_laneq_u16 (uint16x4_t __a, uint16x4_t __b,
+               uint16x8_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vgetq_lane_u16 (__c, __lane)));
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmls_laneq_u32 (uint32x2_t __a, uint32x2_t __b,
+               uint32x4_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vgetq_lane_u32 (__c, __lane)));
+}
+
+/* vmlsq_lane  */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmlsq_lane_f32 (float32x4_t __a, float32x4_t __b,
+               float32x2_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vget_lane_f32 (__c, __lane)));
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmlsq_lane_s16 (int16x8_t __a, int16x8_t __b,
+               int16x4_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vget_lane_s16 (__c, __lane)));
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlsq_lane_s32 (int32x4_t __a, int32x4_t __b,
+               int32x2_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vget_lane_s32 (__c, __lane)));
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmlsq_lane_u16 (uint16x8_t __a, uint16x8_t __b,
+               uint16x4_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vget_lane_u16 (__c, __lane)));
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlsq_lane_u32 (uint32x4_t __a, uint32x4_t __b,
+               uint32x2_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vget_lane_u32 (__c, __lane)));
+}
+
+  /* vmlsq_laneq  */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmlsq_laneq_f32 (float32x4_t __a, float32x4_t __b,
+               float32x4_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vgetq_lane_f32 (__c, __lane)));
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmlsq_laneq_s16 (int16x8_t __a, int16x8_t __b,
+               int16x8_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vgetq_lane_s16 (__c, __lane)));
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlsq_laneq_s32 (int32x4_t __a, int32x4_t __b,
+               int32x4_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vgetq_lane_s32 (__c, __lane)));
+}
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmlsq_laneq_u16 (uint16x8_t __a, uint16x8_t __b,
+               uint16x8_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vgetq_lane_u16 (__c, __lane)));
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlsq_laneq_u32 (uint32x4_t __a, uint32x4_t __b,
+               uint32x4_t __c, const int __lane)
+{
+  return (__a - (__b * __aarch64_vgetq_lane_u32 (__c, __lane)));
+}
+
 /* vmul_lane  */
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
index a6b3117c8a278a205a6e9e6ca1aaf56a227c3837..ec8d813fa3f53ad822d94ccf42ac0619380d7e3b 100644 (file)
@@ -89,6 +89,9 @@
 ;; Vector Float modes.
 (define_mode_iterator VDQF [V2SF V4SF V2DF])
 
+;; Vector single Float modes.
+(define_mode_iterator VDQSF [V2SF V4SF])
+
 ;; Modes suitable to use as the return type of a vcond expression.
 (define_mode_iterator VDQF_COND [V2SF V2SI V4SF V4SI V2DF V2DI])
 
index 17ae8ee15503d5eaa4b3e1c0e3b892df065f15df..4f9b8a79191454b72cedff83ccaf328f6a855cce 100644 (file)
@@ -1,3 +1,10 @@
+2013-09-16  James Greenhalgh  <james.greenhalgh@arm.com>
+
+       * gcc.target/aarch64/fmla-intrinsic.c: New.
+       * gcc.target/aarch64/mla-intrinsic.c: Likewise.
+       * gcc.target/aarch64/fmls-intrinsic.c: Likewise.
+       * gcc.target/aarch64/mls-intrinsic.c: Likewise.
+
 2013-09-16  James Greenhalgh  <james.greenhalgh@arm.com>
 
        * gcc.target/aarch64/mul_intrinsic_1.c: New.
diff --git a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c
new file mode 100644 (file)
index 0000000..0bf1b86
--- /dev/null
@@ -0,0 +1,116 @@
+/* { dg-do run } */
+/* { dg-options "-O3 --save-temps" } */
+
+#include <arm_neon.h>
+
+#define DELTA 0.0001
+
+extern double fabs (double);
+
+extern void abort (void);
+
+#define TEST_VMLA(q1, q2, size, in1_lanes, in2_lanes)                  \
+static void                                                            \
+test_vfma##q1##_lane##q2##_f##size (float##size##_t * res,             \
+                                  const float##size##_t *in1,          \
+                                  const float##size##_t *in2)          \
+{                                                                      \
+  float##size##x##in1_lanes##_t a = vld1##q1##_f##size (res);          \
+  float##size##x##in1_lanes##_t b = vld1##q1##_f##size (in1);          \
+  float##size##x##in2_lanes##_t c;                                     \
+  if (in2_lanes > 1)                                                   \
+    {                                                                  \
+      c = vld1##q2##_f##size (in2);                                    \
+      a = vfma##q1##_lane##q2##_f##size (a, b, c, 1);                  \
+    }                                                                  \
+  else                                                                 \
+    {                                                                  \
+      c = vld1##q2##_f##size (in2 + 1);                                        \
+      a = vfma##q1##_lane##q2##_f##size (a, b, c, 0);                  \
+    }                                                                  \
+  vst1##q1##_f##size (res, a);                                         \
+}
+
+#define BUILD_VARS(width, n_lanes, n_half_lanes)               \
+TEST_VMLA ( ,  , width, n_half_lanes, n_half_lanes)            \
+TEST_VMLA (q,  , width, n_lanes, n_half_lanes)                 \
+TEST_VMLA ( , q, width, n_half_lanes, n_lanes)                 \
+TEST_VMLA (q, q, width, n_lanes, n_lanes)                      \
+
+BUILD_VARS (32, 4, 2)
+BUILD_VARS (64, 2, 1)
+
+#define POOL2 {0.0, 1.0}
+#define POOL4 {0.0, 1.0, 2.0, 3.0}
+#define EMPTY2 {0.0, 0.0}
+#define EMPTY4 {0.0, 0.0, 0.0, 0.0}
+
+#define BUILD_TEST(size, lanes)                                        \
+static void                                                    \
+test_f##size (void)                                            \
+{                                                              \
+  int i;                                                       \
+  float##size##_t pool[lanes] = POOL##lanes;                   \
+  float##size##_t res[lanes] = EMPTY##lanes;                   \
+  float##size##_t res2[lanes] = EMPTY##lanes;                  \
+  float##size##_t res3[lanes] = EMPTY##lanes;                  \
+  float##size##_t res4[lanes] = EMPTY##lanes;                  \
+                                                               \
+  /* Forecfully avoid optimization.  */                                \
+  asm volatile ("" : : : "memory");                            \
+  test_vfma_lane_f##size (res, pool, pool);                    \
+  for (i = 0; i < lanes / 2; i++)                              \
+    if (fabs (res[i] - pool[i]) > DELTA)                       \
+      abort ();                                                        \
+                                                               \
+  /* Forecfully avoid optimization.  */                                \
+  asm volatile ("" : : : "memory");                            \
+  test_vfmaq_lane_f##size (res2, pool, pool);                  \
+  for (i = 0; i < lanes; i++)                                  \
+    if (fabs (res2[i] - pool[i]) > DELTA)                      \
+      abort ();                                                        \
+                                                               \
+  /* Forecfully avoid optimization.  */                                \
+  asm volatile ("" : : : "memory");                            \
+  test_vfma_laneq_f##size (res3, pool, pool);                  \
+  for (i = 0; i < lanes / 2; i++)                              \
+    if (fabs (res3[i] - pool[i]) > DELTA)                      \
+      abort ();                                                        \
+                                                               \
+  /* Forecfully avoid optimization.  */                                \
+  asm volatile ("" : : : "memory");                            \
+  test_vfmaq_laneq_f##size (res4, pool, pool);                 \
+  for (i = 0; i < lanes; i++)                                  \
+    if (fabs (res4[i] - pool[i]) > DELTA)                      \
+      abort ();                                                        \
+}
+
+BUILD_TEST (32, 4)
+BUILD_TEST (64, 2)
+
+int
+main (int argc, char **argv)
+{
+  test_f32 ();
+  test_f64 ();
+  return 0;
+}
+
+/* vfma_laneq_f32.
+   vfma_lane_f32.  */
+/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.2s\\\[\[0-9\]+\\\]" 2 } } */
+
+/* vfmaq_lane_f32.
+   vfmaq_laneq_f32.  */
+/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.4s\\\[\[0-9\]+\\\]" 2 } } */
+
+/* vfma_lane_f64.  */
+/* { dg-final { scan-assembler-times "fmadd\\td\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+" 1 } } */
+
+/* vfmaq_lane_f64.
+   vfma_laneq_f64.
+   vfmaq_laneq_f64.  */
+/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
+
+/* { dg-final { cleanup-saved-temps } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c
new file mode 100644 (file)
index 0000000..8cc2942
--- /dev/null
@@ -0,0 +1,117 @@
+/* { dg-do run } */
+/* { dg-options "-O3 --save-temps" } */
+
+#include <arm_neon.h>
+
+#define DELTA 0.0001
+
+extern double fabs (double);
+
+extern void abort (void);
+
+#define TEST_VMLS(q1, q2, size, in1_lanes, in2_lanes)                  \
+static void                                                            \
+test_vfms##q1##_lane##q2##_f##size (float##size##_t * res,             \
+                                  const float##size##_t *in1,          \
+                                  const float##size##_t *in2)          \
+{                                                                      \
+  float##size##x##in1_lanes##_t a = vld1##q1##_f##size (res);          \
+  float##size##x##in1_lanes##_t b = vld1##q1##_f##size (in1);          \
+  float##size##x##in2_lanes##_t c;                                     \
+  if (in2_lanes > 1)                                                   \
+    {                                                                  \
+      c = vld1##q2##_f##size (in2);                                    \
+      a = vfms##q1##_lane##q2##_f##size (a, b, c, 1);                  \
+    }                                                                  \
+  else                                                                 \
+    {                                                                  \
+      c = vld1##q2##_f##size (in2 + 1);                                        \
+      a = vfms##q1##_lane##q2##_f##size (a, b, c, 0);                  \
+    }                                                                  \
+  vst1##q1##_f##size (res, a);                                         \
+}
+
+#define BUILD_VARS(width, n_lanes, n_half_lanes)               \
+TEST_VMLS ( ,  , width, n_half_lanes, n_half_lanes)            \
+TEST_VMLS (q,  , width, n_lanes, n_half_lanes)                 \
+TEST_VMLS ( , q, width, n_half_lanes, n_lanes)                 \
+TEST_VMLS (q, q, width, n_lanes, n_lanes)                      \
+
+BUILD_VARS (32, 4, 2)
+BUILD_VARS (64, 2, 1)
+
+#define POOL2 {0.0, 1.0}
+#define POOL4 {0.0, 1.0, 2.0, 3.0}
+#define EMPTY2 {0.0, 0.0}
+#define EMPTY4 {0.0, 0.0, 0.0, 0.0}
+
+#define BUILD_TEST(size, lanes)                                        \
+static void                                                    \
+test_f##size (void)                                            \
+{                                                              \
+  int i;                                                       \
+  float##size##_t pool[lanes] = POOL##lanes;                   \
+  float##size##_t res[lanes] = EMPTY##lanes;                   \
+  float##size##_t res2[lanes] = EMPTY##lanes;                  \
+  float##size##_t res3[lanes] = EMPTY##lanes;                  \
+  float##size##_t res4[lanes] = EMPTY##lanes;                  \
+                                                               \
+  /* Forecfully avoid optimization.  */                                \
+  asm volatile ("" : : : "memory");                            \
+  test_vfms_lane_f##size (res, pool, pool);                    \
+  asm volatile ("" : :"Q" (res) : "memory");                   \
+  for (i = 0; i < lanes / 2; i++)                              \
+    if (fabs (res[i] + pool[i]) > DELTA)                       \
+      abort ();                                                        \
+                                                               \
+  /* Forecfully avoid optimization.  */                                \
+  test_vfmsq_lane_f##size (res2, pool, pool);                  \
+  asm volatile ("" : :"Q" (res2) : "memory");                  \
+  for (i = 0; i < lanes; i++)                                  \
+    if (fabs (res2[i] + pool[i]) > DELTA)                      \
+      abort ();                                                        \
+                                                               \
+  /* Forecfully avoid optimization.  */                                \
+  test_vfms_laneq_f##size (res3, pool, pool);                  \
+  asm volatile ("" : :"Q" (res3) : "memory");                  \
+  for (i = 0; i < lanes / 2; i++)                              \
+    if (fabs (res3[i] + pool[i]) > DELTA)                      \
+      abort ();                                                        \
+                                                               \
+  /* Forecfully avoid optimization.  */                                \
+  test_vfmsq_laneq_f##size (res4, pool, pool);                 \
+  asm volatile ("" : :"Q" (res4) : "memory");                  \
+  for (i = 0; i < lanes; i++)                                  \
+    if (fabs (res4[i] + pool[i]) > DELTA)                      \
+      abort ();                                                        \
+}
+
+BUILD_TEST (32, 4)
+BUILD_TEST (64, 2)
+
+int
+main (int argc, char **argv)
+{
+  test_f32 ();
+  test_f64 ();
+  return 0;
+}
+
+/* vfms_laneq_f32.
+   vfms_lane_f32.  */
+/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.2s\\\[\[0-9\]+\\\]" 2 } } */
+
+/* vfmsq_lane_f32.
+   vfmsq_laneq_f32.  */
+/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.4s\\\[\[0-9\]+\\\]" 2 } } */
+
+/* vfms_lane_f64.  */
+/* { dg-final { scan-assembler-times "fmsub\\td\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+" 1 } } */
+
+/* vfmsq_lane_f64.
+   vfms_laneq_f64.
+   vfmsq_laneq_f64.  */
+/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */
+
+/* { dg-final { cleanup-saved-temps } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/mla_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/mla_intrinsic_1.c
new file mode 100644 (file)
index 0000000..fce4138
--- /dev/null
@@ -0,0 +1,84 @@
+/* { dg-do run } */
+/* { dg-options "-O3 --save-temps" } */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+#define MAPs(size, xx) int##size##xx##_t
+#define MAPu(size, xx) uint##size##xx##_t
+
+
+#define TEST_VMLA(q, su, size, in1_lanes, in2_lanes)           \
+static void                                                    \
+test_vmlaq_lane##q##_##su##size (MAP##su (size, ) * res,       \
+                                const MAP##su(size, ) *in1,    \
+                                const MAP##su(size, ) *in2)    \
+{                                                              \
+  MAP##su (size, x##in1_lanes) a = vld1q_##su##size (res);     \
+  MAP##su (size, x##in1_lanes) b = vld1q_##su##size (in1);     \
+  MAP##su (size, x##in2_lanes) c = vld1##q##_##su##size (in2); \
+  a = vmlaq_lane##q##_##su##size (a, b, c, 1);                 \
+  vst1q_##su##size (res, a);                                   \
+}
+
+#define BUILD_VARS(width, n_lanes, n_half_lanes)               \
+TEST_VMLA (, s, width, n_lanes, n_half_lanes)                  \
+TEST_VMLA (q, s, width, n_lanes, n_lanes)                      \
+TEST_VMLA (, u, width, n_lanes, n_half_lanes)                  \
+TEST_VMLA (q, u, width, n_lanes, n_lanes)                      \
+
+BUILD_VARS (32, 4, 2)
+BUILD_VARS (16, 8, 4)
+
+#define POOL4 {0, 1, 2, 3}
+#define POOL8 {0, 1, 2, 3, 4, 5, 6, 7}
+#define EMPTY4 {0, 0, 0, 0}
+#define EMPTY8 {0, 0, 0, 0, 0, 0, 0, 0}
+
+#define BUILD_TEST(su, size, lanes)                            \
+static void                                                    \
+test_##su##size (void)                                         \
+{                                                              \
+  int i;                                                       \
+  MAP##su (size,) pool[lanes] = POOL##lanes;                   \
+  MAP##su (size,) res[lanes] = EMPTY##lanes;                   \
+  MAP##su (size,) res2[lanes] = EMPTY##lanes;                  \
+                                                               \
+  /* Forecfully avoid optimization.  */                                \
+  asm volatile ("" : : : "memory");                            \
+  test_vmlaq_lane_##su##size (res, pool, pool);                        \
+  for (i = 0; i < lanes; i++)                                  \
+    if (res[i] != pool[i])                                     \
+      abort ();                                                        \
+                                                               \
+  /* Forecfully avoid optimization.  */                                \
+  asm volatile ("" : : : "memory");                            \
+  test_vmlaq_laneq_##su##size (res2, pool, pool);              \
+  for (i = 0; i < lanes; i++)                                  \
+    if (res2[i] != pool[i])                                    \
+      abort ();                                                        \
+}
+
+#undef BUILD_VARS
+#define BUILD_VARS(size, lanes)                                        \
+BUILD_TEST (s, size, lanes)                                    \
+BUILD_TEST (u, size, lanes)
+
+BUILD_VARS (32, 4)
+BUILD_VARS (16, 8)
+
+int
+main (int argc, char **argv)
+{
+  test_s32 ();
+  test_u32 ();
+  test_s16 ();
+  test_u16 ();
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "mla\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.4s\\\[\[0-9\]+\\\]" 4 } } */
+/* { dg-final { scan-assembler-times "mla\\tv\[0-9\]+\.8h, v\[0-9\]+\.8h, v\[0-9\]+\.8h\\\[\[0-9\]+\\\]" 4 } } */
+/* { dg-final { cleanup-saved-temps } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/mls_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/mls_intrinsic_1.c
new file mode 100644 (file)
index 0000000..8bf95b6
--- /dev/null
@@ -0,0 +1,89 @@
+/* { dg-do run } */
+/* { dg-options "-O3 --save-temps" } */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+#define MAPs(size, xx) int##size##xx##_t
+#define MAPu(size, xx) uint##size##xx##_t
+
+
+#define TEST_VMLS(q, su, size, in1_lanes, in2_lanes)           \
+static void                                                    \
+test_vmlsq_lane##q##_##su##size (MAP##su (size, ) * res,       \
+                                const MAP##su(size, ) *in1,    \
+                                const MAP##su(size, ) *in2)    \
+{                                                              \
+  MAP##su (size, x##in1_lanes) a = vld1q_##su##size (res);     \
+  MAP##su (size, x##in1_lanes) b = vld1q_##su##size (in1);     \
+  MAP##su (size, x##in2_lanes) c = vld1##q##_##su##size (in2); \
+  a = vmlsq_lane##q##_##su##size (a, b, c, 1);                 \
+  vst1q_##su##size (res, a);                                   \
+}
+
+#define BUILD_VARS(width, n_lanes, n_half_lanes)               \
+TEST_VMLS (, s, width, n_lanes, n_half_lanes)                  \
+TEST_VMLS (q, s, width, n_lanes, n_lanes)                      \
+TEST_VMLS (, u, width, n_lanes, n_half_lanes)                  \
+TEST_VMLS (q, u, width, n_lanes, n_lanes)                      \
+
+BUILD_VARS (32, 4, 2)
+BUILD_VARS (16, 8, 4)
+
+#define MAP_OPs +
+#define MAP_OPu -
+
+#define POOL4 {0, 1, 2, 3}
+#define POOL8 {0, 1, 2, 3, 4, 5, 6, 7}
+#define EMPTY4s {0, 0, 0, 0}
+#define EMPTY8s {0, 0, 0, 0, 0, 0, 0, 0}
+#define EMPTY4u {0, 2, 4, 6}
+#define EMPTY8u {0, 2, 4, 6, 8, 10, 12, 14}
+
+#define BUILD_TEST(su, size, lanes)                            \
+static void                                                    \
+test_##su##size (void)                                         \
+{                                                              \
+  int i;                                                       \
+  MAP##su (size,) pool[lanes] = POOL##lanes;                   \
+  MAP##su (size,) res[lanes] = EMPTY##lanes##su;               \
+  MAP##su (size,) res2[lanes] = EMPTY##lanes##su;              \
+                                                               \
+  /* Forecfully avoid optimization.  */                                \
+  asm volatile ("" : : : "memory");                            \
+  test_vmlsq_lane_##su##size (res, pool, pool);                        \
+  for (i = 0; i < lanes; i++)                                  \
+    if (res[i] MAP_OP##su pool[i] != 0)                                \
+      abort ();                                                        \
+                                                               \
+  /* Forecfully avoid optimization.  */                                \
+  asm volatile ("" : : : "memory");                            \
+  test_vmlsq_laneq_##su##size (res2, pool, pool);              \
+  for (i = 0; i < lanes; i++)                                  \
+    if (res2[i] MAP_OP##su pool[i] != 0)                       \
+      abort ();                                                        \
+}
+
+#undef BUILD_VARS
+#define BUILD_VARS(size, lanes)                                        \
+BUILD_TEST (s, size, lanes)                                    \
+BUILD_TEST (u, size, lanes)
+
+BUILD_VARS (32, 4)
+BUILD_VARS (16, 8)
+
+int
+main (int argc, char **argv)
+{
+  test_s32 ();
+  test_u32 ();
+  test_s16 ();
+  test_u16 ();
+  return 0;
+}
+
+/* { dg-final { scan-assembler-times "mls\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.4s\\\[\[0-9\]+\\\]" 4 } } */
+/* { dg-final { scan-assembler-times "mls\\tv\[0-9\]+\.8h, v\[0-9\]+\.8h, v\[0-9\]+\.8h\\\[\[0-9\]+\\\]" 4 } } */
+/* { dg-final { cleanup-saved-temps } } */
+