From 828e70c1d7bb5c849a2df44aa832793c71833058 Mon Sep 17 00:00:00 2001 From: James Greenhalgh Date: Mon, 16 Sep 2013 09:53:11 +0000 Subject: [PATCH] [AArch64] Improve arm_neon.h vml_lane handling. gcc/ * config/aarch64/aarch64-simd-builtins.def (fma): New. * config/aarch64/aarch64-simd.md (aarch64_mla_elt): New. (aarch64_mla_elt_): Likewise. (aarch64_mls_elt): Likewise. (aarch64_mls_elt_): Likewise. (aarch64_fma4_elt): Likewise. (aarch64_fma4_elt_): Likewise. (aarch64_fma4_elt_to_128v2df): Likewise. (aarch64_fma4_elt_to_64df): Likewise. (fnma4): Likewise. (aarch64_fnma4_elt): Likewise. (aarch64_fnma4_elt_): Likewise. (aarch64_fnma4_elt_to_128v2df): Likewise. (aarch64_fnma4_elt_to_64df): Likewise. * config/aarch64/iterators.md (VDQSF): New. * config/aarch64/arm_neon.h (vfm_lane_f<32, 64>): Convert to C implementation. (vml_lane_<16, 32, 64>): Likewise. gcc/testsuite/ * gcc.target/aarch64/fmla-intrinsic.c: New. * gcc.target/aarch64/mla-intrinsic.c: Likewise. * gcc.target/aarch64/fmls-intrinsic.c: Likewise. * gcc.target/aarch64/mls-intrinsic.c: Likewise. From-SVN: r202625 --- gcc/ChangeLog | 22 + gcc/config/aarch64/aarch64-simd-builtins.def | 3 + gcc/config/aarch64/aarch64-simd.md | 195 +++ gcc/config/aarch64/arm_neon.h | 1250 ++++++++--------- gcc/config/aarch64/iterators.md | 3 + gcc/testsuite/ChangeLog | 7 + .../gcc.target/aarch64/fmla_intrinsic_1.c | 116 ++ .../gcc.target/aarch64/fmls_intrinsic_1.c | 117 ++ .../gcc.target/aarch64/mla_intrinsic_1.c | 84 ++ .../gcc.target/aarch64/mls_intrinsic_1.c | 89 ++ 10 files changed, 1231 insertions(+), 655 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/mla_intrinsic_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/mls_intrinsic_1.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index aaab5ece335..68091bbcbbd 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,25 @@ +2013-09-16 James Greenhalgh + + * config/aarch64/aarch64-simd-builtins.def (fma): New. + * config/aarch64/aarch64-simd.md + (aarch64_mla_elt): New. + (aarch64_mla_elt_): Likewise. + (aarch64_mls_elt): Likewise. + (aarch64_mls_elt_): Likewise. + (aarch64_fma4_elt): Likewise. + (aarch64_fma4_elt_): Likewise. + (aarch64_fma4_elt_to_128v2df): Likewise. + (aarch64_fma4_elt_to_64df): Likewise. + (fnma4): Likewise. + (aarch64_fnma4_elt): Likewise. + (aarch64_fnma4_elt_): Likewise. + (aarch64_fnma4_elt_to_128v2df): Likewise. + (aarch64_fnma4_elt_to_64df): Likewise. + * config/aarch64/iterators.md (VDQSF): New. + * config/aarch64/arm_neon.h + (vfm_lane_f<32, 64>): Convert to C implementation. + (vml_lane_<16, 32, 64>): Likewise. + 2013-09-16 James Greenhalgh * config/aarch64/aarch64-simd.md (aarch64_mul3_elt): New. diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 4046d7a7001..35897f39395 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -359,3 +359,6 @@ /* Implemented by aarch64_st1. */ BUILTIN_VALL (STORE1, st1, 0) + /* Implemented by fma4. */ + BUILTIN_VDQF (TERNOP, fma, 4) + diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 04d5794ffca..f13cd5b7cdb 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1070,6 +1070,38 @@ (set_attr "simd_mode" "")] ) +(define_insn "*aarch64_mla_elt" + [(set (match_operand:VDQHS 0 "register_operand" "=w") + (plus:VDQHS + (mult:VDQHS + (vec_duplicate:VDQHS + (vec_select: + (match_operand:VDQHS 1 "register_operand" "") + (parallel [(match_operand:SI 2 "immediate_operand")]))) + (match_operand:VDQHS 3 "register_operand" "w")) + (match_operand:VDQHS 4 "register_operand" "0")))] + "TARGET_SIMD" + "mla\t%0., %3., %1.[%2]" + [(set_attr "simd_type" "simd_mla") + (set_attr "simd_mode" "")] +) + +(define_insn "*aarch64_mla_elt_" + [(set (match_operand:VDQHS 0 "register_operand" "=w") + (plus:VDQHS + (mult:VDQHS + (vec_duplicate:VDQHS + (vec_select: + (match_operand: 1 "register_operand" "") + (parallel [(match_operand:SI 2 "immediate_operand")]))) + (match_operand:VDQHS 3 "register_operand" "w")) + (match_operand:VDQHS 4 "register_operand" "0")))] + "TARGET_SIMD" + "mla\t%0., %3., %1.[%2]" + [(set_attr "simd_type" "simd_mla") + (set_attr "simd_mode" "")] +) + (define_insn "aarch64_mls" [(set (match_operand:VQ_S 0 "register_operand" "=w") (minus:VQ_S (match_operand:VQ_S 1 "register_operand" "0") @@ -1081,6 +1113,38 @@ (set_attr "simd_mode" "")] ) +(define_insn "*aarch64_mls_elt" + [(set (match_operand:VDQHS 0 "register_operand" "=w") + (minus:VDQHS + (match_operand:VDQHS 4 "register_operand" "0") + (mult:VDQHS + (vec_duplicate:VDQHS + (vec_select: + (match_operand:VDQHS 1 "register_operand" "") + (parallel [(match_operand:SI 2 "immediate_operand")]))) + (match_operand:VDQHS 3 "register_operand" "w"))))] + "TARGET_SIMD" + "mls\t%0., %3., %1.[%2]" + [(set_attr "simd_type" "simd_mla") + (set_attr "simd_mode" "")] +) + +(define_insn "*aarch64_mls_elt_" + [(set (match_operand:VDQHS 0 "register_operand" "=w") + (minus:VDQHS + (match_operand:VDQHS 4 "register_operand" "0") + (mult:VDQHS + (vec_duplicate:VDQHS + (vec_select: + (match_operand: 1 "register_operand" "") + (parallel [(match_operand:SI 2 "immediate_operand")]))) + (match_operand:VDQHS 3 "register_operand" "w"))))] + "TARGET_SIMD" + "mls\t%0., %3., %1.[%2]" + [(set_attr "simd_type" "simd_mla") + (set_attr "simd_mode" "")] +) + ;; Max/Min operations. (define_insn "3" [(set (match_operand:VQ_S 0 "register_operand" "=w") @@ -1483,6 +1547,137 @@ (set_attr "simd_mode" "")] ) +(define_insn "*aarch64_fma4_elt" + [(set (match_operand:VDQF 0 "register_operand" "=w") + (fma:VDQF + (vec_duplicate:VDQF + (vec_select: + (match_operand:VDQF 1 "register_operand" "") + (parallel [(match_operand:SI 2 "immediate_operand")]))) + (match_operand:VDQF 3 "register_operand" "w") + (match_operand:VDQF 4 "register_operand" "0")))] + "TARGET_SIMD" + "fmla\\t%0., %3., %1.[%2]" + [(set_attr "simd_type" "simd_fmla_elt") + (set_attr "simd_mode" "")] +) + +(define_insn "*aarch64_fma4_elt_" + [(set (match_operand:VDQSF 0 "register_operand" "=w") + (fma:VDQSF + (vec_duplicate:VDQSF + (vec_select: + (match_operand: 1 "register_operand" "") + (parallel [(match_operand:SI 2 "immediate_operand")]))) + (match_operand:VDQSF 3 "register_operand" "w") + (match_operand:VDQSF 4 "register_operand" "0")))] + "TARGET_SIMD" + "fmla\\t%0., %3., %1.[%2]" + [(set_attr "simd_type" "simd_fmla_elt") + (set_attr "simd_mode" "")] +) + +(define_insn "*aarch64_fma4_elt_to_128df" + [(set (match_operand:V2DF 0 "register_operand" "=w") + (fma:V2DF + (vec_duplicate:V2DF + (match_operand:DF 1 "register_operand" "w")) + (match_operand:V2DF 2 "register_operand" "w") + (match_operand:V2DF 3 "register_operand" "0")))] + "TARGET_SIMD" + "fmla\\t%0.2d, %2.2d, %1.2d[0]" + [(set_attr "simd_type" "simd_fmla_elt") + (set_attr "simd_mode" "V2DF")] +) + +(define_insn "*aarch64_fma4_elt_to_64v2df" + [(set (match_operand:DF 0 "register_operand" "=w") + (fma:DF + (vec_select:DF + (match_operand:V2DF 1 "register_operand" "w") + (parallel [(match_operand:SI 2 "immediate_operand")])) + (match_operand:DF 3 "register_operand" "w") + (match_operand:DF 4 "register_operand" "0")))] + "TARGET_SIMD" + "fmla\\t%0.2d, %3.2d, %1.2d[%2]" + [(set_attr "simd_type" "simd_fmla_elt") + (set_attr "simd_mode" "V2DF")] +) + +(define_insn "fnma4" + [(set (match_operand:VDQF 0 "register_operand" "=w") + (fma:VDQF + (match_operand:VDQF 1 "register_operand" "w") + (neg:VDQF + (match_operand:VDQF 2 "register_operand" "w")) + (match_operand:VDQF 3 "register_operand" "0")))] + "TARGET_SIMD" + "fmls\\t%0., %1., %2." + [(set_attr "simd_type" "simd_fmla") + (set_attr "simd_mode" "")] +) + +(define_insn "*aarch64_fnma4_elt" + [(set (match_operand:VDQF 0 "register_operand" "=w") + (fma:VDQF + (neg:VDQF + (match_operand:VDQF 3 "register_operand" "w")) + (vec_duplicate:VDQF + (vec_select: + (match_operand:VDQF 1 "register_operand" "") + (parallel [(match_operand:SI 2 "immediate_operand")]))) + (match_operand:VDQF 4 "register_operand" "0")))] + "TARGET_SIMD" + "fmls\\t%0., %3., %1.[%2]" + [(set_attr "simd_type" "simd_fmla_elt") + (set_attr "simd_mode" "")] +) + +(define_insn "*aarch64_fnma4_elt_" + [(set (match_operand:VDQSF 0 "register_operand" "=w") + (fma:VDQSF + (neg:VDQSF + (match_operand:VDQSF 3 "register_operand" "w")) + (vec_duplicate:VDQSF + (vec_select: + (match_operand: 1 "register_operand" "") + (parallel [(match_operand:SI 2 "immediate_operand")]))) + (match_operand:VDQSF 4 "register_operand" "0")))] + "TARGET_SIMD" + "fmls\\t%0., %3., %1.[%2]" + [(set_attr "simd_type" "simd_fmla_elt") + (set_attr "simd_mode" "")] +) + +(define_insn "*aarch64_fnma4_elt_to_128df" + [(set (match_operand:V2DF 0 "register_operand" "=w") + (fma:V2DF + (neg:V2DF + (match_operand:V2DF 2 "register_operand" "w")) + (vec_duplicate:V2DF + (match_operand:DF 1 "register_operand" "w")) + (match_operand:V2DF 3 "register_operand" "0")))] + "TARGET_SIMD" + "fmls\\t%0.2d, %2.2d, %1.2d[0]" + [(set_attr "simd_type" "simd_fmla_elt") + (set_attr "simd_mode" "V2DF")] +) + +(define_insn "*aarch64_fnma4_elt_to_64v2df" + [(set (match_operand:DF 0 "register_operand" "=w") + (fma:DF + (vec_select:DF + (match_operand:V2DF 1 "register_operand" "w") + (parallel [(match_operand:SI 2 "immediate_operand")])) + (neg:DF + (match_operand:DF 3 "register_operand" "w")) + (match_operand:DF 4 "register_operand" "0")))] + "TARGET_SIMD" + "fmls\\t%0.2d, %3.2d, %1.2d[%2]" + [(set_attr "simd_type" "simd_fmla_elt") + (set_attr "simd_mode" "V2DF")] +) + ;; Vector versions of the floating-point frint patterns. ;; Expands to btrunc, ceil, floor, nearbyint, rint, round. (define_insn "2" diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 6c9dd79a695..cb5860206a1 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -6100,33 +6100,6 @@ vfma_f32 (float32x2_t a, float32x2_t b, float32x2_t c) return result; } -#define vfma_lane_f32(a, b, c, d) \ - __extension__ \ - ({ \ - float32x2_t c_ = (c); \ - float32x2_t b_ = (b); \ - float32x2_t a_ = (a); \ - float32x2_t result; \ - __asm__ ("fmla %0.2s,%2.2s,%3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vfmad_lane_f64(a, b, c) \ - __extension__ \ - ({ \ - float64x2_t b_ = (b); \ - float64_t a_ = (a); \ - float64_t result; \ - __asm__ ("fmla %d0,%d1,%2.d[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vfmaq_f32 (float32x4_t a, float32x4_t b, float32x4_t c) { @@ -6149,47 +6122,6 @@ vfmaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c) return result; } -#define vfmaq_lane_f32(a, b, c, d) \ - __extension__ \ - ({ \ - float32x4_t c_ = (c); \ - float32x4_t b_ = (b); \ - float32x4_t a_ = (a); \ - float32x4_t result; \ - __asm__ ("fmla %0.4s,%2.4s,%3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vfmaq_lane_f64(a, b, c, d) \ - __extension__ \ - ({ \ - float64x2_t c_ = (c); \ - float64x2_t b_ = (b); \ - float64x2_t a_ = (a); \ - float64x2_t result; \ - __asm__ ("fmla %0.2d,%2.2d,%3.d[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vfmas_lane_f32(a, b, c) \ - __extension__ \ - ({ \ - float32x4_t b_ = (b); \ - float32_t a_ = (a); \ - float32_t result; \ - __asm__ ("fmla %s0,%s1,%2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vfma_n_f32 (float32x2_t a, float32x2_t b, float32_t c) { @@ -6234,19 +6166,6 @@ vfms_f32 (float32x2_t a, float32x2_t b, float32x2_t c) return result; } -#define vfmsd_lane_f64(a, b, c) \ - __extension__ \ - ({ \ - float64x2_t b_ = (b); \ - float64_t a_ = (a); \ - float64_t result; \ - __asm__ ("fmls %d0,%d1,%2.d[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vfmsq_f32 (float32x4_t a, float32x4_t b, float32x4_t c) { @@ -6269,19 +6188,6 @@ vfmsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c) return result; } -#define vfmss_lane_f32(a, b, c) \ - __extension__ \ - ({ \ - float32x4_t b_ = (b); \ - float32_t a_ = (a); \ - float32_t result; \ - __asm__ ("fmls %s0,%s1,%2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vget_high_f32 (float32x4_t a) { @@ -7122,133 +7028,6 @@ vld1q_dup_u64 (const uint64_t * a) result; \ }) -#define vmla_lane_f32(a, b, c, d) \ - __extension__ \ - ({ \ - float32x2_t c_ = (c); \ - float32x2_t b_ = (b); \ - float32x2_t a_ = (a); \ - float32x2_t result; \ - float32x2_t t1; \ - __asm__ ("fmul %1.2s, %3.2s, %4.s[%5]; fadd %0.2s, %0.2s, %1.2s" \ - : "=w"(result), "=w"(t1) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmla_lane_s16(a, b, c, d) \ - __extension__ \ - ({ \ - int16x4_t c_ = (c); \ - int16x4_t b_ = (b); \ - int16x4_t a_ = (a); \ - int16x4_t result; \ - __asm__ ("mla %0.4h, %2.4h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmla_lane_s32(a, b, c, d) \ - __extension__ \ - ({ \ - int32x2_t c_ = (c); \ - int32x2_t b_ = (b); \ - int32x2_t a_ = (a); \ - int32x2_t result; \ - __asm__ ("mla %0.2s, %2.2s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmla_lane_u16(a, b, c, d) \ - __extension__ \ - ({ \ - uint16x4_t c_ = (c); \ - uint16x4_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint16x4_t result; \ - __asm__ ("mla %0.4h, %2.4h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmla_lane_u32(a, b, c, d) \ - __extension__ \ - ({ \ - uint32x2_t c_ = (c); \ - uint32x2_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint32x2_t result; \ - __asm__ ("mla %0.2s, %2.2s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmla_laneq_s16(a, b, c, d) \ - __extension__ \ - ({ \ - int16x8_t c_ = (c); \ - int16x4_t b_ = (b); \ - int16x4_t a_ = (a); \ - int16x4_t result; \ - __asm__ ("mla %0.4h, %2.4h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmla_laneq_s32(a, b, c, d) \ - __extension__ \ - ({ \ - int32x4_t c_ = (c); \ - int32x2_t b_ = (b); \ - int32x2_t a_ = (a); \ - int32x2_t result; \ - __asm__ ("mla %0.2s, %2.2s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmla_laneq_u16(a, b, c, d) \ - __extension__ \ - ({ \ - uint16x8_t c_ = (c); \ - uint16x4_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint16x4_t result; \ - __asm__ ("mla %0.4h, %2.4h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmla_laneq_u32(a, b, c, d) \ - __extension__ \ - ({ \ - uint32x4_t c_ = (c); \ - uint32x2_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint32x2_t result; \ - __asm__ ("mla %0.2s, %2.2s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vmla_n_f32 (float32x2_t a, float32x2_t b, float32_t c) { @@ -7815,133 +7594,6 @@ vmlal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c) return result; } -#define vmlaq_lane_f32(a, b, c, d) \ - __extension__ \ - ({ \ - float32x4_t c_ = (c); \ - float32x4_t b_ = (b); \ - float32x4_t a_ = (a); \ - float32x4_t result; \ - float32x4_t t1; \ - __asm__ ("fmul %1.4s, %3.4s, %4.s[%5]; fadd %0.4s, %0.4s, %1.4s" \ - : "=w"(result), "=w"(t1) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlaq_lane_s16(a, b, c, d) \ - __extension__ \ - ({ \ - int16x8_t c_ = (c); \ - int16x8_t b_ = (b); \ - int16x8_t a_ = (a); \ - int16x8_t result; \ - __asm__ ("mla %0.8h, %2.8h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlaq_lane_s32(a, b, c, d) \ - __extension__ \ - ({ \ - int32x4_t c_ = (c); \ - int32x4_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("mla %0.4s, %2.4s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlaq_lane_u16(a, b, c, d) \ - __extension__ \ - ({ \ - uint16x8_t c_ = (c); \ - uint16x8_t b_ = (b); \ - uint16x8_t a_ = (a); \ - uint16x8_t result; \ - __asm__ ("mla %0.8h, %2.8h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlaq_lane_u32(a, b, c, d) \ - __extension__ \ - ({ \ - uint32x4_t c_ = (c); \ - uint32x4_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("mla %0.4s, %2.4s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlaq_laneq_s16(a, b, c, d) \ - __extension__ \ - ({ \ - int16x8_t c_ = (c); \ - int16x8_t b_ = (b); \ - int16x8_t a_ = (a); \ - int16x8_t result; \ - __asm__ ("mla %0.8h, %2.8h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlaq_laneq_s32(a, b, c, d) \ - __extension__ \ - ({ \ - int32x4_t c_ = (c); \ - int32x4_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("mla %0.4s, %2.4s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlaq_laneq_u16(a, b, c, d) \ - __extension__ \ - ({ \ - uint16x8_t c_ = (c); \ - uint16x8_t b_ = (b); \ - uint16x8_t a_ = (a); \ - uint16x8_t result; \ - __asm__ ("mla %0.8h, %2.8h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlaq_laneq_u32(a, b, c, d) \ - __extension__ \ - ({ \ - uint32x4_t c_ = (c); \ - uint32x4_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("mla %0.4s, %2.4s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vmlaq_n_f32 (float32x4_t a, float32x4_t b, float32_t c) { @@ -8046,106 +7698,35 @@ vmlaq_s32 (int32x4_t a, int32x4_t b, int32x4_t c) __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) vmlaq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c) { - uint8x16_t result; - __asm__ ("mla %0.16b, %2.16b, %3.16b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vmlaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c) -{ - uint16x8_t result; - __asm__ ("mla %0.8h, %2.8h, %3.8h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmlaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c) -{ - uint32x4_t result; - __asm__ ("mla %0.4s, %2.4s, %3.4s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -#define vmls_lane_f32(a, b, c, d) \ - __extension__ \ - ({ \ - float32x2_t c_ = (c); \ - float32x2_t b_ = (b); \ - float32x2_t a_ = (a); \ - float32x2_t result; \ - float32x2_t t1; \ - __asm__ ("fmul %1.2s, %3.2s, %4.s[%5]; fsub %0.2s, %0.2s, %1.2s" \ - : "=w"(result), "=w"(t1) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmls_lane_s16(a, b, c, d) \ - __extension__ \ - ({ \ - int16x4_t c_ = (c); \ - int16x4_t b_ = (b); \ - int16x4_t a_ = (a); \ - int16x4_t result; \ - __asm__ ("mls %0.4h,%2.4h,%3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmls_lane_s32(a, b, c, d) \ - __extension__ \ - ({ \ - int32x2_t c_ = (c); \ - int32x2_t b_ = (b); \ - int32x2_t a_ = (a); \ - int32x2_t result; \ - __asm__ ("mls %0.2s,%2.2s,%3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmls_lane_u16(a, b, c, d) \ - __extension__ \ - ({ \ - uint16x4_t c_ = (c); \ - uint16x4_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint16x4_t result; \ - __asm__ ("mls %0.4h,%2.4h,%3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) + uint8x16_t result; + __asm__ ("mla %0.16b, %2.16b, %3.16b" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} -#define vmls_lane_u32(a, b, c, d) \ - __extension__ \ - ({ \ - uint32x2_t c_ = (c); \ - uint32x2_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint32x2_t result; \ - __asm__ ("mls %0.2s,%2.2s,%3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmlaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c) +{ + uint16x8_t result; + __asm__ ("mla %0.8h, %2.8h, %3.8h" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c) +{ + uint32x4_t result; + __asm__ ("mla %0.4s, %2.4s, %3.4s" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vmls_n_f32 (float32x2_t a, float32x2_t b, float32_t c) @@ -8713,148 +8294,6 @@ vmlsl_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c) return result; } -#define vmlsq_lane_f32(a, b, c, d) \ - __extension__ \ - ({ \ - float32x4_t c_ = (c); \ - float32x4_t b_ = (b); \ - float32x4_t a_ = (a); \ - float32x4_t result; \ - float32x4_t t1; \ - __asm__ ("fmul %1.4s, %3.4s, %4.s[%5]; fsub %0.4s, %0.4s, %1.4s" \ - : "=w"(result), "=w"(t1) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsq_lane_s16(a, b, c, d) \ - __extension__ \ - ({ \ - int16x8_t c_ = (c); \ - int16x8_t b_ = (b); \ - int16x8_t a_ = (a); \ - int16x8_t result; \ - __asm__ ("mls %0.8h,%2.8h,%3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsq_lane_s32(a, b, c, d) \ - __extension__ \ - ({ \ - int32x4_t c_ = (c); \ - int32x4_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("mls %0.4s,%2.4s,%3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsq_lane_u16(a, b, c, d) \ - __extension__ \ - ({ \ - uint16x8_t c_ = (c); \ - uint16x8_t b_ = (b); \ - uint16x8_t a_ = (a); \ - uint16x8_t result; \ - __asm__ ("mls %0.8h,%2.8h,%3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsq_lane_u32(a, b, c, d) \ - __extension__ \ - ({ \ - uint32x4_t c_ = (c); \ - uint32x4_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("mls %0.4s,%2.4s,%3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsq_laneq_f32(__a, __b, __c, __d) \ - __extension__ \ - ({ \ - float32x4_t __c_ = (__c); \ - float32x4_t __b_ = (__b); \ - float32x4_t __a_ = (__a); \ - float32x4_t __result; \ - float32x4_t __t1; \ - __asm__ ("fmul %1.4s, %3.4s, %4.s[%5]; fsub %0.4s, %0.4s, %1.4s" \ - : "=w"(__result), "=w"(__t1) \ - : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d) \ - : /* No clobbers */); \ - __result; \ - }) - -#define vmlsq_laneq_s16(__a, __b, __c, __d) \ - __extension__ \ - ({ \ - int16x8_t __c_ = (__c); \ - int16x8_t __b_ = (__b); \ - int16x8_t __a_ = (__a); \ - int16x8_t __result; \ - __asm__ ("mls %0.8h, %2.8h, %3.h[%4]" \ - : "=w"(__result) \ - : "0"(__a_), "w"(__b_), "x"(__c_), "i"(__d) \ - : /* No clobbers */); \ - __result; \ - }) - -#define vmlsq_laneq_s32(__a, __b, __c, __d) \ - __extension__ \ - ({ \ - int32x4_t __c_ = (__c); \ - int32x4_t __b_ = (__b); \ - int32x4_t __a_ = (__a); \ - int32x4_t __result; \ - __asm__ ("mls %0.4s, %2.4s, %3.s[%4]" \ - : "=w"(__result) \ - : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d) \ - : /* No clobbers */); \ - __result; \ - }) - -#define vmlsq_laneq_u16(__a, __b, __c, __d) \ - __extension__ \ - ({ \ - uint16x8_t __c_ = (__c); \ - uint16x8_t __b_ = (__b); \ - uint16x8_t __a_ = (__a); \ - uint16x8_t __result; \ - __asm__ ("mls %0.8h, %2.8h, %3.h[%4]" \ - : "=w"(__result) \ - : "0"(__a_), "w"(__b_), "x"(__c_), "i"(__d) \ - : /* No clobbers */); \ - __result; \ - }) - -#define vmlsq_laneq_u32(__a, __b, __c, __d) \ - __extension__ \ - ({ \ - uint32x4_t __c_ = (__c); \ - uint32x4_t __b_ = (__b); \ - uint32x4_t __a_ = (__a); \ - uint32x4_t __result; \ - __asm__ ("mls %0.4s, %2.4s, %3.s[%4]" \ - : "=w"(__result) \ - : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d) \ - : /* No clobbers */); \ - __result; \ - }) - __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vmlsq_n_f32 (float32x4_t a, float32x4_t b, float32_t c) { @@ -19488,130 +18927,334 @@ vduph_lane_p16 (poly16x4_t __a, const int __b) return __aarch64_vget_lane_p16 (__a, __b); } -__extension__ static __inline int16_t __attribute__ ((__always_inline__)) -vduph_lane_s16 (int16x4_t __a, const int __b) +__extension__ static __inline int16_t __attribute__ ((__always_inline__)) +vduph_lane_s16 (int16x4_t __a, const int __b) +{ + return __aarch64_vget_lane_s16 (__a, __b); +} + +__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +vduph_lane_u16 (uint16x4_t __a, const int __b) +{ + return __aarch64_vget_lane_u16 (__a, __b); +} + +/* vdups_lane */ +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vdups_lane_f32 (float32x2_t __a, const int __b) +{ + return __aarch64_vget_lane_f32 (__a, __b); +} + +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vdups_lane_s32 (int32x2_t __a, const int __b) +{ + return __aarch64_vget_lane_s32 (__a, __b); +} + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vdups_lane_u32 (uint32x2_t __a, const int __b) +{ + return __aarch64_vget_lane_u32 (__a, __b); +} + +/* vdupd_lane */ +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vdupd_lane_f64 (float64x1_t __a, const int __attribute__ ((unused)) __b) +{ + return __a; +} + +__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +vdupd_lane_s64 (int64x1_t __a, const int __attribute__ ((unused)) __b) +{ + return __a; +} + +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vdupd_lane_u64 (uint64x1_t __a, const int __attribute__ ((unused)) __b) +{ + return __a; +} + +/* vdupb_laneq */ +__extension__ static __inline poly8_t __attribute__ ((__always_inline__)) +vdupb_laneq_p8 (poly8x16_t __a, const int __b) +{ + return __aarch64_vgetq_lane_p8 (__a, __b); +} + +__extension__ static __inline int8_t __attribute__ ((__always_inline__)) +vdupb_laneq_s8 (int8x16_t __a, const int __attribute__ ((unused)) __b) +{ + return __aarch64_vgetq_lane_s8 (__a, __b); +} + +__extension__ static __inline uint8_t __attribute__ ((__always_inline__)) +vdupb_laneq_u8 (uint8x16_t __a, const int __b) +{ + return __aarch64_vgetq_lane_u8 (__a, __b); +} + +/* vduph_laneq */ +__extension__ static __inline poly16_t __attribute__ ((__always_inline__)) +vduph_laneq_p16 (poly16x8_t __a, const int __b) +{ + return __aarch64_vgetq_lane_p16 (__a, __b); +} + +__extension__ static __inline int16_t __attribute__ ((__always_inline__)) +vduph_laneq_s16 (int16x8_t __a, const int __b) +{ + return __aarch64_vgetq_lane_s16 (__a, __b); +} + +__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +vduph_laneq_u16 (uint16x8_t __a, const int __b) +{ + return __aarch64_vgetq_lane_u16 (__a, __b); +} + +/* vdups_laneq */ +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vdups_laneq_f32 (float32x4_t __a, const int __b) +{ + return __aarch64_vgetq_lane_f32 (__a, __b); +} + +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vdups_laneq_s32 (int32x4_t __a, const int __b) +{ + return __aarch64_vgetq_lane_s32 (__a, __b); +} + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vdups_laneq_u32 (uint32x4_t __a, const int __b) +{ + return __aarch64_vgetq_lane_u32 (__a, __b); +} + +/* vdupd_laneq */ +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vdupd_laneq_f64 (float64x2_t __a, const int __b) +{ + return __aarch64_vgetq_lane_f64 (__a, __b); +} + +__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +vdupd_laneq_s64 (int64x2_t __a, const int __b) +{ + return __aarch64_vgetq_lane_s64 (__a, __b); +} + +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vdupd_laneq_u64 (uint64x2_t __a, const int __b) +{ + return __aarch64_vgetq_lane_u64 (__a, __b); +} + +/* vfma_lane */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vfma_lane_f32 (float32x2_t __a, float32x2_t __b, + float32x2_t __c, const int __lane) +{ + return __builtin_aarch64_fmav2sf (__b, + __aarch64_vdup_lane_f32 (__c, __lane), + __a); +} + +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vfma_lane_f64 (float64_t __a, float64_t __b, + float64_t __c, const int __lane) +{ + return __builtin_fma (__b, __c, __a); +} + +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vfmad_lane_f64 (float64_t __a, float64_t __b, + float64_t __c, const int __lane) +{ + return __builtin_fma (__b, __c, __a); +} + +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vfmas_lane_f32 (float32_t __a, float32_t __b, + float32x2_t __c, const int __lane) +{ + return __builtin_fmaf (__b, __aarch64_vget_lane_f32 (__c, __lane), __a); +} + +/* vfma_laneq */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vfma_laneq_f32 (float32x2_t __a, float32x2_t __b, + float32x4_t __c, const int __lane) +{ + return __builtin_aarch64_fmav2sf (__b, + __aarch64_vdup_laneq_f32 (__c, __lane), + __a); +} + +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vfma_laneq_f64 (float64_t __a, float64_t __b, + float64x2_t __c, const int __lane) { - return __aarch64_vget_lane_s16 (__a, __b); + return __builtin_fma (__b, __aarch64_vgetq_lane_f64 (__c, __lane), __a); } -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) -vduph_lane_u16 (uint16x4_t __a, const int __b) +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vfmad_laneq_f64 (float64_t __a, float64_t __b, + float64x2_t __c, const int __lane) { - return __aarch64_vget_lane_u16 (__a, __b); + return __builtin_fma (__b, __aarch64_vgetq_lane_f64 (__c, __lane), __a); } -/* vdups_lane */ __extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vdups_lane_f32 (float32x2_t __a, const int __b) +vfmas_laneq_f32 (float32_t __a, float32_t __b, + float32x4_t __c, const int __lane) { - return __aarch64_vget_lane_f32 (__a, __b); + return __builtin_fmaf (__b, __aarch64_vgetq_lane_f32 (__c, __lane), __a); } -__extension__ static __inline int32_t __attribute__ ((__always_inline__)) -vdups_lane_s32 (int32x2_t __a, const int __b) +/* vfmaq_lane */ + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vfmaq_lane_f32 (float32x4_t __a, float32x4_t __b, + float32x2_t __c, const int __lane) { - return __aarch64_vget_lane_s32 (__a, __b); + return __builtin_aarch64_fmav4sf (__b, + __aarch64_vdupq_lane_f32 (__c, __lane), + __a); } -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vdups_lane_u32 (uint32x2_t __a, const int __b) +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vfmaq_lane_f64 (float64x2_t __a, float64x2_t __b, + float64_t __c, const int __lane) { - return __aarch64_vget_lane_u32 (__a, __b); + return __builtin_aarch64_fmav2df (__b, vdupq_n_f64 (__c), __a); } -/* vdupd_lane */ -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vdupd_lane_f64 (float64x1_t __a, const int __attribute__ ((unused)) __b) +/* vfmaq_laneq */ + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vfmaq_laneq_f32 (float32x4_t __a, float32x4_t __b, + float32x4_t __c, const int __lane) { - return __a; + return __builtin_aarch64_fmav4sf (__b, + __aarch64_vdupq_laneq_f32 (__c, __lane), + __a); } -__extension__ static __inline int64_t __attribute__ ((__always_inline__)) -vdupd_lane_s64 (int64x1_t __a, const int __attribute__ ((unused)) __b) +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vfmaq_laneq_f64 (float64x2_t __a, float64x2_t __b, + float64x2_t __c, const int __lane) { - return __a; + return __builtin_aarch64_fmav2df (__b, + __aarch64_vdupq_laneq_f64 (__c, __lane), + __a); } -__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) -vdupd_lane_u64 (uint64x1_t __a, const int __attribute__ ((unused)) __b) +/* vfms_lane */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vfms_lane_f32 (float32x2_t __a, float32x2_t __b, + float32x2_t __c, const int __lane) { - return __a; + return __builtin_aarch64_fmav2sf (-__b, + __aarch64_vdup_lane_f32 (__c, __lane), + __a); } -/* vdupb_laneq */ -__extension__ static __inline poly8_t __attribute__ ((__always_inline__)) -vdupb_laneq_p8 (poly8x16_t __a, const int __b) +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vfms_lane_f64 (float64_t __a, float64_t __b, + float64_t __c, const int __lane) { - return __aarch64_vgetq_lane_p8 (__a, __b); + return __builtin_fma (-__b, __c, __a); } -__extension__ static __inline int8_t __attribute__ ((__always_inline__)) -vdupb_laneq_s8 (int8x16_t __a, const int __attribute__ ((unused)) __b) +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vfmsd_lane_f64 (float64_t __a, float64_t __b, + float64_t __c, const int __lane) { - return __aarch64_vgetq_lane_s8 (__a, __b); + return __builtin_fma (-__b, __c, __a); } -__extension__ static __inline uint8_t __attribute__ ((__always_inline__)) -vdupb_laneq_u8 (uint8x16_t __a, const int __b) +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vfmss_lane_f32 (float32_t __a, float32_t __b, + float32x2_t __c, const int __lane) { - return __aarch64_vgetq_lane_u8 (__a, __b); + return __builtin_fmaf (-__b, __aarch64_vget_lane_f32 (__c, __lane), __a); } -/* vduph_laneq */ -__extension__ static __inline poly16_t __attribute__ ((__always_inline__)) -vduph_laneq_p16 (poly16x8_t __a, const int __b) +/* vfms_laneq */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vfms_laneq_f32 (float32x2_t __a, float32x2_t __b, + float32x4_t __c, const int __lane) { - return __aarch64_vgetq_lane_p16 (__a, __b); + return __builtin_aarch64_fmav2sf (-__b, + __aarch64_vdup_laneq_f32 (__c, __lane), + __a); } -__extension__ static __inline int16_t __attribute__ ((__always_inline__)) -vduph_laneq_s16 (int16x8_t __a, const int __b) +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vfms_laneq_f64 (float64_t __a, float64_t __b, + float64x2_t __c, const int __lane) { - return __aarch64_vgetq_lane_s16 (__a, __b); + return __builtin_fma (-__b, __aarch64_vgetq_lane_f64 (__c, __lane), __a); } -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) -vduph_laneq_u16 (uint16x8_t __a, const int __b) +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vfmsd_laneq_f64 (float64_t __a, float64_t __b, + float64x2_t __c, const int __lane) { - return __aarch64_vgetq_lane_u16 (__a, __b); + return __builtin_fma (-__b, __aarch64_vgetq_lane_f64 (__c, __lane), __a); } -/* vdups_laneq */ __extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vdups_laneq_f32 (float32x4_t __a, const int __b) +vfmss_laneq_f32 (float32_t __a, float32_t __b, + float32x4_t __c, const int __lane) { - return __aarch64_vgetq_lane_f32 (__a, __b); + return __builtin_fmaf (-__b, __aarch64_vgetq_lane_f32 (__c, __lane), __a); } -__extension__ static __inline int32_t __attribute__ ((__always_inline__)) -vdups_laneq_s32 (int32x4_t __a, const int __b) -{ - return __aarch64_vgetq_lane_s32 (__a, __b); -} +/* vfmsq_lane */ -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vdups_laneq_u32 (uint32x4_t __a, const int __b) +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vfmsq_lane_f32 (float32x4_t __a, float32x4_t __b, + float32x2_t __c, const int __lane) { - return __aarch64_vgetq_lane_u32 (__a, __b); + return __builtin_aarch64_fmav4sf (-__b, + __aarch64_vdupq_lane_f32 (__c, __lane), + __a); } -/* vdupd_laneq */ -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vdupd_laneq_f64 (float64x2_t __a, const int __b) +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vfmsq_lane_f64 (float64x2_t __a, float64x2_t __b, + float64_t __c, const int __lane) { - return __aarch64_vgetq_lane_f64 (__a, __b); + return __builtin_aarch64_fmav2df (-__b, vdupq_n_f64 (__c), __a); } -__extension__ static __inline int64_t __attribute__ ((__always_inline__)) -vdupd_laneq_s64 (int64x2_t __a, const int __b) +/* vfmsq_laneq */ + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vfmsq_laneq_f32 (float32x4_t __a, float32x4_t __b, + float32x4_t __c, const int __lane) { - return __aarch64_vgetq_lane_s64 (__a, __b); + return __builtin_aarch64_fmav4sf (-__b, + __aarch64_vdupq_laneq_f32 (__c, __lane), + __a); } -__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) -vdupd_laneq_u64 (uint64x2_t __a, const int __b) +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vfmsq_laneq_f64 (float64x2_t __a, float64x2_t __b, + float64x2_t __c, const int __lane) { - return __aarch64_vgetq_lane_u64 (__a, __b); + return __builtin_aarch64_fmav2df (-__b, + __aarch64_vdupq_laneq_f64 (__c, __lane), + __a); } /* vld1 */ @@ -21131,6 +20774,156 @@ vmlaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c) return a + b * c; } +/* vmla_lane */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmla_lane_f32 (float32x2_t __a, float32x2_t __b, + float32x2_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_f32 (__c, __lane))); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmla_lane_s16 (int16x4_t __a, int16x4_t __b, + int16x4_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_s16 (__c, __lane))); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmla_lane_s32 (int32x2_t __a, int32x2_t __b, + int32x2_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_s32 (__c, __lane))); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmla_lane_u16 (uint16x4_t __a, uint16x4_t __b, + uint16x4_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_u16 (__c, __lane))); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmla_lane_u32 (uint32x2_t __a, uint32x2_t __b, + uint32x2_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_u32 (__c, __lane))); +} + +/* vmla_laneq */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmla_laneq_f32 (float32x2_t __a, float32x2_t __b, + float32x4_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vgetq_lane_f32 (__c, __lane))); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmla_laneq_s16 (int16x4_t __a, int16x4_t __b, + int16x8_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vgetq_lane_s16 (__c, __lane))); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmla_laneq_s32 (int32x2_t __a, int32x2_t __b, + int32x4_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vgetq_lane_s32 (__c, __lane))); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmla_laneq_u16 (uint16x4_t __a, uint16x4_t __b, + uint16x8_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vgetq_lane_u16 (__c, __lane))); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmla_laneq_u32 (uint32x2_t __a, uint32x2_t __b, + uint32x4_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vgetq_lane_u32 (__c, __lane))); +} + +/* vmlaq_lane */ + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmlaq_lane_f32 (float32x4_t __a, float32x4_t __b, + float32x2_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_f32 (__c, __lane))); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmlaq_lane_s16 (int16x8_t __a, int16x8_t __b, + int16x4_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_s16 (__c, __lane))); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlaq_lane_s32 (int32x4_t __a, int32x4_t __b, + int32x2_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_s32 (__c, __lane))); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmlaq_lane_u16 (uint16x8_t __a, uint16x8_t __b, + uint16x4_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_u16 (__c, __lane))); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlaq_lane_u32 (uint32x4_t __a, uint32x4_t __b, + uint32x2_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_u32 (__c, __lane))); +} + + /* vmlaq_laneq */ + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmlaq_laneq_f32 (float32x4_t __a, float32x4_t __b, + float32x4_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vgetq_lane_f32 (__c, __lane))); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmlaq_laneq_s16 (int16x8_t __a, int16x8_t __b, + int16x8_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vgetq_lane_s16 (__c, __lane))); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlaq_laneq_s32 (int32x4_t __a, int32x4_t __b, + int32x4_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vgetq_lane_s32 (__c, __lane))); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmlaq_laneq_u16 (uint16x8_t __a, uint16x8_t __b, + uint16x8_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vgetq_lane_u16 (__c, __lane))); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlaq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, + uint32x4_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vgetq_lane_u32 (__c, __lane))); +} + +/* vmls */ + __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vmls_f32 (float32x2_t a, float32x2_t b, float32x2_t c) { @@ -21149,6 +20942,153 @@ vmlsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c) return a - b * c; } +/* vmls_lane */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmls_lane_f32 (float32x2_t __a, float32x2_t __b, + float32x2_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_f32 (__c, __lane))); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmls_lane_s16 (int16x4_t __a, int16x4_t __b, + int16x4_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_s16 (__c, __lane))); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmls_lane_s32 (int32x2_t __a, int32x2_t __b, + int32x2_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_s32 (__c, __lane))); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmls_lane_u16 (uint16x4_t __a, uint16x4_t __b, + uint16x4_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_u16 (__c, __lane))); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmls_lane_u32 (uint32x2_t __a, uint32x2_t __b, + uint32x2_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_u32 (__c, __lane))); +} + +/* vmls_laneq */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmls_laneq_f32 (float32x2_t __a, float32x2_t __b, + float32x4_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vgetq_lane_f32 (__c, __lane))); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmls_laneq_s16 (int16x4_t __a, int16x4_t __b, + int16x8_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vgetq_lane_s16 (__c, __lane))); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmls_laneq_s32 (int32x2_t __a, int32x2_t __b, + int32x4_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vgetq_lane_s32 (__c, __lane))); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmls_laneq_u16 (uint16x4_t __a, uint16x4_t __b, + uint16x8_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vgetq_lane_u16 (__c, __lane))); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmls_laneq_u32 (uint32x2_t __a, uint32x2_t __b, + uint32x4_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vgetq_lane_u32 (__c, __lane))); +} + +/* vmlsq_lane */ + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmlsq_lane_f32 (float32x4_t __a, float32x4_t __b, + float32x2_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_f32 (__c, __lane))); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmlsq_lane_s16 (int16x8_t __a, int16x8_t __b, + int16x4_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_s16 (__c, __lane))); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlsq_lane_s32 (int32x4_t __a, int32x4_t __b, + int32x2_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_s32 (__c, __lane))); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmlsq_lane_u16 (uint16x8_t __a, uint16x8_t __b, + uint16x4_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_u16 (__c, __lane))); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlsq_lane_u32 (uint32x4_t __a, uint32x4_t __b, + uint32x2_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_u32 (__c, __lane))); +} + + /* vmlsq_laneq */ + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmlsq_laneq_f32 (float32x4_t __a, float32x4_t __b, + float32x4_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vgetq_lane_f32 (__c, __lane))); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmlsq_laneq_s16 (int16x8_t __a, int16x8_t __b, + int16x8_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vgetq_lane_s16 (__c, __lane))); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlsq_laneq_s32 (int32x4_t __a, int32x4_t __b, + int32x4_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vgetq_lane_s32 (__c, __lane))); +} +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmlsq_laneq_u16 (uint16x8_t __a, uint16x8_t __b, + uint16x8_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vgetq_lane_u16 (__c, __lane))); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlsq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, + uint32x4_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vgetq_lane_u32 (__c, __lane))); +} + /* vmul_lane */ __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index a6b3117c8a2..ec8d813fa3f 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -89,6 +89,9 @@ ;; Vector Float modes. (define_mode_iterator VDQF [V2SF V4SF V2DF]) +;; Vector single Float modes. +(define_mode_iterator VDQSF [V2SF V4SF]) + ;; Modes suitable to use as the return type of a vcond expression. (define_mode_iterator VDQF_COND [V2SF V2SI V4SF V4SI V2DF V2DI]) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 17ae8ee1550..4f9b8a79191 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,10 @@ +2013-09-16 James Greenhalgh + + * gcc.target/aarch64/fmla-intrinsic.c: New. + * gcc.target/aarch64/mla-intrinsic.c: Likewise. + * gcc.target/aarch64/fmls-intrinsic.c: Likewise. + * gcc.target/aarch64/mls-intrinsic.c: Likewise. + 2013-09-16 James Greenhalgh * gcc.target/aarch64/mul_intrinsic_1.c: New. diff --git a/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c new file mode 100644 index 00000000000..0bf1b86b79e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/fmla_intrinsic_1.c @@ -0,0 +1,116 @@ +/* { dg-do run } */ +/* { dg-options "-O3 --save-temps" } */ + +#include + +#define DELTA 0.0001 + +extern double fabs (double); + +extern void abort (void); + +#define TEST_VMLA(q1, q2, size, in1_lanes, in2_lanes) \ +static void \ +test_vfma##q1##_lane##q2##_f##size (float##size##_t * res, \ + const float##size##_t *in1, \ + const float##size##_t *in2) \ +{ \ + float##size##x##in1_lanes##_t a = vld1##q1##_f##size (res); \ + float##size##x##in1_lanes##_t b = vld1##q1##_f##size (in1); \ + float##size##x##in2_lanes##_t c; \ + if (in2_lanes > 1) \ + { \ + c = vld1##q2##_f##size (in2); \ + a = vfma##q1##_lane##q2##_f##size (a, b, c, 1); \ + } \ + else \ + { \ + c = vld1##q2##_f##size (in2 + 1); \ + a = vfma##q1##_lane##q2##_f##size (a, b, c, 0); \ + } \ + vst1##q1##_f##size (res, a); \ +} + +#define BUILD_VARS(width, n_lanes, n_half_lanes) \ +TEST_VMLA ( , , width, n_half_lanes, n_half_lanes) \ +TEST_VMLA (q, , width, n_lanes, n_half_lanes) \ +TEST_VMLA ( , q, width, n_half_lanes, n_lanes) \ +TEST_VMLA (q, q, width, n_lanes, n_lanes) \ + +BUILD_VARS (32, 4, 2) +BUILD_VARS (64, 2, 1) + +#define POOL2 {0.0, 1.0} +#define POOL4 {0.0, 1.0, 2.0, 3.0} +#define EMPTY2 {0.0, 0.0} +#define EMPTY4 {0.0, 0.0, 0.0, 0.0} + +#define BUILD_TEST(size, lanes) \ +static void \ +test_f##size (void) \ +{ \ + int i; \ + float##size##_t pool[lanes] = POOL##lanes; \ + float##size##_t res[lanes] = EMPTY##lanes; \ + float##size##_t res2[lanes] = EMPTY##lanes; \ + float##size##_t res3[lanes] = EMPTY##lanes; \ + float##size##_t res4[lanes] = EMPTY##lanes; \ + \ + /* Forecfully avoid optimization. */ \ + asm volatile ("" : : : "memory"); \ + test_vfma_lane_f##size (res, pool, pool); \ + for (i = 0; i < lanes / 2; i++) \ + if (fabs (res[i] - pool[i]) > DELTA) \ + abort (); \ + \ + /* Forecfully avoid optimization. */ \ + asm volatile ("" : : : "memory"); \ + test_vfmaq_lane_f##size (res2, pool, pool); \ + for (i = 0; i < lanes; i++) \ + if (fabs (res2[i] - pool[i]) > DELTA) \ + abort (); \ + \ + /* Forecfully avoid optimization. */ \ + asm volatile ("" : : : "memory"); \ + test_vfma_laneq_f##size (res3, pool, pool); \ + for (i = 0; i < lanes / 2; i++) \ + if (fabs (res3[i] - pool[i]) > DELTA) \ + abort (); \ + \ + /* Forecfully avoid optimization. */ \ + asm volatile ("" : : : "memory"); \ + test_vfmaq_laneq_f##size (res4, pool, pool); \ + for (i = 0; i < lanes; i++) \ + if (fabs (res4[i] - pool[i]) > DELTA) \ + abort (); \ +} + +BUILD_TEST (32, 4) +BUILD_TEST (64, 2) + +int +main (int argc, char **argv) +{ + test_f32 (); + test_f64 (); + return 0; +} + +/* vfma_laneq_f32. + vfma_lane_f32. */ +/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.2s\\\[\[0-9\]+\\\]" 2 } } */ + +/* vfmaq_lane_f32. + vfmaq_laneq_f32. */ +/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.4s\\\[\[0-9\]+\\\]" 2 } } */ + +/* vfma_lane_f64. */ +/* { dg-final { scan-assembler-times "fmadd\\td\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+" 1 } } */ + +/* vfmaq_lane_f64. + vfma_laneq_f64. + vfmaq_laneq_f64. */ +/* { dg-final { scan-assembler-times "fmla\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */ + +/* { dg-final { cleanup-saved-temps } } */ + diff --git a/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c new file mode 100644 index 00000000000..8cc2942f8f1 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/fmls_intrinsic_1.c @@ -0,0 +1,117 @@ +/* { dg-do run } */ +/* { dg-options "-O3 --save-temps" } */ + +#include + +#define DELTA 0.0001 + +extern double fabs (double); + +extern void abort (void); + +#define TEST_VMLS(q1, q2, size, in1_lanes, in2_lanes) \ +static void \ +test_vfms##q1##_lane##q2##_f##size (float##size##_t * res, \ + const float##size##_t *in1, \ + const float##size##_t *in2) \ +{ \ + float##size##x##in1_lanes##_t a = vld1##q1##_f##size (res); \ + float##size##x##in1_lanes##_t b = vld1##q1##_f##size (in1); \ + float##size##x##in2_lanes##_t c; \ + if (in2_lanes > 1) \ + { \ + c = vld1##q2##_f##size (in2); \ + a = vfms##q1##_lane##q2##_f##size (a, b, c, 1); \ + } \ + else \ + { \ + c = vld1##q2##_f##size (in2 + 1); \ + a = vfms##q1##_lane##q2##_f##size (a, b, c, 0); \ + } \ + vst1##q1##_f##size (res, a); \ +} + +#define BUILD_VARS(width, n_lanes, n_half_lanes) \ +TEST_VMLS ( , , width, n_half_lanes, n_half_lanes) \ +TEST_VMLS (q, , width, n_lanes, n_half_lanes) \ +TEST_VMLS ( , q, width, n_half_lanes, n_lanes) \ +TEST_VMLS (q, q, width, n_lanes, n_lanes) \ + +BUILD_VARS (32, 4, 2) +BUILD_VARS (64, 2, 1) + +#define POOL2 {0.0, 1.0} +#define POOL4 {0.0, 1.0, 2.0, 3.0} +#define EMPTY2 {0.0, 0.0} +#define EMPTY4 {0.0, 0.0, 0.0, 0.0} + +#define BUILD_TEST(size, lanes) \ +static void \ +test_f##size (void) \ +{ \ + int i; \ + float##size##_t pool[lanes] = POOL##lanes; \ + float##size##_t res[lanes] = EMPTY##lanes; \ + float##size##_t res2[lanes] = EMPTY##lanes; \ + float##size##_t res3[lanes] = EMPTY##lanes; \ + float##size##_t res4[lanes] = EMPTY##lanes; \ + \ + /* Forecfully avoid optimization. */ \ + asm volatile ("" : : : "memory"); \ + test_vfms_lane_f##size (res, pool, pool); \ + asm volatile ("" : :"Q" (res) : "memory"); \ + for (i = 0; i < lanes / 2; i++) \ + if (fabs (res[i] + pool[i]) > DELTA) \ + abort (); \ + \ + /* Forecfully avoid optimization. */ \ + test_vfmsq_lane_f##size (res2, pool, pool); \ + asm volatile ("" : :"Q" (res2) : "memory"); \ + for (i = 0; i < lanes; i++) \ + if (fabs (res2[i] + pool[i]) > DELTA) \ + abort (); \ + \ + /* Forecfully avoid optimization. */ \ + test_vfms_laneq_f##size (res3, pool, pool); \ + asm volatile ("" : :"Q" (res3) : "memory"); \ + for (i = 0; i < lanes / 2; i++) \ + if (fabs (res3[i] + pool[i]) > DELTA) \ + abort (); \ + \ + /* Forecfully avoid optimization. */ \ + test_vfmsq_laneq_f##size (res4, pool, pool); \ + asm volatile ("" : :"Q" (res4) : "memory"); \ + for (i = 0; i < lanes; i++) \ + if (fabs (res4[i] + pool[i]) > DELTA) \ + abort (); \ +} + +BUILD_TEST (32, 4) +BUILD_TEST (64, 2) + +int +main (int argc, char **argv) +{ + test_f32 (); + test_f64 (); + return 0; +} + +/* vfms_laneq_f32. + vfms_lane_f32. */ +/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s, v\[0-9\]+\.2s\\\[\[0-9\]+\\\]" 2 } } */ + +/* vfmsq_lane_f32. + vfmsq_laneq_f32. */ +/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.4s\\\[\[0-9\]+\\\]" 2 } } */ + +/* vfms_lane_f64. */ +/* { dg-final { scan-assembler-times "fmsub\\td\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+\, d\[0-9\]+" 1 } } */ + +/* vfmsq_lane_f64. + vfms_laneq_f64. + vfmsq_laneq_f64. */ +/* { dg-final { scan-assembler-times "fmls\\tv\[0-9\]+\.2d, v\[0-9\]+\.2d, v\[0-9\]+\.2d\\\[\[0-9\]+\\\]" 3 } } */ + +/* { dg-final { cleanup-saved-temps } } */ + diff --git a/gcc/testsuite/gcc.target/aarch64/mla_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/mla_intrinsic_1.c new file mode 100644 index 00000000000..fce41387354 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/mla_intrinsic_1.c @@ -0,0 +1,84 @@ +/* { dg-do run } */ +/* { dg-options "-O3 --save-temps" } */ + +#include + +extern void abort (void); + +#define MAPs(size, xx) int##size##xx##_t +#define MAPu(size, xx) uint##size##xx##_t + + +#define TEST_VMLA(q, su, size, in1_lanes, in2_lanes) \ +static void \ +test_vmlaq_lane##q##_##su##size (MAP##su (size, ) * res, \ + const MAP##su(size, ) *in1, \ + const MAP##su(size, ) *in2) \ +{ \ + MAP##su (size, x##in1_lanes) a = vld1q_##su##size (res); \ + MAP##su (size, x##in1_lanes) b = vld1q_##su##size (in1); \ + MAP##su (size, x##in2_lanes) c = vld1##q##_##su##size (in2); \ + a = vmlaq_lane##q##_##su##size (a, b, c, 1); \ + vst1q_##su##size (res, a); \ +} + +#define BUILD_VARS(width, n_lanes, n_half_lanes) \ +TEST_VMLA (, s, width, n_lanes, n_half_lanes) \ +TEST_VMLA (q, s, width, n_lanes, n_lanes) \ +TEST_VMLA (, u, width, n_lanes, n_half_lanes) \ +TEST_VMLA (q, u, width, n_lanes, n_lanes) \ + +BUILD_VARS (32, 4, 2) +BUILD_VARS (16, 8, 4) + +#define POOL4 {0, 1, 2, 3} +#define POOL8 {0, 1, 2, 3, 4, 5, 6, 7} +#define EMPTY4 {0, 0, 0, 0} +#define EMPTY8 {0, 0, 0, 0, 0, 0, 0, 0} + +#define BUILD_TEST(su, size, lanes) \ +static void \ +test_##su##size (void) \ +{ \ + int i; \ + MAP##su (size,) pool[lanes] = POOL##lanes; \ + MAP##su (size,) res[lanes] = EMPTY##lanes; \ + MAP##su (size,) res2[lanes] = EMPTY##lanes; \ + \ + /* Forecfully avoid optimization. */ \ + asm volatile ("" : : : "memory"); \ + test_vmlaq_lane_##su##size (res, pool, pool); \ + for (i = 0; i < lanes; i++) \ + if (res[i] != pool[i]) \ + abort (); \ + \ + /* Forecfully avoid optimization. */ \ + asm volatile ("" : : : "memory"); \ + test_vmlaq_laneq_##su##size (res2, pool, pool); \ + for (i = 0; i < lanes; i++) \ + if (res2[i] != pool[i]) \ + abort (); \ +} + +#undef BUILD_VARS +#define BUILD_VARS(size, lanes) \ +BUILD_TEST (s, size, lanes) \ +BUILD_TEST (u, size, lanes) + +BUILD_VARS (32, 4) +BUILD_VARS (16, 8) + +int +main (int argc, char **argv) +{ + test_s32 (); + test_u32 (); + test_s16 (); + test_u16 (); + return 0; +} + +/* { dg-final { scan-assembler-times "mla\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.4s\\\[\[0-9\]+\\\]" 4 } } */ +/* { dg-final { scan-assembler-times "mla\\tv\[0-9\]+\.8h, v\[0-9\]+\.8h, v\[0-9\]+\.8h\\\[\[0-9\]+\\\]" 4 } } */ +/* { dg-final { cleanup-saved-temps } } */ + diff --git a/gcc/testsuite/gcc.target/aarch64/mls_intrinsic_1.c b/gcc/testsuite/gcc.target/aarch64/mls_intrinsic_1.c new file mode 100644 index 00000000000..8bf95b641c8 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/mls_intrinsic_1.c @@ -0,0 +1,89 @@ +/* { dg-do run } */ +/* { dg-options "-O3 --save-temps" } */ + +#include + +extern void abort (void); + +#define MAPs(size, xx) int##size##xx##_t +#define MAPu(size, xx) uint##size##xx##_t + + +#define TEST_VMLS(q, su, size, in1_lanes, in2_lanes) \ +static void \ +test_vmlsq_lane##q##_##su##size (MAP##su (size, ) * res, \ + const MAP##su(size, ) *in1, \ + const MAP##su(size, ) *in2) \ +{ \ + MAP##su (size, x##in1_lanes) a = vld1q_##su##size (res); \ + MAP##su (size, x##in1_lanes) b = vld1q_##su##size (in1); \ + MAP##su (size, x##in2_lanes) c = vld1##q##_##su##size (in2); \ + a = vmlsq_lane##q##_##su##size (a, b, c, 1); \ + vst1q_##su##size (res, a); \ +} + +#define BUILD_VARS(width, n_lanes, n_half_lanes) \ +TEST_VMLS (, s, width, n_lanes, n_half_lanes) \ +TEST_VMLS (q, s, width, n_lanes, n_lanes) \ +TEST_VMLS (, u, width, n_lanes, n_half_lanes) \ +TEST_VMLS (q, u, width, n_lanes, n_lanes) \ + +BUILD_VARS (32, 4, 2) +BUILD_VARS (16, 8, 4) + +#define MAP_OPs + +#define MAP_OPu - + +#define POOL4 {0, 1, 2, 3} +#define POOL8 {0, 1, 2, 3, 4, 5, 6, 7} +#define EMPTY4s {0, 0, 0, 0} +#define EMPTY8s {0, 0, 0, 0, 0, 0, 0, 0} +#define EMPTY4u {0, 2, 4, 6} +#define EMPTY8u {0, 2, 4, 6, 8, 10, 12, 14} + +#define BUILD_TEST(su, size, lanes) \ +static void \ +test_##su##size (void) \ +{ \ + int i; \ + MAP##su (size,) pool[lanes] = POOL##lanes; \ + MAP##su (size,) res[lanes] = EMPTY##lanes##su; \ + MAP##su (size,) res2[lanes] = EMPTY##lanes##su; \ + \ + /* Forecfully avoid optimization. */ \ + asm volatile ("" : : : "memory"); \ + test_vmlsq_lane_##su##size (res, pool, pool); \ + for (i = 0; i < lanes; i++) \ + if (res[i] MAP_OP##su pool[i] != 0) \ + abort (); \ + \ + /* Forecfully avoid optimization. */ \ + asm volatile ("" : : : "memory"); \ + test_vmlsq_laneq_##su##size (res2, pool, pool); \ + for (i = 0; i < lanes; i++) \ + if (res2[i] MAP_OP##su pool[i] != 0) \ + abort (); \ +} + +#undef BUILD_VARS +#define BUILD_VARS(size, lanes) \ +BUILD_TEST (s, size, lanes) \ +BUILD_TEST (u, size, lanes) + +BUILD_VARS (32, 4) +BUILD_VARS (16, 8) + +int +main (int argc, char **argv) +{ + test_s32 (); + test_u32 (); + test_s16 (); + test_u16 (); + return 0; +} + +/* { dg-final { scan-assembler-times "mls\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s, v\[0-9\]+\.4s\\\[\[0-9\]+\\\]" 4 } } */ +/* { dg-final { scan-assembler-times "mls\\tv\[0-9\]+\.8h, v\[0-9\]+\.8h, v\[0-9\]+\.8h\\\[\[0-9\]+\\\]" 4 } } */ +/* { dg-final { cleanup-saved-temps } } */ + -- 2.30.2