From 91bd4114a73ca69e5ce3d904282b402b9f2128d3 Mon Sep 17 00:00:00 2001 From: James Greenhalgh Date: Mon, 2 Sep 2013 16:22:10 +0000 Subject: [PATCH] [AArch64] Rewrite the vdup_lane intrinsics in C gcc/ * config/aarch64/aarch64-simd-builtins.def (dup_lane_scalar): Remove. * config/aarch64/aarch64-simd.md (aarch64_simd_dup): Add 'w->w' alternative. (aarch64_dup_lane): Allow for VALL. (aarch64_dup_lane_scalar): Remove. (aarch64_dup_lane_): New. (aarch64_get_lane_signed): Add w->w altenative. (aarch64_get_lane_unsigned): Likewise. (aarch64_get_lane): Likewise. * config/aarch64/aarch64.c (aarch64_evpc_dup): New. (aarch64_expand_vec_perm_const_1): Use aarch64_evpc_dup. * config/aarch64/iterators.md (VSWAP_WIDTH): New. (VCON): Change container of V2SF. (vswap_width_name): Likewise. * config/aarch64/arm_neon.h (__aarch64_vdup_lane_any): New. (__aarch64_vdup_lane_<8,16,32,64>): Likewise. (vdup_n_<8,16,32,64>): Convert to C implementation. (vdup_lane_<8,16,32,64>): Likewise. gcc/testsuite/ * gcc.target/aarch64/scalar_intrinsics.c (vdup_lane<8,16,32,64>): Force values to SIMD registers. From-SVN: r202180 --- gcc/ChangeLog | 23 + gcc/config/aarch64/aarch64-simd.md | 39 +- gcc/config/aarch64/aarch64.c | 51 + gcc/config/aarch64/arm_neon.h | 9009 +++++++++-------- gcc/config/aarch64/iterators.md | 16 +- gcc/testsuite/ChangeLog | 5 + .../gcc.target/aarch64/scalar_intrinsics.c | 48 +- 7 files changed, 4738 insertions(+), 4453 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 918e667c02e..90096122489 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,26 @@ +2013-09-02 James Greenhalgh + + * config/aarch64/aarch64-simd-builtins.def + (dup_lane_scalar): Remove. + * config/aarch64/aarch64-simd.md + (aarch64_simd_dup): Add 'w->w' alternative. + (aarch64_dup_lane): Allow for VALL. + (aarch64_dup_lane_scalar): Remove. + (aarch64_dup_lane_): New. + (aarch64_get_lane_signed): Add w->w altenative. + (aarch64_get_lane_unsigned): Likewise. + (aarch64_get_lane): Likewise. + * config/aarch64/aarch64.c (aarch64_evpc_dup): New. + (aarch64_expand_vec_perm_const_1): Use aarch64_evpc_dup. + * config/aarch64/iterators.md (VSWAP_WIDTH): New. + (VCON): Change container of V2SF. + (vswap_width_name): Likewise. + * config/aarch64/arm_neon.h + (__aarch64_vdup_lane_any): New. + (__aarch64_vdup_lane_<8,16,32,64>): Likewise. + (vdup_n_<8,16,32,64>): Convert to C implementation. + (vdup_lane_<8,16,32,64>): Likewise. + 2013-09-02 Eric Botcazou PR middle-end/56382 diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 982373099f7..f4b929edf44 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -336,32 +336,47 @@ }) (define_insn "aarch64_simd_dup" - [(set (match_operand:VDQ 0 "register_operand" "=w") - (vec_duplicate:VDQ (match_operand: 1 "register_operand" "r")))] + [(set (match_operand:VDQ 0 "register_operand" "=w, w") + (vec_duplicate:VDQ (match_operand: 1 "register_operand" "r, w")))] + "TARGET_SIMD" + "@ + dup\\t%0., %1 + dup\\t%0., %1.[0]" + [(set_attr "simd_type" "simd_dupgp, simd_dup") + (set_attr "simd_mode" "")] +) + +(define_insn "aarch64_simd_dup" + [(set (match_operand:VDQF 0 "register_operand" "=w") + (vec_duplicate:VDQF (match_operand: 1 "register_operand" "w")))] "TARGET_SIMD" - "dup\\t%0., %1" - [(set_attr "simd_type" "simd_dupgp") + "dup\\t%0., %1.[0]" + [(set_attr "simd_type" "simd_dup") (set_attr "simd_mode" "")] ) (define_insn "aarch64_dup_lane" - [(set (match_operand:VDQ_I 0 "register_operand" "=w") - (vec_duplicate:VDQ_I + [(set (match_operand:VALL 0 "register_operand" "=w") + (vec_duplicate:VALL (vec_select: - (match_operand: 1 "register_operand" "w") + (match_operand:VALL 1 "register_operand" "w") (parallel [(match_operand:SI 2 "immediate_operand" "i")]) )))] "TARGET_SIMD" - "dup\\t%0, %1.[%2]" + "dup\\t%0., %1.[%2]" [(set_attr "simd_type" "simd_dup") (set_attr "simd_mode" "")] ) -(define_insn "aarch64_simd_dup" - [(set (match_operand:VDQF 0 "register_operand" "=w") - (vec_duplicate:VDQF (match_operand: 1 "register_operand" "w")))] +(define_insn "aarch64_dup_lane_" + [(set (match_operand:VALL 0 "register_operand" "=w") + (vec_duplicate:VALL + (vec_select: + (match_operand: 1 "register_operand" "w") + (parallel [(match_operand:SI 2 "immediate_operand" "i")]) + )))] "TARGET_SIMD" - "dup\\t%0., %1.[0]" + "dup\\t%0., %1.[%2]" [(set_attr "simd_type" "simd_dup") (set_attr "simd_mode" "")] ) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index aed035a434e..7635e1e2679 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -7931,6 +7931,55 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d) return true; } +static bool +aarch64_evpc_dup (struct expand_vec_perm_d *d) +{ + rtx (*gen) (rtx, rtx, rtx); + rtx out = d->target; + rtx in0; + enum machine_mode vmode = d->vmode; + unsigned int i, elt, nelt = d->nelt; + rtx lane; + + /* TODO: This may not be big-endian safe. */ + if (BYTES_BIG_ENDIAN) + return false; + + elt = d->perm[0]; + for (i = 1; i < nelt; i++) + { + if (elt != d->perm[i]) + return false; + } + + /* The generic preparation in aarch64_expand_vec_perm_const_1 + swaps the operand order and the permute indices if it finds + d->perm[0] to be in the second operand. Thus, we can always + use d->op0 and need not do any extra arithmetic to get the + correct lane number. */ + in0 = d->op0; + lane = GEN_INT (elt); + + switch (vmode) + { + case V16QImode: gen = gen_aarch64_dup_lanev16qi; break; + case V8QImode: gen = gen_aarch64_dup_lanev8qi; break; + case V8HImode: gen = gen_aarch64_dup_lanev8hi; break; + case V4HImode: gen = gen_aarch64_dup_lanev4hi; break; + case V4SImode: gen = gen_aarch64_dup_lanev4si; break; + case V2SImode: gen = gen_aarch64_dup_lanev2si; break; + case V2DImode: gen = gen_aarch64_dup_lanev2di; break; + case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break; + case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break; + case V2DFmode: gen = gen_aarch64_dup_lanev2df; break; + default: + return false; + } + + emit_insn (gen (out, in0, lane)); + return true; +} + static bool aarch64_evpc_tbl (struct expand_vec_perm_d *d) { @@ -7988,6 +8037,8 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) return true; else if (aarch64_evpc_trn (d)) return true; + else if (aarch64_evpc_dup (d)) + return true; return aarch64_evpc_tbl (d); } return false; diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index af097ad9a0d..e289a0dd459 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -508,6 +508,107 @@ typedef struct poly16x8x4_t #define __aarch64_vgetq_lane_u64(__a, __b) \ __aarch64_vget_lane_any (v2di, (uint64_t), (int64x2_t), __a, __b) +/* __aarch64_vdup_lane internal macros. */ +#define __aarch64_vdup_lane_any(__size, __q1, __q2, __a, __b) \ + vdup##__q1##_n_##__size (__aarch64_vget##__q2##_lane_##__size (__a, __b)) + +#define __aarch64_vdup_lane_f32(__a, __b) \ + __aarch64_vdup_lane_any (f32, , , __a, __b) +#define __aarch64_vdup_lane_f64(__a, __b) (__a) +#define __aarch64_vdup_lane_p8(__a, __b) \ + __aarch64_vdup_lane_any (p8, , , __a, __b) +#define __aarch64_vdup_lane_p16(__a, __b) \ + __aarch64_vdup_lane_any (p16, , , __a, __b) +#define __aarch64_vdup_lane_s8(__a, __b) \ + __aarch64_vdup_lane_any (s8, , , __a, __b) +#define __aarch64_vdup_lane_s16(__a, __b) \ + __aarch64_vdup_lane_any (s16, , , __a, __b) +#define __aarch64_vdup_lane_s32(__a, __b) \ + __aarch64_vdup_lane_any (s32, , , __a, __b) +#define __aarch64_vdup_lane_s64(__a, __b) (__a) +#define __aarch64_vdup_lane_u8(__a, __b) \ + __aarch64_vdup_lane_any (u8, , , __a, __b) +#define __aarch64_vdup_lane_u16(__a, __b) \ + __aarch64_vdup_lane_any (u16, , , __a, __b) +#define __aarch64_vdup_lane_u32(__a, __b) \ + __aarch64_vdup_lane_any (u32, , , __a, __b) +#define __aarch64_vdup_lane_u64(__a, __b) (__a) + +/* __aarch64_vdup_laneq internal macros. */ +#define __aarch64_vdup_laneq_f32(__a, __b) \ + __aarch64_vdup_lane_any (f32, , q, __a, __b) +#define __aarch64_vdup_laneq_f64(__a, __b) \ + __aarch64_vdup_lane_any (f64, , q, __a, __b) +#define __aarch64_vdup_laneq_p8(__a, __b) \ + __aarch64_vdup_lane_any (p8, , q, __a, __b) +#define __aarch64_vdup_laneq_p16(__a, __b) \ + __aarch64_vdup_lane_any (p16, , q, __a, __b) +#define __aarch64_vdup_laneq_s8(__a, __b) \ + __aarch64_vdup_lane_any (s8, , q, __a, __b) +#define __aarch64_vdup_laneq_s16(__a, __b) \ + __aarch64_vdup_lane_any (s16, , q, __a, __b) +#define __aarch64_vdup_laneq_s32(__a, __b) \ + __aarch64_vdup_lane_any (s32, , q, __a, __b) +#define __aarch64_vdup_laneq_s64(__a, __b) \ + __aarch64_vdup_lane_any (s64, , q, __a, __b) +#define __aarch64_vdup_laneq_u8(__a, __b) \ + __aarch64_vdup_lane_any (u8, , q, __a, __b) +#define __aarch64_vdup_laneq_u16(__a, __b) \ + __aarch64_vdup_lane_any (u16, , q, __a, __b) +#define __aarch64_vdup_laneq_u32(__a, __b) \ + __aarch64_vdup_lane_any (u32, , q, __a, __b) +#define __aarch64_vdup_laneq_u64(__a, __b) \ + __aarch64_vdup_lane_any (u64, , q, __a, __b) + +/* __aarch64_vdupq_lane internal macros. */ +#define __aarch64_vdupq_lane_f32(__a, __b) \ + __aarch64_vdup_lane_any (f32, q, , __a, __b) +#define __aarch64_vdupq_lane_f64(__a, __b) (vdupq_n_f64 (__a)) +#define __aarch64_vdupq_lane_p8(__a, __b) \ + __aarch64_vdup_lane_any (p8, q, , __a, __b) +#define __aarch64_vdupq_lane_p16(__a, __b) \ + __aarch64_vdup_lane_any (p16, q, , __a, __b) +#define __aarch64_vdupq_lane_s8(__a, __b) \ + __aarch64_vdup_lane_any (s8, q, , __a, __b) +#define __aarch64_vdupq_lane_s16(__a, __b) \ + __aarch64_vdup_lane_any (s16, q, , __a, __b) +#define __aarch64_vdupq_lane_s32(__a, __b) \ + __aarch64_vdup_lane_any (s32, q, , __a, __b) +#define __aarch64_vdupq_lane_s64(__a, __b) (vdupq_n_s64 (__a)) +#define __aarch64_vdupq_lane_u8(__a, __b) \ + __aarch64_vdup_lane_any (u8, q, , __a, __b) +#define __aarch64_vdupq_lane_u16(__a, __b) \ + __aarch64_vdup_lane_any (u16, q, , __a, __b) +#define __aarch64_vdupq_lane_u32(__a, __b) \ + __aarch64_vdup_lane_any (u32, q, , __a, __b) +#define __aarch64_vdupq_lane_u64(__a, __b) (vdupq_n_u64 (__a)) + +/* __aarch64_vdupq_laneq internal macros. */ +#define __aarch64_vdupq_laneq_f32(__a, __b) \ + __aarch64_vdup_lane_any (f32, q, q, __a, __b) +#define __aarch64_vdupq_laneq_f64(__a, __b) \ + __aarch64_vdup_lane_any (f64, q, q, __a, __b) +#define __aarch64_vdupq_laneq_p8(__a, __b) \ + __aarch64_vdup_lane_any (p8, q, q, __a, __b) +#define __aarch64_vdupq_laneq_p16(__a, __b) \ + __aarch64_vdup_lane_any (p16, q, q, __a, __b) +#define __aarch64_vdupq_laneq_s8(__a, __b) \ + __aarch64_vdup_lane_any (s8, q, q, __a, __b) +#define __aarch64_vdupq_laneq_s16(__a, __b) \ + __aarch64_vdup_lane_any (s16, q, q, __a, __b) +#define __aarch64_vdupq_laneq_s32(__a, __b) \ + __aarch64_vdup_lane_any (s32, q, q, __a, __b) +#define __aarch64_vdupq_laneq_s64(__a, __b) \ + __aarch64_vdup_lane_any (s64, q, q, __a, __b) +#define __aarch64_vdupq_laneq_u8(__a, __b) \ + __aarch64_vdup_lane_any (u8, q, q, __a, __b) +#define __aarch64_vdupq_laneq_u16(__a, __b) \ + __aarch64_vdup_lane_any (u16, q, q, __a, __b) +#define __aarch64_vdupq_laneq_u32(__a, __b) \ + __aarch64_vdup_lane_any (u32, q, q, __a, __b) +#define __aarch64_vdupq_laneq_u64(__a, __b) \ + __aarch64_vdup_lane_any (u64, q, q, __a, __b) + /* vadd */ __extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) vadd_s8 (int8x8_t __a, int8x8_t __b) @@ -5676,2775 +5777,2626 @@ vcvtxd_f32_f64 (float64_t a) return result; } -#define vdup_lane_f32(a, b) \ +#define vext_f32(a, b, c) \ __extension__ \ ({ \ + float32x2_t b_ = (b); \ float32x2_t a_ = (a); \ float32x2_t result; \ - __asm__ ("dup %0.2s,%1.s[%2]" \ + __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*4" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ + : /* No clobbers */); \ + result; \ + }) + +#define vext_f64(a, b, c) \ + __extension__ \ + ({ \ + float64x1_t b_ = (b); \ + float64x1_t a_ = (a); \ + float64x1_t result; \ + __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*8" \ + : "=w"(result) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdup_lane_p8(a, b) \ +#define vext_p8(a, b, c) \ __extension__ \ ({ \ + poly8x8_t b_ = (b); \ poly8x8_t a_ = (a); \ poly8x8_t result; \ - __asm__ ("dup %0.8b,%1.b[%2]" \ + __asm__ ("ext %0.8b,%1.8b,%2.8b,%3" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdup_lane_p16(a, b) \ +#define vext_p16(a, b, c) \ __extension__ \ ({ \ + poly16x4_t b_ = (b); \ poly16x4_t a_ = (a); \ poly16x4_t result; \ - __asm__ ("dup %0.4h,%1.h[%2]" \ + __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*2" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdup_lane_s8(a, b) \ +#define vext_s8(a, b, c) \ __extension__ \ ({ \ + int8x8_t b_ = (b); \ int8x8_t a_ = (a); \ int8x8_t result; \ - __asm__ ("dup %0.8b,%1.b[%2]" \ + __asm__ ("ext %0.8b,%1.8b,%2.8b,%3" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdup_lane_s16(a, b) \ +#define vext_s16(a, b, c) \ __extension__ \ ({ \ + int16x4_t b_ = (b); \ int16x4_t a_ = (a); \ int16x4_t result; \ - __asm__ ("dup %0.4h,%1.h[%2]" \ + __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*2" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdup_lane_s32(a, b) \ +#define vext_s32(a, b, c) \ __extension__ \ ({ \ + int32x2_t b_ = (b); \ int32x2_t a_ = (a); \ int32x2_t result; \ - __asm__ ("dup %0.2s,%1.s[%2]" \ + __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*4" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdup_lane_s64(a, b) \ +#define vext_s64(a, b, c) \ __extension__ \ ({ \ + int64x1_t b_ = (b); \ int64x1_t a_ = (a); \ int64x1_t result; \ - __asm__ ("ins %0.d[0],%1.d[%2]" \ + __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*8" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdup_lane_u8(a, b) \ +#define vext_u8(a, b, c) \ __extension__ \ ({ \ + uint8x8_t b_ = (b); \ uint8x8_t a_ = (a); \ uint8x8_t result; \ - __asm__ ("dup %0.8b,%1.b[%2]" \ + __asm__ ("ext %0.8b,%1.8b,%2.8b,%3" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdup_lane_u16(a, b) \ +#define vext_u16(a, b, c) \ __extension__ \ ({ \ + uint16x4_t b_ = (b); \ uint16x4_t a_ = (a); \ uint16x4_t result; \ - __asm__ ("dup %0.4h,%1.h[%2]" \ + __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*2" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdup_lane_u32(a, b) \ +#define vext_u32(a, b, c) \ __extension__ \ ({ \ + uint32x2_t b_ = (b); \ uint32x2_t a_ = (a); \ uint32x2_t result; \ - __asm__ ("dup %0.2s,%1.s[%2]" \ + __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*4" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdup_lane_u64(a, b) \ +#define vext_u64(a, b, c) \ __extension__ \ ({ \ + uint64x1_t b_ = (b); \ uint64x1_t a_ = (a); \ uint64x1_t result; \ - __asm__ ("ins %0.d[0],%1.d[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vdup_n_f32 (float32_t a) -{ - float32x2_t result; - __asm__ ("dup %0.2s, %w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vdup_n_p8 (uint32_t a) -{ - poly8x8_t result; - __asm__ ("dup %0.8b,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) -vdup_n_p16 (uint32_t a) -{ - poly16x4_t result; - __asm__ ("dup %0.4h,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vdup_n_s8 (int32_t a) -{ - int8x8_t result; - __asm__ ("dup %0.8b,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vdup_n_s16 (int32_t a) -{ - int16x4_t result; - __asm__ ("dup %0.4h,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vdup_n_s32 (int32_t a) -{ - int32x2_t result; - __asm__ ("dup %0.2s,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) -vdup_n_s64 (int64_t a) -{ - int64x1_t result; - __asm__ ("ins %0.d[0],%x1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vdup_n_u8 (uint32_t a) -{ - uint8x8_t result; - __asm__ ("dup %0.8b,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vdup_n_u16 (uint32_t a) -{ - uint16x4_t result; - __asm__ ("dup %0.4h,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vdup_n_u32 (uint32_t a) -{ - uint32x2_t result; - __asm__ ("dup %0.2s,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vdup_n_u64 (uint64_t a) -{ - uint64x1_t result; - __asm__ ("ins %0.d[0],%x1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -#define vdupd_lane_f64(a, b) \ - __extension__ \ - ({ \ - float64x2_t a_ = (a); \ - float64_t result; \ - __asm__ ("dup %d0, %1.d[%2]" \ + __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*8" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdupq_lane_f32(a, b) \ +#define vextq_f32(a, b, c) \ __extension__ \ ({ \ - float32x2_t a_ = (a); \ + float32x4_t b_ = (b); \ + float32x4_t a_ = (a); \ float32x4_t result; \ - __asm__ ("dup %0.4s,%1.s[%2]" \ + __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*4" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdupq_lane_f64(a, b) \ +#define vextq_f64(a, b, c) \ __extension__ \ ({ \ - float64x1_t a_ = (a); \ + float64x2_t b_ = (b); \ + float64x2_t a_ = (a); \ float64x2_t result; \ - __asm__ ("dup %0.2d,%1.d[%2]" \ + __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*8" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdupq_lane_p8(a, b) \ +#define vextq_p8(a, b, c) \ __extension__ \ ({ \ - poly8x8_t a_ = (a); \ + poly8x16_t b_ = (b); \ + poly8x16_t a_ = (a); \ poly8x16_t result; \ - __asm__ ("dup %0.16b,%1.b[%2]" \ + __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdupq_lane_p16(a, b) \ +#define vextq_p16(a, b, c) \ __extension__ \ ({ \ - poly16x4_t a_ = (a); \ + poly16x8_t b_ = (b); \ + poly16x8_t a_ = (a); \ poly16x8_t result; \ - __asm__ ("dup %0.8h,%1.h[%2]" \ + __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*2" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdupq_lane_s8(a, b) \ +#define vextq_s8(a, b, c) \ __extension__ \ ({ \ - int8x8_t a_ = (a); \ + int8x16_t b_ = (b); \ + int8x16_t a_ = (a); \ int8x16_t result; \ - __asm__ ("dup %0.16b,%1.b[%2]" \ + __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdupq_lane_s16(a, b) \ +#define vextq_s16(a, b, c) \ __extension__ \ ({ \ - int16x4_t a_ = (a); \ + int16x8_t b_ = (b); \ + int16x8_t a_ = (a); \ int16x8_t result; \ - __asm__ ("dup %0.8h,%1.h[%2]" \ + __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*2" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdupq_lane_s32(a, b) \ +#define vextq_s32(a, b, c) \ __extension__ \ ({ \ - int32x2_t a_ = (a); \ + int32x4_t b_ = (b); \ + int32x4_t a_ = (a); \ int32x4_t result; \ - __asm__ ("dup %0.4s,%1.s[%2]" \ + __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*4" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdupq_lane_s64(a, b) \ +#define vextq_s64(a, b, c) \ __extension__ \ ({ \ - int64x1_t a_ = (a); \ + int64x2_t b_ = (b); \ + int64x2_t a_ = (a); \ int64x2_t result; \ - __asm__ ("dup %0.2d,%1.d[%2]" \ + __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*8" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdupq_lane_u8(a, b) \ +#define vextq_u8(a, b, c) \ __extension__ \ ({ \ - uint8x8_t a_ = (a); \ + uint8x16_t b_ = (b); \ + uint8x16_t a_ = (a); \ uint8x16_t result; \ - __asm__ ("dup %0.16b,%1.b[%2]" \ + __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdupq_lane_u16(a, b) \ +#define vextq_u16(a, b, c) \ __extension__ \ ({ \ - uint16x4_t a_ = (a); \ + uint16x8_t b_ = (b); \ + uint16x8_t a_ = (a); \ uint16x8_t result; \ - __asm__ ("dup %0.8h,%1.h[%2]" \ + __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*2" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdupq_lane_u32(a, b) \ +#define vextq_u32(a, b, c) \ __extension__ \ ({ \ - uint32x2_t a_ = (a); \ + uint32x4_t b_ = (b); \ + uint32x4_t a_ = (a); \ uint32x4_t result; \ - __asm__ ("dup %0.4s,%1.s[%2]" \ + __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*4" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vdupq_lane_u64(a, b) \ +#define vextq_u64(a, b, c) \ __extension__ \ ({ \ - uint64x1_t a_ = (a); \ + uint64x2_t b_ = (b); \ + uint64x2_t a_ = (a); \ uint64x2_t result; \ - __asm__ ("dup %0.2d,%1.d[%2]" \ + __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*8" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "w"(a_), "w"(b_), "i"(c) \ + : /* No clobbers */); \ + result; \ + }) + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vfma_f32 (float32x2_t a, float32x2_t b, float32x2_t c) +{ + float32x2_t result; + __asm__ ("fmla %0.2s,%2.2s,%3.2s" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} + +#define vfma_lane_f32(a, b, c, d) \ + __extension__ \ + ({ \ + float32x2_t c_ = (c); \ + float32x2_t b_ = (b); \ + float32x2_t a_ = (a); \ + float32x2_t result; \ + __asm__ ("fmla %0.2s,%2.2s,%3.s[%4]" \ + : "=w"(result) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ + : /* No clobbers */); \ + result; \ + }) + +#define vfmad_lane_f64(a, b, c) \ + __extension__ \ + ({ \ + float64x2_t b_ = (b); \ + float64_t a_ = (a); \ + float64_t result; \ + __asm__ ("fmla %d0,%d1,%2.d[%3]" \ + : "=w"(result) \ + : "w"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vdupq_n_f32 (float32_t a) +vfmaq_f32 (float32x4_t a, float32x4_t b, float32x4_t c) { float32x4_t result; - __asm__ ("dup %0.4s, %w1" + __asm__ ("fmla %0.4s,%2.4s,%3.4s" : "=w"(result) - : "r"(a) + : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } __extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vdupq_n_f64 (float64_t a) +vfmaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c) { float64x2_t result; - __asm__ ("dup %0.2d, %x1" + __asm__ ("fmla %0.2d,%2.2d,%3.2d" : "=w"(result) - : "r"(a) + : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vdupq_n_p8 (uint32_t a) +#define vfmaq_lane_f32(a, b, c, d) \ + __extension__ \ + ({ \ + float32x4_t c_ = (c); \ + float32x4_t b_ = (b); \ + float32x4_t a_ = (a); \ + float32x4_t result; \ + __asm__ ("fmla %0.4s,%2.4s,%3.s[%4]" \ + : "=w"(result) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ + : /* No clobbers */); \ + result; \ + }) + +#define vfmaq_lane_f64(a, b, c, d) \ + __extension__ \ + ({ \ + float64x2_t c_ = (c); \ + float64x2_t b_ = (b); \ + float64x2_t a_ = (a); \ + float64x2_t result; \ + __asm__ ("fmla %0.2d,%2.2d,%3.d[%4]" \ + : "=w"(result) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ + : /* No clobbers */); \ + result; \ + }) + +#define vfmas_lane_f32(a, b, c) \ + __extension__ \ + ({ \ + float32x4_t b_ = (b); \ + float32_t a_ = (a); \ + float32_t result; \ + __asm__ ("fmla %s0,%s1,%2.s[%3]" \ + : "=w"(result) \ + : "w"(a_), "w"(b_), "i"(c) \ + : /* No clobbers */); \ + result; \ + }) + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vfma_n_f32 (float32x2_t a, float32x2_t b, float32_t c) { - poly8x16_t result; - __asm__ ("dup %0.16b,%w1" + float32x2_t result; + __asm__ ("fmla %0.2s, %2.2s, %3.s[0]" : "=w"(result) - : "r"(a) + : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) -vdupq_n_p16 (uint32_t a) +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vfmaq_n_f32 (float32x4_t a, float32x4_t b, float32_t c) { - poly16x8_t result; - __asm__ ("dup %0.8h,%w1" + float32x4_t result; + __asm__ ("fmla %0.4s, %2.4s, %3.s[0]" : "=w"(result) - : "r"(a) + : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vdupq_n_s8 (int32_t a) +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vfmaq_n_f64 (float64x2_t a, float64x2_t b, float64_t c) { - int8x16_t result; - __asm__ ("dup %0.16b,%w1" + float64x2_t result; + __asm__ ("fmla %0.2d, %2.2d, %3.d[0]" : "=w"(result) - : "r"(a) + : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vdupq_n_s16 (int32_t a) +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vfms_f32 (float32x2_t a, float32x2_t b, float32x2_t c) { - int16x8_t result; - __asm__ ("dup %0.8h,%w1" + float32x2_t result; + __asm__ ("fmls %0.2s,%2.2s,%3.2s" : "=w"(result) - : "r"(a) + : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vdupq_n_s32 (int32_t a) +#define vfmsd_lane_f64(a, b, c) \ + __extension__ \ + ({ \ + float64x2_t b_ = (b); \ + float64_t a_ = (a); \ + float64_t result; \ + __asm__ ("fmls %d0,%d1,%2.d[%3]" \ + : "=w"(result) \ + : "w"(a_), "w"(b_), "i"(c) \ + : /* No clobbers */); \ + result; \ + }) + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vfmsq_f32 (float32x4_t a, float32x4_t b, float32x4_t c) { - int32x4_t result; - __asm__ ("dup %0.4s,%w1" + float32x4_t result; + __asm__ ("fmls %0.4s,%2.4s,%3.4s" : "=w"(result) - : "r"(a) + : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vdupq_n_s64 (int64_t a) +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vfmsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c) { - int64x2_t result; - __asm__ ("dup %0.2d,%x1" + float64x2_t result; + __asm__ ("fmls %0.2d,%2.2d,%3.2d" : "=w"(result) - : "r"(a) + : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vdupq_n_u8 (uint32_t a) +#define vfmss_lane_f32(a, b, c) \ + __extension__ \ + ({ \ + float32x4_t b_ = (b); \ + float32_t a_ = (a); \ + float32_t result; \ + __asm__ ("fmls %s0,%s1,%2.s[%3]" \ + : "=w"(result) \ + : "w"(a_), "w"(b_), "i"(c) \ + : /* No clobbers */); \ + result; \ + }) + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vget_high_f32 (float32x4_t a) { - uint8x16_t result; - __asm__ ("dup %0.16b,%w1" + float32x2_t result; + __asm__ ("ins %0.d[0], %1.d[1]" : "=w"(result) - : "r"(a) + : "w"(a) : /* No clobbers */); return result; } -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vdupq_n_u16 (uint32_t a) +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) +vget_high_f64 (float64x2_t a) { - uint16x8_t result; - __asm__ ("dup %0.8h,%w1" + float64x1_t result; + __asm__ ("ins %0.d[0], %1.d[1]" : "=w"(result) - : "r"(a) + : "w"(a) : /* No clobbers */); return result; } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vdupq_n_u32 (uint32_t a) +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vget_high_p8 (poly8x16_t a) { - uint32x4_t result; - __asm__ ("dup %0.4s,%w1" + poly8x8_t result; + __asm__ ("ins %0.d[0], %1.d[1]" : "=w"(result) - : "r"(a) + : "w"(a) : /* No clobbers */); return result; } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vdupq_n_u64 (uint64_t a) +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vget_high_p16 (poly16x8_t a) { - uint64x2_t result; - __asm__ ("dup %0.2d,%x1" + poly16x4_t result; + __asm__ ("ins %0.d[0], %1.d[1]" : "=w"(result) - : "r"(a) + : "w"(a) : /* No clobbers */); return result; } -#define vdups_lane_f32(a, b) \ - __extension__ \ - ({ \ - float32x4_t a_ = (a); \ - float32_t result; \ - __asm__ ("dup %s0, %1.s[%2]" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vext_f32(a, b, c) \ - __extension__ \ - ({ \ - float32x2_t b_ = (b); \ - float32x2_t a_ = (a); \ - float32x2_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*4" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vget_high_s8 (int8x16_t a) +{ + int8x8_t result; + __asm__ ("ins %0.d[0], %1.d[1]" + : "=w"(result) + : "w"(a) + : /* No clobbers */); + return result; +} -#define vext_f64(a, b, c) \ - __extension__ \ - ({ \ - float64x1_t b_ = (b); \ - float64x1_t a_ = (a); \ - float64x1_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*8" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vext_p8(a, b, c) \ - __extension__ \ - ({ \ - poly8x8_t b_ = (b); \ - poly8x8_t a_ = (a); \ - poly8x8_t result; \ - __asm__ ("ext %0.8b,%1.8b,%2.8b,%3" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vget_high_s16 (int16x8_t a) +{ + int16x4_t result; + __asm__ ("ins %0.d[0], %1.d[1]" + : "=w"(result) + : "w"(a) + : /* No clobbers */); + return result; +} -#define vext_p16(a, b, c) \ - __extension__ \ - ({ \ - poly16x4_t b_ = (b); \ - poly16x4_t a_ = (a); \ - poly16x4_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*2" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vget_high_s32 (int32x4_t a) +{ + int32x2_t result; + __asm__ ("ins %0.d[0], %1.d[1]" + : "=w"(result) + : "w"(a) + : /* No clobbers */); + return result; +} -#define vext_s8(a, b, c) \ - __extension__ \ - ({ \ - int8x8_t b_ = (b); \ - int8x8_t a_ = (a); \ - int8x8_t result; \ - __asm__ ("ext %0.8b,%1.8b,%2.8b,%3" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vget_high_s64 (int64x2_t a) +{ + int64x1_t result; + __asm__ ("ins %0.d[0], %1.d[1]" + : "=w"(result) + : "w"(a) + : /* No clobbers */); + return result; +} -#define vext_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x4_t b_ = (b); \ - int16x4_t a_ = (a); \ - int16x4_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*2" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vget_high_u8 (uint8x16_t a) +{ + uint8x8_t result; + __asm__ ("ins %0.d[0], %1.d[1]" + : "=w"(result) + : "w"(a) + : /* No clobbers */); + return result; +} -#define vext_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x2_t b_ = (b); \ - int32x2_t a_ = (a); \ - int32x2_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*4" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vget_high_u16 (uint16x8_t a) +{ + uint16x4_t result; + __asm__ ("ins %0.d[0], %1.d[1]" + : "=w"(result) + : "w"(a) + : /* No clobbers */); + return result; +} -#define vext_s64(a, b, c) \ - __extension__ \ - ({ \ - int64x1_t b_ = (b); \ - int64x1_t a_ = (a); \ - int64x1_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*8" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vget_high_u32 (uint32x4_t a) +{ + uint32x2_t result; + __asm__ ("ins %0.d[0], %1.d[1]" + : "=w"(result) + : "w"(a) + : /* No clobbers */); + return result; +} -#define vext_u8(a, b, c) \ - __extension__ \ - ({ \ - uint8x8_t b_ = (b); \ - uint8x8_t a_ = (a); \ - uint8x8_t result; \ - __asm__ ("ext %0.8b,%1.8b,%2.8b,%3" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vget_high_u64 (uint64x2_t a) +{ + uint64x1_t result; + __asm__ ("ins %0.d[0], %1.d[1]" + : "=w"(result) + : "w"(a) + : /* No clobbers */); + return result; +} -#define vext_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x4_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint16x4_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*2" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vhsub_s8 (int8x8_t a, int8x8_t b) +{ + int8x8_t result; + __asm__ ("shsub %0.8b, %1.8b, %2.8b" + : "=w"(result) + : "w"(a), "w"(b) + : /* No clobbers */); + return result; +} -#define vext_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x2_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint32x2_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*4" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vhsub_s16 (int16x4_t a, int16x4_t b) +{ + int16x4_t result; + __asm__ ("shsub %0.4h, %1.4h, %2.4h" + : "=w"(result) + : "w"(a), "w"(b) + : /* No clobbers */); + return result; +} -#define vext_u64(a, b, c) \ - __extension__ \ - ({ \ - uint64x1_t b_ = (b); \ - uint64x1_t a_ = (a); \ - uint64x1_t result; \ - __asm__ ("ext %0.8b, %1.8b, %2.8b, #%3*8" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vhsub_s32 (int32x2_t a, int32x2_t b) +{ + int32x2_t result; + __asm__ ("shsub %0.2s, %1.2s, %2.2s" + : "=w"(result) + : "w"(a), "w"(b) + : /* No clobbers */); + return result; +} -#define vextq_f32(a, b, c) \ - __extension__ \ - ({ \ - float32x4_t b_ = (b); \ - float32x4_t a_ = (a); \ - float32x4_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*4" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vhsub_u8 (uint8x8_t a, uint8x8_t b) +{ + uint8x8_t result; + __asm__ ("uhsub %0.8b, %1.8b, %2.8b" + : "=w"(result) + : "w"(a), "w"(b) + : /* No clobbers */); + return result; +} -#define vextq_f64(a, b, c) \ - __extension__ \ - ({ \ - float64x2_t b_ = (b); \ - float64x2_t a_ = (a); \ - float64x2_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*8" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_p8(a, b, c) \ - __extension__ \ - ({ \ - poly8x16_t b_ = (b); \ - poly8x16_t a_ = (a); \ - poly8x16_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_p16(a, b, c) \ - __extension__ \ - ({ \ - poly16x8_t b_ = (b); \ - poly16x8_t a_ = (a); \ - poly16x8_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*2" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_s8(a, b, c) \ - __extension__ \ - ({ \ - int8x16_t b_ = (b); \ - int8x16_t a_ = (a); \ - int8x16_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x8_t b_ = (b); \ - int16x8_t a_ = (a); \ - int16x8_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*2" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x4_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*4" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_s64(a, b, c) \ - __extension__ \ - ({ \ - int64x2_t b_ = (b); \ - int64x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*8" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_u8(a, b, c) \ - __extension__ \ - ({ \ - uint8x16_t b_ = (b); \ - uint8x16_t a_ = (a); \ - uint8x16_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x8_t b_ = (b); \ - uint16x8_t a_ = (a); \ - uint16x8_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*2" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x4_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*4" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vextq_u64(a, b, c) \ - __extension__ \ - ({ \ - uint64x2_t b_ = (b); \ - uint64x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("ext %0.16b, %1.16b, %2.16b, #%3*8" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vfma_f32 (float32x2_t a, float32x2_t b, float32x2_t c) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vhsub_u16 (uint16x4_t a, uint16x4_t b) { - float32x2_t result; - __asm__ ("fmla %0.2s,%2.2s,%3.2s" + uint16x4_t result; + __asm__ ("uhsub %0.4h, %1.4h, %2.4h" : "=w"(result) - : "0"(a), "w"(b), "w"(c) + : "w"(a), "w"(b) : /* No clobbers */); return result; } -#define vfma_lane_f32(a, b, c, d) \ - __extension__ \ - ({ \ - float32x2_t c_ = (c); \ - float32x2_t b_ = (b); \ - float32x2_t a_ = (a); \ - float32x2_t result; \ - __asm__ ("fmla %0.2s,%2.2s,%3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vfmad_lane_f64(a, b, c) \ - __extension__ \ - ({ \ - float64x2_t b_ = (b); \ - float64_t a_ = (a); \ - float64_t result; \ - __asm__ ("fmla %d0,%d1,%2.d[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vfmaq_f32 (float32x4_t a, float32x4_t b, float32x4_t c) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vhsub_u32 (uint32x2_t a, uint32x2_t b) { - float32x4_t result; - __asm__ ("fmla %0.4s,%2.4s,%3.4s" + uint32x2_t result; + __asm__ ("uhsub %0.2s, %1.2s, %2.2s" : "=w"(result) - : "0"(a), "w"(b), "w"(c) + : "w"(a), "w"(b) : /* No clobbers */); return result; } -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vfmaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c) +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vhsubq_s8 (int8x16_t a, int8x16_t b) { - float64x2_t result; - __asm__ ("fmla %0.2d,%2.2d,%3.2d" + int8x16_t result; + __asm__ ("shsub %0.16b, %1.16b, %2.16b" : "=w"(result) - : "0"(a), "w"(b), "w"(c) + : "w"(a), "w"(b) : /* No clobbers */); return result; } -#define vfmaq_lane_f32(a, b, c, d) \ - __extension__ \ - ({ \ - float32x4_t c_ = (c); \ - float32x4_t b_ = (b); \ - float32x4_t a_ = (a); \ - float32x4_t result; \ - __asm__ ("fmla %0.4s,%2.4s,%3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vfmaq_lane_f64(a, b, c, d) \ - __extension__ \ - ({ \ - float64x2_t c_ = (c); \ - float64x2_t b_ = (b); \ - float64x2_t a_ = (a); \ - float64x2_t result; \ - __asm__ ("fmla %0.2d,%2.2d,%3.d[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vfmas_lane_f32(a, b, c) \ - __extension__ \ - ({ \ - float32x4_t b_ = (b); \ - float32_t a_ = (a); \ - float32_t result; \ - __asm__ ("fmla %s0,%s1,%2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vfma_n_f32 (float32x2_t a, float32x2_t b, float32_t c) +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vhsubq_s16 (int16x8_t a, int16x8_t b) { - float32x2_t result; - __asm__ ("fmla %0.2s, %2.2s, %3.s[0]" + int16x8_t result; + __asm__ ("shsub %0.8h, %1.8h, %2.8h" : "=w"(result) - : "0"(a), "w"(b), "w"(c) + : "w"(a), "w"(b) : /* No clobbers */); return result; } -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vfmaq_n_f32 (float32x4_t a, float32x4_t b, float32_t c) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vhsubq_s32 (int32x4_t a, int32x4_t b) { - float32x4_t result; - __asm__ ("fmla %0.4s, %2.4s, %3.s[0]" + int32x4_t result; + __asm__ ("shsub %0.4s, %1.4s, %2.4s" : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vfmaq_n_f64 (float64x2_t a, float64x2_t b, float64_t c) -{ - float64x2_t result; - __asm__ ("fmla %0.2d, %2.2d, %3.d[0]" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) + : "w"(a), "w"(b) : /* No clobbers */); return result; } -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vfms_f32 (float32x2_t a, float32x2_t b, float32x2_t c) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vhsubq_u8 (uint8x16_t a, uint8x16_t b) { - float32x2_t result; - __asm__ ("fmls %0.2s,%2.2s,%3.2s" + uint8x16_t result; + __asm__ ("uhsub %0.16b, %1.16b, %2.16b" : "=w"(result) - : "0"(a), "w"(b), "w"(c) + : "w"(a), "w"(b) : /* No clobbers */); return result; } -#define vfmsd_lane_f64(a, b, c) \ - __extension__ \ - ({ \ - float64x2_t b_ = (b); \ - float64_t a_ = (a); \ - float64_t result; \ - __asm__ ("fmls %d0,%d1,%2.d[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vfmsq_f32 (float32x4_t a, float32x4_t b, float32x4_t c) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vhsubq_u16 (uint16x8_t a, uint16x8_t b) { - float32x4_t result; - __asm__ ("fmls %0.4s,%2.4s,%3.4s" + uint16x8_t result; + __asm__ ("uhsub %0.8h, %1.8h, %2.8h" : "=w"(result) - : "0"(a), "w"(b), "w"(c) + : "w"(a), "w"(b) : /* No clobbers */); return result; } -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vfmsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vhsubq_u32 (uint32x4_t a, uint32x4_t b) { - float64x2_t result; - __asm__ ("fmls %0.2d,%2.2d,%3.2d" + uint32x4_t result; + __asm__ ("uhsub %0.4s, %1.4s, %2.4s" : "=w"(result) - : "0"(a), "w"(b), "w"(c) + : "w"(a), "w"(b) : /* No clobbers */); return result; } -#define vfmss_lane_f32(a, b, c) \ - __extension__ \ - ({ \ - float32x4_t b_ = (b); \ - float32_t a_ = (a); \ - float32_t result; \ - __asm__ ("fmls %s0,%s1,%2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vget_high_f32 (float32x4_t a) +vld1_dup_f32 (const float32_t * a) { float32x2_t result; - __asm__ ("ins %0.d[0], %1.d[1]" - : "=w"(result) - : "w"(a) - : /* No clobbers */); + __asm__ ("ld1r {%0.2s}, %1" + : "=w"(result) + : "Utv"(*a) + : /* No clobbers */); return result; } __extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) -vget_high_f64 (float64x2_t a) +vld1_dup_f64 (const float64_t * a) { float64x1_t result; - __asm__ ("ins %0.d[0], %1.d[1]" - : "=w"(result) - : "w"(a) - : /* No clobbers */); + __asm__ ("ld1r {%0.1d}, %1" + : "=w"(result) + : "Utv"(*a) + : /* No clobbers */); return result; } __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vget_high_p8 (poly8x16_t a) +vld1_dup_p8 (const poly8_t * a) { poly8x8_t result; - __asm__ ("ins %0.d[0], %1.d[1]" - : "=w"(result) - : "w"(a) - : /* No clobbers */); + __asm__ ("ld1r {%0.8b}, %1" + : "=w"(result) + : "Utv"(*a) + : /* No clobbers */); return result; } __extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) -vget_high_p16 (poly16x8_t a) +vld1_dup_p16 (const poly16_t * a) { poly16x4_t result; - __asm__ ("ins %0.d[0], %1.d[1]" - : "=w"(result) - : "w"(a) - : /* No clobbers */); + __asm__ ("ld1r {%0.4h}, %1" + : "=w"(result) + : "Utv"(*a) + : /* No clobbers */); return result; } __extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vget_high_s8 (int8x16_t a) +vld1_dup_s8 (const int8_t * a) { int8x8_t result; - __asm__ ("ins %0.d[0], %1.d[1]" - : "=w"(result) - : "w"(a) - : /* No clobbers */); + __asm__ ("ld1r {%0.8b}, %1" + : "=w"(result) + : "Utv"(*a) + : /* No clobbers */); return result; } __extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vget_high_s16 (int16x8_t a) +vld1_dup_s16 (const int16_t * a) { int16x4_t result; - __asm__ ("ins %0.d[0], %1.d[1]" - : "=w"(result) - : "w"(a) - : /* No clobbers */); + __asm__ ("ld1r {%0.4h}, %1" + : "=w"(result) + : "Utv"(*a) + : /* No clobbers */); return result; } __extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vget_high_s32 (int32x4_t a) +vld1_dup_s32 (const int32_t * a) { int32x2_t result; - __asm__ ("ins %0.d[0], %1.d[1]" - : "=w"(result) - : "w"(a) - : /* No clobbers */); + __asm__ ("ld1r {%0.2s}, %1" + : "=w"(result) + : "Utv"(*a) + : /* No clobbers */); return result; } __extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) -vget_high_s64 (int64x2_t a) +vld1_dup_s64 (const int64_t * a) { int64x1_t result; - __asm__ ("ins %0.d[0], %1.d[1]" - : "=w"(result) - : "w"(a) - : /* No clobbers */); + __asm__ ("ld1r {%0.1d}, %1" + : "=w"(result) + : "Utv"(*a) + : /* No clobbers */); return result; } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vget_high_u8 (uint8x16_t a) +vld1_dup_u8 (const uint8_t * a) { uint8x8_t result; - __asm__ ("ins %0.d[0], %1.d[1]" - : "=w"(result) - : "w"(a) - : /* No clobbers */); + __asm__ ("ld1r {%0.8b}, %1" + : "=w"(result) + : "Utv"(*a) + : /* No clobbers */); return result; } __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vget_high_u16 (uint16x8_t a) +vld1_dup_u16 (const uint16_t * a) { uint16x4_t result; - __asm__ ("ins %0.d[0], %1.d[1]" - : "=w"(result) - : "w"(a) - : /* No clobbers */); + __asm__ ("ld1r {%0.4h}, %1" + : "=w"(result) + : "Utv"(*a) + : /* No clobbers */); return result; } __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vget_high_u32 (uint32x4_t a) +vld1_dup_u32 (const uint32_t * a) { uint32x2_t result; - __asm__ ("ins %0.d[0], %1.d[1]" - : "=w"(result) - : "w"(a) - : /* No clobbers */); + __asm__ ("ld1r {%0.2s}, %1" + : "=w"(result) + : "Utv"(*a) + : /* No clobbers */); return result; } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vget_high_u64 (uint64x2_t a) +vld1_dup_u64 (const uint64_t * a) { uint64x1_t result; - __asm__ ("ins %0.d[0], %1.d[1]" - : "=w"(result) - : "w"(a) - : /* No clobbers */); + __asm__ ("ld1r {%0.1d}, %1" + : "=w"(result) + : "Utv"(*a) + : /* No clobbers */); return result; } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vhsub_s8 (int8x8_t a, int8x8_t b) -{ - int8x8_t result; - __asm__ ("shsub %0.8b, %1.8b, %2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} +#define vld1_lane_f32(a, b, c) \ + __extension__ \ + ({ \ + float32x2_t b_ = (b); \ + const float32_t * a_ = (a); \ + float32x2_t result; \ + __asm__ ("ld1 {%0.s}[%1], %2" \ + : "=w"(result) \ + : "i" (c), "Utv"(*a_), "0"(b_) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vhsub_s16 (int16x4_t a, int16x4_t b) -{ - int16x4_t result; - __asm__ ("shsub %0.4h, %1.4h, %2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} +#define vld1_lane_f64(a, b, c) \ + __extension__ \ + ({ \ + float64x1_t b_ = (b); \ + const float64_t * a_ = (a); \ + float64x1_t result; \ + __asm__ ("ld1 {%0.d}[%1], %2" \ + : "=w"(result) \ + : "i" (c), "Utv"(*a_), "0"(b_) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vhsub_s32 (int32x2_t a, int32x2_t b) -{ - int32x2_t result; - __asm__ ("shsub %0.2s, %1.2s, %2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} +#define vld1_lane_p8(a, b, c) \ + __extension__ \ + ({ \ + poly8x8_t b_ = (b); \ + const poly8_t * a_ = (a); \ + poly8x8_t result; \ + __asm__ ("ld1 {%0.b}[%1], %2" \ + : "=w"(result) \ + : "i" (c), "Utv"(*a_), "0"(b_) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vhsub_u8 (uint8x8_t a, uint8x8_t b) -{ - uint8x8_t result; - __asm__ ("uhsub %0.8b, %1.8b, %2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} +#define vld1_lane_p16(a, b, c) \ + __extension__ \ + ({ \ + poly16x4_t b_ = (b); \ + const poly16_t * a_ = (a); \ + poly16x4_t result; \ + __asm__ ("ld1 {%0.h}[%1], %2" \ + : "=w"(result) \ + : "i" (c), "Utv"(*a_), "0"(b_) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vhsub_u16 (uint16x4_t a, uint16x4_t b) -{ - uint16x4_t result; - __asm__ ("uhsub %0.4h, %1.4h, %2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} +#define vld1_lane_s8(a, b, c) \ + __extension__ \ + ({ \ + int8x8_t b_ = (b); \ + const int8_t * a_ = (a); \ + int8x8_t result; \ + __asm__ ("ld1 {%0.b}[%1], %2" \ + : "=w"(result) \ + : "i" (c), "Utv"(*a_), "0"(b_) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vhsub_u32 (uint32x2_t a, uint32x2_t b) -{ - uint32x2_t result; - __asm__ ("uhsub %0.2s, %1.2s, %2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} +#define vld1_lane_s16(a, b, c) \ + __extension__ \ + ({ \ + int16x4_t b_ = (b); \ + const int16_t * a_ = (a); \ + int16x4_t result; \ + __asm__ ("ld1 {%0.h}[%1], %2" \ + : "=w"(result) \ + : "i" (c), "Utv"(*a_), "0"(b_) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vhsubq_s8 (int8x16_t a, int8x16_t b) -{ - int8x16_t result; - __asm__ ("shsub %0.16b, %1.16b, %2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} +#define vld1_lane_s32(a, b, c) \ + __extension__ \ + ({ \ + int32x2_t b_ = (b); \ + const int32_t * a_ = (a); \ + int32x2_t result; \ + __asm__ ("ld1 {%0.s}[%1], %2" \ + : "=w"(result) \ + : "i" (c), "Utv"(*a_), "0"(b_) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vhsubq_s16 (int16x8_t a, int16x8_t b) -{ - int16x8_t result; - __asm__ ("shsub %0.8h, %1.8h, %2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} +#define vld1_lane_s64(a, b, c) \ + __extension__ \ + ({ \ + int64x1_t b_ = (b); \ + const int64_t * a_ = (a); \ + int64x1_t result; \ + __asm__ ("ld1 {%0.d}[%1], %2" \ + : "=w"(result) \ + : "i" (c), "Utv"(*a_), "0"(b_) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vhsubq_s32 (int32x4_t a, int32x4_t b) -{ - int32x4_t result; - __asm__ ("shsub %0.4s, %1.4s, %2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} +#define vld1_lane_u8(a, b, c) \ + __extension__ \ + ({ \ + uint8x8_t b_ = (b); \ + const uint8_t * a_ = (a); \ + uint8x8_t result; \ + __asm__ ("ld1 {%0.b}[%1], %2" \ + : "=w"(result) \ + : "i" (c), "Utv"(*a_), "0"(b_) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vhsubq_u8 (uint8x16_t a, uint8x16_t b) -{ - uint8x16_t result; - __asm__ ("uhsub %0.16b, %1.16b, %2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} +#define vld1_lane_u16(a, b, c) \ + __extension__ \ + ({ \ + uint16x4_t b_ = (b); \ + const uint16_t * a_ = (a); \ + uint16x4_t result; \ + __asm__ ("ld1 {%0.h}[%1], %2" \ + : "=w"(result) \ + : "i" (c), "Utv"(*a_), "0"(b_) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vhsubq_u16 (uint16x8_t a, uint16x8_t b) -{ - uint16x8_t result; - __asm__ ("uhsub %0.8h, %1.8h, %2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} +#define vld1_lane_u32(a, b, c) \ + __extension__ \ + ({ \ + uint32x2_t b_ = (b); \ + const uint32_t * a_ = (a); \ + uint32x2_t result; \ + __asm__ ("ld1 {%0.s}[%1], %2" \ + : "=w"(result) \ + : "i" (c), "Utv"(*a_), "0"(b_) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vhsubq_u32 (uint32x4_t a, uint32x4_t b) -{ - uint32x4_t result; - __asm__ ("uhsub %0.4s, %1.4s, %2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} +#define vld1_lane_u64(a, b, c) \ + __extension__ \ + ({ \ + uint64x1_t b_ = (b); \ + const uint64_t * a_ = (a); \ + uint64x1_t result; \ + __asm__ ("ld1 {%0.d}[%1], %2" \ + : "=w"(result) \ + : "i" (c), "Utv"(*a_), "0"(b_) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vld1_dup_f32 (const float32_t * a) +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vld1q_dup_f32 (const float32_t * a) { - float32x2_t result; - __asm__ ("ld1r {%0.2s}, %1" + float32x4_t result; + __asm__ ("ld1r {%0.4s}, %1" : "=w"(result) : "Utv"(*a) : /* No clobbers */); return result; } -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) -vld1_dup_f64 (const float64_t * a) +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vld1q_dup_f64 (const float64_t * a) { - float64x1_t result; - __asm__ ("ld1r {%0.1d}, %1" + float64x2_t result; + __asm__ ("ld1r {%0.2d}, %1" : "=w"(result) : "Utv"(*a) : /* No clobbers */); return result; } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vld1_dup_p8 (const poly8_t * a) +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vld1q_dup_p8 (const poly8_t * a) { - poly8x8_t result; - __asm__ ("ld1r {%0.8b}, %1" + poly8x16_t result; + __asm__ ("ld1r {%0.16b}, %1" : "=w"(result) : "Utv"(*a) : /* No clobbers */); return result; } -__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) -vld1_dup_p16 (const poly16_t * a) +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vld1q_dup_p16 (const poly16_t * a) { - poly16x4_t result; - __asm__ ("ld1r {%0.4h}, %1" + poly16x8_t result; + __asm__ ("ld1r {%0.8h}, %1" : "=w"(result) : "Utv"(*a) : /* No clobbers */); return result; } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vld1_dup_s8 (const int8_t * a) +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vld1q_dup_s8 (const int8_t * a) { - int8x8_t result; - __asm__ ("ld1r {%0.8b}, %1" + int8x16_t result; + __asm__ ("ld1r {%0.16b}, %1" : "=w"(result) : "Utv"(*a) : /* No clobbers */); return result; } -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vld1_dup_s16 (const int16_t * a) +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vld1q_dup_s16 (const int16_t * a) { - int16x4_t result; - __asm__ ("ld1r {%0.4h}, %1" + int16x8_t result; + __asm__ ("ld1r {%0.8h}, %1" : "=w"(result) : "Utv"(*a) : /* No clobbers */); return result; } -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vld1_dup_s32 (const int32_t * a) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vld1q_dup_s32 (const int32_t * a) { - int32x2_t result; - __asm__ ("ld1r {%0.2s}, %1" + int32x4_t result; + __asm__ ("ld1r {%0.4s}, %1" : "=w"(result) : "Utv"(*a) : /* No clobbers */); return result; } -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) -vld1_dup_s64 (const int64_t * a) +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vld1q_dup_s64 (const int64_t * a) { - int64x1_t result; - __asm__ ("ld1r {%0.1d}, %1" + int64x2_t result; + __asm__ ("ld1r {%0.2d}, %1" : "=w"(result) : "Utv"(*a) : /* No clobbers */); return result; } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vld1_dup_u8 (const uint8_t * a) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vld1q_dup_u8 (const uint8_t * a) { - uint8x8_t result; - __asm__ ("ld1r {%0.8b}, %1" + uint8x16_t result; + __asm__ ("ld1r {%0.16b}, %1" : "=w"(result) : "Utv"(*a) : /* No clobbers */); return result; } -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vld1_dup_u16 (const uint16_t * a) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vld1q_dup_u16 (const uint16_t * a) { - uint16x4_t result; - __asm__ ("ld1r {%0.4h}, %1" + uint16x8_t result; + __asm__ ("ld1r {%0.8h}, %1" : "=w"(result) : "Utv"(*a) : /* No clobbers */); return result; } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vld1_dup_u32 (const uint32_t * a) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vld1q_dup_u32 (const uint32_t * a) { - uint32x2_t result; - __asm__ ("ld1r {%0.2s}, %1" + uint32x4_t result; + __asm__ ("ld1r {%0.4s}, %1" : "=w"(result) : "Utv"(*a) : /* No clobbers */); return result; } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vld1_dup_u64 (const uint64_t * a) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vld1q_dup_u64 (const uint64_t * a) { - uint64x1_t result; - __asm__ ("ld1r {%0.1d}, %1" + uint64x2_t result; + __asm__ ("ld1r {%0.2d}, %1" : "=w"(result) : "Utv"(*a) : /* No clobbers */); return result; } -#define vld1_lane_f32(a, b, c) \ +#define vld1q_lane_f32(a, b, c) \ __extension__ \ ({ \ - float32x2_t b_ = (b); \ + float32x4_t b_ = (b); \ const float32_t * a_ = (a); \ - float32x2_t result; \ + float32x4_t result; \ __asm__ ("ld1 {%0.s}[%1], %2" \ : "=w"(result) \ - : "i" (c), "Utv"(*a_), "0"(b_) \ + : "i"(c), "Utv"(*a_), "0"(b_) \ : /* No clobbers */); \ result; \ }) -#define vld1_lane_f64(a, b, c) \ +#define vld1q_lane_f64(a, b, c) \ __extension__ \ ({ \ - float64x1_t b_ = (b); \ + float64x2_t b_ = (b); \ const float64_t * a_ = (a); \ - float64x1_t result; \ + float64x2_t result; \ __asm__ ("ld1 {%0.d}[%1], %2" \ : "=w"(result) \ - : "i" (c), "Utv"(*a_), "0"(b_) \ + : "i"(c), "Utv"(*a_), "0"(b_) \ : /* No clobbers */); \ result; \ }) -#define vld1_lane_p8(a, b, c) \ +#define vld1q_lane_p8(a, b, c) \ __extension__ \ ({ \ - poly8x8_t b_ = (b); \ + poly8x16_t b_ = (b); \ const poly8_t * a_ = (a); \ - poly8x8_t result; \ + poly8x16_t result; \ __asm__ ("ld1 {%0.b}[%1], %2" \ : "=w"(result) \ - : "i" (c), "Utv"(*a_), "0"(b_) \ + : "i"(c), "Utv"(*a_), "0"(b_) \ : /* No clobbers */); \ result; \ }) -#define vld1_lane_p16(a, b, c) \ +#define vld1q_lane_p16(a, b, c) \ __extension__ \ ({ \ - poly16x4_t b_ = (b); \ + poly16x8_t b_ = (b); \ const poly16_t * a_ = (a); \ - poly16x4_t result; \ + poly16x8_t result; \ __asm__ ("ld1 {%0.h}[%1], %2" \ : "=w"(result) \ - : "i" (c), "Utv"(*a_), "0"(b_) \ + : "i"(c), "Utv"(*a_), "0"(b_) \ : /* No clobbers */); \ result; \ }) -#define vld1_lane_s8(a, b, c) \ +#define vld1q_lane_s8(a, b, c) \ __extension__ \ ({ \ - int8x8_t b_ = (b); \ + int8x16_t b_ = (b); \ const int8_t * a_ = (a); \ - int8x8_t result; \ + int8x16_t result; \ __asm__ ("ld1 {%0.b}[%1], %2" \ : "=w"(result) \ - : "i" (c), "Utv"(*a_), "0"(b_) \ + : "i"(c), "Utv"(*a_), "0"(b_) \ : /* No clobbers */); \ result; \ }) -#define vld1_lane_s16(a, b, c) \ +#define vld1q_lane_s16(a, b, c) \ __extension__ \ ({ \ - int16x4_t b_ = (b); \ + int16x8_t b_ = (b); \ const int16_t * a_ = (a); \ - int16x4_t result; \ + int16x8_t result; \ __asm__ ("ld1 {%0.h}[%1], %2" \ : "=w"(result) \ - : "i" (c), "Utv"(*a_), "0"(b_) \ + : "i"(c), "Utv"(*a_), "0"(b_) \ : /* No clobbers */); \ result; \ }) -#define vld1_lane_s32(a, b, c) \ +#define vld1q_lane_s32(a, b, c) \ __extension__ \ ({ \ - int32x2_t b_ = (b); \ + int32x4_t b_ = (b); \ const int32_t * a_ = (a); \ - int32x2_t result; \ + int32x4_t result; \ __asm__ ("ld1 {%0.s}[%1], %2" \ : "=w"(result) \ - : "i" (c), "Utv"(*a_), "0"(b_) \ + : "i"(c), "Utv"(*a_), "0"(b_) \ : /* No clobbers */); \ result; \ }) -#define vld1_lane_s64(a, b, c) \ +#define vld1q_lane_s64(a, b, c) \ __extension__ \ ({ \ - int64x1_t b_ = (b); \ + int64x2_t b_ = (b); \ const int64_t * a_ = (a); \ - int64x1_t result; \ + int64x2_t result; \ __asm__ ("ld1 {%0.d}[%1], %2" \ : "=w"(result) \ - : "i" (c), "Utv"(*a_), "0"(b_) \ + : "i"(c), "Utv"(*a_), "0"(b_) \ : /* No clobbers */); \ result; \ }) -#define vld1_lane_u8(a, b, c) \ +#define vld1q_lane_u8(a, b, c) \ __extension__ \ ({ \ - uint8x8_t b_ = (b); \ + uint8x16_t b_ = (b); \ const uint8_t * a_ = (a); \ - uint8x8_t result; \ + uint8x16_t result; \ __asm__ ("ld1 {%0.b}[%1], %2" \ : "=w"(result) \ - : "i" (c), "Utv"(*a_), "0"(b_) \ + : "i"(c), "Utv"(*a_), "0"(b_) \ : /* No clobbers */); \ result; \ }) -#define vld1_lane_u16(a, b, c) \ +#define vld1q_lane_u16(a, b, c) \ __extension__ \ ({ \ - uint16x4_t b_ = (b); \ + uint16x8_t b_ = (b); \ const uint16_t * a_ = (a); \ - uint16x4_t result; \ + uint16x8_t result; \ __asm__ ("ld1 {%0.h}[%1], %2" \ : "=w"(result) \ - : "i" (c), "Utv"(*a_), "0"(b_) \ + : "i"(c), "Utv"(*a_), "0"(b_) \ : /* No clobbers */); \ result; \ }) -#define vld1_lane_u32(a, b, c) \ +#define vld1q_lane_u32(a, b, c) \ __extension__ \ ({ \ - uint32x2_t b_ = (b); \ + uint32x4_t b_ = (b); \ const uint32_t * a_ = (a); \ - uint32x2_t result; \ + uint32x4_t result; \ __asm__ ("ld1 {%0.s}[%1], %2" \ : "=w"(result) \ - : "i" (c), "Utv"(*a_), "0"(b_) \ + : "i"(c), "Utv"(*a_), "0"(b_) \ : /* No clobbers */); \ result; \ }) -#define vld1_lane_u64(a, b, c) \ +#define vld1q_lane_u64(a, b, c) \ __extension__ \ ({ \ - uint64x1_t b_ = (b); \ + uint64x2_t b_ = (b); \ const uint64_t * a_ = (a); \ - uint64x1_t result; \ + uint64x2_t result; \ __asm__ ("ld1 {%0.d}[%1], %2" \ : "=w"(result) \ - : "i" (c), "Utv"(*a_), "0"(b_) \ + : "i"(c), "Utv"(*a_), "0"(b_) \ : /* No clobbers */); \ result; \ }) -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vld1q_dup_f32 (const float32_t * a) -{ - float32x4_t result; - __asm__ ("ld1r {%0.4s}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} +#define vmla_lane_f32(a, b, c, d) \ + __extension__ \ + ({ \ + float32x2_t c_ = (c); \ + float32x2_t b_ = (b); \ + float32x2_t a_ = (a); \ + float32x2_t result; \ + float32x2_t t1; \ + __asm__ ("fmul %1.2s, %3.2s, %4.s[%5]; fadd %0.2s, %0.2s, %1.2s" \ + : "=w"(result), "=w"(t1) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vld1q_dup_f64 (const float64_t * a) -{ - float64x2_t result; - __asm__ ("ld1r {%0.2d}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} +#define vmla_lane_s16(a, b, c, d) \ + __extension__ \ + ({ \ + int16x4_t c_ = (c); \ + int16x4_t b_ = (b); \ + int16x4_t a_ = (a); \ + int16x4_t result; \ + __asm__ ("mla %0.4h, %2.4h, %3.h[%4]" \ + : "=w"(result) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vld1q_dup_p8 (const poly8_t * a) -{ - poly8x16_t result; - __asm__ ("ld1r {%0.16b}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} +#define vmla_lane_s32(a, b, c, d) \ + __extension__ \ + ({ \ + int32x2_t c_ = (c); \ + int32x2_t b_ = (b); \ + int32x2_t a_ = (a); \ + int32x2_t result; \ + __asm__ ("mla %0.2s, %2.2s, %3.s[%4]" \ + : "=w"(result) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) -vld1q_dup_p16 (const poly16_t * a) -{ - poly16x8_t result; - __asm__ ("ld1r {%0.8h}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} +#define vmla_lane_u16(a, b, c, d) \ + __extension__ \ + ({ \ + uint16x4_t c_ = (c); \ + uint16x4_t b_ = (b); \ + uint16x4_t a_ = (a); \ + uint16x4_t result; \ + __asm__ ("mla %0.4h, %2.4h, %3.h[%4]" \ + : "=w"(result) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vld1q_dup_s8 (const int8_t * a) -{ - int8x16_t result; - __asm__ ("ld1r {%0.16b}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} +#define vmla_lane_u32(a, b, c, d) \ + __extension__ \ + ({ \ + uint32x2_t c_ = (c); \ + uint32x2_t b_ = (b); \ + uint32x2_t a_ = (a); \ + uint32x2_t result; \ + __asm__ ("mla %0.2s, %2.2s, %3.s[%4]" \ + : "=w"(result) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ + : /* No clobbers */); \ + result; \ + }) -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vld1q_dup_s16 (const int16_t * a) -{ - int16x8_t result; - __asm__ ("ld1r {%0.8h}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); +#define vmla_laneq_s16(a, b, c, d) \ + __extension__ \ + ({ \ + int16x8_t c_ = (c); \ + int16x4_t b_ = (b); \ + int16x4_t a_ = (a); \ + int16x4_t result; \ + __asm__ ("mla %0.4h, %2.4h, %3.h[%4]" \ + : "=w"(result) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ + : /* No clobbers */); \ + result; \ + }) + +#define vmla_laneq_s32(a, b, c, d) \ + __extension__ \ + ({ \ + int32x4_t c_ = (c); \ + int32x2_t b_ = (b); \ + int32x2_t a_ = (a); \ + int32x2_t result; \ + __asm__ ("mla %0.2s, %2.2s, %3.s[%4]" \ + : "=w"(result) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ + : /* No clobbers */); \ + result; \ + }) + +#define vmla_laneq_u16(a, b, c, d) \ + __extension__ \ + ({ \ + uint16x8_t c_ = (c); \ + uint16x4_t b_ = (b); \ + uint16x4_t a_ = (a); \ + uint16x4_t result; \ + __asm__ ("mla %0.4h, %2.4h, %3.h[%4]" \ + : "=w"(result) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ + : /* No clobbers */); \ + result; \ + }) + +#define vmla_laneq_u32(a, b, c, d) \ + __extension__ \ + ({ \ + uint32x4_t c_ = (c); \ + uint32x2_t b_ = (b); \ + uint32x2_t a_ = (a); \ + uint32x2_t result; \ + __asm__ ("mla %0.2s, %2.2s, %3.s[%4]" \ + : "=w"(result) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ + : /* No clobbers */); \ + result; \ + }) + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmla_n_f32 (float32x2_t a, float32x2_t b, float32_t c) +{ + float32x2_t result; + float32x2_t t1; + __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fadd %0.2s, %0.2s, %1.2s" + : "=w"(result), "=w"(t1) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); return result; } -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vld1q_dup_s32 (const int32_t * a) +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmla_n_s16 (int16x4_t a, int16x4_t b, int16_t c) { - int32x4_t result; - __asm__ ("ld1r {%0.4s}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); + int16x4_t result; + __asm__ ("mla %0.4h,%2.4h,%3.h[0]" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); return result; } -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vld1q_dup_s64 (const int64_t * a) +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmla_n_s32 (int32x2_t a, int32x2_t b, int32_t c) { - int64x2_t result; - __asm__ ("ld1r {%0.2d}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); + int32x2_t result; + __asm__ ("mla %0.2s,%2.2s,%3.s[0]" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); return result; } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vld1q_dup_u8 (const uint8_t * a) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmla_n_u16 (uint16x4_t a, uint16x4_t b, uint16_t c) { - uint8x16_t result; - __asm__ ("ld1r {%0.16b}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); + uint16x4_t result; + __asm__ ("mla %0.4h,%2.4h,%3.h[0]" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); return result; } -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vld1q_dup_u16 (const uint16_t * a) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmla_n_u32 (uint32x2_t a, uint32x2_t b, uint32_t c) { - uint16x8_t result; - __asm__ ("ld1r {%0.8h}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); + uint32x2_t result; + __asm__ ("mla %0.2s,%2.2s,%3.s[0]" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); return result; } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vld1q_dup_u32 (const uint32_t * a) +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vmla_s8 (int8x8_t a, int8x8_t b, int8x8_t c) { - uint32x4_t result; - __asm__ ("ld1r {%0.4s}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); + int8x8_t result; + __asm__ ("mla %0.8b, %2.8b, %3.8b" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); return result; } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vld1q_dup_u64 (const uint64_t * a) +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmla_s16 (int16x4_t a, int16x4_t b, int16x4_t c) { - uint64x2_t result; - __asm__ ("ld1r {%0.2d}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); + int16x4_t result; + __asm__ ("mla %0.4h, %2.4h, %3.4h" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); return result; } -#define vld1q_lane_f32(a, b, c) \ +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmla_s32 (int32x2_t a, int32x2_t b, int32x2_t c) +{ + int32x2_t result; + __asm__ ("mla %0.2s, %2.2s, %3.2s" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vmla_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c) +{ + uint8x8_t result; + __asm__ ("mla %0.8b, %2.8b, %3.8b" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmla_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c) +{ + uint16x4_t result; + __asm__ ("mla %0.4h, %2.4h, %3.4h" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmla_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c) +{ + uint32x2_t result; + __asm__ ("mla %0.2s, %2.2s, %3.2s" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} + +#define vmlal_high_lane_s16(a, b, c, d) \ __extension__ \ ({ \ - float32x4_t b_ = (b); \ - const float32_t * a_ = (a); \ - float32x4_t result; \ - __asm__ ("ld1 {%0.s}[%1], %2" \ + int16x8_t c_ = (c); \ + int16x8_t b_ = (b); \ + int32x4_t a_ = (a); \ + int32x4_t result; \ + __asm__ ("smlal2 %0.4s, %2.8h, %3.h[%4]" \ : "=w"(result) \ - : "i"(c), "Utv"(*a_), "0"(b_) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vld1q_lane_f64(a, b, c) \ +#define vmlal_high_lane_s32(a, b, c, d) \ __extension__ \ ({ \ - float64x2_t b_ = (b); \ - const float64_t * a_ = (a); \ - float64x2_t result; \ - __asm__ ("ld1 {%0.d}[%1], %2" \ + int32x4_t c_ = (c); \ + int32x4_t b_ = (b); \ + int64x2_t a_ = (a); \ + int64x2_t result; \ + __asm__ ("smlal2 %0.2d, %2.4s, %3.s[%4]" \ : "=w"(result) \ - : "i"(c), "Utv"(*a_), "0"(b_) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vld1q_lane_p8(a, b, c) \ +#define vmlal_high_lane_u16(a, b, c, d) \ __extension__ \ ({ \ - poly8x16_t b_ = (b); \ - const poly8_t * a_ = (a); \ - poly8x16_t result; \ - __asm__ ("ld1 {%0.b}[%1], %2" \ + uint16x8_t c_ = (c); \ + uint16x8_t b_ = (b); \ + uint32x4_t a_ = (a); \ + uint32x4_t result; \ + __asm__ ("umlal2 %0.4s, %2.8h, %3.h[%4]" \ : "=w"(result) \ - : "i"(c), "Utv"(*a_), "0"(b_) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vld1q_lane_p16(a, b, c) \ +#define vmlal_high_lane_u32(a, b, c, d) \ __extension__ \ ({ \ - poly16x8_t b_ = (b); \ - const poly16_t * a_ = (a); \ - poly16x8_t result; \ - __asm__ ("ld1 {%0.h}[%1], %2" \ + uint32x4_t c_ = (c); \ + uint32x4_t b_ = (b); \ + uint64x2_t a_ = (a); \ + uint64x2_t result; \ + __asm__ ("umlal2 %0.2d, %2.4s, %3.s[%4]" \ : "=w"(result) \ - : "i"(c), "Utv"(*a_), "0"(b_) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vld1q_lane_s8(a, b, c) \ +#define vmlal_high_laneq_s16(a, b, c, d) \ __extension__ \ ({ \ - int8x16_t b_ = (b); \ - const int8_t * a_ = (a); \ - int8x16_t result; \ - __asm__ ("ld1 {%0.b}[%1], %2" \ + int16x8_t c_ = (c); \ + int16x8_t b_ = (b); \ + int32x4_t a_ = (a); \ + int32x4_t result; \ + __asm__ ("smlal2 %0.4s, %2.8h, %3.h[%4]" \ : "=w"(result) \ - : "i"(c), "Utv"(*a_), "0"(b_) \ - : /* No clobbers */); \ - result; \ - }) - -#define vld1q_lane_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x8_t b_ = (b); \ - const int16_t * a_ = (a); \ - int16x8_t result; \ - __asm__ ("ld1 {%0.h}[%1], %2" \ - : "=w"(result) \ - : "i"(c), "Utv"(*a_), "0"(b_) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vld1q_lane_s32(a, b, c) \ +#define vmlal_high_laneq_s32(a, b, c, d) \ __extension__ \ ({ \ + int32x4_t c_ = (c); \ int32x4_t b_ = (b); \ - const int32_t * a_ = (a); \ - int32x4_t result; \ - __asm__ ("ld1 {%0.s}[%1], %2" \ - : "=w"(result) \ - : "i"(c), "Utv"(*a_), "0"(b_) \ - : /* No clobbers */); \ - result; \ - }) - -#define vld1q_lane_s64(a, b, c) \ - __extension__ \ - ({ \ - int64x2_t b_ = (b); \ - const int64_t * a_ = (a); \ + int64x2_t a_ = (a); \ int64x2_t result; \ - __asm__ ("ld1 {%0.d}[%1], %2" \ - : "=w"(result) \ - : "i"(c), "Utv"(*a_), "0"(b_) \ - : /* No clobbers */); \ - result; \ - }) - -#define vld1q_lane_u8(a, b, c) \ - __extension__ \ - ({ \ - uint8x16_t b_ = (b); \ - const uint8_t * a_ = (a); \ - uint8x16_t result; \ - __asm__ ("ld1 {%0.b}[%1], %2" \ + __asm__ ("smlal2 %0.2d, %2.4s, %3.s[%4]" \ : "=w"(result) \ - : "i"(c), "Utv"(*a_), "0"(b_) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vld1q_lane_u16(a, b, c) \ +#define vmlal_high_laneq_u16(a, b, c, d) \ __extension__ \ ({ \ + uint16x8_t c_ = (c); \ uint16x8_t b_ = (b); \ - const uint16_t * a_ = (a); \ - uint16x8_t result; \ - __asm__ ("ld1 {%0.h}[%1], %2" \ - : "=w"(result) \ - : "i"(c), "Utv"(*a_), "0"(b_) \ - : /* No clobbers */); \ - result; \ - }) - -#define vld1q_lane_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x4_t b_ = (b); \ - const uint32_t * a_ = (a); \ + uint32x4_t a_ = (a); \ uint32x4_t result; \ - __asm__ ("ld1 {%0.s}[%1], %2" \ + __asm__ ("umlal2 %0.4s, %2.8h, %3.h[%4]" \ : "=w"(result) \ - : "i"(c), "Utv"(*a_), "0"(b_) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vld1q_lane_u64(a, b, c) \ +#define vmlal_high_laneq_u32(a, b, c, d) \ __extension__ \ ({ \ - uint64x2_t b_ = (b); \ - const uint64_t * a_ = (a); \ + uint32x4_t c_ = (c); \ + uint32x4_t b_ = (b); \ + uint64x2_t a_ = (a); \ uint64x2_t result; \ - __asm__ ("ld1 {%0.d}[%1], %2" \ + __asm__ ("umlal2 %0.2d, %2.4s, %3.s[%4]" \ : "=w"(result) \ - : "i"(c), "Utv"(*a_), "0"(b_) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmla_lane_f32(a, b, c, d) \ - __extension__ \ - ({ \ - float32x2_t c_ = (c); \ - float32x2_t b_ = (b); \ - float32x2_t a_ = (a); \ - float32x2_t result; \ - float32x2_t t1; \ - __asm__ ("fmul %1.2s, %3.2s, %4.s[%5]; fadd %0.2s, %0.2s, %1.2s" \ - : "=w"(result), "=w"(t1) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmla_lane_s16(a, b, c, d) \ +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlal_high_n_s16 (int32x4_t a, int16x8_t b, int16_t c) +{ + int32x4_t result; + __asm__ ("smlal2 %0.4s,%2.8h,%3.h[0]" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmlal_high_n_s32 (int64x2_t a, int32x4_t b, int32_t c) +{ + int64x2_t result; + __asm__ ("smlal2 %0.2d,%2.4s,%3.s[0]" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlal_high_n_u16 (uint32x4_t a, uint16x8_t b, uint16_t c) +{ + uint32x4_t result; + __asm__ ("umlal2 %0.4s,%2.8h,%3.h[0]" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmlal_high_n_u32 (uint64x2_t a, uint32x4_t b, uint32_t c) +{ + uint64x2_t result; + __asm__ ("umlal2 %0.2d,%2.4s,%3.s[0]" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmlal_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c) +{ + int16x8_t result; + __asm__ ("smlal2 %0.8h,%2.16b,%3.16b" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlal_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c) +{ + int32x4_t result; + __asm__ ("smlal2 %0.4s,%2.8h,%3.8h" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmlal_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c) +{ + int64x2_t result; + __asm__ ("smlal2 %0.2d,%2.4s,%3.4s" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmlal_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c) +{ + uint16x8_t result; + __asm__ ("umlal2 %0.8h,%2.16b,%3.16b" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlal_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c) +{ + uint32x4_t result; + __asm__ ("umlal2 %0.4s,%2.8h,%3.8h" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmlal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c) +{ + uint64x2_t result; + __asm__ ("umlal2 %0.2d,%2.4s,%3.4s" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} + +#define vmlal_lane_s16(a, b, c, d) \ __extension__ \ ({ \ int16x4_t c_ = (c); \ int16x4_t b_ = (b); \ - int16x4_t a_ = (a); \ - int16x4_t result; \ - __asm__ ("mla %0.4h, %2.4h, %3.h[%4]" \ + int32x4_t a_ = (a); \ + int32x4_t result; \ + __asm__ ("smlal %0.4s,%2.4h,%3.h[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmla_lane_s32(a, b, c, d) \ +#define vmlal_lane_s32(a, b, c, d) \ __extension__ \ ({ \ int32x2_t c_ = (c); \ int32x2_t b_ = (b); \ - int32x2_t a_ = (a); \ - int32x2_t result; \ - __asm__ ("mla %0.2s, %2.2s, %3.s[%4]" \ + int64x2_t a_ = (a); \ + int64x2_t result; \ + __asm__ ("smlal %0.2d,%2.2s,%3.s[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmla_lane_u16(a, b, c, d) \ +#define vmlal_lane_u16(a, b, c, d) \ __extension__ \ ({ \ uint16x4_t c_ = (c); \ uint16x4_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint16x4_t result; \ - __asm__ ("mla %0.4h, %2.4h, %3.h[%4]" \ + uint32x4_t a_ = (a); \ + uint32x4_t result; \ + __asm__ ("umlal %0.4s,%2.4h,%3.h[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmla_lane_u32(a, b, c, d) \ +#define vmlal_lane_u32(a, b, c, d) \ __extension__ \ ({ \ uint32x2_t c_ = (c); \ uint32x2_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint32x2_t result; \ - __asm__ ("mla %0.2s, %2.2s, %3.s[%4]" \ + uint64x2_t a_ = (a); \ + uint64x2_t result; \ + __asm__ ("umlal %0.2d, %2.2s, %3.s[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmla_laneq_s16(a, b, c, d) \ +#define vmlal_laneq_s16(a, b, c, d) \ __extension__ \ ({ \ int16x8_t c_ = (c); \ int16x4_t b_ = (b); \ - int16x4_t a_ = (a); \ - int16x4_t result; \ - __asm__ ("mla %0.4h, %2.4h, %3.h[%4]" \ + int32x4_t a_ = (a); \ + int32x4_t result; \ + __asm__ ("smlal %0.4s, %2.4h, %3.h[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmla_laneq_s32(a, b, c, d) \ +#define vmlal_laneq_s32(a, b, c, d) \ __extension__ \ ({ \ int32x4_t c_ = (c); \ int32x2_t b_ = (b); \ - int32x2_t a_ = (a); \ - int32x2_t result; \ - __asm__ ("mla %0.2s, %2.2s, %3.s[%4]" \ + int64x2_t a_ = (a); \ + int64x2_t result; \ + __asm__ ("smlal %0.2d, %2.2s, %3.s[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmla_laneq_u16(a, b, c, d) \ +#define vmlal_laneq_u16(a, b, c, d) \ __extension__ \ ({ \ uint16x8_t c_ = (c); \ uint16x4_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint16x4_t result; \ - __asm__ ("mla %0.4h, %2.4h, %3.h[%4]" \ + uint32x4_t a_ = (a); \ + uint32x4_t result; \ + __asm__ ("umlal %0.4s, %2.4h, %3.h[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmla_laneq_u32(a, b, c, d) \ +#define vmlal_laneq_u32(a, b, c, d) \ __extension__ \ ({ \ uint32x4_t c_ = (c); \ uint32x2_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint32x2_t result; \ - __asm__ ("mla %0.2s, %2.2s, %3.s[%4]" \ + uint64x2_t a_ = (a); \ + uint64x2_t result; \ + __asm__ ("umlal %0.2d, %2.2s, %3.s[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vmla_n_f32 (float32x2_t a, float32x2_t b, float32_t c) -{ - float32x2_t result; - float32x2_t t1; - __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fadd %0.2s, %0.2s, %1.2s" - : "=w"(result), "=w"(t1) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vmla_n_s16 (int16x4_t a, int16x4_t b, int16_t c) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlal_n_s16 (int32x4_t a, int16x4_t b, int16_t c) { - int16x4_t result; - __asm__ ("mla %0.4h,%2.4h,%3.h[0]" + int32x4_t result; + __asm__ ("smlal %0.4s,%2.4h,%3.h[0]" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vmla_n_s32 (int32x2_t a, int32x2_t b, int32_t c) +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmlal_n_s32 (int64x2_t a, int32x2_t b, int32_t c) { - int32x2_t result; - __asm__ ("mla %0.2s,%2.2s,%3.s[0]" + int64x2_t result; + __asm__ ("smlal %0.2d,%2.2s,%3.s[0]" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vmla_n_u16 (uint16x4_t a, uint16x4_t b, uint16_t c) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlal_n_u16 (uint32x4_t a, uint16x4_t b, uint16_t c) { - uint16x4_t result; - __asm__ ("mla %0.4h,%2.4h,%3.h[0]" + uint32x4_t result; + __asm__ ("umlal %0.4s,%2.4h,%3.h[0]" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vmla_n_u32 (uint32x2_t a, uint32x2_t b, uint32_t c) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmlal_n_u32 (uint64x2_t a, uint32x2_t b, uint32_t c) { - uint32x2_t result; - __asm__ ("mla %0.2s,%2.2s,%3.s[0]" + uint64x2_t result; + __asm__ ("umlal %0.2d,%2.2s,%3.s[0]" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vmla_s8 (int8x8_t a, int8x8_t b, int8x8_t c) +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmlal_s8 (int16x8_t a, int8x8_t b, int8x8_t c) { - int8x8_t result; - __asm__ ("mla %0.8b, %2.8b, %3.8b" + int16x8_t result; + __asm__ ("smlal %0.8h,%2.8b,%3.8b" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vmla_s16 (int16x4_t a, int16x4_t b, int16x4_t c) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlal_s16 (int32x4_t a, int16x4_t b, int16x4_t c) { - int16x4_t result; - __asm__ ("mla %0.4h, %2.4h, %3.4h" + int32x4_t result; + __asm__ ("smlal %0.4s,%2.4h,%3.4h" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vmla_s32 (int32x2_t a, int32x2_t b, int32x2_t c) +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmlal_s32 (int64x2_t a, int32x2_t b, int32x2_t c) { - int32x2_t result; - __asm__ ("mla %0.2s, %2.2s, %3.2s" + int64x2_t result; + __asm__ ("smlal %0.2d,%2.2s,%3.2s" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vmla_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmlal_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c) { - uint8x8_t result; - __asm__ ("mla %0.8b, %2.8b, %3.8b" + uint16x8_t result; + __asm__ ("umlal %0.8h,%2.8b,%3.8b" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vmla_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlal_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c) { - uint16x4_t result; - __asm__ ("mla %0.4h, %2.4h, %3.4h" + uint32x4_t result; + __asm__ ("umlal %0.4s,%2.4h,%3.4h" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vmla_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmlal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c) { - uint32x2_t result; - __asm__ ("mla %0.2s, %2.2s, %3.2s" + uint64x2_t result; + __asm__ ("umlal %0.2d,%2.2s,%3.2s" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -#define vmlal_high_lane_s16(a, b, c, d) \ +#define vmlaq_lane_f32(a, b, c, d) \ + __extension__ \ + ({ \ + float32x4_t c_ = (c); \ + float32x4_t b_ = (b); \ + float32x4_t a_ = (a); \ + float32x4_t result; \ + float32x4_t t1; \ + __asm__ ("fmul %1.4s, %3.4s, %4.s[%5]; fadd %0.4s, %0.4s, %1.4s" \ + : "=w"(result), "=w"(t1) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ + : /* No clobbers */); \ + result; \ + }) + +#define vmlaq_lane_s16(a, b, c, d) \ __extension__ \ ({ \ int16x8_t c_ = (c); \ int16x8_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smlal2 %0.4s, %2.8h, %3.h[%4]" \ + int16x8_t a_ = (a); \ + int16x8_t result; \ + __asm__ ("mla %0.8h, %2.8h, %3.h[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmlal_high_lane_s32(a, b, c, d) \ +#define vmlaq_lane_s32(a, b, c, d) \ __extension__ \ ({ \ int32x4_t c_ = (c); \ int32x4_t b_ = (b); \ - int64x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smlal2 %0.2d, %2.4s, %3.s[%4]" \ + int32x4_t a_ = (a); \ + int32x4_t result; \ + __asm__ ("mla %0.4s, %2.4s, %3.s[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmlal_high_lane_u16(a, b, c, d) \ +#define vmlaq_lane_u16(a, b, c, d) \ __extension__ \ ({ \ uint16x8_t c_ = (c); \ uint16x8_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umlal2 %0.4s, %2.8h, %3.h[%4]" \ + uint16x8_t a_ = (a); \ + uint16x8_t result; \ + __asm__ ("mla %0.8h, %2.8h, %3.h[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmlal_high_lane_u32(a, b, c, d) \ +#define vmlaq_lane_u32(a, b, c, d) \ __extension__ \ ({ \ uint32x4_t c_ = (c); \ uint32x4_t b_ = (b); \ - uint64x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umlal2 %0.2d, %2.4s, %3.s[%4]" \ + uint32x4_t a_ = (a); \ + uint32x4_t result; \ + __asm__ ("mla %0.4s, %2.4s, %3.s[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmlal_high_laneq_s16(a, b, c, d) \ +#define vmlaq_laneq_s16(a, b, c, d) \ __extension__ \ ({ \ int16x8_t c_ = (c); \ int16x8_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smlal2 %0.4s, %2.8h, %3.h[%4]" \ + int16x8_t a_ = (a); \ + int16x8_t result; \ + __asm__ ("mla %0.8h, %2.8h, %3.h[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmlal_high_laneq_s32(a, b, c, d) \ +#define vmlaq_laneq_s32(a, b, c, d) \ __extension__ \ ({ \ int32x4_t c_ = (c); \ int32x4_t b_ = (b); \ - int64x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smlal2 %0.2d, %2.4s, %3.s[%4]" \ + int32x4_t a_ = (a); \ + int32x4_t result; \ + __asm__ ("mla %0.4s, %2.4s, %3.s[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmlal_high_laneq_u16(a, b, c, d) \ +#define vmlaq_laneq_u16(a, b, c, d) \ __extension__ \ ({ \ uint16x8_t c_ = (c); \ uint16x8_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umlal2 %0.4s, %2.8h, %3.h[%4]" \ + uint16x8_t a_ = (a); \ + uint16x8_t result; \ + __asm__ ("mla %0.8h, %2.8h, %3.h[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmlal_high_laneq_u32(a, b, c, d) \ +#define vmlaq_laneq_u32(a, b, c, d) \ __extension__ \ ({ \ uint32x4_t c_ = (c); \ uint32x4_t b_ = (b); \ - uint64x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umlal2 %0.2d, %2.4s, %3.s[%4]" \ + uint32x4_t a_ = (a); \ + uint32x4_t result; \ + __asm__ ("mla %0.4s, %2.4s, %3.s[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmlal_high_n_s16 (int32x4_t a, int16x8_t b, int16_t c) +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmlaq_n_f32 (float32x4_t a, float32x4_t b, float32_t c) { - int32x4_t result; - __asm__ ("smlal2 %0.4s,%2.8h,%3.h[0]" - : "=w"(result) + float32x4_t result; + float32x4_t t1; + __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fadd %0.4s, %0.4s, %1.4s" + : "=w"(result), "=w"(t1) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vmlal_high_n_s32 (int64x2_t a, int32x4_t b, int32_t c) +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vmlaq_n_f64 (float64x2_t a, float64x2_t b, float64_t c) { - int64x2_t result; - __asm__ ("smlal2 %0.2d,%2.4s,%3.s[0]" - : "=w"(result) + float64x2_t result; + float64x2_t t1; + __asm__ ("fmul %1.2d, %3.2d, %4.d[0]; fadd %0.2d, %0.2d, %1.2d" + : "=w"(result), "=w"(t1) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmlal_high_n_u16 (uint32x4_t a, uint16x8_t b, uint16_t c) +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmlaq_n_s16 (int16x8_t a, int16x8_t b, int16_t c) { - uint32x4_t result; - __asm__ ("umlal2 %0.4s,%2.8h,%3.h[0]" + int16x8_t result; + __asm__ ("mla %0.8h,%2.8h,%3.h[0]" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vmlal_high_n_u32 (uint64x2_t a, uint32x4_t b, uint32_t c) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlaq_n_s32 (int32x4_t a, int32x4_t b, int32_t c) { - uint64x2_t result; - __asm__ ("umlal2 %0.2d,%2.4s,%3.s[0]" + int32x4_t result; + __asm__ ("mla %0.4s,%2.4s,%3.s[0]" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vmlal_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmlaq_n_u16 (uint16x8_t a, uint16x8_t b, uint16_t c) { - int16x8_t result; - __asm__ ("smlal2 %0.8h,%2.16b,%3.16b" + uint16x8_t result; + __asm__ ("mla %0.8h,%2.8h,%3.h[0]" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmlal_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlaq_n_u32 (uint32x4_t a, uint32x4_t b, uint32_t c) { - int32x4_t result; - __asm__ ("smlal2 %0.4s,%2.8h,%3.8h" + uint32x4_t result; + __asm__ ("mla %0.4s,%2.4s,%3.s[0]" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vmlal_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c) +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vmlaq_s8 (int8x16_t a, int8x16_t b, int8x16_t c) { - int64x2_t result; - __asm__ ("smlal2 %0.2d,%2.4s,%3.4s" + int8x16_t result; + __asm__ ("mla %0.16b, %2.16b, %3.16b" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vmlal_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c) +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmlaq_s16 (int16x8_t a, int16x8_t b, int16x8_t c) { - uint16x8_t result; - __asm__ ("umlal2 %0.8h,%2.16b,%3.16b" + int16x8_t result; + __asm__ ("mla %0.8h, %2.8h, %3.8h" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmlal_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlaq_s32 (int32x4_t a, int32x4_t b, int32x4_t c) { - uint32x4_t result; - __asm__ ("umlal2 %0.4s,%2.8h,%3.8h" + int32x4_t result; + __asm__ ("mla %0.4s, %2.4s, %3.4s" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vmlal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vmlaq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c) { - uint64x2_t result; - __asm__ ("umlal2 %0.2d,%2.4s,%3.4s" + uint8x16_t result; + __asm__ ("mla %0.16b, %2.16b, %3.16b" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -#define vmlal_lane_s16(a, b, c, d) \ - __extension__ \ - ({ \ - int16x4_t c_ = (c); \ - int16x4_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smlal %0.4s,%2.4h,%3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlal_lane_s32(a, b, c, d) \ - __extension__ \ - ({ \ - int32x2_t c_ = (c); \ - int32x2_t b_ = (b); \ - int64x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smlal %0.2d,%2.2s,%3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmlaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c) +{ + uint16x8_t result; + __asm__ ("mla %0.8h, %2.8h, %3.8h" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} -#define vmlal_lane_u16(a, b, c, d) \ - __extension__ \ - ({ \ - uint16x4_t c_ = (c); \ - uint16x4_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umlal %0.4s,%2.4h,%3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c) +{ + uint32x4_t result; + __asm__ ("mla %0.4s, %2.4s, %3.4s" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} -#define vmlal_lane_u32(a, b, c, d) \ +#define vmls_lane_f32(a, b, c, d) \ __extension__ \ ({ \ - uint32x2_t c_ = (c); \ - uint32x2_t b_ = (b); \ - uint64x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umlal %0.2d, %2.2s, %3.s[%4]" \ - : "=w"(result) \ + float32x2_t c_ = (c); \ + float32x2_t b_ = (b); \ + float32x2_t a_ = (a); \ + float32x2_t result; \ + float32x2_t t1; \ + __asm__ ("fmul %1.2s, %3.2s, %4.s[%5]; fsub %0.2s, %0.2s, %1.2s" \ + : "=w"(result), "=w"(t1) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmlal_laneq_s16(a, b, c, d) \ +#define vmls_lane_s16(a, b, c, d) \ __extension__ \ ({ \ - int16x8_t c_ = (c); \ + int16x4_t c_ = (c); \ int16x4_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smlal %0.4s, %2.4h, %3.h[%4]" \ + int16x4_t a_ = (a); \ + int16x4_t result; \ + __asm__ ("mls %0.4h,%2.4h,%3.h[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmlal_laneq_s32(a, b, c, d) \ +#define vmls_lane_s32(a, b, c, d) \ __extension__ \ ({ \ - int32x4_t c_ = (c); \ + int32x2_t c_ = (c); \ int32x2_t b_ = (b); \ - int64x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smlal %0.2d, %2.2s, %3.s[%4]" \ + int32x2_t a_ = (a); \ + int32x2_t result; \ + __asm__ ("mls %0.2s,%2.2s,%3.s[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmlal_laneq_u16(a, b, c, d) \ +#define vmls_lane_u16(a, b, c, d) \ __extension__ \ ({ \ - uint16x8_t c_ = (c); \ + uint16x4_t c_ = (c); \ uint16x4_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umlal %0.4s, %2.4h, %3.h[%4]" \ + uint16x4_t a_ = (a); \ + uint16x4_t result; \ + __asm__ ("mls %0.4h,%2.4h,%3.h[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmlal_laneq_u32(a, b, c, d) \ +#define vmls_lane_u32(a, b, c, d) \ __extension__ \ ({ \ - uint32x4_t c_ = (c); \ + uint32x2_t c_ = (c); \ uint32x2_t b_ = (b); \ - uint64x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umlal %0.2d, %2.2s, %3.s[%4]" \ + uint32x2_t a_ = (a); \ + uint32x2_t result; \ + __asm__ ("mls %0.2s,%2.2s,%3.s[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmlal_n_s16 (int32x4_t a, int16x4_t b, int16_t c) +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmls_n_f32 (float32x2_t a, float32x2_t b, float32_t c) { - int32x4_t result; - __asm__ ("smlal %0.4s,%2.4h,%3.h[0]" - : "=w"(result) + float32x2_t result; + float32x2_t t1; + __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fsub %0.2s, %0.2s, %1.2s" + : "=w"(result), "=w"(t1) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vmlal_n_s32 (int64x2_t a, int32x2_t b, int32_t c) +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmls_n_s16 (int16x4_t a, int16x4_t b, int16_t c) { - int64x2_t result; - __asm__ ("smlal %0.2d,%2.2s,%3.s[0]" + int16x4_t result; + __asm__ ("mls %0.4h, %2.4h, %3.h[0]" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmlal_n_u16 (uint32x4_t a, uint16x4_t b, uint16_t c) +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmls_n_s32 (int32x2_t a, int32x2_t b, int32_t c) { - uint32x4_t result; - __asm__ ("umlal %0.4s,%2.4h,%3.h[0]" + int32x2_t result; + __asm__ ("mls %0.2s, %2.2s, %3.s[0]" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vmlal_n_u32 (uint64x2_t a, uint32x2_t b, uint32_t c) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmls_n_u16 (uint16x4_t a, uint16x4_t b, uint16_t c) { - uint64x2_t result; - __asm__ ("umlal %0.2d,%2.2s,%3.s[0]" + uint16x4_t result; + __asm__ ("mls %0.4h, %2.4h, %3.h[0]" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vmlal_s8 (int16x8_t a, int8x8_t b, int8x8_t c) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmls_n_u32 (uint32x2_t a, uint32x2_t b, uint32_t c) { - int16x8_t result; - __asm__ ("smlal %0.8h,%2.8b,%3.8b" + uint32x2_t result; + __asm__ ("mls %0.2s, %2.2s, %3.s[0]" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmlal_s16 (int32x4_t a, int16x4_t b, int16x4_t c) +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vmls_s8 (int8x8_t a, int8x8_t b, int8x8_t c) { - int32x4_t result; - __asm__ ("smlal %0.4s,%2.4h,%3.4h" + int8x8_t result; + __asm__ ("mls %0.8b,%2.8b,%3.8b" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vmlal_s32 (int64x2_t a, int32x2_t b, int32x2_t c) +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmls_s16 (int16x4_t a, int16x4_t b, int16x4_t c) { - int64x2_t result; - __asm__ ("smlal %0.2d,%2.2s,%3.2s" + int16x4_t result; + __asm__ ("mls %0.4h,%2.4h,%3.4h" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vmlal_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c) +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmls_s32 (int32x2_t a, int32x2_t b, int32x2_t c) { - uint16x8_t result; - __asm__ ("umlal %0.8h,%2.8b,%3.8b" + int32x2_t result; + __asm__ ("mls %0.2s,%2.2s,%3.2s" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmlal_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vmls_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c) { - uint32x4_t result; - __asm__ ("umlal %0.4s,%2.4h,%3.4h" + uint8x8_t result; + __asm__ ("mls %0.8b,%2.8b,%3.8b" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vmlal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmls_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c) { - uint64x2_t result; - __asm__ ("umlal %0.2d,%2.2s,%3.2s" + uint16x4_t result; + __asm__ ("mls %0.4h,%2.4h,%3.4h" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -#define vmlaq_lane_f32(a, b, c, d) \ - __extension__ \ - ({ \ - float32x4_t c_ = (c); \ - float32x4_t b_ = (b); \ - float32x4_t a_ = (a); \ - float32x4_t result; \ - float32x4_t t1; \ - __asm__ ("fmul %1.4s, %3.4s, %4.s[%5]; fadd %0.4s, %0.4s, %1.4s" \ - : "=w"(result), "=w"(t1) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmls_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c) +{ + uint32x2_t result; + __asm__ ("mls %0.2s,%2.2s,%3.2s" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} -#define vmlaq_lane_s16(a, b, c, d) \ +#define vmlsl_high_lane_s16(a, b, c, d) \ __extension__ \ ({ \ int16x8_t c_ = (c); \ int16x8_t b_ = (b); \ - int16x8_t a_ = (a); \ - int16x8_t result; \ - __asm__ ("mla %0.8h, %2.8h, %3.h[%4]" \ + int32x4_t a_ = (a); \ + int32x4_t result; \ + __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmlaq_lane_s32(a, b, c, d) \ +#define vmlsl_high_lane_s32(a, b, c, d) \ __extension__ \ ({ \ int32x4_t c_ = (c); \ int32x4_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("mla %0.4s, %2.4s, %3.s[%4]" \ + int64x2_t a_ = (a); \ + int64x2_t result; \ + __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmlaq_lane_u16(a, b, c, d) \ +#define vmlsl_high_lane_u16(a, b, c, d) \ __extension__ \ ({ \ uint16x8_t c_ = (c); \ uint16x8_t b_ = (b); \ - uint16x8_t a_ = (a); \ - uint16x8_t result; \ - __asm__ ("mla %0.8h, %2.8h, %3.h[%4]" \ + uint32x4_t a_ = (a); \ + uint32x4_t result; \ + __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmlaq_lane_u32(a, b, c, d) \ +#define vmlsl_high_lane_u32(a, b, c, d) \ __extension__ \ ({ \ uint32x4_t c_ = (c); \ uint32x4_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("mla %0.4s, %2.4s, %3.s[%4]" \ + uint64x2_t a_ = (a); \ + uint64x2_t result; \ + __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmlaq_laneq_s16(a, b, c, d) \ +#define vmlsl_high_laneq_s16(a, b, c, d) \ __extension__ \ ({ \ int16x8_t c_ = (c); \ int16x8_t b_ = (b); \ - int16x8_t a_ = (a); \ - int16x8_t result; \ - __asm__ ("mla %0.8h, %2.8h, %3.h[%4]" \ + int32x4_t a_ = (a); \ + int32x4_t result; \ + __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmlaq_laneq_s32(a, b, c, d) \ +#define vmlsl_high_laneq_s32(a, b, c, d) \ __extension__ \ ({ \ int32x4_t c_ = (c); \ int32x4_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("mla %0.4s, %2.4s, %3.s[%4]" \ + int64x2_t a_ = (a); \ + int64x2_t result; \ + __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmlaq_laneq_u16(a, b, c, d) \ +#define vmlsl_high_laneq_u16(a, b, c, d) \ __extension__ \ ({ \ uint16x8_t c_ = (c); \ uint16x8_t b_ = (b); \ - uint16x8_t a_ = (a); \ - uint16x8_t result; \ - __asm__ ("mla %0.8h, %2.8h, %3.h[%4]" \ + uint32x4_t a_ = (a); \ + uint32x4_t result; \ + __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmlaq_laneq_u32(a, b, c, d) \ +#define vmlsl_high_laneq_u32(a, b, c, d) \ __extension__ \ ({ \ uint32x4_t c_ = (c); \ uint32x4_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("mla %0.4s, %2.4s, %3.s[%4]" \ + uint64x2_t a_ = (a); \ + uint64x2_t result; \ + __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vmlaq_n_f32 (float32x4_t a, float32x4_t b, float32_t c) -{ - float32x4_t result; - float32x4_t t1; - __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fadd %0.4s, %0.4s, %1.4s" - : "=w"(result), "=w"(t1) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vmlaq_n_f64 (float64x2_t a, float64x2_t b, float64_t c) -{ - float64x2_t result; - float64x2_t t1; - __asm__ ("fmul %1.2d, %3.2d, %4.d[0]; fadd %0.2d, %0.2d, %1.2d" - : "=w"(result), "=w"(t1) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vmlaq_n_s16 (int16x8_t a, int16x8_t b, int16_t c) -{ - int16x8_t result; - __asm__ ("mla %0.8h,%2.8h,%3.h[0]" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - __extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmlaq_n_s32 (int32x4_t a, int32x4_t b, int32_t c) +vmlsl_high_n_s16 (int32x4_t a, int16x8_t b, int16_t c) { int32x4_t result; - __asm__ ("mla %0.4s,%2.4s,%3.s[0]" + __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[0]" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vmlaq_n_u16 (uint16x8_t a, uint16x8_t b, uint16_t c) +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmlsl_high_n_s32 (int64x2_t a, int32x4_t b, int32_t c) { - uint16x8_t result; - __asm__ ("mla %0.8h,%2.8h,%3.h[0]" + int64x2_t result; + __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[0]" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); @@ -8452,21 +8404,21 @@ vmlaq_n_u16 (uint16x8_t a, uint16x8_t b, uint16_t c) } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmlaq_n_u32 (uint32x4_t a, uint32x4_t b, uint32_t c) +vmlsl_high_n_u16 (uint32x4_t a, uint16x8_t b, uint16_t c) { uint32x4_t result; - __asm__ ("mla %0.4s,%2.4s,%3.s[0]" + __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[0]" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vmlaq_s8 (int8x16_t a, int8x16_t b, int8x16_t c) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmlsl_high_n_u32 (uint64x2_t a, uint32x4_t b, uint32_t c) { - int8x16_t result; - __asm__ ("mla %0.16b, %2.16b, %3.16b" + uint64x2_t result; + __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[0]" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); @@ -8474,10 +8426,10 @@ vmlaq_s8 (int8x16_t a, int8x16_t b, int8x16_t c) } __extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vmlaq_s16 (int16x8_t a, int16x8_t b, int16x8_t c) +vmlsl_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c) { int16x8_t result; - __asm__ ("mla %0.8h, %2.8h, %3.8h" + __asm__ ("smlsl2 %0.8h,%2.16b,%3.16b" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); @@ -8485,21 +8437,21 @@ vmlaq_s16 (int16x8_t a, int16x8_t b, int16x8_t c) } __extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmlaq_s32 (int32x4_t a, int32x4_t b, int32x4_t c) +vmlsl_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c) { int32x4_t result; - __asm__ ("mla %0.4s, %2.4s, %3.4s" + __asm__ ("smlsl2 %0.4s,%2.8h,%3.8h" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vmlaq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c) +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmlsl_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c) { - uint8x16_t result; - __asm__ ("mla %0.16b, %2.16b, %3.16b" + int64x2_t result; + __asm__ ("smlsl2 %0.2d,%2.4s,%3.4s" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); @@ -8507,10 +8459,10 @@ vmlaq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c) } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vmlaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c) +vmlsl_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c) { uint16x8_t result; - __asm__ ("mla %0.8h, %2.8h, %3.8h" + __asm__ ("umlsl2 %0.8h,%2.16b,%3.16b" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); @@ -8518,559 +8470,508 @@ vmlaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c) } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmlaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c) +vmlsl_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c) { uint32x4_t result; - __asm__ ("mla %0.4s, %2.4s, %3.4s" + __asm__ ("umlsl2 %0.4s,%2.8h,%3.8h" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -#define vmls_lane_f32(a, b, c, d) \ +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmlsl_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c) +{ + uint64x2_t result; + __asm__ ("umlsl2 %0.2d,%2.4s,%3.4s" + : "=w"(result) + : "0"(a), "w"(b), "w"(c) + : /* No clobbers */); + return result; +} + +#define vmlsl_lane_s16(a, b, c, d) \ __extension__ \ ({ \ - float32x2_t c_ = (c); \ - float32x2_t b_ = (b); \ - float32x2_t a_ = (a); \ - float32x2_t result; \ - float32x2_t t1; \ - __asm__ ("fmul %1.2s, %3.2s, %4.s[%5]; fsub %0.2s, %0.2s, %1.2s" \ - : "=w"(result), "=w"(t1) \ + int16x4_t c_ = (c); \ + int16x4_t b_ = (b); \ + int32x4_t a_ = (a); \ + int32x4_t result; \ + __asm__ ("smlsl %0.4s, %2.4h, %3.h[%4]" \ + : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmls_lane_s16(a, b, c, d) \ +#define vmlsl_lane_s32(a, b, c, d) \ __extension__ \ ({ \ - int16x4_t c_ = (c); \ + int32x2_t c_ = (c); \ + int32x2_t b_ = (b); \ + int64x2_t a_ = (a); \ + int64x2_t result; \ + __asm__ ("smlsl %0.2d, %2.2s, %3.s[%4]" \ + : "=w"(result) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ + : /* No clobbers */); \ + result; \ + }) + +#define vmlsl_lane_u16(a, b, c, d) \ + __extension__ \ + ({ \ + uint16x4_t c_ = (c); \ + uint16x4_t b_ = (b); \ + uint32x4_t a_ = (a); \ + uint32x4_t result; \ + __asm__ ("umlsl %0.4s, %2.4h, %3.h[%4]" \ + : "=w"(result) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ + : /* No clobbers */); \ + result; \ + }) + +#define vmlsl_lane_u32(a, b, c, d) \ + __extension__ \ + ({ \ + uint32x2_t c_ = (c); \ + uint32x2_t b_ = (b); \ + uint64x2_t a_ = (a); \ + uint64x2_t result; \ + __asm__ ("umlsl %0.2d, %2.2s, %3.s[%4]" \ + : "=w"(result) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ + : /* No clobbers */); \ + result; \ + }) + +#define vmlsl_laneq_s16(a, b, c, d) \ + __extension__ \ + ({ \ + int16x8_t c_ = (c); \ int16x4_t b_ = (b); \ - int16x4_t a_ = (a); \ - int16x4_t result; \ - __asm__ ("mls %0.4h,%2.4h,%3.h[%4]" \ + int32x4_t a_ = (a); \ + int32x4_t result; \ + __asm__ ("smlsl %0.4s, %2.4h, %3.h[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmls_lane_s32(a, b, c, d) \ +#define vmlsl_laneq_s32(a, b, c, d) \ __extension__ \ ({ \ - int32x2_t c_ = (c); \ + int32x4_t c_ = (c); \ int32x2_t b_ = (b); \ - int32x2_t a_ = (a); \ - int32x2_t result; \ - __asm__ ("mls %0.2s,%2.2s,%3.s[%4]" \ + int64x2_t a_ = (a); \ + int64x2_t result; \ + __asm__ ("smlsl %0.2d, %2.2s, %3.s[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmls_lane_u16(a, b, c, d) \ +#define vmlsl_laneq_u16(a, b, c, d) \ __extension__ \ ({ \ - uint16x4_t c_ = (c); \ + uint16x8_t c_ = (c); \ uint16x4_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint16x4_t result; \ - __asm__ ("mls %0.4h,%2.4h,%3.h[%4]" \ + uint32x4_t a_ = (a); \ + uint32x4_t result; \ + __asm__ ("umlsl %0.4s, %2.4h, %3.h[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmls_lane_u32(a, b, c, d) \ +#define vmlsl_laneq_u32(a, b, c, d) \ __extension__ \ ({ \ - uint32x2_t c_ = (c); \ + uint32x4_t c_ = (c); \ uint32x2_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint32x2_t result; \ - __asm__ ("mls %0.2s,%2.2s,%3.s[%4]" \ + uint64x2_t a_ = (a); \ + uint64x2_t result; \ + __asm__ ("umlsl %0.2d, %2.2s, %3.s[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vmls_n_f32 (float32x2_t a, float32x2_t b, float32_t c) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlsl_n_s16 (int32x4_t a, int16x4_t b, int16_t c) { - float32x2_t result; - float32x2_t t1; - __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fsub %0.2s, %0.2s, %1.2s" - : "=w"(result), "=w"(t1) + int32x4_t result; + __asm__ ("smlsl %0.4s, %2.4h, %3.h[0]" + : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vmls_n_s16 (int16x4_t a, int16x4_t b, int16_t c) +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmlsl_n_s32 (int64x2_t a, int32x2_t b, int32_t c) { - int16x4_t result; - __asm__ ("mls %0.4h, %2.4h, %3.h[0]" + int64x2_t result; + __asm__ ("smlsl %0.2d, %2.2s, %3.s[0]" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vmls_n_s32 (int32x2_t a, int32x2_t b, int32_t c) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlsl_n_u16 (uint32x4_t a, uint16x4_t b, uint16_t c) { - int32x2_t result; - __asm__ ("mls %0.2s, %2.2s, %3.s[0]" + uint32x4_t result; + __asm__ ("umlsl %0.4s, %2.4h, %3.h[0]" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vmls_n_u16 (uint16x4_t a, uint16x4_t b, uint16_t c) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmlsl_n_u32 (uint64x2_t a, uint32x2_t b, uint32_t c) { - uint16x4_t result; - __asm__ ("mls %0.4h, %2.4h, %3.h[0]" + uint64x2_t result; + __asm__ ("umlsl %0.2d, %2.2s, %3.s[0]" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vmls_n_u32 (uint32x2_t a, uint32x2_t b, uint32_t c) +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmlsl_s8 (int16x8_t a, int8x8_t b, int8x8_t c) { - uint32x2_t result; - __asm__ ("mls %0.2s, %2.2s, %3.s[0]" + int16x8_t result; + __asm__ ("smlsl %0.8h, %2.8b, %3.8b" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vmls_s8 (int8x8_t a, int8x8_t b, int8x8_t c) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlsl_s16 (int32x4_t a, int16x4_t b, int16x4_t c) { - int8x8_t result; - __asm__ ("mls %0.8b,%2.8b,%3.8b" + int32x4_t result; + __asm__ ("smlsl %0.4s, %2.4h, %3.4h" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vmls_s16 (int16x4_t a, int16x4_t b, int16x4_t c) +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmlsl_s32 (int64x2_t a, int32x2_t b, int32x2_t c) { - int16x4_t result; - __asm__ ("mls %0.4h,%2.4h,%3.4h" + int64x2_t result; + __asm__ ("smlsl %0.2d, %2.2s, %3.2s" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vmls_s32 (int32x2_t a, int32x2_t b, int32x2_t c) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmlsl_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c) { - int32x2_t result; - __asm__ ("mls %0.2s,%2.2s,%3.2s" + uint16x8_t result; + __asm__ ("umlsl %0.8h, %2.8b, %3.8b" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vmls_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlsl_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c) { - uint8x8_t result; - __asm__ ("mls %0.8b,%2.8b,%3.8b" + uint32x4_t result; + __asm__ ("umlsl %0.4s, %2.4h, %3.4h" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vmls_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmlsl_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c) { - uint16x4_t result; - __asm__ ("mls %0.4h,%2.4h,%3.4h" + uint64x2_t result; + __asm__ ("umlsl %0.2d, %2.2s, %3.2s" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vmls_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c) -{ - uint32x2_t result; - __asm__ ("mls %0.2s,%2.2s,%3.2s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} +#define vmlsq_lane_f32(a, b, c, d) \ + __extension__ \ + ({ \ + float32x4_t c_ = (c); \ + float32x4_t b_ = (b); \ + float32x4_t a_ = (a); \ + float32x4_t result; \ + float32x4_t t1; \ + __asm__ ("fmul %1.4s, %3.4s, %4.s[%5]; fsub %0.4s, %0.4s, %1.4s" \ + : "=w"(result), "=w"(t1) \ + : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ + : /* No clobbers */); \ + result; \ + }) -#define vmlsl_high_lane_s16(a, b, c, d) \ +#define vmlsq_lane_s16(a, b, c, d) \ __extension__ \ ({ \ int16x8_t c_ = (c); \ int16x8_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[%4]" \ + int16x8_t a_ = (a); \ + int16x8_t result; \ + __asm__ ("mls %0.8h,%2.8h,%3.h[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmlsl_high_lane_s32(a, b, c, d) \ +#define vmlsq_lane_s32(a, b, c, d) \ __extension__ \ ({ \ int32x4_t c_ = (c); \ int32x4_t b_ = (b); \ - int64x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[%4]" \ + int32x4_t a_ = (a); \ + int32x4_t result; \ + __asm__ ("mls %0.4s,%2.4s,%3.s[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmlsl_high_lane_u16(a, b, c, d) \ +#define vmlsq_lane_u16(a, b, c, d) \ __extension__ \ ({ \ uint16x8_t c_ = (c); \ uint16x8_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[%4]" \ + uint16x8_t a_ = (a); \ + uint16x8_t result; \ + __asm__ ("mls %0.8h,%2.8h,%3.h[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmlsl_high_lane_u32(a, b, c, d) \ +#define vmlsq_lane_u32(a, b, c, d) \ __extension__ \ ({ \ uint32x4_t c_ = (c); \ uint32x4_t b_ = (b); \ - uint64x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[%4]" \ + uint32x4_t a_ = (a); \ + uint32x4_t result; \ + __asm__ ("mls %0.4s,%2.4s,%3.s[%4]" \ : "=w"(result) \ : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ : /* No clobbers */); \ result; \ }) -#define vmlsl_high_laneq_s16(a, b, c, d) \ - __extension__ \ - ({ \ - int16x8_t c_ = (c); \ - int16x8_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ +#define vmlsq_laneq_f32(__a, __b, __c, __d) \ + __extension__ \ + ({ \ + float32x4_t __c_ = (__c); \ + float32x4_t __b_ = (__b); \ + float32x4_t __a_ = (__a); \ + float32x4_t __result; \ + float32x4_t __t1; \ + __asm__ ("fmul %1.4s, %3.4s, %4.s[%5]; fsub %0.4s, %0.4s, %1.4s" \ + : "=w"(__result), "=w"(__t1) \ + : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d) \ + : /* No clobbers */); \ + __result; \ }) -#define vmlsl_high_laneq_s32(a, b, c, d) \ - __extension__ \ - ({ \ - int32x4_t c_ = (c); \ - int32x4_t b_ = (b); \ - int64x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ +#define vmlsq_laneq_s16(__a, __b, __c, __d) \ + __extension__ \ + ({ \ + int16x8_t __c_ = (__c); \ + int16x8_t __b_ = (__b); \ + int16x8_t __a_ = (__a); \ + int16x8_t __result; \ + __asm__ ("mls %0.8h, %2.8h, %3.h[%4]" \ + : "=w"(__result) \ + : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d) \ + : /* No clobbers */); \ + __result; \ }) -#define vmlsl_high_laneq_u16(a, b, c, d) \ - __extension__ \ - ({ \ - uint16x8_t c_ = (c); \ - uint16x8_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ +#define vmlsq_laneq_s32(__a, __b, __c, __d) \ + __extension__ \ + ({ \ + int32x4_t __c_ = (__c); \ + int32x4_t __b_ = (__b); \ + int32x4_t __a_ = (__a); \ + int32x4_t __result; \ + __asm__ ("mls %0.4s, %2.4s, %3.s[%4]" \ + : "=w"(__result) \ + : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d) \ + : /* No clobbers */); \ + __result; \ }) -#define vmlsl_high_laneq_u32(a, b, c, d) \ - __extension__ \ - ({ \ - uint32x4_t c_ = (c); \ - uint32x4_t b_ = (b); \ - uint64x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ +#define vmlsq_laneq_u16(__a, __b, __c, __d) \ + __extension__ \ + ({ \ + uint16x8_t __c_ = (__c); \ + uint16x8_t __b_ = (__b); \ + uint16x8_t __a_ = (__a); \ + uint16x8_t __result; \ + __asm__ ("mls %0.8h, %2.8h, %3.h[%4]" \ + : "=w"(__result) \ + : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d) \ + : /* No clobbers */); \ + __result; \ }) -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmlsl_high_n_s16 (int32x4_t a, int16x8_t b, int16_t c) +#define vmlsq_laneq_u32(__a, __b, __c, __d) \ + __extension__ \ + ({ \ + uint32x4_t __c_ = (__c); \ + uint32x4_t __b_ = (__b); \ + uint32x4_t __a_ = (__a); \ + uint32x4_t __result; \ + __asm__ ("mls %0.4s, %2.4s, %3.s[%4]" \ + : "=w"(__result) \ + : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d) \ + : /* No clobbers */); \ + __result; \ + }) + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vmlsq_n_f32 (float32x4_t a, float32x4_t b, float32_t c) { - int32x4_t result; - __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[0]" - : "=w"(result) + float32x4_t result; + float32x4_t t1; + __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fsub %0.4s, %0.4s, %1.4s" + : "=w"(result), "=w"(t1) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vmlsl_high_n_s32 (int64x2_t a, int32x4_t b, int32_t c) +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vmlsq_n_f64 (float64x2_t a, float64x2_t b, float64_t c) { - int64x2_t result; - __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[0]" - : "=w"(result) + float64x2_t result; + float64x2_t t1; + __asm__ ("fmul %1.2d, %3.2d, %4.d[0]; fsub %0.2d, %0.2d, %1.2d" + : "=w"(result), "=w"(t1) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmlsl_high_n_u16 (uint32x4_t a, uint16x8_t b, uint16_t c) +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmlsq_n_s16 (int16x8_t a, int16x8_t b, int16_t c) { - uint32x4_t result; - __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[0]" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vmlsl_high_n_u32 (uint64x2_t a, uint32x4_t b, uint32_t c) -{ - uint64x2_t result; - __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[0]" + int16x8_t result; + __asm__ ("mls %0.8h, %2.8h, %3.h[0]" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vmlsl_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlsq_n_s32 (int32x4_t a, int32x4_t b, int32_t c) { - int16x8_t result; - __asm__ ("smlsl2 %0.8h,%2.16b,%3.16b" + int32x4_t result; + __asm__ ("mls %0.4s, %2.4s, %3.s[0]" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmlsl_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmlsq_n_u16 (uint16x8_t a, uint16x8_t b, uint16_t c) { - int32x4_t result; - __asm__ ("smlsl2 %0.4s,%2.8h,%3.8h" + uint16x8_t result; + __asm__ ("mls %0.8h, %2.8h, %3.h[0]" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vmlsl_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vmlsq_n_u32 (uint32x4_t a, uint32x4_t b, uint32_t c) { - int64x2_t result; - __asm__ ("smlsl2 %0.2d,%2.4s,%3.4s" + uint32x4_t result; + __asm__ ("mls %0.4s, %2.4s, %3.s[0]" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vmlsl_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c) +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vmlsq_s8 (int8x16_t a, int8x16_t b, int8x16_t c) { - uint16x8_t result; - __asm__ ("umlsl2 %0.8h,%2.16b,%3.16b" + int8x16_t result; + __asm__ ("mls %0.16b,%2.16b,%3.16b" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmlsl_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c) +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vmlsq_s16 (int16x8_t a, int16x8_t b, int16x8_t c) { - uint32x4_t result; - __asm__ ("umlsl2 %0.4s,%2.8h,%3.8h" + int16x8_t result; + __asm__ ("mls %0.8h,%2.8h,%3.8h" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vmlsl_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vmlsq_s32 (int32x4_t a, int32x4_t b, int32x4_t c) { - uint64x2_t result; - __asm__ ("umlsl2 %0.2d,%2.4s,%3.4s" + int32x4_t result; + __asm__ ("mls %0.4s,%2.4s,%3.4s" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -#define vmlsl_lane_s16(a, b, c, d) \ - __extension__ \ - ({ \ - int16x4_t c_ = (c); \ - int16x4_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smlsl %0.4s, %2.4h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsl_lane_s32(a, b, c, d) \ - __extension__ \ - ({ \ - int32x2_t c_ = (c); \ - int32x2_t b_ = (b); \ - int64x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smlsl %0.2d, %2.2s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsl_lane_u16(a, b, c, d) \ - __extension__ \ - ({ \ - uint16x4_t c_ = (c); \ - uint16x4_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umlsl %0.4s, %2.4h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsl_lane_u32(a, b, c, d) \ - __extension__ \ - ({ \ - uint32x2_t c_ = (c); \ - uint32x2_t b_ = (b); \ - uint64x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umlsl %0.2d, %2.2s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsl_laneq_s16(a, b, c, d) \ - __extension__ \ - ({ \ - int16x8_t c_ = (c); \ - int16x4_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smlsl %0.4s, %2.4h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsl_laneq_s32(a, b, c, d) \ - __extension__ \ - ({ \ - int32x4_t c_ = (c); \ - int32x2_t b_ = (b); \ - int64x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smlsl %0.2d, %2.2s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsl_laneq_u16(a, b, c, d) \ - __extension__ \ - ({ \ - uint16x8_t c_ = (c); \ - uint16x4_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umlsl %0.4s, %2.4h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsl_laneq_u32(a, b, c, d) \ - __extension__ \ - ({ \ - uint32x4_t c_ = (c); \ - uint32x2_t b_ = (b); \ - uint64x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umlsl %0.2d, %2.2s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmlsl_n_s16 (int32x4_t a, int16x4_t b, int16_t c) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vmlsq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c) { - int32x4_t result; - __asm__ ("smlsl %0.4s, %2.4h, %3.h[0]" + uint8x16_t result; + __asm__ ("mls %0.16b,%2.16b,%3.16b" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vmlsl_n_s32 (int64x2_t a, int32x2_t b, int32_t c) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vmlsq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c) { - int64x2_t result; - __asm__ ("smlsl %0.2d, %2.2s, %3.s[0]" + uint16x8_t result; + __asm__ ("mls %0.8h,%2.8h,%3.8h" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); @@ -9078,580 +8979,227 @@ vmlsl_n_s32 (int64x2_t a, int32x2_t b, int32_t c) } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmlsl_n_u16 (uint32x4_t a, uint16x4_t b, uint16_t c) +vmlsq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c) { uint32x4_t result; - __asm__ ("umlsl %0.4s, %2.4h, %3.h[0]" + __asm__ ("mls %0.4s,%2.4s,%3.4s" : "=w"(result) : "0"(a), "w"(b), "w"(c) : /* No clobbers */); return result; } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vmlsl_n_u32 (uint64x2_t a, uint32x2_t b, uint32_t c) +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vmov_n_f32 (float32_t a) { - uint64x2_t result; - __asm__ ("umlsl %0.2d, %2.2s, %3.s[0]" + float32x2_t result; + __asm__ ("dup %0.2s, %w1" : "=w"(result) - : "0"(a), "w"(b), "w"(c) + : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vmlsl_s8 (int16x8_t a, int8x8_t b, int8x8_t c) +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vmov_n_p8 (uint32_t a) { - int16x8_t result; - __asm__ ("smlsl %0.8h, %2.8b, %3.8b" + poly8x8_t result; + __asm__ ("dup %0.8b,%w1" : "=w"(result) - : "0"(a), "w"(b), "w"(c) + : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmlsl_s16 (int32x4_t a, int16x4_t b, int16x4_t c) +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vmov_n_p16 (uint32_t a) { - int32x4_t result; - __asm__ ("smlsl %0.4s, %2.4h, %3.4h" + poly16x4_t result; + __asm__ ("dup %0.4h,%w1" : "=w"(result) - : "0"(a), "w"(b), "w"(c) + : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vmlsl_s32 (int64x2_t a, int32x2_t b, int32x2_t c) +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vmov_n_s8 (int32_t a) { - int64x2_t result; - __asm__ ("smlsl %0.2d, %2.2s, %3.2s" + int8x8_t result; + __asm__ ("dup %0.8b,%w1" : "=w"(result) - : "0"(a), "w"(b), "w"(c) + : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vmlsl_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c) +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vmov_n_s16 (int32_t a) { - uint16x8_t result; - __asm__ ("umlsl %0.8h, %2.8b, %3.8b" + int16x4_t result; + __asm__ ("dup %0.4h,%w1" : "=w"(result) - : "0"(a), "w"(b), "w"(c) + : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmlsl_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c) +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vmov_n_s32 (int32_t a) { - uint32x4_t result; - __asm__ ("umlsl %0.4s, %2.4h, %3.4h" + int32x2_t result; + __asm__ ("dup %0.2s,%w1" : "=w"(result) - : "0"(a), "w"(b), "w"(c) + : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vmlsl_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c) +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vmov_n_s64 (int64_t a) { - uint64x2_t result; - __asm__ ("umlsl %0.2d, %2.2s, %3.2s" + int64x1_t result; + __asm__ ("ins %0.d[0],%x1" : "=w"(result) - : "0"(a), "w"(b), "w"(c) + : "r"(a) : /* No clobbers */); return result; } -#define vmlsq_lane_f32(a, b, c, d) \ - __extension__ \ - ({ \ - float32x4_t c_ = (c); \ - float32x4_t b_ = (b); \ - float32x4_t a_ = (a); \ - float32x4_t result; \ - float32x4_t t1; \ - __asm__ ("fmul %1.4s, %3.4s, %4.s[%5]; fsub %0.4s, %0.4s, %1.4s" \ - : "=w"(result), "=w"(t1) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsq_lane_s16(a, b, c, d) \ - __extension__ \ - ({ \ - int16x8_t c_ = (c); \ - int16x8_t b_ = (b); \ - int16x8_t a_ = (a); \ - int16x8_t result; \ - __asm__ ("mls %0.8h,%2.8h,%3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsq_lane_s32(a, b, c, d) \ - __extension__ \ - ({ \ - int32x4_t c_ = (c); \ - int32x4_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("mls %0.4s,%2.4s,%3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsq_lane_u16(a, b, c, d) \ - __extension__ \ - ({ \ - uint16x8_t c_ = (c); \ - uint16x8_t b_ = (b); \ - uint16x8_t a_ = (a); \ - uint16x8_t result; \ - __asm__ ("mls %0.8h,%2.8h,%3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsq_lane_u32(a, b, c, d) \ - __extension__ \ - ({ \ - uint32x4_t c_ = (c); \ - uint32x4_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("mls %0.4s,%2.4s,%3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsq_laneq_f32(__a, __b, __c, __d) \ - __extension__ \ - ({ \ - float32x4_t __c_ = (__c); \ - float32x4_t __b_ = (__b); \ - float32x4_t __a_ = (__a); \ - float32x4_t __result; \ - float32x4_t __t1; \ - __asm__ ("fmul %1.4s, %3.4s, %4.s[%5]; fsub %0.4s, %0.4s, %1.4s" \ - : "=w"(__result), "=w"(__t1) \ - : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d) \ - : /* No clobbers */); \ - __result; \ - }) - -#define vmlsq_laneq_s16(__a, __b, __c, __d) \ - __extension__ \ - ({ \ - int16x8_t __c_ = (__c); \ - int16x8_t __b_ = (__b); \ - int16x8_t __a_ = (__a); \ - int16x8_t __result; \ - __asm__ ("mls %0.8h, %2.8h, %3.h[%4]" \ - : "=w"(__result) \ - : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d) \ - : /* No clobbers */); \ - __result; \ - }) - -#define vmlsq_laneq_s32(__a, __b, __c, __d) \ - __extension__ \ - ({ \ - int32x4_t __c_ = (__c); \ - int32x4_t __b_ = (__b); \ - int32x4_t __a_ = (__a); \ - int32x4_t __result; \ - __asm__ ("mls %0.4s, %2.4s, %3.s[%4]" \ - : "=w"(__result) \ - : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d) \ - : /* No clobbers */); \ - __result; \ - }) - -#define vmlsq_laneq_u16(__a, __b, __c, __d) \ - __extension__ \ - ({ \ - uint16x8_t __c_ = (__c); \ - uint16x8_t __b_ = (__b); \ - uint16x8_t __a_ = (__a); \ - uint16x8_t __result; \ - __asm__ ("mls %0.8h, %2.8h, %3.h[%4]" \ - : "=w"(__result) \ - : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d) \ - : /* No clobbers */); \ - __result; \ - }) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vmov_n_u8 (uint32_t a) +{ + uint8x8_t result; + __asm__ ("dup %0.8b,%w1" + : "=w"(result) + : "r"(a) + : /* No clobbers */); + return result; +} -#define vmlsq_laneq_u32(__a, __b, __c, __d) \ - __extension__ \ - ({ \ - uint32x4_t __c_ = (__c); \ - uint32x4_t __b_ = (__b); \ - uint32x4_t __a_ = (__a); \ - uint32x4_t __result; \ - __asm__ ("mls %0.4s, %2.4s, %3.s[%4]" \ - : "=w"(__result) \ - : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d) \ - : /* No clobbers */); \ - __result; \ - }) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vmov_n_u16 (uint32_t a) +{ + uint16x4_t result; + __asm__ ("dup %0.4h,%w1" + : "=w"(result) + : "r"(a) + : /* No clobbers */); + return result; +} -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vmlsq_n_f32 (float32x4_t a, float32x4_t b, float32_t c) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vmov_n_u32 (uint32_t a) { - float32x4_t result; - float32x4_t t1; - __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fsub %0.4s, %0.4s, %1.4s" - : "=w"(result), "=w"(t1) - : "0"(a), "w"(b), "w"(c) + uint32x2_t result; + __asm__ ("dup %0.2s,%w1" + : "=w"(result) + : "r"(a) : /* No clobbers */); return result; } -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vmlsq_n_f64 (float64x2_t a, float64x2_t b, float64_t c) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vmov_n_u64 (uint64_t a) { - float64x2_t result; - float64x2_t t1; - __asm__ ("fmul %1.2d, %3.2d, %4.d[0]; fsub %0.2d, %0.2d, %1.2d" - : "=w"(result), "=w"(t1) - : "0"(a), "w"(b), "w"(c) + uint64x1_t result; + __asm__ ("ins %0.d[0],%x1" + : "=w"(result) + : "r"(a) : /* No clobbers */); return result; } __extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vmlsq_n_s16 (int16x8_t a, int16x8_t b, int16_t c) +vmovl_high_s8 (int8x16_t a) { int16x8_t result; - __asm__ ("mls %0.8h, %2.8h, %3.h[0]" + __asm__ ("sshll2 %0.8h,%1.16b,#0" : "=w"(result) - : "0"(a), "w"(b), "w"(c) + : "w"(a) : /* No clobbers */); return result; } __extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmlsq_n_s32 (int32x4_t a, int32x4_t b, int32_t c) +vmovl_high_s16 (int16x8_t a) { int32x4_t result; - __asm__ ("mls %0.4s, %2.4s, %3.s[0]" + __asm__ ("sshll2 %0.4s,%1.8h,#0" : "=w"(result) - : "0"(a), "w"(b), "w"(c) + : "w"(a) + : /* No clobbers */); + return result; +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmovl_high_s32 (int32x4_t a) +{ + int64x2_t result; + __asm__ ("sshll2 %0.2d,%1.4s,#0" + : "=w"(result) + : "w"(a) : /* No clobbers */); return result; } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vmlsq_n_u16 (uint16x8_t a, uint16x8_t b, uint16_t c) +vmovl_high_u8 (uint8x16_t a) { uint16x8_t result; - __asm__ ("mls %0.8h, %2.8h, %3.h[0]" + __asm__ ("ushll2 %0.8h,%1.16b,#0" : "=w"(result) - : "0"(a), "w"(b), "w"(c) + : "w"(a) : /* No clobbers */); return result; } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmlsq_n_u32 (uint32x4_t a, uint32x4_t b, uint32_t c) +vmovl_high_u16 (uint16x8_t a) { uint32x4_t result; - __asm__ ("mls %0.4s, %2.4s, %3.s[0]" + __asm__ ("ushll2 %0.4s,%1.8h,#0" : "=w"(result) - : "0"(a), "w"(b), "w"(c) + : "w"(a) : /* No clobbers */); return result; } -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vmlsq_s8 (int8x16_t a, int8x16_t b, int8x16_t c) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vmovl_high_u32 (uint32x4_t a) { - int8x16_t result; - __asm__ ("mls %0.16b,%2.16b,%3.16b" + uint64x2_t result; + __asm__ ("ushll2 %0.2d,%1.4s,#0" : "=w"(result) - : "0"(a), "w"(b), "w"(c) + : "w"(a) : /* No clobbers */); return result; } __extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vmlsq_s16 (int16x8_t a, int16x8_t b, int16x8_t c) +vmovl_s8 (int8x8_t a) { int16x8_t result; - __asm__ ("mls %0.8h,%2.8h,%3.8h" + __asm__ ("sshll %0.8h,%1.8b,#0" : "=w"(result) - : "0"(a), "w"(b), "w"(c) + : "w"(a) : /* No clobbers */); return result; } __extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmlsq_s32 (int32x4_t a, int32x4_t b, int32x4_t c) +vmovl_s16 (int16x4_t a) { int32x4_t result; - __asm__ ("mls %0.4s,%2.4s,%3.4s" + __asm__ ("sshll %0.4s,%1.4h,#0" : "=w"(result) - : "0"(a), "w"(b), "w"(c) + : "w"(a) : /* No clobbers */); return result; } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vmlsq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c) -{ - uint8x16_t result; - __asm__ ("mls %0.16b,%2.16b,%3.16b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vmlsq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c) -{ - uint16x8_t result; - __asm__ ("mls %0.8h,%2.8h,%3.8h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmlsq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c) -{ - uint32x4_t result; - __asm__ ("mls %0.4s,%2.4s,%3.4s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vmov_n_f32 (float32_t a) -{ - float32x2_t result; - __asm__ ("dup %0.2s, %w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vmov_n_p8 (uint32_t a) -{ - poly8x8_t result; - __asm__ ("dup %0.8b,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) -vmov_n_p16 (uint32_t a) -{ - poly16x4_t result; - __asm__ ("dup %0.4h,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vmov_n_s8 (int32_t a) -{ - int8x8_t result; - __asm__ ("dup %0.8b,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vmov_n_s16 (int32_t a) -{ - int16x4_t result; - __asm__ ("dup %0.4h,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vmov_n_s32 (int32_t a) -{ - int32x2_t result; - __asm__ ("dup %0.2s,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) -vmov_n_s64 (int64_t a) -{ - int64x1_t result; - __asm__ ("ins %0.d[0],%x1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vmov_n_u8 (uint32_t a) -{ - uint8x8_t result; - __asm__ ("dup %0.8b,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vmov_n_u16 (uint32_t a) -{ - uint16x4_t result; - __asm__ ("dup %0.4h,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vmov_n_u32 (uint32_t a) -{ - uint32x2_t result; - __asm__ ("dup %0.2s,%w1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vmov_n_u64 (uint64_t a) -{ - uint64x1_t result; - __asm__ ("ins %0.d[0],%x1" - : "=w"(result) - : "r"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vmovl_high_s8 (int8x16_t a) -{ - int16x8_t result; - __asm__ ("sshll2 %0.8h,%1.16b,#0" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmovl_high_s16 (int16x8_t a) -{ - int32x4_t result; - __asm__ ("sshll2 %0.4s,%1.8h,#0" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vmovl_high_s32 (int32x4_t a) -{ - int64x2_t result; - __asm__ ("sshll2 %0.2d,%1.4s,#0" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vmovl_high_u8 (uint8x16_t a) -{ - uint16x8_t result; - __asm__ ("ushll2 %0.8h,%1.16b,#0" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vmovl_high_u16 (uint16x8_t a) -{ - uint32x4_t result; - __asm__ ("ushll2 %0.4s,%1.8h,#0" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vmovl_high_u32 (uint32x4_t a) -{ - uint64x2_t result; - __asm__ ("ushll2 %0.2d,%1.4s,#0" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vmovl_s8 (int8x8_t a) -{ - int16x8_t result; - __asm__ ("sshll %0.8h,%1.8b,#0" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vmovl_s16 (int16x4_t a) -{ - int32x4_t result; - __asm__ ("sshll %0.4s,%1.4h,#0" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vmovl_s32 (int32x2_t a) +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vmovl_s32 (int32x2_t a) { int64x2_t result; __asm__ ("sshll %0.2d,%1.2s,#0" @@ -16182,3627 +15730,4174 @@ __LD4R_FUNC (uint16x8x4_t, uint16x4_t, uint16_t, 8h, u16, q) __LD4R_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, 4s, u32, q) __LD4R_FUNC (uint64x2x4_t, uint64x4_t, uint64_t, 2d, u64, q) -#define __LD4_LANE_FUNC(rettype, ptrtype, regsuffix, \ - lnsuffix, funcsuffix, Q) \ - __extension__ static __inline rettype \ - __attribute__ ((__always_inline__)) \ - vld4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \ - rettype b, const int c) \ - { \ - rettype result; \ - __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t" \ - "ld4 {v16." #lnsuffix " - v19." #lnsuffix "}[%3], %2\n\t" \ - "st1 {v16." #regsuffix " - v19." #regsuffix "}, %0\n\t" \ - : "=Q"(result) \ - : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \ - : "memory", "v16", "v17", "v18", "v19"); \ - return result; \ - } +#define __LD4_LANE_FUNC(rettype, ptrtype, regsuffix, \ + lnsuffix, funcsuffix, Q) \ + __extension__ static __inline rettype \ + __attribute__ ((__always_inline__)) \ + vld4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \ + rettype b, const int c) \ + { \ + rettype result; \ + __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t" \ + "ld4 {v16." #lnsuffix " - v19." #lnsuffix "}[%3], %2\n\t" \ + "st1 {v16." #regsuffix " - v19." #regsuffix "}, %0\n\t" \ + : "=Q"(result) \ + : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \ + : "memory", "v16", "v17", "v18", "v19"); \ + return result; \ + } + +__LD4_LANE_FUNC (int8x8x4_t, uint8_t, 8b, b, s8,) +__LD4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,) +__LD4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,) +__LD4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,) +__LD4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,) +__LD4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,) +__LD4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,) +__LD4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,) +__LD4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,) +__LD4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,) +__LD4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,) +__LD4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,) +__LD4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q) +__LD4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q) +__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q) +__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q) +__LD4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q) +__LD4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q) +__LD4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q) +__LD4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q) +__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q) +__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q) +__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q) +__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q) + +#define __ST2_LANE_FUNC(intype, ptrtype, regsuffix, \ + lnsuffix, funcsuffix, Q) \ + __extension__ static __inline void \ + __attribute__ ((__always_inline__)) \ + vst2 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \ + intype b, const int c) \ + { \ + __asm__ ("ld1 {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t" \ + "st2 {v16." #lnsuffix ", v17." #lnsuffix "}[%2], %0\n\t" \ + : "=Q"(*(intype *) ptr) \ + : "Q"(b), "i"(c) \ + : "memory", "v16", "v17"); \ + } + +__ST2_LANE_FUNC (int8x8x2_t, int8_t, 8b, b, s8,) +__ST2_LANE_FUNC (float32x2x2_t, float32_t, 2s, s, f32,) +__ST2_LANE_FUNC (float64x1x2_t, float64_t, 1d, d, f64,) +__ST2_LANE_FUNC (poly8x8x2_t, poly8_t, 8b, b, p8,) +__ST2_LANE_FUNC (poly16x4x2_t, poly16_t, 4h, h, p16,) +__ST2_LANE_FUNC (int16x4x2_t, int16_t, 4h, h, s16,) +__ST2_LANE_FUNC (int32x2x2_t, int32_t, 2s, s, s32,) +__ST2_LANE_FUNC (int64x1x2_t, int64_t, 1d, d, s64,) +__ST2_LANE_FUNC (uint8x8x2_t, uint8_t, 8b, b, u8,) +__ST2_LANE_FUNC (uint16x4x2_t, uint16_t, 4h, h, u16,) +__ST2_LANE_FUNC (uint32x2x2_t, uint32_t, 2s, s, u32,) +__ST2_LANE_FUNC (uint64x1x2_t, uint64_t, 1d, d, u64,) +__ST2_LANE_FUNC (float32x4x2_t, float32_t, 4s, s, f32, q) +__ST2_LANE_FUNC (float64x2x2_t, float64_t, 2d, d, f64, q) +__ST2_LANE_FUNC (poly8x16x2_t, poly8_t, 16b, b, p8, q) +__ST2_LANE_FUNC (poly16x8x2_t, poly16_t, 8h, h, p16, q) +__ST2_LANE_FUNC (int8x16x2_t, int8_t, 16b, b, s8, q) +__ST2_LANE_FUNC (int16x8x2_t, int16_t, 8h, h, s16, q) +__ST2_LANE_FUNC (int32x4x2_t, int32_t, 4s, s, s32, q) +__ST2_LANE_FUNC (int64x2x2_t, int64_t, 2d, d, s64, q) +__ST2_LANE_FUNC (uint8x16x2_t, uint8_t, 16b, b, u8, q) +__ST2_LANE_FUNC (uint16x8x2_t, uint16_t, 8h, h, u16, q) +__ST2_LANE_FUNC (uint32x4x2_t, uint32_t, 4s, s, u32, q) +__ST2_LANE_FUNC (uint64x2x2_t, uint64_t, 2d, d, u64, q) + +#define __ST3_LANE_FUNC(intype, ptrtype, regsuffix, \ + lnsuffix, funcsuffix, Q) \ + __extension__ static __inline void \ + __attribute__ ((__always_inline__)) \ + vst3 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \ + intype b, const int c) \ + { \ + __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t" \ + "st3 {v16." #lnsuffix " - v18." #lnsuffix "}[%2], %0\n\t" \ + : "=Q"(*(intype *) ptr) \ + : "Q"(b), "i"(c) \ + : "memory", "v16", "v17", "v18"); \ + } + +__ST3_LANE_FUNC (int8x8x3_t, int8_t, 8b, b, s8,) +__ST3_LANE_FUNC (float32x2x3_t, float32_t, 2s, s, f32,) +__ST3_LANE_FUNC (float64x1x3_t, float64_t, 1d, d, f64,) +__ST3_LANE_FUNC (poly8x8x3_t, poly8_t, 8b, b, p8,) +__ST3_LANE_FUNC (poly16x4x3_t, poly16_t, 4h, h, p16,) +__ST3_LANE_FUNC (int16x4x3_t, int16_t, 4h, h, s16,) +__ST3_LANE_FUNC (int32x2x3_t, int32_t, 2s, s, s32,) +__ST3_LANE_FUNC (int64x1x3_t, int64_t, 1d, d, s64,) +__ST3_LANE_FUNC (uint8x8x3_t, uint8_t, 8b, b, u8,) +__ST3_LANE_FUNC (uint16x4x3_t, uint16_t, 4h, h, u16,) +__ST3_LANE_FUNC (uint32x2x3_t, uint32_t, 2s, s, u32,) +__ST3_LANE_FUNC (uint64x1x3_t, uint64_t, 1d, d, u64,) +__ST3_LANE_FUNC (float32x4x3_t, float32_t, 4s, s, f32, q) +__ST3_LANE_FUNC (float64x2x3_t, float64_t, 2d, d, f64, q) +__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, 16b, b, p8, q) +__ST3_LANE_FUNC (poly16x8x3_t, poly16_t, 8h, h, p16, q) +__ST3_LANE_FUNC (int8x16x3_t, int8_t, 16b, b, s8, q) +__ST3_LANE_FUNC (int16x8x3_t, int16_t, 8h, h, s16, q) +__ST3_LANE_FUNC (int32x4x3_t, int32_t, 4s, s, s32, q) +__ST3_LANE_FUNC (int64x2x3_t, int64_t, 2d, d, s64, q) +__ST3_LANE_FUNC (uint8x16x3_t, uint8_t, 16b, b, u8, q) +__ST3_LANE_FUNC (uint16x8x3_t, uint16_t, 8h, h, u16, q) +__ST3_LANE_FUNC (uint32x4x3_t, uint32_t, 4s, s, u32, q) +__ST3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q) + +#define __ST4_LANE_FUNC(intype, ptrtype, regsuffix, \ + lnsuffix, funcsuffix, Q) \ + __extension__ static __inline void \ + __attribute__ ((__always_inline__)) \ + vst4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \ + intype b, const int c) \ + { \ + __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t" \ + "st4 {v16." #lnsuffix " - v19." #lnsuffix "}[%2], %0\n\t" \ + : "=Q"(*(intype *) ptr) \ + : "Q"(b), "i"(c) \ + : "memory", "v16", "v17", "v18", "v19"); \ + } + +__ST4_LANE_FUNC (int8x8x4_t, int8_t, 8b, b, s8,) +__ST4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,) +__ST4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,) +__ST4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,) +__ST4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,) +__ST4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,) +__ST4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,) +__ST4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,) +__ST4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,) +__ST4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,) +__ST4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,) +__ST4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,) +__ST4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q) +__ST4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q) +__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q) +__ST4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q) +__ST4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q) +__ST4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q) +__ST4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q) +__ST4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q) +__ST4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q) +__ST4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q) +__ST4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q) +__ST4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q) + +__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +vaddlv_s32 (int32x2_t a) +{ + int64_t result; + __asm__ ("saddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : ); + return result; +} + +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vaddlv_u32 (uint32x2_t a) +{ + uint64_t result; + __asm__ ("uaddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : ); + return result; +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vpaddd_s64 (int64x2_t __a) +{ + return __builtin_aarch64_addpdi (__a); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vqdmulh_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_sqdmulh_laneqv4hi (__a, __b, __c); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vqdmulh_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_sqdmulh_laneqv2si (__a, __b, __c); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vqdmulhq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_sqdmulh_laneqv8hi (__a, __b, __c); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_sqdmulh_laneqv4si (__a, __b, __c); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vqrdmulh_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_sqrdmulh_laneqv4hi (__a, __b, __c); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vqrdmulh_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_sqrdmulh_laneqv2si (__a, __b, __c); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vqrdmulhq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_sqrdmulh_laneqv8hi (__a, __b, __c); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vqrdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_sqrdmulh_laneqv4si (__a, __b, __c); +} + +/* Table intrinsics. */ + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vqtbl1_p8 (poly8x16_t a, uint8x8_t b) +{ + poly8x8_t result; + __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" + : "=w"(result) + : "w"(a), "w"(b) + : /* No clobbers */); + return result; +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqtbl1_s8 (int8x16_t a, int8x8_t b) +{ + int8x8_t result; + __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" + : "=w"(result) + : "w"(a), "w"(b) + : /* No clobbers */); + return result; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqtbl1_u8 (uint8x16_t a, uint8x8_t b) +{ + uint8x8_t result; + __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" + : "=w"(result) + : "w"(a), "w"(b) + : /* No clobbers */); + return result; +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vqtbl1q_p8 (poly8x16_t a, uint8x16_t b) +{ + poly8x16_t result; + __asm__ ("tbl %0.16b, {%1.16b}, %2.16b" + : "=w"(result) + : "w"(a), "w"(b) + : /* No clobbers */); + return result; +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vqtbl1q_s8 (int8x16_t a, int8x16_t b) +{ + int8x16_t result; + __asm__ ("tbl %0.16b, {%1.16b}, %2.16b" + : "=w"(result) + : "w"(a), "w"(b) + : /* No clobbers */); + return result; +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vqtbl1q_u8 (uint8x16_t a, uint8x16_t b) +{ + uint8x16_t result; + __asm__ ("tbl %0.16b, {%1.16b}, %2.16b" + : "=w"(result) + : "w"(a), "w"(b) + : /* No clobbers */); + return result; +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqtbl2_s8 (int8x16x2_t tab, int8x8_t idx) +{ + int8x8_t result; + __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" + "tbl %0.8b, {v16.16b, v17.16b}, %2.8b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17"); + return result; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqtbl2_u8 (uint8x16x2_t tab, uint8x8_t idx) +{ + uint8x8_t result; + __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" + "tbl %0.8b, {v16.16b, v17.16b}, %2.8b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17"); + return result; +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vqtbl2_p8 (poly8x16x2_t tab, uint8x8_t idx) +{ + poly8x8_t result; + __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" + "tbl %0.8b, {v16.16b, v17.16b}, %2.8b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17"); + return result; +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vqtbl2q_s8 (int8x16x2_t tab, int8x16_t idx) +{ + int8x16_t result; + __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" + "tbl %0.16b, {v16.16b, v17.16b}, %2.16b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17"); + return result; +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vqtbl2q_u8 (uint8x16x2_t tab, uint8x16_t idx) +{ + uint8x16_t result; + __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" + "tbl %0.16b, {v16.16b, v17.16b}, %2.16b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17"); + return result; +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vqtbl2q_p8 (poly8x16x2_t tab, uint8x16_t idx) +{ + poly8x16_t result; + __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" + "tbl %0.16b, {v16.16b, v17.16b}, %2.16b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17"); + return result; +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqtbl3_s8 (int8x16x3_t tab, int8x8_t idx) +{ + int8x8_t result; + __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" + "tbl %0.8b, {v16.16b - v18.16b}, %2.8b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18"); + return result; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqtbl3_u8 (uint8x16x3_t tab, uint8x8_t idx) +{ + uint8x8_t result; + __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" + "tbl %0.8b, {v16.16b - v18.16b}, %2.8b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18"); + return result; +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vqtbl3_p8 (poly8x16x3_t tab, uint8x8_t idx) +{ + poly8x8_t result; + __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" + "tbl %0.8b, {v16.16b - v18.16b}, %2.8b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18"); + return result; +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vqtbl3q_s8 (int8x16x3_t tab, int8x16_t idx) +{ + int8x16_t result; + __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" + "tbl %0.16b, {v16.16b - v18.16b}, %2.16b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18"); + return result; +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vqtbl3q_u8 (uint8x16x3_t tab, uint8x16_t idx) +{ + uint8x16_t result; + __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" + "tbl %0.16b, {v16.16b - v18.16b}, %2.16b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18"); + return result; +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vqtbl3q_p8 (poly8x16x3_t tab, uint8x16_t idx) +{ + poly8x16_t result; + __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" + "tbl %0.16b, {v16.16b - v18.16b}, %2.16b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18"); + return result; +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqtbl4_s8 (int8x16x4_t tab, int8x8_t idx) +{ + int8x8_t result; + __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" + "tbl %0.8b, {v16.16b - v19.16b}, %2.8b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18", "v19"); + return result; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqtbl4_u8 (uint8x16x4_t tab, uint8x8_t idx) +{ + uint8x8_t result; + __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" + "tbl %0.8b, {v16.16b - v19.16b}, %2.8b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18", "v19"); + return result; +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vqtbl4_p8 (poly8x16x4_t tab, uint8x8_t idx) +{ + poly8x8_t result; + __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" + "tbl %0.8b, {v16.16b - v19.16b}, %2.8b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18", "v19"); + return result; +} + + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vqtbl4q_s8 (int8x16x4_t tab, int8x16_t idx) +{ + int8x16_t result; + __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" + "tbl %0.16b, {v16.16b - v19.16b}, %2.16b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18", "v19"); + return result; +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vqtbl4q_u8 (uint8x16x4_t tab, uint8x16_t idx) +{ + uint8x16_t result; + __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" + "tbl %0.16b, {v16.16b - v19.16b}, %2.16b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18", "v19"); + return result; +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vqtbl4q_p8 (poly8x16x4_t tab, uint8x16_t idx) +{ + poly8x16_t result; + __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" + "tbl %0.16b, {v16.16b - v19.16b}, %2.16b\n\t" + :"=w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18", "v19"); + return result; +} + + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqtbx1_s8 (int8x8_t r, int8x16_t tab, int8x8_t idx) +{ + int8x8_t result = r; + __asm__ ("tbx %0.8b,{%1.16b},%2.8b" + : "+w"(result) + : "w"(tab), "w"(idx) + : /* No clobbers */); + return result; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqtbx1_u8 (uint8x8_t r, uint8x16_t tab, uint8x8_t idx) +{ + uint8x8_t result = r; + __asm__ ("tbx %0.8b,{%1.16b},%2.8b" + : "+w"(result) + : "w"(tab), "w"(idx) + : /* No clobbers */); + return result; +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vqtbx1_p8 (poly8x8_t r, poly8x16_t tab, uint8x8_t idx) +{ + poly8x8_t result = r; + __asm__ ("tbx %0.8b,{%1.16b},%2.8b" + : "+w"(result) + : "w"(tab), "w"(idx) + : /* No clobbers */); + return result; +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vqtbx1q_s8 (int8x16_t r, int8x16_t tab, int8x16_t idx) +{ + int8x16_t result = r; + __asm__ ("tbx %0.16b,{%1.16b},%2.16b" + : "+w"(result) + : "w"(tab), "w"(idx) + : /* No clobbers */); + return result; +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vqtbx1q_u8 (uint8x16_t r, uint8x16_t tab, uint8x16_t idx) +{ + uint8x16_t result = r; + __asm__ ("tbx %0.16b,{%1.16b},%2.16b" + : "+w"(result) + : "w"(tab), "w"(idx) + : /* No clobbers */); + return result; +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vqtbx1q_p8 (poly8x16_t r, poly8x16_t tab, uint8x16_t idx) +{ + poly8x16_t result = r; + __asm__ ("tbx %0.16b,{%1.16b},%2.16b" + : "+w"(result) + : "w"(tab), "w"(idx) + : /* No clobbers */); + return result; +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqtbx2_s8 (int8x8_t r, int8x16x2_t tab, int8x8_t idx) +{ + int8x8_t result = r; + __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" + "tbx %0.8b, {v16.16b, v17.16b}, %2.8b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17"); + return result; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqtbx2_u8 (uint8x8_t r, uint8x16x2_t tab, uint8x8_t idx) +{ + uint8x8_t result = r; + __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" + "tbx %0.8b, {v16.16b, v17.16b}, %2.8b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17"); + return result; +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vqtbx2_p8 (poly8x8_t r, poly8x16x2_t tab, uint8x8_t idx) +{ + poly8x8_t result = r; + __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" + "tbx %0.8b, {v16.16b, v17.16b}, %2.8b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17"); + return result; +} + + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vqtbx2q_s8 (int8x16_t r, int8x16x2_t tab, int8x16_t idx) +{ + int8x16_t result = r; + __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" + "tbx %0.16b, {v16.16b, v17.16b}, %2.16b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17"); + return result; +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vqtbx2q_u8 (uint8x16_t r, uint8x16x2_t tab, uint8x16_t idx) +{ + uint8x16_t result = r; + __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" + "tbx %0.16b, {v16.16b, v17.16b}, %2.16b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17"); + return result; +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vqtbx2q_p8 (poly8x16_t r, poly8x16x2_t tab, uint8x16_t idx) +{ + poly8x16_t result = r; + __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" + "tbx %0.16b, {v16.16b, v17.16b}, %2.16b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17"); + return result; +} + + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqtbx3_s8 (int8x8_t r, int8x16x3_t tab, int8x8_t idx) +{ + int8x8_t result = r; + __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" + "tbx %0.8b, {v16.16b - v18.16b}, %2.8b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18"); + return result; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqtbx3_u8 (uint8x8_t r, uint8x16x3_t tab, uint8x8_t idx) +{ + uint8x8_t result = r; + __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" + "tbx %0.8b, {v16.16b - v18.16b}, %2.8b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18"); + return result; +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vqtbx3_p8 (poly8x8_t r, poly8x16x3_t tab, uint8x8_t idx) +{ + poly8x8_t result = r; + __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" + "tbx %0.8b, {v16.16b - v18.16b}, %2.8b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18"); + return result; +} + + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vqtbx3q_s8 (int8x16_t r, int8x16x3_t tab, int8x16_t idx) +{ + int8x16_t result = r; + __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" + "tbx %0.16b, {v16.16b - v18.16b}, %2.16b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18"); + return result; +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vqtbx3q_u8 (uint8x16_t r, uint8x16x3_t tab, uint8x16_t idx) +{ + uint8x16_t result = r; + __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" + "tbx %0.16b, {v16.16b - v18.16b}, %2.16b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18"); + return result; +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vqtbx3q_p8 (poly8x16_t r, poly8x16x3_t tab, uint8x16_t idx) +{ + poly8x16_t result = r; + __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" + "tbx %0.16b, {v16.16b - v18.16b}, %2.16b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18"); + return result; +} + + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vqtbx4_s8 (int8x8_t r, int8x16x4_t tab, int8x8_t idx) +{ + int8x8_t result = r; + __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" + "tbx %0.8b, {v16.16b - v19.16b}, %2.8b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18", "v19"); + return result; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vqtbx4_u8 (uint8x8_t r, uint8x16x4_t tab, uint8x8_t idx) +{ + uint8x8_t result = r; + __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" + "tbx %0.8b, {v16.16b - v19.16b}, %2.8b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18", "v19"); + return result; +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vqtbx4_p8 (poly8x8_t r, poly8x16x4_t tab, uint8x8_t idx) +{ + poly8x8_t result = r; + __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" + "tbx %0.8b, {v16.16b - v19.16b}, %2.8b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18", "v19"); + return result; +} + + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vqtbx4q_s8 (int8x16_t r, int8x16x4_t tab, int8x16_t idx) +{ + int8x16_t result = r; + __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" + "tbx %0.16b, {v16.16b - v19.16b}, %2.16b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18", "v19"); + return result; +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vqtbx4q_u8 (uint8x16_t r, uint8x16x4_t tab, uint8x16_t idx) +{ + uint8x16_t result = r; + __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" + "tbx %0.16b, {v16.16b - v19.16b}, %2.16b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18", "v19"); + return result; +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vqtbx4q_p8 (poly8x16_t r, poly8x16x4_t tab, uint8x16_t idx) +{ + poly8x16_t result = r; + __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" + "tbx %0.16b, {v16.16b - v19.16b}, %2.16b\n\t" + :"+w"(result) + :"Q"(tab),"w"(idx) + :"memory", "v16", "v17", "v18", "v19"); + return result; +} + +/* V7 legacy table intrinsics. */ + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtbl1_s8 (int8x8_t tab, int8x8_t idx) +{ + int8x8_t result; + int8x16_t temp = vcombine_s8 (tab, vcreate_s8 (__AARCH64_UINT64_C (0x0))); + __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" + : "=w"(result) + : "w"(temp), "w"(idx) + : /* No clobbers */); + return result; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtbl1_u8 (uint8x8_t tab, uint8x8_t idx) +{ + uint8x8_t result; + uint8x16_t temp = vcombine_u8 (tab, vcreate_u8 (__AARCH64_UINT64_C (0x0))); + __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" + : "=w"(result) + : "w"(temp), "w"(idx) + : /* No clobbers */); + return result; +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbl1_p8 (poly8x8_t tab, uint8x8_t idx) +{ + poly8x8_t result; + poly8x16_t temp = vcombine_p8 (tab, vcreate_p8 (__AARCH64_UINT64_C (0x0))); + __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" + : "=w"(result) + : "w"(temp), "w"(idx) + : /* No clobbers */); + return result; +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtbl2_s8 (int8x8x2_t tab, int8x8_t idx) +{ + int8x8_t result; + int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]); + __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" + : "=w"(result) + : "w"(temp), "w"(idx) + : /* No clobbers */); + return result; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtbl2_u8 (uint8x8x2_t tab, uint8x8_t idx) +{ + uint8x8_t result; + uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]); + __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" + : "=w"(result) + : "w"(temp), "w"(idx) + : /* No clobbers */); + return result; +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbl2_p8 (poly8x8x2_t tab, uint8x8_t idx) +{ + poly8x8_t result; + poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]); + __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" + : "=w"(result) + : "w"(temp), "w"(idx) + : /* No clobbers */); + return result; +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtbl3_s8 (int8x8x3_t tab, int8x8_t idx) +{ + int8x8_t result; + int8x16x2_t temp; + temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]); + temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0))); + __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" + "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" + : "=w"(result) + : "Q"(temp), "w"(idx) + : "v16", "v17", "memory"); + return result; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtbl3_u8 (uint8x8x3_t tab, uint8x8_t idx) +{ + uint8x8_t result; + uint8x16x2_t temp; + temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]); + temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0))); + __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" + "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" + : "=w"(result) + : "Q"(temp), "w"(idx) + : "v16", "v17", "memory"); + return result; +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbl3_p8 (poly8x8x3_t tab, uint8x8_t idx) +{ + poly8x8_t result; + poly8x16x2_t temp; + temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]); + temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0))); + __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" + "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" + : "=w"(result) + : "Q"(temp), "w"(idx) + : "v16", "v17", "memory"); + return result; +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtbl4_s8 (int8x8x4_t tab, int8x8_t idx) +{ + int8x8_t result; + int8x16x2_t temp; + temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]); + temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]); + __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" + "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" + : "=w"(result) + : "Q"(temp), "w"(idx) + : "v16", "v17", "memory"); + return result; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtbl4_u8 (uint8x8x4_t tab, uint8x8_t idx) +{ + uint8x8_t result; + uint8x16x2_t temp; + temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]); + temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]); + __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" + "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" + : "=w"(result) + : "Q"(temp), "w"(idx) + : "v16", "v17", "memory"); + return result; +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbl4_p8 (poly8x8x4_t tab, uint8x8_t idx) +{ + poly8x8_t result; + poly8x16x2_t temp; + temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]); + temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]); + __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" + "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" + : "=w"(result) + : "Q"(temp), "w"(idx) + : "v16", "v17", "memory"); + return result; +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtbx1_s8 (int8x8_t r, int8x8_t tab, int8x8_t idx) +{ + int8x8_t result; + int8x8_t tmp1; + int8x16_t temp = vcombine_s8 (tab, vcreate_s8 (__AARCH64_UINT64_C (0x0))); + __asm__ ("movi %0.8b, 8\n\t" + "cmhs %0.8b, %3.8b, %0.8b\n\t" + "tbl %1.8b, {%2.16b}, %3.8b\n\t" + "bsl %0.8b, %4.8b, %1.8b\n\t" + : "+w"(result), "=w"(tmp1) + : "w"(temp), "w"(idx), "w"(r) + : /* No clobbers */); + return result; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtbx1_u8 (uint8x8_t r, uint8x8_t tab, uint8x8_t idx) +{ + uint8x8_t result; + uint8x8_t tmp1; + uint8x16_t temp = vcombine_u8 (tab, vcreate_u8 (__AARCH64_UINT64_C (0x0))); + __asm__ ("movi %0.8b, 8\n\t" + "cmhs %0.8b, %3.8b, %0.8b\n\t" + "tbl %1.8b, {%2.16b}, %3.8b\n\t" + "bsl %0.8b, %4.8b, %1.8b\n\t" + : "+w"(result), "=w"(tmp1) + : "w"(temp), "w"(idx), "w"(r) + : /* No clobbers */); + return result; +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbx1_p8 (poly8x8_t r, poly8x8_t tab, uint8x8_t idx) +{ + poly8x8_t result; + poly8x8_t tmp1; + poly8x16_t temp = vcombine_p8 (tab, vcreate_p8 (__AARCH64_UINT64_C (0x0))); + __asm__ ("movi %0.8b, 8\n\t" + "cmhs %0.8b, %3.8b, %0.8b\n\t" + "tbl %1.8b, {%2.16b}, %3.8b\n\t" + "bsl %0.8b, %4.8b, %1.8b\n\t" + : "+w"(result), "=w"(tmp1) + : "w"(temp), "w"(idx), "w"(r) + : /* No clobbers */); + return result; +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtbx2_s8 (int8x8_t r, int8x8x2_t tab, int8x8_t idx) +{ + int8x8_t result = r; + int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]); + __asm__ ("tbx %0.8b, {%1.16b}, %2.8b" + : "+w"(result) + : "w"(temp), "w"(idx) + : /* No clobbers */); + return result; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtbx2_u8 (uint8x8_t r, uint8x8x2_t tab, uint8x8_t idx) +{ + uint8x8_t result = r; + uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]); + __asm__ ("tbx %0.8b, {%1.16b}, %2.8b" + : "+w"(result) + : "w"(temp), "w"(idx) + : /* No clobbers */); + return result; +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx) +{ + poly8x8_t result = r; + poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]); + __asm__ ("tbx %0.8b, {%1.16b}, %2.8b" + : "+w"(result) + : "w"(temp), "w"(idx) + : /* No clobbers */); + return result; +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtbx3_s8 (int8x8_t r, int8x8x3_t tab, int8x8_t idx) +{ + int8x8_t result; + int8x8_t tmp1; + int8x16x2_t temp; + temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]); + temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0))); + __asm__ ("ld1 {v16.16b - v17.16b}, %2\n\t" + "movi %0.8b, 24\n\t" + "cmhs %0.8b, %3.8b, %0.8b\n\t" + "tbl %1.8b, {v16.16b - v17.16b}, %3.8b\n\t" + "bsl %0.8b, %4.8b, %1.8b\n\t" + : "+w"(result), "=w"(tmp1) + : "Q"(temp), "w"(idx), "w"(r) + : "v16", "v17", "memory"); + return result; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtbx3_u8 (uint8x8_t r, uint8x8x3_t tab, uint8x8_t idx) +{ + uint8x8_t result; + uint8x8_t tmp1; + uint8x16x2_t temp; + temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]); + temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0))); + __asm__ ("ld1 {v16.16b - v17.16b}, %2\n\t" + "movi %0.8b, 24\n\t" + "cmhs %0.8b, %3.8b, %0.8b\n\t" + "tbl %1.8b, {v16.16b - v17.16b}, %3.8b\n\t" + "bsl %0.8b, %4.8b, %1.8b\n\t" + : "+w"(result), "=w"(tmp1) + : "Q"(temp), "w"(idx), "w"(r) + : "v16", "v17", "memory"); + return result; +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbx3_p8 (poly8x8_t r, poly8x8x3_t tab, uint8x8_t idx) +{ + poly8x8_t result; + poly8x8_t tmp1; + poly8x16x2_t temp; + temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]); + temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0))); + __asm__ ("ld1 {v16.16b - v17.16b}, %2\n\t" + "movi %0.8b, 24\n\t" + "cmhs %0.8b, %3.8b, %0.8b\n\t" + "tbl %1.8b, {v16.16b - v17.16b}, %3.8b\n\t" + "bsl %0.8b, %4.8b, %1.8b\n\t" + : "+w"(result), "=w"(tmp1) + : "Q"(temp), "w"(idx), "w"(r) + : "v16", "v17", "memory"); + return result; +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vtbx4_s8 (int8x8_t r, int8x8x4_t tab, int8x8_t idx) +{ + int8x8_t result = r; + int8x16x2_t temp; + temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]); + temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]); + __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" + "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" + : "+w"(result) + : "Q"(temp), "w"(idx) + : "v16", "v17", "memory"); + return result; +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vtbx4_u8 (uint8x8_t r, uint8x8x4_t tab, uint8x8_t idx) +{ + uint8x8_t result = r; + uint8x16x2_t temp; + temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]); + temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]); + __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" + "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" + : "+w"(result) + : "Q"(temp), "w"(idx) + : "v16", "v17", "memory"); + return result; +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vtbx4_p8 (poly8x8_t r, poly8x8x4_t tab, uint8x8_t idx) +{ + poly8x8_t result = r; + poly8x16x2_t temp; + temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]); + temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]); + __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" + "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" + : "+w"(result) + : "Q"(temp), "w"(idx) + : "v16", "v17", "memory"); + return result; +} + +/* End of temporary inline asm. */ -__LD4_LANE_FUNC (int8x8x4_t, uint8_t, 8b, b, s8,) -__LD4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,) -__LD4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,) -__LD4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,) -__LD4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,) -__LD4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,) -__LD4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,) -__LD4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,) -__LD4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,) -__LD4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,) -__LD4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,) -__LD4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,) -__LD4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q) -__LD4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q) -__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q) -__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q) -__LD4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q) -__LD4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q) -__LD4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q) -__LD4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q) -__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q) -__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q) -__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q) -__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q) +/* Start of optimal implementations in approved order. */ -#define __ST2_LANE_FUNC(intype, ptrtype, regsuffix, \ - lnsuffix, funcsuffix, Q) \ - __extension__ static __inline void \ - __attribute__ ((__always_inline__)) \ - vst2 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \ - intype b, const int c) \ - { \ - __asm__ ("ld1 {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t" \ - "st2 {v16." #lnsuffix ", v17." #lnsuffix "}[%2], %0\n\t" \ - : "=Q"(*(intype *) ptr) \ - : "Q"(b), "i"(c) \ - : "memory", "v16", "v17"); \ - } +/* vabs */ -__ST2_LANE_FUNC (int8x8x2_t, int8_t, 8b, b, s8,) -__ST2_LANE_FUNC (float32x2x2_t, float32_t, 2s, s, f32,) -__ST2_LANE_FUNC (float64x1x2_t, float64_t, 1d, d, f64,) -__ST2_LANE_FUNC (poly8x8x2_t, poly8_t, 8b, b, p8,) -__ST2_LANE_FUNC (poly16x4x2_t, poly16_t, 4h, h, p16,) -__ST2_LANE_FUNC (int16x4x2_t, int16_t, 4h, h, s16,) -__ST2_LANE_FUNC (int32x2x2_t, int32_t, 2s, s, s32,) -__ST2_LANE_FUNC (int64x1x2_t, int64_t, 1d, d, s64,) -__ST2_LANE_FUNC (uint8x8x2_t, uint8_t, 8b, b, u8,) -__ST2_LANE_FUNC (uint16x4x2_t, uint16_t, 4h, h, u16,) -__ST2_LANE_FUNC (uint32x2x2_t, uint32_t, 2s, s, u32,) -__ST2_LANE_FUNC (uint64x1x2_t, uint64_t, 1d, d, u64,) -__ST2_LANE_FUNC (float32x4x2_t, float32_t, 4s, s, f32, q) -__ST2_LANE_FUNC (float64x2x2_t, float64_t, 2d, d, f64, q) -__ST2_LANE_FUNC (poly8x16x2_t, poly8_t, 16b, b, p8, q) -__ST2_LANE_FUNC (poly16x8x2_t, poly16_t, 8h, h, p16, q) -__ST2_LANE_FUNC (int8x16x2_t, int8_t, 16b, b, s8, q) -__ST2_LANE_FUNC (int16x8x2_t, int16_t, 8h, h, s16, q) -__ST2_LANE_FUNC (int32x4x2_t, int32_t, 4s, s, s32, q) -__ST2_LANE_FUNC (int64x2x2_t, int64_t, 2d, d, s64, q) -__ST2_LANE_FUNC (uint8x16x2_t, uint8_t, 16b, b, u8, q) -__ST2_LANE_FUNC (uint16x8x2_t, uint16_t, 8h, h, u16, q) -__ST2_LANE_FUNC (uint32x4x2_t, uint32_t, 4s, s, u32, q) -__ST2_LANE_FUNC (uint64x2x2_t, uint64_t, 2d, d, u64, q) +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vabs_f32 (float32x2_t __a) +{ + return __builtin_aarch64_absv2sf (__a); +} -#define __ST3_LANE_FUNC(intype, ptrtype, regsuffix, \ - lnsuffix, funcsuffix, Q) \ - __extension__ static __inline void \ - __attribute__ ((__always_inline__)) \ - vst3 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \ - intype b, const int c) \ - { \ - __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t" \ - "st3 {v16." #lnsuffix " - v18." #lnsuffix "}[%2], %0\n\t" \ - : "=Q"(*(intype *) ptr) \ - : "Q"(b), "i"(c) \ - : "memory", "v16", "v17", "v18"); \ - } +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) +vabs_f64 (float64x1_t __a) +{ + return __builtin_fabs (__a); +} -__ST3_LANE_FUNC (int8x8x3_t, int8_t, 8b, b, s8,) -__ST3_LANE_FUNC (float32x2x3_t, float32_t, 2s, s, f32,) -__ST3_LANE_FUNC (float64x1x3_t, float64_t, 1d, d, f64,) -__ST3_LANE_FUNC (poly8x8x3_t, poly8_t, 8b, b, p8,) -__ST3_LANE_FUNC (poly16x4x3_t, poly16_t, 4h, h, p16,) -__ST3_LANE_FUNC (int16x4x3_t, int16_t, 4h, h, s16,) -__ST3_LANE_FUNC (int32x2x3_t, int32_t, 2s, s, s32,) -__ST3_LANE_FUNC (int64x1x3_t, int64_t, 1d, d, s64,) -__ST3_LANE_FUNC (uint8x8x3_t, uint8_t, 8b, b, u8,) -__ST3_LANE_FUNC (uint16x4x3_t, uint16_t, 4h, h, u16,) -__ST3_LANE_FUNC (uint32x2x3_t, uint32_t, 2s, s, u32,) -__ST3_LANE_FUNC (uint64x1x3_t, uint64_t, 1d, d, u64,) -__ST3_LANE_FUNC (float32x4x3_t, float32_t, 4s, s, f32, q) -__ST3_LANE_FUNC (float64x2x3_t, float64_t, 2d, d, f64, q) -__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, 16b, b, p8, q) -__ST3_LANE_FUNC (poly16x8x3_t, poly16_t, 8h, h, p16, q) -__ST3_LANE_FUNC (int8x16x3_t, int8_t, 16b, b, s8, q) -__ST3_LANE_FUNC (int16x8x3_t, int16_t, 8h, h, s16, q) -__ST3_LANE_FUNC (int32x4x3_t, int32_t, 4s, s, s32, q) -__ST3_LANE_FUNC (int64x2x3_t, int64_t, 2d, d, s64, q) -__ST3_LANE_FUNC (uint8x16x3_t, uint8_t, 16b, b, u8, q) -__ST3_LANE_FUNC (uint16x8x3_t, uint16_t, 8h, h, u16, q) -__ST3_LANE_FUNC (uint32x4x3_t, uint32_t, 4s, s, u32, q) -__ST3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q) +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vabs_s8 (int8x8_t __a) +{ + return __builtin_aarch64_absv8qi (__a); +} -#define __ST4_LANE_FUNC(intype, ptrtype, regsuffix, \ - lnsuffix, funcsuffix, Q) \ - __extension__ static __inline void \ - __attribute__ ((__always_inline__)) \ - vst4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \ - intype b, const int c) \ - { \ - __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t" \ - "st4 {v16." #lnsuffix " - v19." #lnsuffix "}[%2], %0\n\t" \ - : "=Q"(*(intype *) ptr) \ - : "Q"(b), "i"(c) \ - : "memory", "v16", "v17", "v18", "v19"); \ - } +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vabs_s16 (int16x4_t __a) +{ + return __builtin_aarch64_absv4hi (__a); +} -__ST4_LANE_FUNC (int8x8x4_t, int8_t, 8b, b, s8,) -__ST4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,) -__ST4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,) -__ST4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,) -__ST4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,) -__ST4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,) -__ST4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,) -__ST4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,) -__ST4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,) -__ST4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,) -__ST4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,) -__ST4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,) -__ST4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q) -__ST4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q) -__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q) -__ST4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q) -__ST4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q) -__ST4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q) -__ST4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q) -__ST4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q) -__ST4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q) -__ST4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q) -__ST4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q) -__ST4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q) +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vabs_s32 (int32x2_t __a) +{ + return __builtin_aarch64_absv2si (__a); +} -__extension__ static __inline int64_t __attribute__ ((__always_inline__)) -vaddlv_s32 (int32x2_t a) +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vabs_s64 (int64x1_t __a) { - int64_t result; - __asm__ ("saddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : ); - return result; + return __builtin_llabs (__a); } -__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) -vaddlv_u32 (uint32x2_t a) +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vabsq_f32 (float32x4_t __a) { - uint64_t result; - __asm__ ("uaddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : ); - return result; + return __builtin_aarch64_absv4sf (__a); +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vabsq_f64 (float64x2_t __a) +{ + return __builtin_aarch64_absv2df (__a); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vabsq_s8 (int8x16_t __a) +{ + return __builtin_aarch64_absv16qi (__a); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vabsq_s16 (int16x8_t __a) +{ + return __builtin_aarch64_absv8hi (__a); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vabsq_s32 (int32x4_t __a) +{ + return __builtin_aarch64_absv4si (__a); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vabsq_s64 (int64x2_t __a) +{ + return __builtin_aarch64_absv2di (__a); } +/* vadd */ + __extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) -vpaddd_s64 (int64x2_t __a) +vaddd_s64 (int64x1_t __a, int64x1_t __b) { - return __builtin_aarch64_addpdi (__a); + return __a + __b; } -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vqdmulh_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vaddd_u64 (uint64x1_t __a, uint64x1_t __b) { - return __builtin_aarch64_sqdmulh_laneqv4hi (__a, __b, __c); + return __a + __b; } -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vqdmulh_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c) +/* vaddv */ + +__extension__ static __inline int8_t __attribute__ ((__always_inline__)) +vaddv_s8 (int8x8_t __a) +{ + return vget_lane_s8 (__builtin_aarch64_reduc_splus_v8qi (__a), 0); +} + +__extension__ static __inline int16_t __attribute__ ((__always_inline__)) +vaddv_s16 (int16x4_t __a) +{ + return vget_lane_s16 (__builtin_aarch64_reduc_splus_v4hi (__a), 0); +} + +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vaddv_s32 (int32x2_t __a) { - return __builtin_aarch64_sqdmulh_laneqv2si (__a, __b, __c); + return vget_lane_s32 (__builtin_aarch64_reduc_splus_v2si (__a), 0); } -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vqdmulhq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c) +__extension__ static __inline uint8_t __attribute__ ((__always_inline__)) +vaddv_u8 (uint8x8_t __a) { - return __builtin_aarch64_sqdmulh_laneqv8hi (__a, __b, __c); + return vget_lane_u8 ((uint8x8_t) + __builtin_aarch64_reduc_uplus_v8qi ((int8x8_t) __a), 0); } -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vqdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c) +__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +vaddv_u16 (uint16x4_t __a) { - return __builtin_aarch64_sqdmulh_laneqv4si (__a, __b, __c); + return vget_lane_u16 ((uint16x4_t) + __builtin_aarch64_reduc_uplus_v4hi ((int16x4_t) __a), 0); } -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vqrdmulh_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c) +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vaddv_u32 (uint32x2_t __a) { - return __builtin_aarch64_sqrdmulh_laneqv4hi (__a, __b, __c); + return vget_lane_u32 ((uint32x2_t) + __builtin_aarch64_reduc_uplus_v2si ((int32x2_t) __a), 0); } -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vqrdmulh_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c) +__extension__ static __inline int8_t __attribute__ ((__always_inline__)) +vaddvq_s8 (int8x16_t __a) { - return __builtin_aarch64_sqrdmulh_laneqv2si (__a, __b, __c); + return vgetq_lane_s8 (__builtin_aarch64_reduc_splus_v16qi (__a), 0); } -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vqrdmulhq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __c) +__extension__ static __inline int16_t __attribute__ ((__always_inline__)) +vaddvq_s16 (int16x8_t __a) { - return __builtin_aarch64_sqrdmulh_laneqv8hi (__a, __b, __c); + return vgetq_lane_s16 (__builtin_aarch64_reduc_splus_v8hi (__a), 0); } -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vqrdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c) +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vaddvq_s32 (int32x4_t __a) { - return __builtin_aarch64_sqrdmulh_laneqv4si (__a, __b, __c); + return vgetq_lane_s32 (__builtin_aarch64_reduc_splus_v4si (__a), 0); } -/* Table intrinsics. */ - -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vqtbl1_p8 (poly8x16_t a, uint8x8_t b) +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vaddvq_s64 (int64x2_t __a) { - poly8x8_t result; - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return vgetq_lane_s64 (__builtin_aarch64_reduc_splus_v2di (__a), 0); } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vqtbl1_s8 (int8x16_t a, int8x8_t b) +__extension__ static __inline uint8_t __attribute__ ((__always_inline__)) +vaddvq_u8 (uint8x16_t __a) { - int8x8_t result; - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return vgetq_lane_u8 ((uint8x16_t) + __builtin_aarch64_reduc_uplus_v16qi ((int8x16_t) __a), 0); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vqtbl1_u8 (uint8x16_t a, uint8x8_t b) +__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +vaddvq_u16 (uint16x8_t __a) { - uint8x8_t result; - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return vgetq_lane_u16 ((uint16x8_t) + __builtin_aarch64_reduc_uplus_v8hi ((int16x8_t) __a), 0); } -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vqtbl1q_p8 (poly8x16_t a, uint8x16_t b) +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vaddvq_u32 (uint32x4_t __a) { - poly8x16_t result; - __asm__ ("tbl %0.16b, {%1.16b}, %2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return vgetq_lane_u32 ((uint32x4_t) + __builtin_aarch64_reduc_uplus_v4si ((int32x4_t) __a), 0); } -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vqtbl1q_s8 (int8x16_t a, int8x16_t b) +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vaddvq_u64 (uint64x2_t __a) { - int8x16_t result; - __asm__ ("tbl %0.16b, {%1.16b}, %2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return vgetq_lane_u64 ((uint64x2_t) + __builtin_aarch64_reduc_uplus_v2di ((int64x2_t) __a), 0); } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vqtbl1q_u8 (uint8x16_t a, uint8x16_t b) +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vaddv_f32 (float32x2_t __a) { - uint8x16_t result; - __asm__ ("tbl %0.16b, {%1.16b}, %2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + float32x2_t t = __builtin_aarch64_reduc_splus_v2sf (__a); + return vget_lane_f32 (t, 0); } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vqtbl2_s8 (int8x16x2_t tab, int8x8_t idx) +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vaddvq_f32 (float32x4_t __a) { - int8x8_t result; - __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" - "tbl %0.8b, {v16.16b, v17.16b}, %2.8b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17"); - return result; + float32x4_t t = __builtin_aarch64_reduc_splus_v4sf (__a); + return vgetq_lane_f32 (t, 0); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vqtbl2_u8 (uint8x16x2_t tab, uint8x8_t idx) +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vaddvq_f64 (float64x2_t __a) { - uint8x8_t result; - __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" - "tbl %0.8b, {v16.16b, v17.16b}, %2.8b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17"); - return result; + float64x2_t t = __builtin_aarch64_reduc_splus_v2df (__a); + return vgetq_lane_f64 (t, 0); } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vqtbl2_p8 (poly8x16x2_t tab, uint8x8_t idx) +/* vcage */ + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vcages_f32 (float32_t __a, float32_t __b) { - poly8x8_t result; - __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" - "tbl %0.8b, {v16.16b, v17.16b}, %2.8b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17"); - return result; + return __builtin_fabsf (__a) >= __builtin_fabsf (__b) ? -1 : 0; } -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vqtbl2q_s8 (int8x16x2_t tab, int8x16_t idx) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcage_f32 (float32x2_t __a, float32x2_t __b) { - int8x16_t result; - __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" - "tbl %0.16b, {v16.16b, v17.16b}, %2.16b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17"); - return result; + return vabs_f32 (__a) >= vabs_f32 (__b); } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vqtbl2q_u8 (uint8x16x2_t tab, uint8x16_t idx) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcageq_f32 (float32x4_t __a, float32x4_t __b) { - uint8x16_t result; - __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" - "tbl %0.16b, {v16.16b, v17.16b}, %2.16b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17"); - return result; + return vabsq_f32 (__a) >= vabsq_f32 (__b); } -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vqtbl2q_p8 (poly8x16x2_t tab, uint8x16_t idx) +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vcaged_f64 (float64_t __a, float64_t __b) { - poly8x16_t result; - __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" - "tbl %0.16b, {v16.16b, v17.16b}, %2.16b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17"); - return result; + return __builtin_fabs (__a) >= __builtin_fabs (__b) ? -1 : 0; } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vqtbl3_s8 (int8x16x3_t tab, int8x8_t idx) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcageq_f64 (float64x2_t __a, float64x2_t __b) { - int8x8_t result; - __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" - "tbl %0.8b, {v16.16b - v18.16b}, %2.8b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18"); - return result; + return vabsq_f64 (__a) >= vabsq_f64 (__b); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vqtbl3_u8 (uint8x16x3_t tab, uint8x8_t idx) +/* vcagt */ + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vcagts_f32 (float32_t __a, float32_t __b) { - uint8x8_t result; - __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" - "tbl %0.8b, {v16.16b - v18.16b}, %2.8b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18"); - return result; + return __builtin_fabsf (__a) > __builtin_fabsf (__b) ? -1 : 0; } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vqtbl3_p8 (poly8x16x3_t tab, uint8x8_t idx) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcagt_f32 (float32x2_t __a, float32x2_t __b) { - poly8x8_t result; - __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" - "tbl %0.8b, {v16.16b - v18.16b}, %2.8b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18"); - return result; + return vabs_f32 (__a) > vabs_f32 (__b); } -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vqtbl3q_s8 (int8x16x3_t tab, int8x16_t idx) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcagtq_f32 (float32x4_t __a, float32x4_t __b) { - int8x16_t result; - __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" - "tbl %0.16b, {v16.16b - v18.16b}, %2.16b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18"); - return result; + return vabsq_f32 (__a) > vabsq_f32 (__b); } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vqtbl3q_u8 (uint8x16x3_t tab, uint8x16_t idx) +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vcagtd_f64 (float64_t __a, float64_t __b) { - uint8x16_t result; - __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" - "tbl %0.16b, {v16.16b - v18.16b}, %2.16b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18"); - return result; + return __builtin_fabs (__a) > __builtin_fabs (__b) ? -1 : 0; } -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vqtbl3q_p8 (poly8x16x3_t tab, uint8x16_t idx) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcagtq_f64 (float64x2_t __a, float64x2_t __b) { - poly8x16_t result; - __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" - "tbl %0.16b, {v16.16b - v18.16b}, %2.16b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18"); - return result; + return vabsq_f64 (__a) > vabsq_f64 (__b); } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vqtbl4_s8 (int8x16x4_t tab, int8x8_t idx) +/* vcale */ + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcale_f32 (float32x2_t __a, float32x2_t __b) { - int8x8_t result; - __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" - "tbl %0.8b, {v16.16b - v19.16b}, %2.8b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18", "v19"); - return result; + return vabs_f32 (__a) <= vabs_f32 (__b); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vqtbl4_u8 (uint8x16x4_t tab, uint8x8_t idx) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcaleq_f32 (float32x4_t __a, float32x4_t __b) { - uint8x8_t result; - __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" - "tbl %0.8b, {v16.16b - v19.16b}, %2.8b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18", "v19"); - return result; + return vabsq_f32 (__a) <= vabsq_f32 (__b); } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vqtbl4_p8 (poly8x16x4_t tab, uint8x8_t idx) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcaleq_f64 (float64x2_t __a, float64x2_t __b) { - poly8x8_t result; - __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" - "tbl %0.8b, {v16.16b - v19.16b}, %2.8b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18", "v19"); - return result; + return vabsq_f64 (__a) <= vabsq_f64 (__b); } +/* vcalt */ -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vqtbl4q_s8 (int8x16x4_t tab, int8x16_t idx) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcalt_f32 (float32x2_t __a, float32x2_t __b) { - int8x16_t result; - __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" - "tbl %0.16b, {v16.16b - v19.16b}, %2.16b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18", "v19"); - return result; + return vabs_f32 (__a) < vabs_f32 (__b); } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vqtbl4q_u8 (uint8x16x4_t tab, uint8x16_t idx) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcaltq_f32 (float32x4_t __a, float32x4_t __b) { - uint8x16_t result; - __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" - "tbl %0.16b, {v16.16b - v19.16b}, %2.16b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18", "v19"); - return result; + return vabsq_f32 (__a) < vabsq_f32 (__b); } -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vqtbl4q_p8 (poly8x16x4_t tab, uint8x16_t idx) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcaltq_f64 (float64x2_t __a, float64x2_t __b) { - poly8x16_t result; - __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" - "tbl %0.16b, {v16.16b - v19.16b}, %2.16b\n\t" - :"=w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18", "v19"); - return result; + return vabsq_f64 (__a) < vabsq_f64 (__b); } +/* vceq - vector. */ -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vqtbx1_s8 (int8x8_t r, int8x16_t tab, int8x8_t idx) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vceq_f32 (float32x2_t __a, float32x2_t __b) { - int8x8_t result = r; - __asm__ ("tbx %0.8b,{%1.16b},%2.8b" - : "+w"(result) - : "w"(tab), "w"(idx) - : /* No clobbers */); - return result; + return (uint32x2_t) __builtin_aarch64_cmeqv2sf (__a, __b); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vqtbx1_u8 (uint8x8_t r, uint8x16_t tab, uint8x8_t idx) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vceq_f64 (float64x1_t __a, float64x1_t __b) { - uint8x8_t result = r; - __asm__ ("tbx %0.8b,{%1.16b},%2.8b" - : "+w"(result) - : "w"(tab), "w"(idx) - : /* No clobbers */); - return result; + return __a == __b ? -1ll : 0ll; } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vqtbx1_p8 (poly8x8_t r, poly8x16_t tab, uint8x8_t idx) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vceq_p8 (poly8x8_t __a, poly8x8_t __b) { - poly8x8_t result = r; - __asm__ ("tbx %0.8b,{%1.16b},%2.8b" - : "+w"(result) - : "w"(tab), "w"(idx) - : /* No clobbers */); - return result; + return (uint8x8_t) __builtin_aarch64_cmeqv8qi ((int8x8_t) __a, + (int8x8_t) __b); } -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vqtbx1q_s8 (int8x16_t r, int8x16_t tab, int8x16_t idx) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vceq_s8 (int8x8_t __a, int8x8_t __b) { - int8x16_t result = r; - __asm__ ("tbx %0.16b,{%1.16b},%2.16b" - : "+w"(result) - : "w"(tab), "w"(idx) - : /* No clobbers */); - return result; + return (uint8x8_t) __builtin_aarch64_cmeqv8qi (__a, __b); } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vqtbx1q_u8 (uint8x16_t r, uint8x16_t tab, uint8x16_t idx) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vceq_s16 (int16x4_t __a, int16x4_t __b) { - uint8x16_t result = r; - __asm__ ("tbx %0.16b,{%1.16b},%2.16b" - : "+w"(result) - : "w"(tab), "w"(idx) - : /* No clobbers */); - return result; + return (uint16x4_t) __builtin_aarch64_cmeqv4hi (__a, __b); } -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vqtbx1q_p8 (poly8x16_t r, poly8x16_t tab, uint8x16_t idx) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vceq_s32 (int32x2_t __a, int32x2_t __b) { - poly8x16_t result = r; - __asm__ ("tbx %0.16b,{%1.16b},%2.16b" - : "+w"(result) - : "w"(tab), "w"(idx) - : /* No clobbers */); - return result; + return (uint32x2_t) __builtin_aarch64_cmeqv2si (__a, __b); } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vqtbx2_s8 (int8x8_t r, int8x16x2_t tab, int8x8_t idx) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vceq_s64 (int64x1_t __a, int64x1_t __b) { - int8x8_t result = r; - __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" - "tbx %0.8b, {v16.16b, v17.16b}, %2.8b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17"); - return result; + return __a == __b ? -1ll : 0ll; } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vqtbx2_u8 (uint8x8_t r, uint8x16x2_t tab, uint8x8_t idx) +vceq_u8 (uint8x8_t __a, uint8x8_t __b) { - uint8x8_t result = r; - __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" - "tbx %0.8b, {v16.16b, v17.16b}, %2.8b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17"); - return result; + return (uint8x8_t) __builtin_aarch64_cmeqv8qi ((int8x8_t) __a, + (int8x8_t) __b); } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vqtbx2_p8 (poly8x8_t r, poly8x16x2_t tab, uint8x8_t idx) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vceq_u16 (uint16x4_t __a, uint16x4_t __b) { - poly8x8_t result = r; - __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" - "tbx %0.8b, {v16.16b, v17.16b}, %2.8b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17"); - return result; + return (uint16x4_t) __builtin_aarch64_cmeqv4hi ((int16x4_t) __a, + (int16x4_t) __b); } - -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vqtbx2q_s8 (int8x16_t r, int8x16x2_t tab, int8x16_t idx) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vceq_u32 (uint32x2_t __a, uint32x2_t __b) { - int8x16_t result = r; - __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" - "tbx %0.16b, {v16.16b, v17.16b}, %2.16b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17"); - return result; + return (uint32x2_t) __builtin_aarch64_cmeqv2si ((int32x2_t) __a, + (int32x2_t) __b); } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vqtbx2q_u8 (uint8x16_t r, uint8x16x2_t tab, uint8x16_t idx) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vceq_u64 (uint64x1_t __a, uint64x1_t __b) { - uint8x16_t result = r; - __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" - "tbx %0.16b, {v16.16b, v17.16b}, %2.16b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17"); - return result; + return __a == __b ? -1ll : 0ll; } -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vqtbx2q_p8 (poly8x16_t r, poly8x16x2_t tab, uint8x16_t idx) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vceqq_f32 (float32x4_t __a, float32x4_t __b) { - poly8x16_t result = r; - __asm__ ("ld1 {v16.16b, v17.16b}, %1\n\t" - "tbx %0.16b, {v16.16b, v17.16b}, %2.16b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17"); - return result; + return (uint32x4_t) __builtin_aarch64_cmeqv4sf (__a, __b); } +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vceqq_f64 (float64x2_t __a, float64x2_t __b) +{ + return (uint64x2_t) __builtin_aarch64_cmeqv2df (__a, __b); +} -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vqtbx3_s8 (int8x8_t r, int8x16x3_t tab, int8x8_t idx) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vceqq_p8 (poly8x16_t __a, poly8x16_t __b) { - int8x8_t result = r; - __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" - "tbx %0.8b, {v16.16b - v18.16b}, %2.8b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18"); - return result; + return (uint8x16_t) __builtin_aarch64_cmeqv16qi ((int8x16_t) __a, + (int8x16_t) __b); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vqtbx3_u8 (uint8x8_t r, uint8x16x3_t tab, uint8x8_t idx) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vceqq_s8 (int8x16_t __a, int8x16_t __b) { - uint8x8_t result = r; - __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" - "tbx %0.8b, {v16.16b - v18.16b}, %2.8b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18"); - return result; + return (uint8x16_t) __builtin_aarch64_cmeqv16qi (__a, __b); } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vqtbx3_p8 (poly8x8_t r, poly8x16x3_t tab, uint8x8_t idx) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vceqq_s16 (int16x8_t __a, int16x8_t __b) { - poly8x8_t result = r; - __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" - "tbx %0.8b, {v16.16b - v18.16b}, %2.8b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18"); - return result; + return (uint16x8_t) __builtin_aarch64_cmeqv8hi (__a, __b); } +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vceqq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (uint32x4_t) __builtin_aarch64_cmeqv4si (__a, __b); +} -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vqtbx3q_s8 (int8x16_t r, int8x16x3_t tab, int8x16_t idx) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vceqq_s64 (int64x2_t __a, int64x2_t __b) { - int8x16_t result = r; - __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" - "tbx %0.16b, {v16.16b - v18.16b}, %2.16b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18"); - return result; + return (uint64x2_t) __builtin_aarch64_cmeqv2di (__a, __b); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vqtbx3q_u8 (uint8x16_t r, uint8x16x3_t tab, uint8x16_t idx) +vceqq_u8 (uint8x16_t __a, uint8x16_t __b) { - uint8x16_t result = r; - __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" - "tbx %0.16b, {v16.16b - v18.16b}, %2.16b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18"); - return result; + return (uint8x16_t) __builtin_aarch64_cmeqv16qi ((int8x16_t) __a, + (int8x16_t) __b); } -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vqtbx3q_p8 (poly8x16_t r, poly8x16x3_t tab, uint8x16_t idx) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vceqq_u16 (uint16x8_t __a, uint16x8_t __b) { - poly8x16_t result = r; - __asm__ ("ld1 {v16.16b - v18.16b}, %1\n\t" - "tbx %0.16b, {v16.16b - v18.16b}, %2.16b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18"); - return result; + return (uint16x8_t) __builtin_aarch64_cmeqv8hi ((int16x8_t) __a, + (int16x8_t) __b); } - -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vqtbx4_s8 (int8x8_t r, int8x16x4_t tab, int8x8_t idx) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vceqq_u32 (uint32x4_t __a, uint32x4_t __b) { - int8x8_t result = r; - __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" - "tbx %0.8b, {v16.16b - v19.16b}, %2.8b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18", "v19"); - return result; + return (uint32x4_t) __builtin_aarch64_cmeqv4si ((int32x4_t) __a, + (int32x4_t) __b); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vqtbx4_u8 (uint8x8_t r, uint8x16x4_t tab, uint8x8_t idx) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vceqq_u64 (uint64x2_t __a, uint64x2_t __b) { - uint8x8_t result = r; - __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" - "tbx %0.8b, {v16.16b - v19.16b}, %2.8b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18", "v19"); - return result; + return (uint64x2_t) __builtin_aarch64_cmeqv2di ((int64x2_t) __a, + (int64x2_t) __b); } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vqtbx4_p8 (poly8x8_t r, poly8x16x4_t tab, uint8x8_t idx) +/* vceq - scalar. */ + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vceqs_f32 (float32_t __a, float32_t __b) { - poly8x8_t result = r; - __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" - "tbx %0.8b, {v16.16b - v19.16b}, %2.8b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18", "v19"); - return result; + return __a == __b ? -1 : 0; } - -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vqtbx4q_s8 (int8x16_t r, int8x16x4_t tab, int8x16_t idx) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vceqd_s64 (int64x1_t __a, int64x1_t __b) { - int8x16_t result = r; - __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" - "tbx %0.16b, {v16.16b - v19.16b}, %2.16b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18", "v19"); - return result; + return __a == __b ? -1ll : 0ll; } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vqtbx4q_u8 (uint8x16_t r, uint8x16x4_t tab, uint8x16_t idx) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vceqd_u64 (uint64x1_t __a, uint64x1_t __b) { - uint8x16_t result = r; - __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" - "tbx %0.16b, {v16.16b - v19.16b}, %2.16b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18", "v19"); - return result; + return __a == __b ? -1ll : 0ll; } -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vqtbx4q_p8 (poly8x16_t r, poly8x16x4_t tab, uint8x16_t idx) +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vceqd_f64 (float64_t __a, float64_t __b) { - poly8x16_t result = r; - __asm__ ("ld1 {v16.16b - v19.16b}, %1\n\t" - "tbx %0.16b, {v16.16b - v19.16b}, %2.16b\n\t" - :"+w"(result) - :"Q"(tab),"w"(idx) - :"memory", "v16", "v17", "v18", "v19"); - return result; + return __a == __b ? -1ll : 0ll; } -/* V7 legacy table intrinsics. */ +/* vceqz - vector. */ -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vtbl1_s8 (int8x8_t tab, int8x8_t idx) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vceqz_f32 (float32x2_t __a) { - int8x8_t result; - int8x16_t temp = vcombine_s8 (tab, vcreate_s8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(result) - : "w"(temp), "w"(idx) - : /* No clobbers */); - return result; + float32x2_t __b = {0.0f, 0.0f}; + return (uint32x2_t) __builtin_aarch64_cmeqv2sf (__a, __b); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtbl1_u8 (uint8x8_t tab, uint8x8_t idx) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vceqz_f64 (float64x1_t __a) { - uint8x8_t result; - uint8x16_t temp = vcombine_u8 (tab, vcreate_u8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(result) - : "w"(temp), "w"(idx) - : /* No clobbers */); - return result; + return __a == 0.0 ? -1ll : 0ll; } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vtbl1_p8 (poly8x8_t tab, uint8x8_t idx) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vceqz_p8 (poly8x8_t __a) { - poly8x8_t result; - poly8x16_t temp = vcombine_p8 (tab, vcreate_p8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(result) - : "w"(temp), "w"(idx) - : /* No clobbers */); - return result; + poly8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x8_t) __builtin_aarch64_cmeqv8qi ((int8x8_t) __a, + (int8x8_t) __b); } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vtbl2_s8 (int8x8x2_t tab, int8x8_t idx) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vceqz_s8 (int8x8_t __a) { - int8x8_t result; - int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]); - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(result) - : "w"(temp), "w"(idx) - : /* No clobbers */); - return result; + int8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x8_t) __builtin_aarch64_cmeqv8qi (__a, __b); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtbl2_u8 (uint8x8x2_t tab, uint8x8_t idx) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vceqz_s16 (int16x4_t __a) { - uint8x8_t result; - uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]); - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(result) - : "w"(temp), "w"(idx) - : /* No clobbers */); - return result; + int16x4_t __b = {0, 0, 0, 0}; + return (uint16x4_t) __builtin_aarch64_cmeqv4hi (__a, __b); } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vtbl2_p8 (poly8x8x2_t tab, uint8x8_t idx) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vceqz_s32 (int32x2_t __a) { - poly8x8_t result; - poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]); - __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(result) - : "w"(temp), "w"(idx) - : /* No clobbers */); - return result; + int32x2_t __b = {0, 0}; + return (uint32x2_t) __builtin_aarch64_cmeqv2si (__a, __b); } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vtbl3_s8 (int8x8x3_t tab, int8x8_t idx) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vceqz_s64 (int64x1_t __a) { - int8x8_t result; - int8x16x2_t temp; - temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "=w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); - return result; + return __a == 0ll ? -1ll : 0ll; } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtbl3_u8 (uint8x8x3_t tab, uint8x8_t idx) +vceqz_u8 (uint8x8_t __a) { - uint8x8_t result; - uint8x16x2_t temp; - temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "=w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); - return result; + uint8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x8_t) __builtin_aarch64_cmeqv8qi ((int8x8_t) __a, + (int8x8_t) __b); } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vtbl3_p8 (poly8x8x3_t tab, uint8x8_t idx) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vceqz_u16 (uint16x4_t __a) { - poly8x8_t result; - poly8x16x2_t temp; - temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "=w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); - return result; + uint16x4_t __b = {0, 0, 0, 0}; + return (uint16x4_t) __builtin_aarch64_cmeqv4hi ((int16x4_t) __a, + (int16x4_t) __b); } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vtbl4_s8 (int8x8x4_t tab, int8x8_t idx) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vceqz_u32 (uint32x2_t __a) { - int8x8_t result; - int8x16x2_t temp; - temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "=w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); - return result; + uint32x2_t __b = {0, 0}; + return (uint32x2_t) __builtin_aarch64_cmeqv2si ((int32x2_t) __a, + (int32x2_t) __b); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtbl4_u8 (uint8x8x4_t tab, uint8x8_t idx) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vceqz_u64 (uint64x1_t __a) { - uint8x8_t result; - uint8x16x2_t temp; - temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "=w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); - return result; + return __a == 0ll ? -1ll : 0ll; } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vtbl4_p8 (poly8x8x4_t tab, uint8x8_t idx) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vceqzq_f32 (float32x4_t __a) { - poly8x8_t result; - poly8x16x2_t temp; - temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbl %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "=w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); - return result; + float32x4_t __b = {0.0f, 0.0f, 0.0f, 0.0f}; + return (uint32x4_t) __builtin_aarch64_cmeqv4sf (__a, __b); } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vtbx1_s8 (int8x8_t r, int8x8_t tab, int8x8_t idx) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vceqzq_f64 (float64x2_t __a) { - int8x8_t result; - int8x8_t tmp1; - int8x16_t temp = vcombine_s8 (tab, vcreate_s8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("movi %0.8b, 8\n\t" - "cmhs %0.8b, %3.8b, %0.8b\n\t" - "tbl %1.8b, {%2.16b}, %3.8b\n\t" - "bsl %0.8b, %4.8b, %1.8b\n\t" - : "+w"(result), "=w"(tmp1) - : "w"(temp), "w"(idx), "w"(r) - : /* No clobbers */); - return result; + float64x2_t __b = {0.0, 0.0}; + return (uint64x2_t) __builtin_aarch64_cmeqv2df (__a, __b); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtbx1_u8 (uint8x8_t r, uint8x8_t tab, uint8x8_t idx) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vceqzq_p8 (poly8x16_t __a) { - uint8x8_t result; - uint8x8_t tmp1; - uint8x16_t temp = vcombine_u8 (tab, vcreate_u8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("movi %0.8b, 8\n\t" - "cmhs %0.8b, %3.8b, %0.8b\n\t" - "tbl %1.8b, {%2.16b}, %3.8b\n\t" - "bsl %0.8b, %4.8b, %1.8b\n\t" - : "+w"(result), "=w"(tmp1) - : "w"(temp), "w"(idx), "w"(r) - : /* No clobbers */); - return result; + poly8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x16_t) __builtin_aarch64_cmeqv16qi ((int8x16_t) __a, + (int8x16_t) __b); } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vtbx1_p8 (poly8x8_t r, poly8x8_t tab, uint8x8_t idx) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vceqzq_s8 (int8x16_t __a) { - poly8x8_t result; - poly8x8_t tmp1; - poly8x16_t temp = vcombine_p8 (tab, vcreate_p8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("movi %0.8b, 8\n\t" - "cmhs %0.8b, %3.8b, %0.8b\n\t" - "tbl %1.8b, {%2.16b}, %3.8b\n\t" - "bsl %0.8b, %4.8b, %1.8b\n\t" - : "+w"(result), "=w"(tmp1) - : "w"(temp), "w"(idx), "w"(r) - : /* No clobbers */); - return result; + int8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x16_t) __builtin_aarch64_cmeqv16qi (__a, __b); } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vtbx2_s8 (int8x8_t r, int8x8x2_t tab, int8x8_t idx) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vceqzq_s16 (int16x8_t __a) { - int8x8_t result = r; - int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]); - __asm__ ("tbx %0.8b, {%1.16b}, %2.8b" - : "+w"(result) - : "w"(temp), "w"(idx) - : /* No clobbers */); - return result; + int16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint16x8_t) __builtin_aarch64_cmeqv8hi (__a, __b); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtbx2_u8 (uint8x8_t r, uint8x8x2_t tab, uint8x8_t idx) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vceqzq_s32 (int32x4_t __a) { - uint8x8_t result = r; - uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]); - __asm__ ("tbx %0.8b, {%1.16b}, %2.8b" - : "+w"(result) - : "w"(temp), "w"(idx) - : /* No clobbers */); - return result; + int32x4_t __b = {0, 0, 0, 0}; + return (uint32x4_t) __builtin_aarch64_cmeqv4si (__a, __b); } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vceqzq_s64 (int64x2_t __a) { - poly8x8_t result = r; - poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]); - __asm__ ("tbx %0.8b, {%1.16b}, %2.8b" - : "+w"(result) - : "w"(temp), "w"(idx) - : /* No clobbers */); - return result; + int64x2_t __b = {0, 0}; + return (uint64x2_t) __builtin_aarch64_cmeqv2di (__a, __b); } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vtbx3_s8 (int8x8_t r, int8x8x3_t tab, int8x8_t idx) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vceqzq_u8 (uint8x16_t __a) { - int8x8_t result; - int8x8_t tmp1; - int8x16x2_t temp; - temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("ld1 {v16.16b - v17.16b}, %2\n\t" - "movi %0.8b, 24\n\t" - "cmhs %0.8b, %3.8b, %0.8b\n\t" - "tbl %1.8b, {v16.16b - v17.16b}, %3.8b\n\t" - "bsl %0.8b, %4.8b, %1.8b\n\t" - : "+w"(result), "=w"(tmp1) - : "Q"(temp), "w"(idx), "w"(r) - : "v16", "v17", "memory"); - return result; + uint8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x16_t) __builtin_aarch64_cmeqv16qi ((int8x16_t) __a, + (int8x16_t) __b); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtbx3_u8 (uint8x8_t r, uint8x8x3_t tab, uint8x8_t idx) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vceqzq_u16 (uint16x8_t __a) { - uint8x8_t result; - uint8x8_t tmp1; - uint8x16x2_t temp; - temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("ld1 {v16.16b - v17.16b}, %2\n\t" - "movi %0.8b, 24\n\t" - "cmhs %0.8b, %3.8b, %0.8b\n\t" - "tbl %1.8b, {v16.16b - v17.16b}, %3.8b\n\t" - "bsl %0.8b, %4.8b, %1.8b\n\t" - : "+w"(result), "=w"(tmp1) - : "Q"(temp), "w"(idx), "w"(r) - : "v16", "v17", "memory"); - return result; + uint16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint16x8_t) __builtin_aarch64_cmeqv8hi ((int16x8_t) __a, + (int16x8_t) __b); } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vtbx3_p8 (poly8x8_t r, poly8x8x3_t tab, uint8x8_t idx) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vceqzq_u32 (uint32x4_t __a) { - poly8x8_t result; - poly8x8_t tmp1; - poly8x16x2_t temp; - temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("ld1 {v16.16b - v17.16b}, %2\n\t" - "movi %0.8b, 24\n\t" - "cmhs %0.8b, %3.8b, %0.8b\n\t" - "tbl %1.8b, {v16.16b - v17.16b}, %3.8b\n\t" - "bsl %0.8b, %4.8b, %1.8b\n\t" - : "+w"(result), "=w"(tmp1) - : "Q"(temp), "w"(idx), "w"(r) - : "v16", "v17", "memory"); - return result; + uint32x4_t __b = {0, 0, 0, 0}; + return (uint32x4_t) __builtin_aarch64_cmeqv4si ((int32x4_t) __a, + (int32x4_t) __b); } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vtbx4_s8 (int8x8_t r, int8x8x4_t tab, int8x8_t idx) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vceqzq_u64 (uint64x2_t __a) { - int8x8_t result = r; - int8x16x2_t temp; - temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "+w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); - return result; + uint64x2_t __b = {0, 0}; + return (uint64x2_t) __builtin_aarch64_cmeqv2di ((int64x2_t) __a, + (int64x2_t) __b); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vtbx4_u8 (uint8x8_t r, uint8x8x4_t tab, uint8x8_t idx) +/* vceqz - scalar. */ + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vceqzs_f32 (float32_t __a) { - uint8x8_t result = r; - uint8x16x2_t temp; - temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "+w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); - return result; + return __a == 0.0f ? -1 : 0; } -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vtbx4_p8 (poly8x8_t r, poly8x8x4_t tab, uint8x8_t idx) -{ - poly8x8_t result = r; - poly8x16x2_t temp; - temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]); - __asm__ ("ld1 {v16.16b - v17.16b }, %1\n\t" - "tbx %0.8b, {v16.16b - v17.16b}, %2.8b\n\t" - : "+w"(result) - : "Q"(temp), "w"(idx) - : "v16", "v17", "memory"); - return result; +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vceqzd_s64 (int64x1_t __a) +{ + return __a == 0 ? -1ll : 0ll; } -/* End of temporary inline asm. */ +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vceqzd_u64 (int64x1_t __a) +{ + return __a == 0 ? -1ll : 0ll; +} -/* Start of optimal implementations in approved order. */ +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vceqzd_f64 (float64_t __a) +{ + return __a == 0.0 ? -1ll : 0ll; +} -/* vabs */ +/* vcge - vector. */ -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vabs_f32 (float32x2_t __a) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcge_f32 (float32x2_t __a, float32x2_t __b) { - return __builtin_aarch64_absv2sf (__a); + return (uint32x2_t) __builtin_aarch64_cmgev2sf (__a, __b); } -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) -vabs_f64 (float64x1_t __a) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcge_f64 (float64x1_t __a, float64x1_t __b) { - return __builtin_fabs (__a); + return __a >= __b ? -1ll : 0ll; } -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vabs_s8 (int8x8_t __a) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcge_p8 (poly8x8_t __a, poly8x8_t __b) { - return __builtin_aarch64_absv8qi (__a); + return (uint8x8_t) __builtin_aarch64_cmgev8qi ((int8x8_t) __a, + (int8x8_t) __b); } -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vabs_s16 (int16x4_t __a) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcge_s8 (int8x8_t __a, int8x8_t __b) { - return __builtin_aarch64_absv4hi (__a); + return (uint8x8_t) __builtin_aarch64_cmgev8qi (__a, __b); } -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vabs_s32 (int32x2_t __a) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcge_s16 (int16x4_t __a, int16x4_t __b) { - return __builtin_aarch64_absv2si (__a); + return (uint16x4_t) __builtin_aarch64_cmgev4hi (__a, __b); } -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) -vabs_s64 (int64x1_t __a) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcge_s32 (int32x2_t __a, int32x2_t __b) { - return __builtin_llabs (__a); + return (uint32x2_t) __builtin_aarch64_cmgev2si (__a, __b); } -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vabsq_f32 (float32x4_t __a) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcge_s64 (int64x1_t __a, int64x1_t __b) { - return __builtin_aarch64_absv4sf (__a); + return __a >= __b ? -1ll : 0ll; } -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vabsq_f64 (float64x2_t __a) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcge_u8 (uint8x8_t __a, uint8x8_t __b) { - return __builtin_aarch64_absv2df (__a); + return (uint8x8_t) __builtin_aarch64_cmgeuv8qi ((int8x8_t) __a, + (int8x8_t) __b); } -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vabsq_s8 (int8x16_t __a) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcge_u16 (uint16x4_t __a, uint16x4_t __b) { - return __builtin_aarch64_absv16qi (__a); + return (uint16x4_t) __builtin_aarch64_cmgeuv4hi ((int16x4_t) __a, + (int16x4_t) __b); } -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vabsq_s16 (int16x8_t __a) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcge_u32 (uint32x2_t __a, uint32x2_t __b) { - return __builtin_aarch64_absv8hi (__a); + return (uint32x2_t) __builtin_aarch64_cmgeuv2si ((int32x2_t) __a, + (int32x2_t) __b); } -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vabsq_s32 (int32x4_t __a) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcge_u64 (uint64x1_t __a, uint64x1_t __b) { - return __builtin_aarch64_absv4si (__a); + return __a >= __b ? -1ll : 0ll; } -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vabsq_s64 (int64x2_t __a) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcgeq_f32 (float32x4_t __a, float32x4_t __b) { - return __builtin_aarch64_absv2di (__a); + return (uint32x4_t) __builtin_aarch64_cmgev4sf (__a, __b); } -/* vadd */ +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcgeq_f64 (float64x2_t __a, float64x2_t __b) +{ + return (uint64x2_t) __builtin_aarch64_cmgev2df (__a, __b); +} -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) -vaddd_s64 (int64x1_t __a, int64x1_t __b) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcgeq_p8 (poly8x16_t __a, poly8x16_t __b) { - return __a + __b; + return (uint8x16_t) __builtin_aarch64_cmgev16qi ((int8x16_t) __a, + (int8x16_t) __b); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vaddd_u64 (uint64x1_t __a, uint64x1_t __b) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcgeq_s8 (int8x16_t __a, int8x16_t __b) { - return __a + __b; + return (uint8x16_t) __builtin_aarch64_cmgev16qi (__a, __b); } -/* vaddv */ +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcgeq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (uint16x8_t) __builtin_aarch64_cmgev8hi (__a, __b); +} -__extension__ static __inline int8_t __attribute__ ((__always_inline__)) -vaddv_s8 (int8x8_t __a) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcgeq_s32 (int32x4_t __a, int32x4_t __b) { - return vget_lane_s8 (__builtin_aarch64_reduc_splus_v8qi (__a), 0); + return (uint32x4_t) __builtin_aarch64_cmgev4si (__a, __b); } -__extension__ static __inline int16_t __attribute__ ((__always_inline__)) -vaddv_s16 (int16x4_t __a) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcgeq_s64 (int64x2_t __a, int64x2_t __b) { - return vget_lane_s16 (__builtin_aarch64_reduc_splus_v4hi (__a), 0); + return (uint64x2_t) __builtin_aarch64_cmgev2di (__a, __b); } -__extension__ static __inline int32_t __attribute__ ((__always_inline__)) -vaddv_s32 (int32x2_t __a) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcgeq_u8 (uint8x16_t __a, uint8x16_t __b) { - return vget_lane_s32 (__builtin_aarch64_reduc_splus_v2si (__a), 0); + return (uint8x16_t) __builtin_aarch64_cmgeuv16qi ((int8x16_t) __a, + (int8x16_t) __b); } -__extension__ static __inline uint8_t __attribute__ ((__always_inline__)) -vaddv_u8 (uint8x8_t __a) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcgeq_u16 (uint16x8_t __a, uint16x8_t __b) { - return vget_lane_u8 ((uint8x8_t) - __builtin_aarch64_reduc_uplus_v8qi ((int8x8_t) __a), 0); + return (uint16x8_t) __builtin_aarch64_cmgeuv8hi ((int16x8_t) __a, + (int16x8_t) __b); } -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) -vaddv_u16 (uint16x4_t __a) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcgeq_u32 (uint32x4_t __a, uint32x4_t __b) { - return vget_lane_u16 ((uint16x4_t) - __builtin_aarch64_reduc_uplus_v4hi ((int16x4_t) __a), 0); + return (uint32x4_t) __builtin_aarch64_cmgeuv4si ((int32x4_t) __a, + (int32x4_t) __b); } -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vaddv_u32 (uint32x2_t __a) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcgeq_u64 (uint64x2_t __a, uint64x2_t __b) { - return vget_lane_u32 ((uint32x2_t) - __builtin_aarch64_reduc_uplus_v2si ((int32x2_t) __a), 0); + return (uint64x2_t) __builtin_aarch64_cmgeuv2di ((int64x2_t) __a, + (int64x2_t) __b); } -__extension__ static __inline int8_t __attribute__ ((__always_inline__)) -vaddvq_s8 (int8x16_t __a) +/* vcge - scalar. */ + +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vcges_f32 (float32_t __a, float32_t __b) { - return vgetq_lane_s8 (__builtin_aarch64_reduc_splus_v16qi (__a), 0); + return __a >= __b ? -1 : 0; } -__extension__ static __inline int16_t __attribute__ ((__always_inline__)) -vaddvq_s16 (int16x8_t __a) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcged_s64 (int64x1_t __a, int64x1_t __b) { - return vgetq_lane_s16 (__builtin_aarch64_reduc_splus_v8hi (__a), 0); + return __a >= __b ? -1ll : 0ll; } -__extension__ static __inline int32_t __attribute__ ((__always_inline__)) -vaddvq_s32 (int32x4_t __a) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcged_u64 (uint64x1_t __a, uint64x1_t __b) { - return vgetq_lane_s32 (__builtin_aarch64_reduc_splus_v4si (__a), 0); + return __a >= __b ? -1ll : 0ll; } -__extension__ static __inline int32_t __attribute__ ((__always_inline__)) -vaddvq_s64 (int64x2_t __a) +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vcged_f64 (float64_t __a, float64_t __b) { - return vgetq_lane_s64 (__builtin_aarch64_reduc_splus_v2di (__a), 0); + return __a >= __b ? -1ll : 0ll; } -__extension__ static __inline uint8_t __attribute__ ((__always_inline__)) -vaddvq_u8 (uint8x16_t __a) +/* vcgez - vector. */ + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcgez_f32 (float32x2_t __a) { - return vgetq_lane_u8 ((uint8x16_t) - __builtin_aarch64_reduc_uplus_v16qi ((int8x16_t) __a), 0); + float32x2_t __b = {0.0f, 0.0f}; + return (uint32x2_t) __builtin_aarch64_cmgev2sf (__a, __b); } -__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) -vaddvq_u16 (uint16x8_t __a) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcgez_f64 (float64x1_t __a) { - return vgetq_lane_u16 ((uint16x8_t) - __builtin_aarch64_reduc_uplus_v8hi ((int16x8_t) __a), 0); + return __a >= 0.0 ? -1ll : 0ll; } -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vaddvq_u32 (uint32x4_t __a) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcgez_p8 (poly8x8_t __a) { - return vgetq_lane_u32 ((uint32x4_t) - __builtin_aarch64_reduc_uplus_v4si ((int32x4_t) __a), 0); + poly8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x8_t) __builtin_aarch64_cmgev8qi ((int8x8_t) __a, + (int8x8_t) __b); } -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vaddvq_u64 (uint64x2_t __a) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcgez_s8 (int8x8_t __a) { - return vgetq_lane_u64 ((uint64x2_t) - __builtin_aarch64_reduc_uplus_v2di ((int64x2_t) __a), 0); + int8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x8_t) __builtin_aarch64_cmgev8qi (__a, __b); } -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vaddv_f32 (float32x2_t __a) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcgez_s16 (int16x4_t __a) { - float32x2_t t = __builtin_aarch64_reduc_splus_v2sf (__a); - return vget_lane_f32 (t, 0); + int16x4_t __b = {0, 0, 0, 0}; + return (uint16x4_t) __builtin_aarch64_cmgev4hi (__a, __b); } -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vaddvq_f32 (float32x4_t __a) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcgez_s32 (int32x2_t __a) { - float32x4_t t = __builtin_aarch64_reduc_splus_v4sf (__a); - return vgetq_lane_f32 (t, 0); + int32x2_t __b = {0, 0}; + return (uint32x2_t) __builtin_aarch64_cmgev2si (__a, __b); } -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vaddvq_f64 (float64x2_t __a) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcgez_s64 (int64x1_t __a) { - float64x2_t t = __builtin_aarch64_reduc_splus_v2df (__a); - return vgetq_lane_f64 (t, 0); + return __a >= 0ll ? -1ll : 0ll; } -/* vcage */ +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcgez_u8 (uint8x8_t __a) +{ + uint8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x8_t) __builtin_aarch64_cmgeuv8qi ((int8x8_t) __a, + (int8x8_t) __b); +} -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vcages_f32 (float32_t __a, float32_t __b) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcgez_u16 (uint16x4_t __a) { - return __builtin_fabsf (__a) >= __builtin_fabsf (__b) ? -1 : 0; + uint16x4_t __b = {0, 0, 0, 0}; + return (uint16x4_t) __builtin_aarch64_cmgeuv4hi ((int16x4_t) __a, + (int16x4_t) __b); } __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcage_f32 (float32x2_t __a, float32x2_t __b) +vcgez_u32 (uint32x2_t __a) { - return vabs_f32 (__a) >= vabs_f32 (__b); + uint32x2_t __b = {0, 0}; + return (uint32x2_t) __builtin_aarch64_cmgeuv2si ((int32x2_t) __a, + (int32x2_t) __b); } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcageq_f32 (float32x4_t __a, float32x4_t __b) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcgez_u64 (uint64x1_t __a) { - return vabsq_f32 (__a) >= vabsq_f32 (__b); + return __a >= 0ll ? -1ll : 0ll; } -__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) -vcaged_f64 (float64_t __a, float64_t __b) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcgezq_f32 (float32x4_t __a) { - return __builtin_fabs (__a) >= __builtin_fabs (__b) ? -1 : 0; + float32x4_t __b = {0.0f, 0.0f, 0.0f, 0.0f}; + return (uint32x4_t) __builtin_aarch64_cmgev4sf (__a, __b); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcageq_f64 (float64x2_t __a, float64x2_t __b) +vcgezq_f64 (float64x2_t __a) { - return vabsq_f64 (__a) >= vabsq_f64 (__b); + float64x2_t __b = {0.0, 0.0}; + return (uint64x2_t) __builtin_aarch64_cmgev2df (__a, __b); } -/* vcagt */ - -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vcagts_f32 (float32_t __a, float32_t __b) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcgezq_p8 (poly8x16_t __a) { - return __builtin_fabsf (__a) > __builtin_fabsf (__b) ? -1 : 0; + poly8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x16_t) __builtin_aarch64_cmgev16qi ((int8x16_t) __a, + (int8x16_t) __b); } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcagt_f32 (float32x2_t __a, float32x2_t __b) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcgezq_s8 (int8x16_t __a) { - return vabs_f32 (__a) > vabs_f32 (__b); + int8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x16_t) __builtin_aarch64_cmgev16qi (__a, __b); } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcagtq_f32 (float32x4_t __a, float32x4_t __b) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcgezq_s16 (int16x8_t __a) { - return vabsq_f32 (__a) > vabsq_f32 (__b); + int16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint16x8_t) __builtin_aarch64_cmgev8hi (__a, __b); } -__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) -vcagtd_f64 (float64_t __a, float64_t __b) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcgezq_s32 (int32x4_t __a) { - return __builtin_fabs (__a) > __builtin_fabs (__b) ? -1 : 0; + int32x4_t __b = {0, 0, 0, 0}; + return (uint32x4_t) __builtin_aarch64_cmgev4si (__a, __b); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcagtq_f64 (float64x2_t __a, float64x2_t __b) +vcgezq_s64 (int64x2_t __a) { - return vabsq_f64 (__a) > vabsq_f64 (__b); + int64x2_t __b = {0, 0}; + return (uint64x2_t) __builtin_aarch64_cmgev2di (__a, __b); } -/* vcale */ +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vcgezq_u8 (uint8x16_t __a) +{ + uint8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x16_t) __builtin_aarch64_cmgeuv16qi ((int8x16_t) __a, + (int8x16_t) __b); +} -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcale_f32 (float32x2_t __a, float32x2_t __b) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcgezq_u16 (uint16x8_t __a) { - return vabs_f32 (__a) <= vabs_f32 (__b); + uint16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint16x8_t) __builtin_aarch64_cmgeuv8hi ((int16x8_t) __a, + (int16x8_t) __b); } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcaleq_f32 (float32x4_t __a, float32x4_t __b) +vcgezq_u32 (uint32x4_t __a) { - return vabsq_f32 (__a) <= vabsq_f32 (__b); + uint32x4_t __b = {0, 0, 0, 0}; + return (uint32x4_t) __builtin_aarch64_cmgeuv4si ((int32x4_t) __a, + (int32x4_t) __b); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcaleq_f64 (float64x2_t __a, float64x2_t __b) +vcgezq_u64 (uint64x2_t __a) { - return vabsq_f64 (__a) <= vabsq_f64 (__b); + uint64x2_t __b = {0, 0}; + return (uint64x2_t) __builtin_aarch64_cmgeuv2di ((int64x2_t) __a, + (int64x2_t) __b); } -/* vcalt */ +/* vcgez - scalar. */ -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcalt_f32 (float32x2_t __a, float32x2_t __b) +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vcgezs_f32 (float32_t __a) { - return vabs_f32 (__a) < vabs_f32 (__b); + return __a >= 0.0f ? -1 : 0; } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcaltq_f32 (float32x4_t __a, float32x4_t __b) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcgezd_s64 (int64x1_t __a) { - return vabsq_f32 (__a) < vabsq_f32 (__b); + return __a >= 0 ? -1ll : 0ll; } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcaltq_f64 (float64x2_t __a, float64x2_t __b) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vcgezd_u64 (int64x1_t __a) { - return vabsq_f64 (__a) < vabsq_f64 (__b); + return __a >= 0 ? -1ll : 0ll; } -/* vceq - vector. */ +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vcgezd_f64 (float64_t __a) +{ + return __a >= 0.0 ? -1ll : 0ll; +} + +/* vcgt - vector. */ __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vceq_f32 (float32x2_t __a, float32x2_t __b) +vcgt_f32 (float32x2_t __a, float32x2_t __b) { - return (uint32x2_t) __builtin_aarch64_cmeqv2sf (__a, __b); + return (uint32x2_t) __builtin_aarch64_cmgtv2sf (__a, __b); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vceq_f64 (float64x1_t __a, float64x1_t __b) +vcgt_f64 (float64x1_t __a, float64x1_t __b) { - return __a == __b ? -1ll : 0ll; + return __a > __b ? -1ll : 0ll; } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vceq_p8 (poly8x8_t __a, poly8x8_t __b) +vcgt_p8 (poly8x8_t __a, poly8x8_t __b) { - return (uint8x8_t) __builtin_aarch64_cmeqv8qi ((int8x8_t) __a, + return (uint8x8_t) __builtin_aarch64_cmgtv8qi ((int8x8_t) __a, (int8x8_t) __b); } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vceq_s8 (int8x8_t __a, int8x8_t __b) +vcgt_s8 (int8x8_t __a, int8x8_t __b) { - return (uint8x8_t) __builtin_aarch64_cmeqv8qi (__a, __b); + return (uint8x8_t) __builtin_aarch64_cmgtv8qi (__a, __b); } __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vceq_s16 (int16x4_t __a, int16x4_t __b) +vcgt_s16 (int16x4_t __a, int16x4_t __b) { - return (uint16x4_t) __builtin_aarch64_cmeqv4hi (__a, __b); + return (uint16x4_t) __builtin_aarch64_cmgtv4hi (__a, __b); } __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vceq_s32 (int32x2_t __a, int32x2_t __b) +vcgt_s32 (int32x2_t __a, int32x2_t __b) { - return (uint32x2_t) __builtin_aarch64_cmeqv2si (__a, __b); + return (uint32x2_t) __builtin_aarch64_cmgtv2si (__a, __b); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vceq_s64 (int64x1_t __a, int64x1_t __b) +vcgt_s64 (int64x1_t __a, int64x1_t __b) { - return __a == __b ? -1ll : 0ll; + return __a > __b ? -1ll : 0ll; } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vceq_u8 (uint8x8_t __a, uint8x8_t __b) +vcgt_u8 (uint8x8_t __a, uint8x8_t __b) { - return (uint8x8_t) __builtin_aarch64_cmeqv8qi ((int8x8_t) __a, + return (uint8x8_t) __builtin_aarch64_cmgtuv8qi ((int8x8_t) __a, (int8x8_t) __b); } __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vceq_u16 (uint16x4_t __a, uint16x4_t __b) +vcgt_u16 (uint16x4_t __a, uint16x4_t __b) { - return (uint16x4_t) __builtin_aarch64_cmeqv4hi ((int16x4_t) __a, + return (uint16x4_t) __builtin_aarch64_cmgtuv4hi ((int16x4_t) __a, (int16x4_t) __b); } __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vceq_u32 (uint32x2_t __a, uint32x2_t __b) +vcgt_u32 (uint32x2_t __a, uint32x2_t __b) { - return (uint32x2_t) __builtin_aarch64_cmeqv2si ((int32x2_t) __a, + return (uint32x2_t) __builtin_aarch64_cmgtuv2si ((int32x2_t) __a, (int32x2_t) __b); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vceq_u64 (uint64x1_t __a, uint64x1_t __b) +vcgt_u64 (uint64x1_t __a, uint64x1_t __b) { - return __a == __b ? -1ll : 0ll; + return __a > __b ? -1ll : 0ll; } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vceqq_f32 (float32x4_t __a, float32x4_t __b) +vcgtq_f32 (float32x4_t __a, float32x4_t __b) { - return (uint32x4_t) __builtin_aarch64_cmeqv4sf (__a, __b); + return (uint32x4_t) __builtin_aarch64_cmgtv4sf (__a, __b); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vceqq_f64 (float64x2_t __a, float64x2_t __b) +vcgtq_f64 (float64x2_t __a, float64x2_t __b) { - return (uint64x2_t) __builtin_aarch64_cmeqv2df (__a, __b); + return (uint64x2_t) __builtin_aarch64_cmgtv2df (__a, __b); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vceqq_p8 (poly8x16_t __a, poly8x16_t __b) +vcgtq_p8 (poly8x16_t __a, poly8x16_t __b) { - return (uint8x16_t) __builtin_aarch64_cmeqv16qi ((int8x16_t) __a, + return (uint8x16_t) __builtin_aarch64_cmgtv16qi ((int8x16_t) __a, (int8x16_t) __b); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vceqq_s8 (int8x16_t __a, int8x16_t __b) +vcgtq_s8 (int8x16_t __a, int8x16_t __b) { - return (uint8x16_t) __builtin_aarch64_cmeqv16qi (__a, __b); + return (uint8x16_t) __builtin_aarch64_cmgtv16qi (__a, __b); } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vceqq_s16 (int16x8_t __a, int16x8_t __b) +vcgtq_s16 (int16x8_t __a, int16x8_t __b) { - return (uint16x8_t) __builtin_aarch64_cmeqv8hi (__a, __b); + return (uint16x8_t) __builtin_aarch64_cmgtv8hi (__a, __b); } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vceqq_s32 (int32x4_t __a, int32x4_t __b) +vcgtq_s32 (int32x4_t __a, int32x4_t __b) { - return (uint32x4_t) __builtin_aarch64_cmeqv4si (__a, __b); + return (uint32x4_t) __builtin_aarch64_cmgtv4si (__a, __b); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vceqq_s64 (int64x2_t __a, int64x2_t __b) +vcgtq_s64 (int64x2_t __a, int64x2_t __b) { - return (uint64x2_t) __builtin_aarch64_cmeqv2di (__a, __b); + return (uint64x2_t) __builtin_aarch64_cmgtv2di (__a, __b); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vceqq_u8 (uint8x16_t __a, uint8x16_t __b) +vcgtq_u8 (uint8x16_t __a, uint8x16_t __b) { - return (uint8x16_t) __builtin_aarch64_cmeqv16qi ((int8x16_t) __a, + return (uint8x16_t) __builtin_aarch64_cmgtuv16qi ((int8x16_t) __a, (int8x16_t) __b); } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vceqq_u16 (uint16x8_t __a, uint16x8_t __b) +vcgtq_u16 (uint16x8_t __a, uint16x8_t __b) { - return (uint16x8_t) __builtin_aarch64_cmeqv8hi ((int16x8_t) __a, + return (uint16x8_t) __builtin_aarch64_cmgtuv8hi ((int16x8_t) __a, (int16x8_t) __b); } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vceqq_u32 (uint32x4_t __a, uint32x4_t __b) +vcgtq_u32 (uint32x4_t __a, uint32x4_t __b) { - return (uint32x4_t) __builtin_aarch64_cmeqv4si ((int32x4_t) __a, + return (uint32x4_t) __builtin_aarch64_cmgtuv4si ((int32x4_t) __a, (int32x4_t) __b); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vceqq_u64 (uint64x2_t __a, uint64x2_t __b) +vcgtq_u64 (uint64x2_t __a, uint64x2_t __b) { - return (uint64x2_t) __builtin_aarch64_cmeqv2di ((int64x2_t) __a, + return (uint64x2_t) __builtin_aarch64_cmgtuv2di ((int64x2_t) __a, (int64x2_t) __b); } -/* vceq - scalar. */ +/* vcgt - scalar. */ __extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vceqs_f32 (float32_t __a, float32_t __b) +vcgts_f32 (float32_t __a, float32_t __b) { - return __a == __b ? -1 : 0; + return __a > __b ? -1 : 0; } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vceqd_s64 (int64x1_t __a, int64x1_t __b) +vcgtd_s64 (int64x1_t __a, int64x1_t __b) { - return __a == __b ? -1ll : 0ll; + return __a > __b ? -1ll : 0ll; } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vceqd_u64 (uint64x1_t __a, uint64x1_t __b) +vcgtd_u64 (uint64x1_t __a, uint64x1_t __b) { - return __a == __b ? -1ll : 0ll; + return __a > __b ? -1ll : 0ll; } __extension__ static __inline uint64_t __attribute__ ((__always_inline__)) -vceqd_f64 (float64_t __a, float64_t __b) +vcgtd_f64 (float64_t __a, float64_t __b) { - return __a == __b ? -1ll : 0ll; + return __a > __b ? -1ll : 0ll; } -/* vceqz - vector. */ +/* vcgtz - vector. */ __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vceqz_f32 (float32x2_t __a) +vcgtz_f32 (float32x2_t __a) { float32x2_t __b = {0.0f, 0.0f}; - return (uint32x2_t) __builtin_aarch64_cmeqv2sf (__a, __b); + return (uint32x2_t) __builtin_aarch64_cmgtv2sf (__a, __b); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vceqz_f64 (float64x1_t __a) +vcgtz_f64 (float64x1_t __a) { - return __a == 0.0 ? -1ll : 0ll; + return __a > 0.0 ? -1ll : 0ll; } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vceqz_p8 (poly8x8_t __a) +vcgtz_p8 (poly8x8_t __a) { poly8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x8_t) __builtin_aarch64_cmeqv8qi ((int8x8_t) __a, + return (uint8x8_t) __builtin_aarch64_cmgtv8qi ((int8x8_t) __a, (int8x8_t) __b); } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vceqz_s8 (int8x8_t __a) +vcgtz_s8 (int8x8_t __a) { int8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x8_t) __builtin_aarch64_cmeqv8qi (__a, __b); + return (uint8x8_t) __builtin_aarch64_cmgtv8qi (__a, __b); } __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vceqz_s16 (int16x4_t __a) +vcgtz_s16 (int16x4_t __a) { int16x4_t __b = {0, 0, 0, 0}; - return (uint16x4_t) __builtin_aarch64_cmeqv4hi (__a, __b); + return (uint16x4_t) __builtin_aarch64_cmgtv4hi (__a, __b); } __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vceqz_s32 (int32x2_t __a) +vcgtz_s32 (int32x2_t __a) { int32x2_t __b = {0, 0}; - return (uint32x2_t) __builtin_aarch64_cmeqv2si (__a, __b); + return (uint32x2_t) __builtin_aarch64_cmgtv2si (__a, __b); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vceqz_s64 (int64x1_t __a) +vcgtz_s64 (int64x1_t __a) { - return __a == 0ll ? -1ll : 0ll; + return __a > 0ll ? -1ll : 0ll; } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vceqz_u8 (uint8x8_t __a) +vcgtz_u8 (uint8x8_t __a) { uint8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x8_t) __builtin_aarch64_cmeqv8qi ((int8x8_t) __a, + return (uint8x8_t) __builtin_aarch64_cmgtuv8qi ((int8x8_t) __a, (int8x8_t) __b); } __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vceqz_u16 (uint16x4_t __a) +vcgtz_u16 (uint16x4_t __a) { uint16x4_t __b = {0, 0, 0, 0}; - return (uint16x4_t) __builtin_aarch64_cmeqv4hi ((int16x4_t) __a, + return (uint16x4_t) __builtin_aarch64_cmgtuv4hi ((int16x4_t) __a, (int16x4_t) __b); } __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vceqz_u32 (uint32x2_t __a) +vcgtz_u32 (uint32x2_t __a) { uint32x2_t __b = {0, 0}; - return (uint32x2_t) __builtin_aarch64_cmeqv2si ((int32x2_t) __a, + return (uint32x2_t) __builtin_aarch64_cmgtuv2si ((int32x2_t) __a, (int32x2_t) __b); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vceqz_u64 (uint64x1_t __a) +vcgtz_u64 (uint64x1_t __a) { - return __a == 0ll ? -1ll : 0ll; + return __a > 0ll ? -1ll : 0ll; } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vceqzq_f32 (float32x4_t __a) +vcgtzq_f32 (float32x4_t __a) { float32x4_t __b = {0.0f, 0.0f, 0.0f, 0.0f}; - return (uint32x4_t) __builtin_aarch64_cmeqv4sf (__a, __b); + return (uint32x4_t) __builtin_aarch64_cmgtv4sf (__a, __b); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vceqzq_f64 (float64x2_t __a) +vcgtzq_f64 (float64x2_t __a) { float64x2_t __b = {0.0, 0.0}; - return (uint64x2_t) __builtin_aarch64_cmeqv2df (__a, __b); + return (uint64x2_t) __builtin_aarch64_cmgtv2df (__a, __b); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vceqzq_p8 (poly8x16_t __a) +vcgtzq_p8 (poly8x16_t __a) { poly8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x16_t) __builtin_aarch64_cmeqv16qi ((int8x16_t) __a, + return (uint8x16_t) __builtin_aarch64_cmgtv16qi ((int8x16_t) __a, (int8x16_t) __b); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vceqzq_s8 (int8x16_t __a) +vcgtzq_s8 (int8x16_t __a) { int8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x16_t) __builtin_aarch64_cmeqv16qi (__a, __b); + return (uint8x16_t) __builtin_aarch64_cmgtv16qi (__a, __b); } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vceqzq_s16 (int16x8_t __a) +vcgtzq_s16 (int16x8_t __a) { int16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; - return (uint16x8_t) __builtin_aarch64_cmeqv8hi (__a, __b); + return (uint16x8_t) __builtin_aarch64_cmgtv8hi (__a, __b); } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vceqzq_s32 (int32x4_t __a) +vcgtzq_s32 (int32x4_t __a) { int32x4_t __b = {0, 0, 0, 0}; - return (uint32x4_t) __builtin_aarch64_cmeqv4si (__a, __b); + return (uint32x4_t) __builtin_aarch64_cmgtv4si (__a, __b); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vceqzq_s64 (int64x2_t __a) +vcgtzq_s64 (int64x2_t __a) { int64x2_t __b = {0, 0}; - return (uint64x2_t) __builtin_aarch64_cmeqv2di (__a, __b); + return (uint64x2_t) __builtin_aarch64_cmgtv2di (__a, __b); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vceqzq_u8 (uint8x16_t __a) +vcgtzq_u8 (uint8x16_t __a) { uint8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x16_t) __builtin_aarch64_cmeqv16qi ((int8x16_t) __a, + return (uint8x16_t) __builtin_aarch64_cmgtuv16qi ((int8x16_t) __a, (int8x16_t) __b); } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vceqzq_u16 (uint16x8_t __a) +vcgtzq_u16 (uint16x8_t __a) { uint16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; - return (uint16x8_t) __builtin_aarch64_cmeqv8hi ((int16x8_t) __a, + return (uint16x8_t) __builtin_aarch64_cmgtuv8hi ((int16x8_t) __a, (int16x8_t) __b); } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vceqzq_u32 (uint32x4_t __a) +vcgtzq_u32 (uint32x4_t __a) { uint32x4_t __b = {0, 0, 0, 0}; - return (uint32x4_t) __builtin_aarch64_cmeqv4si ((int32x4_t) __a, + return (uint32x4_t) __builtin_aarch64_cmgtuv4si ((int32x4_t) __a, (int32x4_t) __b); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vceqzq_u64 (uint64x2_t __a) +vcgtzq_u64 (uint64x2_t __a) { uint64x2_t __b = {0, 0}; - return (uint64x2_t) __builtin_aarch64_cmeqv2di ((int64x2_t) __a, + return (uint64x2_t) __builtin_aarch64_cmgtuv2di ((int64x2_t) __a, (int64x2_t) __b); } -/* vceqz - scalar. */ +/* vcgtz - scalar. */ __extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vceqzs_f32 (float32_t __a) +vcgtzs_f32 (float32_t __a) { - return __a == 0.0f ? -1 : 0; + return __a > 0.0f ? -1 : 0; } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vceqzd_s64 (int64x1_t __a) +vcgtzd_s64 (int64x1_t __a) { - return __a == 0 ? -1ll : 0ll; + return __a > 0 ? -1ll : 0ll; } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vceqzd_u64 (int64x1_t __a) +vcgtzd_u64 (int64x1_t __a) { - return __a == 0 ? -1ll : 0ll; + return __a > 0 ? -1ll : 0ll; } __extension__ static __inline uint64_t __attribute__ ((__always_inline__)) -vceqzd_f64 (float64_t __a) +vcgtzd_f64 (float64_t __a) { - return __a == 0.0 ? -1ll : 0ll; + return __a > 0.0 ? -1ll : 0ll; } -/* vcge - vector. */ +/* vcle - vector. */ __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcge_f32 (float32x2_t __a, float32x2_t __b) +vcle_f32 (float32x2_t __a, float32x2_t __b) { - return (uint32x2_t) __builtin_aarch64_cmgev2sf (__a, __b); + return (uint32x2_t) __builtin_aarch64_cmgev2sf (__b, __a); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcge_f64 (float64x1_t __a, float64x1_t __b) +vcle_f64 (float64x1_t __a, float64x1_t __b) { - return __a >= __b ? -1ll : 0ll; + return __a <= __b ? -1ll : 0ll; } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vcge_p8 (poly8x8_t __a, poly8x8_t __b) +vcle_p8 (poly8x8_t __a, poly8x8_t __b) { - return (uint8x8_t) __builtin_aarch64_cmgev8qi ((int8x8_t) __a, - (int8x8_t) __b); + return (uint8x8_t) __builtin_aarch64_cmgev8qi ((int8x8_t) __b, + (int8x8_t) __a); } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vcge_s8 (int8x8_t __a, int8x8_t __b) +vcle_s8 (int8x8_t __a, int8x8_t __b) { - return (uint8x8_t) __builtin_aarch64_cmgev8qi (__a, __b); + return (uint8x8_t) __builtin_aarch64_cmgev8qi (__b, __a); } __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vcge_s16 (int16x4_t __a, int16x4_t __b) +vcle_s16 (int16x4_t __a, int16x4_t __b) { - return (uint16x4_t) __builtin_aarch64_cmgev4hi (__a, __b); + return (uint16x4_t) __builtin_aarch64_cmgev4hi (__b, __a); } __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcge_s32 (int32x2_t __a, int32x2_t __b) +vcle_s32 (int32x2_t __a, int32x2_t __b) { - return (uint32x2_t) __builtin_aarch64_cmgev2si (__a, __b); + return (uint32x2_t) __builtin_aarch64_cmgev2si (__b, __a); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcge_s64 (int64x1_t __a, int64x1_t __b) +vcle_s64 (int64x1_t __a, int64x1_t __b) { - return __a >= __b ? -1ll : 0ll; + return __a <= __b ? -1ll : 0ll; } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vcge_u8 (uint8x8_t __a, uint8x8_t __b) +vcle_u8 (uint8x8_t __a, uint8x8_t __b) { - return (uint8x8_t) __builtin_aarch64_cmgeuv8qi ((int8x8_t) __a, - (int8x8_t) __b); + return (uint8x8_t) __builtin_aarch64_cmgeuv8qi ((int8x8_t) __b, + (int8x8_t) __a); } __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vcge_u16 (uint16x4_t __a, uint16x4_t __b) +vcle_u16 (uint16x4_t __a, uint16x4_t __b) { - return (uint16x4_t) __builtin_aarch64_cmgeuv4hi ((int16x4_t) __a, - (int16x4_t) __b); + return (uint16x4_t) __builtin_aarch64_cmgeuv4hi ((int16x4_t) __b, + (int16x4_t) __a); } __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcge_u32 (uint32x2_t __a, uint32x2_t __b) +vcle_u32 (uint32x2_t __a, uint32x2_t __b) { - return (uint32x2_t) __builtin_aarch64_cmgeuv2si ((int32x2_t) __a, - (int32x2_t) __b); + return (uint32x2_t) __builtin_aarch64_cmgeuv2si ((int32x2_t) __b, + (int32x2_t) __a); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcge_u64 (uint64x1_t __a, uint64x1_t __b) +vcle_u64 (uint64x1_t __a, uint64x1_t __b) { - return __a >= __b ? -1ll : 0ll; + return __a <= __b ? -1ll : 0ll; } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcgeq_f32 (float32x4_t __a, float32x4_t __b) +vcleq_f32 (float32x4_t __a, float32x4_t __b) { - return (uint32x4_t) __builtin_aarch64_cmgev4sf (__a, __b); + return (uint32x4_t) __builtin_aarch64_cmgev4sf (__b, __a); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcgeq_f64 (float64x2_t __a, float64x2_t __b) +vcleq_f64 (float64x2_t __a, float64x2_t __b) { - return (uint64x2_t) __builtin_aarch64_cmgev2df (__a, __b); + return (uint64x2_t) __builtin_aarch64_cmgev2df (__b, __a); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcgeq_p8 (poly8x16_t __a, poly8x16_t __b) +vcleq_p8 (poly8x16_t __a, poly8x16_t __b) { - return (uint8x16_t) __builtin_aarch64_cmgev16qi ((int8x16_t) __a, - (int8x16_t) __b); + return (uint8x16_t) __builtin_aarch64_cmgev16qi ((int8x16_t) __b, + (int8x16_t) __a); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcgeq_s8 (int8x16_t __a, int8x16_t __b) +vcleq_s8 (int8x16_t __a, int8x16_t __b) { - return (uint8x16_t) __builtin_aarch64_cmgev16qi (__a, __b); + return (uint8x16_t) __builtin_aarch64_cmgev16qi (__b, __a); } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vcgeq_s16 (int16x8_t __a, int16x8_t __b) +vcleq_s16 (int16x8_t __a, int16x8_t __b) { - return (uint16x8_t) __builtin_aarch64_cmgev8hi (__a, __b); + return (uint16x8_t) __builtin_aarch64_cmgev8hi (__b, __a); } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcgeq_s32 (int32x4_t __a, int32x4_t __b) +vcleq_s32 (int32x4_t __a, int32x4_t __b) { - return (uint32x4_t) __builtin_aarch64_cmgev4si (__a, __b); + return (uint32x4_t) __builtin_aarch64_cmgev4si (__b, __a); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcgeq_s64 (int64x2_t __a, int64x2_t __b) +vcleq_s64 (int64x2_t __a, int64x2_t __b) { - return (uint64x2_t) __builtin_aarch64_cmgev2di (__a, __b); + return (uint64x2_t) __builtin_aarch64_cmgev2di (__b, __a); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcgeq_u8 (uint8x16_t __a, uint8x16_t __b) +vcleq_u8 (uint8x16_t __a, uint8x16_t __b) { - return (uint8x16_t) __builtin_aarch64_cmgeuv16qi ((int8x16_t) __a, - (int8x16_t) __b); + return (uint8x16_t) __builtin_aarch64_cmgeuv16qi ((int8x16_t) __b, + (int8x16_t) __a); } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vcgeq_u16 (uint16x8_t __a, uint16x8_t __b) +vcleq_u16 (uint16x8_t __a, uint16x8_t __b) { - return (uint16x8_t) __builtin_aarch64_cmgeuv8hi ((int16x8_t) __a, - (int16x8_t) __b); + return (uint16x8_t) __builtin_aarch64_cmgeuv8hi ((int16x8_t) __b, + (int16x8_t) __a); } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcgeq_u32 (uint32x4_t __a, uint32x4_t __b) +vcleq_u32 (uint32x4_t __a, uint32x4_t __b) { - return (uint32x4_t) __builtin_aarch64_cmgeuv4si ((int32x4_t) __a, - (int32x4_t) __b); + return (uint32x4_t) __builtin_aarch64_cmgeuv4si ((int32x4_t) __b, + (int32x4_t) __a); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcgeq_u64 (uint64x2_t __a, uint64x2_t __b) +vcleq_u64 (uint64x2_t __a, uint64x2_t __b) { - return (uint64x2_t) __builtin_aarch64_cmgeuv2di ((int64x2_t) __a, - (int64x2_t) __b); + return (uint64x2_t) __builtin_aarch64_cmgeuv2di ((int64x2_t) __b, + (int64x2_t) __a); } -/* vcge - scalar. */ +/* vcle - scalar. */ __extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vcges_f32 (float32_t __a, float32_t __b) +vcles_f32 (float32_t __a, float32_t __b) { - return __a >= __b ? -1 : 0; + return __a <= __b ? -1 : 0; } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcged_s64 (int64x1_t __a, int64x1_t __b) +vcled_s64 (int64x1_t __a, int64x1_t __b) { - return __a >= __b ? -1ll : 0ll; + return __a <= __b ? -1ll : 0ll; } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcged_u64 (uint64x1_t __a, uint64x1_t __b) +vcled_u64 (uint64x1_t __a, uint64x1_t __b) { - return __a >= __b ? -1ll : 0ll; + return __a <= __b ? -1ll : 0ll; } __extension__ static __inline uint64_t __attribute__ ((__always_inline__)) -vcged_f64 (float64_t __a, float64_t __b) +vcled_f64 (float64_t __a, float64_t __b) { - return __a >= __b ? -1ll : 0ll; + return __a <= __b ? -1ll : 0ll; } -/* vcgez - vector. */ +/* vclez - vector. */ __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcgez_f32 (float32x2_t __a) +vclez_f32 (float32x2_t __a) { float32x2_t __b = {0.0f, 0.0f}; - return (uint32x2_t) __builtin_aarch64_cmgev2sf (__a, __b); + return (uint32x2_t) __builtin_aarch64_cmlev2sf (__a, __b); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcgez_f64 (float64x1_t __a) +vclez_f64 (float64x1_t __a) { - return __a >= 0.0 ? -1ll : 0ll; + return __a <= 0.0 ? -1ll : 0ll; } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vcgez_p8 (poly8x8_t __a) +vclez_p8 (poly8x8_t __a) { poly8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x8_t) __builtin_aarch64_cmgev8qi ((int8x8_t) __a, + return (uint8x8_t) __builtin_aarch64_cmlev8qi ((int8x8_t) __a, (int8x8_t) __b); } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vcgez_s8 (int8x8_t __a) +vclez_s8 (int8x8_t __a) { int8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x8_t) __builtin_aarch64_cmgev8qi (__a, __b); + return (uint8x8_t) __builtin_aarch64_cmlev8qi (__a, __b); } __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vcgez_s16 (int16x4_t __a) +vclez_s16 (int16x4_t __a) { int16x4_t __b = {0, 0, 0, 0}; - return (uint16x4_t) __builtin_aarch64_cmgev4hi (__a, __b); + return (uint16x4_t) __builtin_aarch64_cmlev4hi (__a, __b); } __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcgez_s32 (int32x2_t __a) +vclez_s32 (int32x2_t __a) { int32x2_t __b = {0, 0}; - return (uint32x2_t) __builtin_aarch64_cmgev2si (__a, __b); -} - -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcgez_s64 (int64x1_t __a) -{ - return __a >= 0ll ? -1ll : 0ll; -} - -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vcgez_u8 (uint8x8_t __a) -{ - uint8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x8_t) __builtin_aarch64_cmgeuv8qi ((int8x8_t) __a, - (int8x8_t) __b); -} - -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vcgez_u16 (uint16x4_t __a) -{ - uint16x4_t __b = {0, 0, 0, 0}; - return (uint16x4_t) __builtin_aarch64_cmgeuv4hi ((int16x4_t) __a, - (int16x4_t) __b); + return (uint32x2_t) __builtin_aarch64_cmlev2si (__a, __b); } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcgez_u32 (uint32x2_t __a) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vclez_s64 (int64x1_t __a) { - uint32x2_t __b = {0, 0}; - return (uint32x2_t) __builtin_aarch64_cmgeuv2si ((int32x2_t) __a, - (int32x2_t) __b); + return __a <= 0ll ? -1ll : 0ll; } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcgez_u64 (uint64x1_t __a) +vclez_u64 (uint64x1_t __a) { - return __a >= 0ll ? -1ll : 0ll; + return __a <= 0ll ? -1ll : 0ll; } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcgezq_f32 (float32x4_t __a) +vclezq_f32 (float32x4_t __a) { float32x4_t __b = {0.0f, 0.0f, 0.0f, 0.0f}; - return (uint32x4_t) __builtin_aarch64_cmgev4sf (__a, __b); + return (uint32x4_t) __builtin_aarch64_cmlev4sf (__a, __b); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcgezq_f64 (float64x2_t __a) +vclezq_f64 (float64x2_t __a) { float64x2_t __b = {0.0, 0.0}; - return (uint64x2_t) __builtin_aarch64_cmgev2df (__a, __b); + return (uint64x2_t) __builtin_aarch64_cmlev2df (__a, __b); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcgezq_p8 (poly8x16_t __a) +vclezq_p8 (poly8x16_t __a) { poly8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x16_t) __builtin_aarch64_cmgev16qi ((int8x16_t) __a, + return (uint8x16_t) __builtin_aarch64_cmlev16qi ((int8x16_t) __a, (int8x16_t) __b); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcgezq_s8 (int8x16_t __a) +vclezq_s8 (int8x16_t __a) { int8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x16_t) __builtin_aarch64_cmgev16qi (__a, __b); + return (uint8x16_t) __builtin_aarch64_cmlev16qi (__a, __b); } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vcgezq_s16 (int16x8_t __a) +vclezq_s16 (int16x8_t __a) { int16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; - return (uint16x8_t) __builtin_aarch64_cmgev8hi (__a, __b); + return (uint16x8_t) __builtin_aarch64_cmlev8hi (__a, __b); } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcgezq_s32 (int32x4_t __a) +vclezq_s32 (int32x4_t __a) { int32x4_t __b = {0, 0, 0, 0}; - return (uint32x4_t) __builtin_aarch64_cmgev4si (__a, __b); + return (uint32x4_t) __builtin_aarch64_cmlev4si (__a, __b); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcgezq_s64 (int64x2_t __a) +vclezq_s64 (int64x2_t __a) { int64x2_t __b = {0, 0}; - return (uint64x2_t) __builtin_aarch64_cmgev2di (__a, __b); -} - -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcgezq_u8 (uint8x16_t __a) -{ - uint8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x16_t) __builtin_aarch64_cmgeuv16qi ((int8x16_t) __a, - (int8x16_t) __b); -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vcgezq_u16 (uint16x8_t __a) -{ - uint16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; - return (uint16x8_t) __builtin_aarch64_cmgeuv8hi ((int16x8_t) __a, - (int16x8_t) __b); -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcgezq_u32 (uint32x4_t __a) -{ - uint32x4_t __b = {0, 0, 0, 0}; - return (uint32x4_t) __builtin_aarch64_cmgeuv4si ((int32x4_t) __a, - (int32x4_t) __b); -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcgezq_u64 (uint64x2_t __a) -{ - uint64x2_t __b = {0, 0}; - return (uint64x2_t) __builtin_aarch64_cmgeuv2di ((int64x2_t) __a, - (int64x2_t) __b); + return (uint64x2_t) __builtin_aarch64_cmlev2di (__a, __b); } -/* vcgez - scalar. */ +/* vclez - scalar. */ __extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vcgezs_f32 (float32_t __a) +vclezs_f32 (float32_t __a) { - return __a >= 0.0f ? -1 : 0; + return __a <= 0.0f ? -1 : 0; } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcgezd_s64 (int64x1_t __a) +vclezd_s64 (int64x1_t __a) { - return __a >= 0 ? -1ll : 0ll; + return __a <= 0 ? -1ll : 0ll; } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcgezd_u64 (int64x1_t __a) +vclezd_u64 (int64x1_t __a) { - return __a >= 0 ? -1ll : 0ll; + return __a <= 0 ? -1ll : 0ll; } __extension__ static __inline uint64_t __attribute__ ((__always_inline__)) -vcgezd_f64 (float64_t __a) +vclezd_f64 (float64_t __a) { - return __a >= 0.0 ? -1ll : 0ll; + return __a <= 0.0 ? -1ll : 0ll; } -/* vcgt - vector. */ +/* vclt - vector. */ __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcgt_f32 (float32x2_t __a, float32x2_t __b) +vclt_f32 (float32x2_t __a, float32x2_t __b) { - return (uint32x2_t) __builtin_aarch64_cmgtv2sf (__a, __b); + return (uint32x2_t) __builtin_aarch64_cmgtv2sf (__b, __a); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcgt_f64 (float64x1_t __a, float64x1_t __b) +vclt_f64 (float64x1_t __a, float64x1_t __b) { - return __a > __b ? -1ll : 0ll; + return __a < __b ? -1ll : 0ll; } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vcgt_p8 (poly8x8_t __a, poly8x8_t __b) +vclt_p8 (poly8x8_t __a, poly8x8_t __b) { - return (uint8x8_t) __builtin_aarch64_cmgtv8qi ((int8x8_t) __a, - (int8x8_t) __b); + return (uint8x8_t) __builtin_aarch64_cmgtv8qi ((int8x8_t) __b, + (int8x8_t) __a); } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vcgt_s8 (int8x8_t __a, int8x8_t __b) +vclt_s8 (int8x8_t __a, int8x8_t __b) { - return (uint8x8_t) __builtin_aarch64_cmgtv8qi (__a, __b); + return (uint8x8_t) __builtin_aarch64_cmgtv8qi (__b, __a); } __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vcgt_s16 (int16x4_t __a, int16x4_t __b) +vclt_s16 (int16x4_t __a, int16x4_t __b) { - return (uint16x4_t) __builtin_aarch64_cmgtv4hi (__a, __b); + return (uint16x4_t) __builtin_aarch64_cmgtv4hi (__b, __a); } __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcgt_s32 (int32x2_t __a, int32x2_t __b) +vclt_s32 (int32x2_t __a, int32x2_t __b) { - return (uint32x2_t) __builtin_aarch64_cmgtv2si (__a, __b); + return (uint32x2_t) __builtin_aarch64_cmgtv2si (__b, __a); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcgt_s64 (int64x1_t __a, int64x1_t __b) +vclt_s64 (int64x1_t __a, int64x1_t __b) { - return __a > __b ? -1ll : 0ll; + return __a < __b ? -1ll : 0ll; } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vcgt_u8 (uint8x8_t __a, uint8x8_t __b) +vclt_u8 (uint8x8_t __a, uint8x8_t __b) { - return (uint8x8_t) __builtin_aarch64_cmgtuv8qi ((int8x8_t) __a, - (int8x8_t) __b); + return (uint8x8_t) __builtin_aarch64_cmgtuv8qi ((int8x8_t) __b, + (int8x8_t) __a); } __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vcgt_u16 (uint16x4_t __a, uint16x4_t __b) +vclt_u16 (uint16x4_t __a, uint16x4_t __b) { - return (uint16x4_t) __builtin_aarch64_cmgtuv4hi ((int16x4_t) __a, - (int16x4_t) __b); + return (uint16x4_t) __builtin_aarch64_cmgtuv4hi ((int16x4_t) __b, + (int16x4_t) __a); } __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcgt_u32 (uint32x2_t __a, uint32x2_t __b) +vclt_u32 (uint32x2_t __a, uint32x2_t __b) { - return (uint32x2_t) __builtin_aarch64_cmgtuv2si ((int32x2_t) __a, - (int32x2_t) __b); + return (uint32x2_t) __builtin_aarch64_cmgtuv2si ((int32x2_t) __b, + (int32x2_t) __a); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcgt_u64 (uint64x1_t __a, uint64x1_t __b) +vclt_u64 (uint64x1_t __a, uint64x1_t __b) { - return __a > __b ? -1ll : 0ll; + return __a < __b ? -1ll : 0ll; } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcgtq_f32 (float32x4_t __a, float32x4_t __b) +vcltq_f32 (float32x4_t __a, float32x4_t __b) { - return (uint32x4_t) __builtin_aarch64_cmgtv4sf (__a, __b); + return (uint32x4_t) __builtin_aarch64_cmgtv4sf (__b, __a); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcgtq_f64 (float64x2_t __a, float64x2_t __b) +vcltq_f64 (float64x2_t __a, float64x2_t __b) { - return (uint64x2_t) __builtin_aarch64_cmgtv2df (__a, __b); + return (uint64x2_t) __builtin_aarch64_cmgtv2df (__b, __a); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcgtq_p8 (poly8x16_t __a, poly8x16_t __b) +vcltq_p8 (poly8x16_t __a, poly8x16_t __b) { - return (uint8x16_t) __builtin_aarch64_cmgtv16qi ((int8x16_t) __a, - (int8x16_t) __b); + return (uint8x16_t) __builtin_aarch64_cmgtv16qi ((int8x16_t) __b, + (int8x16_t) __a); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcgtq_s8 (int8x16_t __a, int8x16_t __b) +vcltq_s8 (int8x16_t __a, int8x16_t __b) { - return (uint8x16_t) __builtin_aarch64_cmgtv16qi (__a, __b); + return (uint8x16_t) __builtin_aarch64_cmgtv16qi (__b, __a); } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vcgtq_s16 (int16x8_t __a, int16x8_t __b) +vcltq_s16 (int16x8_t __a, int16x8_t __b) { - return (uint16x8_t) __builtin_aarch64_cmgtv8hi (__a, __b); + return (uint16x8_t) __builtin_aarch64_cmgtv8hi (__b, __a); } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcgtq_s32 (int32x4_t __a, int32x4_t __b) +vcltq_s32 (int32x4_t __a, int32x4_t __b) { - return (uint32x4_t) __builtin_aarch64_cmgtv4si (__a, __b); + return (uint32x4_t) __builtin_aarch64_cmgtv4si (__b, __a); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcgtq_s64 (int64x2_t __a, int64x2_t __b) +vcltq_s64 (int64x2_t __a, int64x2_t __b) { - return (uint64x2_t) __builtin_aarch64_cmgtv2di (__a, __b); + return (uint64x2_t) __builtin_aarch64_cmgtv2di (__b, __a); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcgtq_u8 (uint8x16_t __a, uint8x16_t __b) +vcltq_u8 (uint8x16_t __a, uint8x16_t __b) { - return (uint8x16_t) __builtin_aarch64_cmgtuv16qi ((int8x16_t) __a, - (int8x16_t) __b); + return (uint8x16_t) __builtin_aarch64_cmgtuv16qi ((int8x16_t) __b, + (int8x16_t) __a); } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vcgtq_u16 (uint16x8_t __a, uint16x8_t __b) +vcltq_u16 (uint16x8_t __a, uint16x8_t __b) { - return (uint16x8_t) __builtin_aarch64_cmgtuv8hi ((int16x8_t) __a, - (int16x8_t) __b); + return (uint16x8_t) __builtin_aarch64_cmgtuv8hi ((int16x8_t) __b, + (int16x8_t) __a); } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcgtq_u32 (uint32x4_t __a, uint32x4_t __b) +vcltq_u32 (uint32x4_t __a, uint32x4_t __b) { - return (uint32x4_t) __builtin_aarch64_cmgtuv4si ((int32x4_t) __a, - (int32x4_t) __b); + return (uint32x4_t) __builtin_aarch64_cmgtuv4si ((int32x4_t) __b, + (int32x4_t) __a); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcgtq_u64 (uint64x2_t __a, uint64x2_t __b) +vcltq_u64 (uint64x2_t __a, uint64x2_t __b) { - return (uint64x2_t) __builtin_aarch64_cmgtuv2di ((int64x2_t) __a, - (int64x2_t) __b); + return (uint64x2_t) __builtin_aarch64_cmgtuv2di ((int64x2_t) __b, + (int64x2_t) __a); } -/* vcgt - scalar. */ +/* vclt - scalar. */ __extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vcgts_f32 (float32_t __a, float32_t __b) +vclts_f32 (float32_t __a, float32_t __b) { - return __a > __b ? -1 : 0; + return __a < __b ? -1 : 0; } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcgtd_s64 (int64x1_t __a, int64x1_t __b) +vcltd_s64 (int64x1_t __a, int64x1_t __b) { - return __a > __b ? -1ll : 0ll; + return __a < __b ? -1ll : 0ll; } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcgtd_u64 (uint64x1_t __a, uint64x1_t __b) +vcltd_u64 (uint64x1_t __a, uint64x1_t __b) { - return __a > __b ? -1ll : 0ll; + return __a < __b ? -1ll : 0ll; } __extension__ static __inline uint64_t __attribute__ ((__always_inline__)) -vcgtd_f64 (float64_t __a, float64_t __b) +vcltd_f64 (float64_t __a, float64_t __b) { - return __a > __b ? -1ll : 0ll; + return __a < __b ? -1ll : 0ll; } -/* vcgtz - vector. */ +/* vcltz - vector. */ __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcgtz_f32 (float32x2_t __a) +vcltz_f32 (float32x2_t __a) { float32x2_t __b = {0.0f, 0.0f}; - return (uint32x2_t) __builtin_aarch64_cmgtv2sf (__a, __b); -} - -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcgtz_f64 (float64x1_t __a) -{ - return __a > 0.0 ? -1ll : 0ll; -} - -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vcgtz_p8 (poly8x8_t __a) -{ - poly8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x8_t) __builtin_aarch64_cmgtv8qi ((int8x8_t) __a, - (int8x8_t) __b); -} - -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vcgtz_s8 (int8x8_t __a) -{ - int8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x8_t) __builtin_aarch64_cmgtv8qi (__a, __b); -} - -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vcgtz_s16 (int16x4_t __a) -{ - int16x4_t __b = {0, 0, 0, 0}; - return (uint16x4_t) __builtin_aarch64_cmgtv4hi (__a, __b); -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcgtz_s32 (int32x2_t __a) -{ - int32x2_t __b = {0, 0}; - return (uint32x2_t) __builtin_aarch64_cmgtv2si (__a, __b); + return (uint32x2_t) __builtin_aarch64_cmltv2sf (__a, __b); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcgtz_s64 (int64x1_t __a) +vcltz_f64 (float64x1_t __a) { - return __a > 0ll ? -1ll : 0ll; + return __a < 0.0 ? -1ll : 0ll; } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vcgtz_u8 (uint8x8_t __a) +vcltz_p8 (poly8x8_t __a) { - uint8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x8_t) __builtin_aarch64_cmgtuv8qi ((int8x8_t) __a, + poly8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x8_t) __builtin_aarch64_cmltv8qi ((int8x8_t) __a, (int8x8_t) __b); } +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vcltz_s8 (int8x8_t __a) +{ + int8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; + return (uint8x8_t) __builtin_aarch64_cmltv8qi (__a, __b); +} + __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vcgtz_u16 (uint16x4_t __a) +vcltz_s16 (int16x4_t __a) { - uint16x4_t __b = {0, 0, 0, 0}; - return (uint16x4_t) __builtin_aarch64_cmgtuv4hi ((int16x4_t) __a, - (int16x4_t) __b); + int16x4_t __b = {0, 0, 0, 0}; + return (uint16x4_t) __builtin_aarch64_cmltv4hi (__a, __b); } __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcgtz_u32 (uint32x2_t __a) +vcltz_s32 (int32x2_t __a) { - uint32x2_t __b = {0, 0}; - return (uint32x2_t) __builtin_aarch64_cmgtuv2si ((int32x2_t) __a, - (int32x2_t) __b); + int32x2_t __b = {0, 0}; + return (uint32x2_t) __builtin_aarch64_cmltv2si (__a, __b); } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcgtz_u64 (uint64x1_t __a) +vcltz_s64 (int64x1_t __a) { - return __a > 0ll ? -1ll : 0ll; + return __a < 0ll ? -1ll : 0ll; } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcgtzq_f32 (float32x4_t __a) +vcltzq_f32 (float32x4_t __a) { float32x4_t __b = {0.0f, 0.0f, 0.0f, 0.0f}; - return (uint32x4_t) __builtin_aarch64_cmgtv4sf (__a, __b); + return (uint32x4_t) __builtin_aarch64_cmltv4sf (__a, __b); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcgtzq_f64 (float64x2_t __a) +vcltzq_f64 (float64x2_t __a) { float64x2_t __b = {0.0, 0.0}; - return (uint64x2_t) __builtin_aarch64_cmgtv2df (__a, __b); + return (uint64x2_t) __builtin_aarch64_cmltv2df (__a, __b); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcgtzq_p8 (poly8x16_t __a) +vcltzq_p8 (poly8x16_t __a) { poly8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x16_t) __builtin_aarch64_cmgtv16qi ((int8x16_t) __a, + return (uint8x16_t) __builtin_aarch64_cmltv16qi ((int8x16_t) __a, (int8x16_t) __b); } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcgtzq_s8 (int8x16_t __a) +vcltzq_s8 (int8x16_t __a) { int8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x16_t) __builtin_aarch64_cmgtv16qi (__a, __b); + return (uint8x16_t) __builtin_aarch64_cmltv16qi (__a, __b); } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vcgtzq_s16 (int16x8_t __a) +vcltzq_s16 (int16x8_t __a) { int16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; - return (uint16x8_t) __builtin_aarch64_cmgtv8hi (__a, __b); + return (uint16x8_t) __builtin_aarch64_cmltv8hi (__a, __b); } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcgtzq_s32 (int32x4_t __a) +vcltzq_s32 (int32x4_t __a) { int32x4_t __b = {0, 0, 0, 0}; - return (uint32x4_t) __builtin_aarch64_cmgtv4si (__a, __b); + return (uint32x4_t) __builtin_aarch64_cmltv4si (__a, __b); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcgtzq_s64 (int64x2_t __a) +vcltzq_s64 (int64x2_t __a) { int64x2_t __b = {0, 0}; - return (uint64x2_t) __builtin_aarch64_cmgtv2di (__a, __b); -} - -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcgtzq_u8 (uint8x16_t __a) -{ - uint8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x16_t) __builtin_aarch64_cmgtuv16qi ((int8x16_t) __a, - (int8x16_t) __b); -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vcgtzq_u16 (uint16x8_t __a) -{ - uint16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; - return (uint16x8_t) __builtin_aarch64_cmgtuv8hi ((int16x8_t) __a, - (int16x8_t) __b); -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcgtzq_u32 (uint32x4_t __a) -{ - uint32x4_t __b = {0, 0, 0, 0}; - return (uint32x4_t) __builtin_aarch64_cmgtuv4si ((int32x4_t) __a, - (int32x4_t) __b); -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcgtzq_u64 (uint64x2_t __a) -{ - uint64x2_t __b = {0, 0}; - return (uint64x2_t) __builtin_aarch64_cmgtuv2di ((int64x2_t) __a, - (int64x2_t) __b); + return (uint64x2_t) __builtin_aarch64_cmltv2di (__a, __b); } -/* vcgtz - scalar. */ +/* vcltz - scalar. */ __extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vcgtzs_f32 (float32_t __a) +vcltzs_f32 (float32_t __a) { - return __a > 0.0f ? -1 : 0; + return __a < 0.0f ? -1 : 0; } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcgtzd_s64 (int64x1_t __a) +vcltzd_s64 (int64x1_t __a) { - return __a > 0 ? -1ll : 0ll; + return __a < 0 ? -1ll : 0ll; } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcgtzd_u64 (int64x1_t __a) +vcltzd_u64 (int64x1_t __a) { - return __a > 0 ? -1ll : 0ll; + return __a < 0 ? -1ll : 0ll; } __extension__ static __inline uint64_t __attribute__ ((__always_inline__)) -vcgtzd_f64 (float64_t __a) -{ - return __a > 0.0 ? -1ll : 0ll; -} - -/* vcle - vector. */ - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcle_f32 (float32x2_t __a, float32x2_t __b) -{ - return (uint32x2_t) __builtin_aarch64_cmgev2sf (__b, __a); -} - -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcle_f64 (float64x1_t __a, float64x1_t __b) +vcltzd_f64 (float64_t __a) { - return __a <= __b ? -1ll : 0ll; + return __a < 0.0 ? -1ll : 0ll; } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vcle_p8 (poly8x8_t __a, poly8x8_t __b) -{ - return (uint8x8_t) __builtin_aarch64_cmgev8qi ((int8x8_t) __b, - (int8x8_t) __a); -} +/* vcvt (double -> float). */ -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vcle_s8 (int8x8_t __a, int8x8_t __b) +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vcvt_f32_f64 (float64x2_t __a) { - return (uint8x8_t) __builtin_aarch64_cmgev8qi (__b, __a); + return __builtin_aarch64_float_truncate_lo_v2sf (__a); } -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vcle_s16 (int16x4_t __a, int16x4_t __b) +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vcvt_high_f32_f64 (float32x2_t __a, float64x2_t __b) { - return (uint16x4_t) __builtin_aarch64_cmgev4hi (__b, __a); + return __builtin_aarch64_float_truncate_hi_v4sf (__a, __b); } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcle_s32 (int32x2_t __a, int32x2_t __b) -{ - return (uint32x2_t) __builtin_aarch64_cmgev2si (__b, __a); -} +/* vcvt (float -> double). */ -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcle_s64 (int64x1_t __a, int64x1_t __b) +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vcvt_f64_f32 (float32x2_t __a) { - return __a <= __b ? -1ll : 0ll; -} -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vcle_u8 (uint8x8_t __a, uint8x8_t __b) -{ - return (uint8x8_t) __builtin_aarch64_cmgeuv8qi ((int8x8_t) __b, - (int8x8_t) __a); + return __builtin_aarch64_float_extend_lo_v2df (__a); } -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vcle_u16 (uint16x4_t __a, uint16x4_t __b) +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vcvt_high_f64_f32 (float32x4_t __a) { - return (uint16x4_t) __builtin_aarch64_cmgeuv4hi ((int16x4_t) __b, - (int16x4_t) __a); + return __builtin_aarch64_vec_unpacks_hi_v4sf (__a); } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcle_u32 (uint32x2_t __a, uint32x2_t __b) -{ - return (uint32x2_t) __builtin_aarch64_cmgeuv2si ((int32x2_t) __b, - (int32x2_t) __a); -} +/* vcvt (int -> float) */ -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcle_u64 (uint64x1_t __a, uint64x1_t __b) +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vcvtd_f64_s64 (int64_t __a) { - return __a <= __b ? -1ll : 0ll; + return (float64_t) __a; } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcleq_f32 (float32x4_t __a, float32x4_t __b) +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vcvtd_f64_u64 (uint64_t __a) { - return (uint32x4_t) __builtin_aarch64_cmgev4sf (__b, __a); + return (float64_t) __a; } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcleq_f64 (float64x2_t __a, float64x2_t __b) +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vcvts_f32_s32 (int32_t __a) { - return (uint64x2_t) __builtin_aarch64_cmgev2df (__b, __a); + return (float32_t) __a; } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcleq_p8 (poly8x16_t __a, poly8x16_t __b) +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vcvts_f32_u32 (uint32_t __a) { - return (uint8x16_t) __builtin_aarch64_cmgev16qi ((int8x16_t) __b, - (int8x16_t) __a); + return (float32_t) __a; } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcleq_s8 (int8x16_t __a, int8x16_t __b) +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vcvt_f32_s32 (int32x2_t __a) { - return (uint8x16_t) __builtin_aarch64_cmgev16qi (__b, __a); + return __builtin_aarch64_floatv2siv2sf (__a); } -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vcleq_s16 (int16x8_t __a, int16x8_t __b) +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vcvt_f32_u32 (uint32x2_t __a) { - return (uint16x8_t) __builtin_aarch64_cmgev8hi (__b, __a); + return __builtin_aarch64_floatunsv2siv2sf ((int32x2_t) __a); } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcleq_s32 (int32x4_t __a, int32x4_t __b) +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vcvtq_f32_s32 (int32x4_t __a) { - return (uint32x4_t) __builtin_aarch64_cmgev4si (__b, __a); + return __builtin_aarch64_floatv4siv4sf (__a); } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcleq_s64 (int64x2_t __a, int64x2_t __b) +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vcvtq_f32_u32 (uint32x4_t __a) { - return (uint64x2_t) __builtin_aarch64_cmgev2di (__b, __a); + return __builtin_aarch64_floatunsv4siv4sf ((int32x4_t) __a); } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcleq_u8 (uint8x16_t __a, uint8x16_t __b) +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vcvtq_f64_s64 (int64x2_t __a) { - return (uint8x16_t) __builtin_aarch64_cmgeuv16qi ((int8x16_t) __b, - (int8x16_t) __a); + return __builtin_aarch64_floatv2div2df (__a); } -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vcleq_u16 (uint16x8_t __a, uint16x8_t __b) +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vcvtq_f64_u64 (uint64x2_t __a) { - return (uint16x8_t) __builtin_aarch64_cmgeuv8hi ((int16x8_t) __b, - (int16x8_t) __a); + return __builtin_aarch64_floatunsv2div2df ((int64x2_t) __a); } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcleq_u32 (uint32x4_t __a, uint32x4_t __b) -{ - return (uint32x4_t) __builtin_aarch64_cmgeuv4si ((int32x4_t) __b, - (int32x4_t) __a); -} +/* vcvt (float -> int) */ -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcleq_u64 (uint64x2_t __a, uint64x2_t __b) +__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +vcvtd_s64_f64 (float64_t __a) { - return (uint64x2_t) __builtin_aarch64_cmgeuv2di ((int64x2_t) __b, - (int64x2_t) __a); + return (int64_t) __a; } -/* vcle - scalar. */ - -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vcles_f32 (float32_t __a, float32_t __b) +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vcvtd_u64_f64 (float64_t __a) { - return __a <= __b ? -1 : 0; + return (uint64_t) __a; } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcled_s64 (int64x1_t __a, int64x1_t __b) +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vcvts_s32_f32 (float32_t __a) { - return __a <= __b ? -1ll : 0ll; + return (int32_t) __a; } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcled_u64 (uint64x1_t __a, uint64x1_t __b) +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vcvts_u32_f32 (float32_t __a) { - return __a <= __b ? -1ll : 0ll; + return (uint32_t) __a; } -__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) -vcled_f64 (float64_t __a, float64_t __b) +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vcvt_s32_f32 (float32x2_t __a) { - return __a <= __b ? -1ll : 0ll; + return __builtin_aarch64_lbtruncv2sfv2si (__a); } -/* vclez - vector. */ - __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vclez_f32 (float32x2_t __a) +vcvt_u32_f32 (float32x2_t __a) { - float32x2_t __b = {0.0f, 0.0f}; - return (uint32x2_t) __builtin_aarch64_cmlev2sf (__a, __b); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint32x2_t) __builtin_aarch64_lbtruncuv2sfv2si (__a); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vclez_f64 (float64x1_t __a) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vcvtq_s32_f32 (float32x4_t __a) { - return __a <= 0.0 ? -1ll : 0ll; + return __builtin_aarch64_lbtruncv4sfv4si (__a); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vclez_p8 (poly8x8_t __a) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcvtq_u32_f32 (float32x4_t __a) { - poly8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x8_t) __builtin_aarch64_cmlev8qi ((int8x8_t) __a, - (int8x8_t) __b); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint32x4_t) __builtin_aarch64_lbtruncuv4sfv4si (__a); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vclez_s8 (int8x8_t __a) +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vcvtq_s64_f64 (float64x2_t __a) { - int8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x8_t) __builtin_aarch64_cmlev8qi (__a, __b); + return __builtin_aarch64_lbtruncv2dfv2di (__a); } -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vclez_s16 (int16x4_t __a) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcvtq_u64_f64 (float64x2_t __a) { - int16x4_t __b = {0, 0, 0, 0}; - return (uint16x4_t) __builtin_aarch64_cmlev4hi (__a, __b); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint64x2_t) __builtin_aarch64_lbtruncuv2dfv2di (__a); } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vclez_s32 (int32x2_t __a) +/* vcvta */ + +__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +vcvtad_s64_f64 (float64_t __a) { - int32x2_t __b = {0, 0}; - return (uint32x2_t) __builtin_aarch64_cmlev2si (__a, __b); + return __builtin_aarch64_lrounddfdi (__a); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vclez_s64 (int64x1_t __a) +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vcvtad_u64_f64 (float64_t __a) { - return __a <= 0ll ? -1ll : 0ll; + return __builtin_aarch64_lroundudfdi (__a); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vclez_u64 (uint64x1_t __a) +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vcvtas_s32_f32 (float32_t __a) { - return __a <= 0ll ? -1ll : 0ll; + return __builtin_aarch64_lroundsfsi (__a); } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vclezq_f32 (float32x4_t __a) +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vcvtas_u32_f32 (float32_t __a) { - float32x4_t __b = {0.0f, 0.0f, 0.0f, 0.0f}; - return (uint32x4_t) __builtin_aarch64_cmlev4sf (__a, __b); + return __builtin_aarch64_lroundusfsi (__a); } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vclezq_f64 (float64x2_t __a) +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vcvta_s32_f32 (float32x2_t __a) { - float64x2_t __b = {0.0, 0.0}; - return (uint64x2_t) __builtin_aarch64_cmlev2df (__a, __b); + return __builtin_aarch64_lroundv2sfv2si (__a); } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vclezq_p8 (poly8x16_t __a) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcvta_u32_f32 (float32x2_t __a) { - poly8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x16_t) __builtin_aarch64_cmlev16qi ((int8x16_t) __a, - (int8x16_t) __b); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint32x2_t) __builtin_aarch64_lrounduv2sfv2si (__a); } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vclezq_s8 (int8x16_t __a) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vcvtaq_s32_f32 (float32x4_t __a) { - int8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x16_t) __builtin_aarch64_cmlev16qi (__a, __b); + return __builtin_aarch64_lroundv4sfv4si (__a); } -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vclezq_s16 (int16x8_t __a) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcvtaq_u32_f32 (float32x4_t __a) { - int16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; - return (uint16x8_t) __builtin_aarch64_cmlev8hi (__a, __b); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint32x4_t) __builtin_aarch64_lrounduv4sfv4si (__a); } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vclezq_s32 (int32x4_t __a) +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vcvtaq_s64_f64 (float64x2_t __a) { - int32x4_t __b = {0, 0, 0, 0}; - return (uint32x4_t) __builtin_aarch64_cmlev4si (__a, __b); + return __builtin_aarch64_lroundv2dfv2di (__a); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vclezq_s64 (int64x2_t __a) +vcvtaq_u64_f64 (float64x2_t __a) { - int64x2_t __b = {0, 0}; - return (uint64x2_t) __builtin_aarch64_cmlev2di (__a, __b); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint64x2_t) __builtin_aarch64_lrounduv2dfv2di (__a); } -/* vclez - scalar. */ +/* vcvtm */ -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vclezs_f32 (float32_t __a) +__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +vcvtmd_s64_f64 (float64_t __a) { - return __a <= 0.0f ? -1 : 0; + return __builtin_lfloor (__a); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vclezd_s64 (int64x1_t __a) +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vcvtmd_u64_f64 (float64_t __a) { - return __a <= 0 ? -1ll : 0ll; + return __builtin_aarch64_lfloorudfdi (__a); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vclezd_u64 (int64x1_t __a) +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vcvtms_s32_f32 (float32_t __a) { - return __a <= 0 ? -1ll : 0ll; + return __builtin_ifloorf (__a); } -__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) -vclezd_f64 (float64_t __a) +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vcvtms_u32_f32 (float32_t __a) { - return __a <= 0.0 ? -1ll : 0ll; + return __builtin_aarch64_lfloorusfsi (__a); } -/* vclt - vector. */ +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vcvtm_s32_f32 (float32x2_t __a) +{ + return __builtin_aarch64_lfloorv2sfv2si (__a); +} __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vclt_f32 (float32x2_t __a, float32x2_t __b) +vcvtm_u32_f32 (float32x2_t __a) { - return (uint32x2_t) __builtin_aarch64_cmgtv2sf (__b, __a); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint32x2_t) __builtin_aarch64_lflooruv2sfv2si (__a); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vclt_f64 (float64x1_t __a, float64x1_t __b) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vcvtmq_s32_f32 (float32x4_t __a) { - return __a < __b ? -1ll : 0ll; + return __builtin_aarch64_lfloorv4sfv4si (__a); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vclt_p8 (poly8x8_t __a, poly8x8_t __b) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcvtmq_u32_f32 (float32x4_t __a) { - return (uint8x8_t) __builtin_aarch64_cmgtv8qi ((int8x8_t) __b, - (int8x8_t) __a); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint32x4_t) __builtin_aarch64_lflooruv4sfv4si (__a); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vclt_s8 (int8x8_t __a, int8x8_t __b) +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vcvtmq_s64_f64 (float64x2_t __a) { - return (uint8x8_t) __builtin_aarch64_cmgtv8qi (__b, __a); + return __builtin_aarch64_lfloorv2dfv2di (__a); } -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vclt_s16 (int16x4_t __a, int16x4_t __b) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vcvtmq_u64_f64 (float64x2_t __a) { - return (uint16x4_t) __builtin_aarch64_cmgtv4hi (__b, __a); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint64x2_t) __builtin_aarch64_lflooruv2dfv2di (__a); } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vclt_s32 (int32x2_t __a, int32x2_t __b) +/* vcvtn */ + +__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +vcvtnd_s64_f64 (float64_t __a) { - return (uint32x2_t) __builtin_aarch64_cmgtv2si (__b, __a); + return __builtin_aarch64_lfrintndfdi (__a); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vclt_s64 (int64x1_t __a, int64x1_t __b) +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vcvtnd_u64_f64 (float64_t __a) { - return __a < __b ? -1ll : 0ll; + return __builtin_aarch64_lfrintnudfdi (__a); } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vclt_u8 (uint8x8_t __a, uint8x8_t __b) +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vcvtns_s32_f32 (float32_t __a) { - return (uint8x8_t) __builtin_aarch64_cmgtuv8qi ((int8x8_t) __b, - (int8x8_t) __a); + return __builtin_aarch64_lfrintnsfsi (__a); } -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vclt_u16 (uint16x4_t __a, uint16x4_t __b) +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vcvtns_u32_f32 (float32_t __a) { - return (uint16x4_t) __builtin_aarch64_cmgtuv4hi ((int16x4_t) __b, - (int16x4_t) __a); + return __builtin_aarch64_lfrintnusfsi (__a); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vcvtn_s32_f32 (float32x2_t __a) +{ + return __builtin_aarch64_lfrintnv2sfv2si (__a); } __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vclt_u32 (uint32x2_t __a, uint32x2_t __b) +vcvtn_u32_f32 (float32x2_t __a) { - return (uint32x2_t) __builtin_aarch64_cmgtuv2si ((int32x2_t) __b, - (int32x2_t) __a); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint32x2_t) __builtin_aarch64_lfrintnuv2sfv2si (__a); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vcvtnq_s32_f32 (float32x4_t __a) +{ + return __builtin_aarch64_lfrintnv4sfv4si (__a); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vclt_u64 (uint64x1_t __a, uint64x1_t __b) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vcvtnq_u32_f32 (float32x4_t __a) { - return __a < __b ? -1ll : 0ll; + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint32x4_t) __builtin_aarch64_lfrintnuv4sfv4si (__a); } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcltq_f32 (float32x4_t __a, float32x4_t __b) +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vcvtnq_s64_f64 (float64x2_t __a) { - return (uint32x4_t) __builtin_aarch64_cmgtv4sf (__b, __a); + return __builtin_aarch64_lfrintnv2dfv2di (__a); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcltq_f64 (float64x2_t __a, float64x2_t __b) +vcvtnq_u64_f64 (float64x2_t __a) { - return (uint64x2_t) __builtin_aarch64_cmgtv2df (__b, __a); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint64x2_t) __builtin_aarch64_lfrintnuv2dfv2di (__a); } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcltq_p8 (poly8x16_t __a, poly8x16_t __b) +/* vcvtp */ + +__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +vcvtpd_s64_f64 (float64_t __a) { - return (uint8x16_t) __builtin_aarch64_cmgtv16qi ((int8x16_t) __b, - (int8x16_t) __a); + return __builtin_lceil (__a); } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcltq_s8 (int8x16_t __a, int8x16_t __b) +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vcvtpd_u64_f64 (float64_t __a) { - return (uint8x16_t) __builtin_aarch64_cmgtv16qi (__b, __a); + return __builtin_aarch64_lceiludfdi (__a); } -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vcltq_s16 (int16x8_t __a, int16x8_t __b) +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vcvtps_s32_f32 (float32_t __a) { - return (uint16x8_t) __builtin_aarch64_cmgtv8hi (__b, __a); + return __builtin_iceilf (__a); } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcltq_s32 (int32x4_t __a, int32x4_t __b) +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vcvtps_u32_f32 (float32_t __a) { - return (uint32x4_t) __builtin_aarch64_cmgtv4si (__b, __a); + return __builtin_aarch64_lceilusfsi (__a); } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcltq_s64 (int64x2_t __a, int64x2_t __b) +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vcvtp_s32_f32 (float32x2_t __a) { - return (uint64x2_t) __builtin_aarch64_cmgtv2di (__b, __a); + return __builtin_aarch64_lceilv2sfv2si (__a); } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcltq_u8 (uint8x16_t __a, uint8x16_t __b) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vcvtp_u32_f32 (float32x2_t __a) { - return (uint8x16_t) __builtin_aarch64_cmgtuv16qi ((int8x16_t) __b, - (int8x16_t) __a); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint32x2_t) __builtin_aarch64_lceiluv2sfv2si (__a); } -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vcltq_u16 (uint16x8_t __a, uint16x8_t __b) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vcvtpq_s32_f32 (float32x4_t __a) { - return (uint16x8_t) __builtin_aarch64_cmgtuv8hi ((int16x8_t) __b, - (int16x8_t) __a); + return __builtin_aarch64_lceilv4sfv4si (__a); } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcltq_u32 (uint32x4_t __a, uint32x4_t __b) +vcvtpq_u32_f32 (float32x4_t __a) { - return (uint32x4_t) __builtin_aarch64_cmgtuv4si ((int32x4_t) __b, - (int32x4_t) __a); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint32x4_t) __builtin_aarch64_lceiluv4sfv4si (__a); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vcvtpq_s64_f64 (float64x2_t __a) +{ + return __builtin_aarch64_lceilv2dfv2di (__a); } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcltq_u64 (uint64x2_t __a, uint64x2_t __b) +vcvtpq_u64_f64 (float64x2_t __a) { - return (uint64x2_t) __builtin_aarch64_cmgtuv2di ((int64x2_t) __b, - (int64x2_t) __a); + /* TODO: This cast should go away when builtins have + their correct types. */ + return (uint64x2_t) __builtin_aarch64_lceiluv2dfv2di (__a); } -/* vclt - scalar. */ +/* vdup_n */ -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vclts_f32 (float32_t __a, float32_t __b) +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vdup_n_f32 (float32_t __a) { - return __a < __b ? -1 : 0; + return (float32x2_t) {__a, __a}; } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcltd_s64 (int64x1_t __a, int64x1_t __b) +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) +vdup_n_f64 (float64_t __a) { - return __a < __b ? -1ll : 0ll; + return __a; } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcltd_u64 (uint64x1_t __a, uint64x1_t __b) +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vdup_n_p8 (poly8_t __a) { - return __a < __b ? -1ll : 0ll; + return (poly8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } -__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) -vcltd_f64 (float64_t __a, float64_t __b) +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vdup_n_p16 (poly16_t __a) { - return __a < __b ? -1ll : 0ll; + return (poly16x4_t) {__a, __a, __a, __a}; } -/* vcltz - vector. */ +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vdup_n_s8 (int8_t __a) +{ + return (int8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; +} -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcltz_f32 (float32x2_t __a) +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vdup_n_s16 (int16_t __a) { - float32x2_t __b = {0.0f, 0.0f}; - return (uint32x2_t) __builtin_aarch64_cmltv2sf (__a, __b); + return (int16x4_t) {__a, __a, __a, __a}; } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcltz_f64 (float64x1_t __a) +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vdup_n_s32 (int32_t __a) { - return __a < 0.0 ? -1ll : 0ll; + return (int32x2_t) {__a, __a}; } -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vcltz_p8 (poly8x8_t __a) +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vdup_n_s64 (int64_t __a) { - poly8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x8_t) __builtin_aarch64_cmltv8qi ((int8x8_t) __a, - (int8x8_t) __b); + return __a; } __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vcltz_s8 (int8x8_t __a) +vdup_n_u8 (uint8_t __a) { - int8x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x8_t) __builtin_aarch64_cmltv8qi (__a, __b); + return (uint8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vcltz_s16 (int16x4_t __a) +vdup_n_u16 (uint16_t __a) { - int16x4_t __b = {0, 0, 0, 0}; - return (uint16x4_t) __builtin_aarch64_cmltv4hi (__a, __b); + return (uint16x4_t) {__a, __a, __a, __a}; } __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcltz_s32 (int32x2_t __a) +vdup_n_u32 (uint32_t __a) { - int32x2_t __b = {0, 0}; - return (uint32x2_t) __builtin_aarch64_cmltv2si (__a, __b); + return (uint32x2_t) {__a, __a}; } __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcltz_s64 (int64x1_t __a) +vdup_n_u64 (uint64_t __a) { - return __a < 0ll ? -1ll : 0ll; + return __a; } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcltzq_f32 (float32x4_t __a) -{ - float32x4_t __b = {0.0f, 0.0f, 0.0f, 0.0f}; - return (uint32x4_t) __builtin_aarch64_cmltv4sf (__a, __b); -} +/* vdupq_n */ -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcltzq_f64 (float64x2_t __a) +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vdupq_n_f32 (float32_t __a) { - float64x2_t __b = {0.0, 0.0}; - return (uint64x2_t) __builtin_aarch64_cmltv2df (__a, __b); + return (float32x4_t) {__a, __a, __a, __a}; } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcltzq_p8 (poly8x16_t __a) +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vdupq_n_f64 (float64_t __a) { - poly8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x16_t) __builtin_aarch64_cmltv16qi ((int8x16_t) __a, - (int8x16_t) __b); + return (float64x2_t) {__a, __a}; } -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vcltzq_s8 (int8x16_t __a) +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vdupq_n_p8 (uint32_t __a) { - int8x16_t __b = {0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0}; - return (uint8x16_t) __builtin_aarch64_cmltv16qi (__a, __b); + return (poly8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a, + __a, __a, __a, __a, __a, __a, __a, __a}; } -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vcltzq_s16 (int16x8_t __a) +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vdupq_n_p16 (uint32_t __a) { - int16x8_t __b = {0, 0, 0, 0, 0, 0, 0, 0}; - return (uint16x8_t) __builtin_aarch64_cmltv8hi (__a, __b); + return (poly16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcltzq_s32 (int32x4_t __a) +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vdupq_n_s8 (int32_t __a) { - int32x4_t __b = {0, 0, 0, 0}; - return (uint32x4_t) __builtin_aarch64_cmltv4si (__a, __b); + return (int8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a, + __a, __a, __a, __a, __a, __a, __a, __a}; } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcltzq_s64 (int64x2_t __a) +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vdupq_n_s16 (int32_t __a) { - int64x2_t __b = {0, 0}; - return (uint64x2_t) __builtin_aarch64_cmltv2di (__a, __b); + return (int16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } -/* vcltz - scalar. */ - -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vcltzs_f32 (float32_t __a) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vdupq_n_s32 (int32_t __a) { - return __a < 0.0f ? -1 : 0; + return (int32x4_t) {__a, __a, __a, __a}; } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcltzd_s64 (int64x1_t __a) +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vdupq_n_s64 (int64_t __a) { - return __a < 0 ? -1ll : 0ll; + return (int64x2_t) {__a, __a}; } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vcltzd_u64 (int64x1_t __a) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vdupq_n_u8 (uint32_t __a) { - return __a < 0 ? -1ll : 0ll; + return (uint8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a, + __a, __a, __a, __a, __a, __a, __a, __a}; } -__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) -vcltzd_f64 (float64_t __a) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vdupq_n_u16 (uint32_t __a) { - return __a < 0.0 ? -1ll : 0ll; + return (uint16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } -/* vcvt (double -> float). */ - -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vcvt_f32_f64 (float64x2_t __a) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vdupq_n_u32 (uint32_t __a) { - return __builtin_aarch64_float_truncate_lo_v2sf (__a); + return (uint32x4_t) {__a, __a, __a, __a}; } -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vcvt_high_f32_f64 (float32x2_t __a, float64x2_t __b) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vdupq_n_u64 (uint64_t __a) { - return __builtin_aarch64_float_truncate_hi_v4sf (__a, __b); + return (uint64x2_t) {__a, __a}; } -/* vcvt (float -> double). */ +/* vdup_lane */ -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vcvt_f64_f32 (float32x2_t __a) +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vdup_lane_f32 (float32x2_t __a, const int __b) { - - return __builtin_aarch64_float_extend_lo_v2df (__a); + return __aarch64_vdup_lane_f32 (__a, __b); } -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vcvt_high_f64_f32 (float32x4_t __a) +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) +vdup_lane_f64 (float64x1_t __a, const int __b) { - return __builtin_aarch64_vec_unpacks_hi_v4sf (__a); + return __aarch64_vdup_lane_f64 (__a, __b); } -/* vcvt (int -> float) */ - -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vcvtd_f64_s64 (int64_t __a) +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vdup_lane_p8 (poly8x8_t __a, const int __b) { - return (float64_t) __a; + return __aarch64_vdup_lane_p8 (__a, __b); } -__extension__ static __inline float64_t __attribute__ ((__always_inline__)) -vcvtd_f64_u64 (uint64_t __a) +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vdup_lane_p16 (poly16x4_t __a, const int __b) { - return (float64_t) __a; + return __aarch64_vdup_lane_p16 (__a, __b); } -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vcvts_f32_s32 (int32_t __a) +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vdup_lane_s8 (int8x8_t __a, const int __b) { - return (float32_t) __a; + return __aarch64_vdup_lane_s8 (__a, __b); } -__extension__ static __inline float32_t __attribute__ ((__always_inline__)) -vcvts_f32_u32 (uint32_t __a) +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vdup_lane_s16 (int16x4_t __a, const int __b) { - return (float32_t) __a; + return __aarch64_vdup_lane_s16 (__a, __b); } -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vcvt_f32_s32 (int32x2_t __a) +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vdup_lane_s32 (int32x2_t __a, const int __b) { - return __builtin_aarch64_floatv2siv2sf (__a); + return __aarch64_vdup_lane_s32 (__a, __b); } -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vcvt_f32_u32 (uint32x2_t __a) +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vdup_lane_s64 (int64x1_t __a, const int __b) { - return __builtin_aarch64_floatunsv2siv2sf ((int32x2_t) __a); + return __aarch64_vdup_lane_s64 (__a, __b); } -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vcvtq_f32_s32 (int32x4_t __a) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vdup_lane_u8 (uint8x8_t __a, const int __b) { - return __builtin_aarch64_floatv4siv4sf (__a); + return __aarch64_vdup_lane_u8 (__a, __b); } -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vcvtq_f32_u32 (uint32x4_t __a) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vdup_lane_u16 (uint16x4_t __a, const int __b) { - return __builtin_aarch64_floatunsv4siv4sf ((int32x4_t) __a); + return __aarch64_vdup_lane_u16 (__a, __b); } -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vcvtq_f64_s64 (int64x2_t __a) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vdup_lane_u32 (uint32x2_t __a, const int __b) { - return __builtin_aarch64_floatv2div2df (__a); + return __aarch64_vdup_lane_u32 (__a, __b); } -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vcvtq_f64_u64 (uint64x2_t __a) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vdup_lane_u64 (uint64x1_t __a, const int __b) { - return __builtin_aarch64_floatunsv2div2df ((int64x2_t) __a); + return __aarch64_vdup_lane_u64 (__a, __b); } -/* vcvt (float -> int) */ +/* vdup_laneq */ -__extension__ static __inline int64_t __attribute__ ((__always_inline__)) -vcvtd_s64_f64 (float64_t __a) +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vdup_laneq_f32 (float32x4_t __a, const int __b) { - return (int64_t) __a; + return __aarch64_vdup_laneq_f32 (__a, __b); } -__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) -vcvtd_u64_f64 (float64_t __a) +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) +vdup_laneq_f64 (float64x2_t __a, const int __b) { - return (uint64_t) __a; + return __aarch64_vdup_laneq_f64 (__a, __b); } -__extension__ static __inline int32_t __attribute__ ((__always_inline__)) -vcvts_s32_f32 (float32_t __a) +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vdup_laneq_p8 (poly8x16_t __a, const int __b) { - return (int32_t) __a; + return __aarch64_vdup_laneq_p8 (__a, __b); } -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vcvts_u32_f32 (float32_t __a) +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vdup_laneq_p16 (poly16x8_t __a, const int __b) { - return (uint32_t) __a; + return __aarch64_vdup_laneq_p16 (__a, __b); } -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vcvt_s32_f32 (float32x2_t __a) +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vdup_laneq_s8 (int8x16_t __a, const int __b) { - return __builtin_aarch64_lbtruncv2sfv2si (__a); + return __aarch64_vdup_laneq_s8 (__a, __b); } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcvt_u32_f32 (float32x2_t __a) +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vdup_laneq_s16 (int16x8_t __a, const int __b) { - /* TODO: This cast should go away when builtins have - their correct types. */ - return (uint32x2_t) __builtin_aarch64_lbtruncuv2sfv2si (__a); + return __aarch64_vdup_laneq_s16 (__a, __b); } -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vcvtq_s32_f32 (float32x4_t __a) +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vdup_laneq_s32 (int32x4_t __a, const int __b) { - return __builtin_aarch64_lbtruncv4sfv4si (__a); + return __aarch64_vdup_laneq_s32 (__a, __b); } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcvtq_u32_f32 (float32x4_t __a) +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vdup_laneq_s64 (int64x2_t __a, const int __b) { - /* TODO: This cast should go away when builtins have - their correct types. */ - return (uint32x4_t) __builtin_aarch64_lbtruncuv4sfv4si (__a); + return __aarch64_vdup_laneq_s64 (__a, __b); } -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vcvtq_s64_f64 (float64x2_t __a) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vdup_laneq_u8 (uint8x16_t __a, const int __b) { - return __builtin_aarch64_lbtruncv2dfv2di (__a); + return __aarch64_vdup_laneq_u8 (__a, __b); } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcvtq_u64_f64 (float64x2_t __a) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vdup_laneq_u16 (uint16x8_t __a, const int __b) { - /* TODO: This cast should go away when builtins have - their correct types. */ - return (uint64x2_t) __builtin_aarch64_lbtruncuv2dfv2di (__a); + return __aarch64_vdup_laneq_u16 (__a, __b); } -/* vcvta */ +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vdup_laneq_u32 (uint32x4_t __a, const int __b) +{ + return __aarch64_vdup_laneq_u32 (__a, __b); +} -__extension__ static __inline int64_t __attribute__ ((__always_inline__)) -vcvtad_s64_f64 (float64_t __a) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vdup_laneq_u64 (uint64x2_t __a, const int __b) { - return __builtin_aarch64_lrounddfdi (__a); + return __aarch64_vdup_laneq_u64 (__a, __b); } -__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) -vcvtad_u64_f64 (float64_t __a) +/* vdupq_lane */ +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vdupq_lane_f32 (float32x2_t __a, const int __b) { - return __builtin_aarch64_lroundudfdi (__a); + return __aarch64_vdupq_lane_f32 (__a, __b); } -__extension__ static __inline int32_t __attribute__ ((__always_inline__)) -vcvtas_s32_f32 (float32_t __a) +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vdupq_lane_f64 (float64x1_t __a, const int __b) { - return __builtin_aarch64_lroundsfsi (__a); + return __aarch64_vdupq_lane_f64 (__a, __b); } -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vcvtas_u32_f32 (float32_t __a) +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vdupq_lane_p8 (poly8x8_t __a, const int __b) { - return __builtin_aarch64_lroundusfsi (__a); + return __aarch64_vdupq_lane_p8 (__a, __b); } -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vcvta_s32_f32 (float32x2_t __a) +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vdupq_lane_p16 (poly16x4_t __a, const int __b) { - return __builtin_aarch64_lroundv2sfv2si (__a); + return __aarch64_vdupq_lane_p16 (__a, __b); } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcvta_u32_f32 (float32x2_t __a) +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vdupq_lane_s8 (int8x8_t __a, const int __b) { - /* TODO: This cast should go away when builtins have - their correct types. */ - return (uint32x2_t) __builtin_aarch64_lrounduv2sfv2si (__a); + return __aarch64_vdupq_lane_s8 (__a, __b); } -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vcvtaq_s32_f32 (float32x4_t __a) +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vdupq_lane_s16 (int16x4_t __a, const int __b) { - return __builtin_aarch64_lroundv4sfv4si (__a); + return __aarch64_vdupq_lane_s16 (__a, __b); } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcvtaq_u32_f32 (float32x4_t __a) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vdupq_lane_s32 (int32x2_t __a, const int __b) { - /* TODO: This cast should go away when builtins have - their correct types. */ - return (uint32x4_t) __builtin_aarch64_lrounduv4sfv4si (__a); + return __aarch64_vdupq_lane_s32 (__a, __b); } __extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vcvtaq_s64_f64 (float64x2_t __a) +vdupq_lane_s64 (int64x1_t __a, const int __b) { - return __builtin_aarch64_lroundv2dfv2di (__a); + return __aarch64_vdupq_lane_s64 (__a, __b); } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcvtaq_u64_f64 (float64x2_t __a) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vdupq_lane_u8 (uint8x8_t __a, const int __b) { - /* TODO: This cast should go away when builtins have - their correct types. */ - return (uint64x2_t) __builtin_aarch64_lrounduv2dfv2di (__a); + return __aarch64_vdupq_lane_u8 (__a, __b); } -/* vcvtm */ - -__extension__ static __inline int64_t __attribute__ ((__always_inline__)) -vcvtmd_s64_f64 (float64_t __a) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vdupq_lane_u16 (uint16x4_t __a, const int __b) { - return __builtin_lfloor (__a); + return __aarch64_vdupq_lane_u16 (__a, __b); } -__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) -vcvtmd_u64_f64 (float64_t __a) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vdupq_lane_u32 (uint32x2_t __a, const int __b) { - return __builtin_aarch64_lfloorudfdi (__a); + return __aarch64_vdupq_lane_u32 (__a, __b); } -__extension__ static __inline int32_t __attribute__ ((__always_inline__)) -vcvtms_s32_f32 (float32_t __a) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vdupq_lane_u64 (uint64x1_t __a, const int __b) { - return __builtin_ifloorf (__a); + return __aarch64_vdupq_lane_u64 (__a, __b); } -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vcvtms_u32_f32 (float32_t __a) +/* vdupq_laneq */ +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vdupq_laneq_f32 (float32x4_t __a, const int __b) { - return __builtin_aarch64_lfloorusfsi (__a); + return __aarch64_vdupq_laneq_f32 (__a, __b); } -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vcvtm_s32_f32 (float32x2_t __a) +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vdupq_laneq_f64 (float64x2_t __a, const int __b) { - return __builtin_aarch64_lfloorv2sfv2si (__a); + return __aarch64_vdupq_laneq_f64 (__a, __b); } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcvtm_u32_f32 (float32x2_t __a) +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vdupq_laneq_p8 (poly8x16_t __a, const int __b) { - /* TODO: This cast should go away when builtins have - their correct types. */ - return (uint32x2_t) __builtin_aarch64_lflooruv2sfv2si (__a); + return __aarch64_vdupq_laneq_p8 (__a, __b); } -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vcvtmq_s32_f32 (float32x4_t __a) +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vdupq_laneq_p16 (poly16x8_t __a, const int __b) { - return __builtin_aarch64_lfloorv4sfv4si (__a); + return __aarch64_vdupq_laneq_p16 (__a, __b); } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcvtmq_u32_f32 (float32x4_t __a) +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vdupq_laneq_s8 (int8x16_t __a, const int __b) { - /* TODO: This cast should go away when builtins have - their correct types. */ - return (uint32x4_t) __builtin_aarch64_lflooruv4sfv4si (__a); + return __aarch64_vdupq_laneq_s8 (__a, __b); } -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vcvtmq_s64_f64 (float64x2_t __a) +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vdupq_laneq_s16 (int16x8_t __a, const int __b) { - return __builtin_aarch64_lfloorv2dfv2di (__a); + return __aarch64_vdupq_laneq_s16 (__a, __b); } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcvtmq_u64_f64 (float64x2_t __a) +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vdupq_laneq_s32 (int32x4_t __a, const int __b) { - /* TODO: This cast should go away when builtins have - their correct types. */ - return (uint64x2_t) __builtin_aarch64_lflooruv2dfv2di (__a); + return __aarch64_vdupq_laneq_s32 (__a, __b); } -/* vcvtn */ - -__extension__ static __inline int64_t __attribute__ ((__always_inline__)) -vcvtnd_s64_f64 (float64_t __a) +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vdupq_laneq_s64 (int64x2_t __a, const int __b) { - return __builtin_aarch64_lfrintndfdi (__a); + return __aarch64_vdupq_laneq_s64 (__a, __b); } -__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) -vcvtnd_u64_f64 (float64_t __a) +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vdupq_laneq_u8 (uint8x16_t __a, const int __b) { - return __builtin_aarch64_lfrintnudfdi (__a); + return __aarch64_vdupq_laneq_u8 (__a, __b); } -__extension__ static __inline int32_t __attribute__ ((__always_inline__)) -vcvtns_s32_f32 (float32_t __a) +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vdupq_laneq_u16 (uint16x8_t __a, const int __b) { - return __builtin_aarch64_lfrintnsfsi (__a); + return __aarch64_vdupq_laneq_u16 (__a, __b); } -__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vcvtns_u32_f32 (float32_t __a) +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vdupq_laneq_u32 (uint32x4_t __a, const int __b) { - return __builtin_aarch64_lfrintnusfsi (__a); + return __aarch64_vdupq_laneq_u32 (__a, __b); } -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vcvtn_s32_f32 (float32x2_t __a) +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vdupq_laneq_u64 (uint64x2_t __a, const int __b) { - return __builtin_aarch64_lfrintnv2sfv2si (__a); + return __aarch64_vdupq_laneq_u64 (__a, __b); } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcvtn_u32_f32 (float32x2_t __a) +/* vdupb_lane */ +__extension__ static __inline poly8_t __attribute__ ((__always_inline__)) +vdupb_lane_p8 (poly8x8_t __a, const int __attribute__ ((unused)) __b) { - /* TODO: This cast should go away when builtins have - their correct types. */ - return (uint32x2_t) __builtin_aarch64_lfrintnuv2sfv2si (__a); + return __aarch64_vget_lane_p8 (__a, 0); } -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vcvtnq_s32_f32 (float32x4_t __a) +__extension__ static __inline int8_t __attribute__ ((__always_inline__)) +vdupb_lane_s8 (int8x8_t __a, const int __attribute__ ((unused)) __b) { - return __builtin_aarch64_lfrintnv4sfv4si (__a); + return __aarch64_vget_lane_s8 (__a, 0); } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcvtnq_u32_f32 (float32x4_t __a) +__extension__ static __inline uint8_t __attribute__ ((__always_inline__)) +vdupb_lane_u8 (uint8x8_t __a, const int __attribute__ ((unused)) __b) { - /* TODO: This cast should go away when builtins have - their correct types. */ - return (uint32x4_t) __builtin_aarch64_lfrintnuv4sfv4si (__a); + return __aarch64_vget_lane_u8 (__a, 0); } -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vcvtnq_s64_f64 (float64x2_t __a) +/* vduph_lane */ +__extension__ static __inline poly16_t __attribute__ ((__always_inline__)) +vduph_lane_p16 (poly16x4_t __a, const int __attribute__ ((unused)) __b) { - return __builtin_aarch64_lfrintnv2dfv2di (__a); + return __aarch64_vget_lane_p16 (__a, 0); } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcvtnq_u64_f64 (float64x2_t __a) +__extension__ static __inline int16_t __attribute__ ((__always_inline__)) +vduph_lane_s16 (int16x4_t __a, const int __attribute__ ((unused)) __b) { - /* TODO: This cast should go away when builtins have - their correct types. */ - return (uint64x2_t) __builtin_aarch64_lfrintnuv2dfv2di (__a); + return __aarch64_vget_lane_s16 (__a, 0); } -/* vcvtp */ - -__extension__ static __inline int64_t __attribute__ ((__always_inline__)) -vcvtpd_s64_f64 (float64_t __a) +__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +vduph_lane_u16 (uint16x4_t __a, const int __attribute__ ((unused)) __b) { - return __builtin_lceil (__a); + return __aarch64_vget_lane_u16 (__a, 0); } -__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) -vcvtpd_u64_f64 (float64_t __a) +/* vdups_lane */ +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vdups_lane_f32 (float32x2_t __a, const int __attribute__ ((unused)) __b) { - return __builtin_aarch64_lceiludfdi (__a); + return __aarch64_vget_lane_f32 (__a, 0); } __extension__ static __inline int32_t __attribute__ ((__always_inline__)) -vcvtps_s32_f32 (float32_t __a) +vdups_lane_s32 (int32x2_t __a, const int __attribute__ ((unused)) __b) { - return __builtin_iceilf (__a); + return __aarch64_vget_lane_s32 (__a, 0); } __extension__ static __inline uint32_t __attribute__ ((__always_inline__)) -vcvtps_u32_f32 (float32_t __a) +vdups_lane_u32 (uint32x2_t __a, const int __attribute__ ((unused)) __b) { - return __builtin_aarch64_lceilusfsi (__a); + return __aarch64_vget_lane_u32 (__a, 0); } -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vcvtp_s32_f32 (float32x2_t __a) +/* vdupd_lane */ +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vdupd_lane_f64 (float64x1_t __a, const int __attribute__ ((unused)) __b) { - return __builtin_aarch64_lceilv2sfv2si (__a); + return __a; } -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vcvtp_u32_f32 (float32x2_t __a) +__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +vdupd_lane_s64 (int64x1_t __a, const int __attribute__ ((unused)) __b) { - /* TODO: This cast should go away when builtins have - their correct types. */ - return (uint32x2_t) __builtin_aarch64_lceiluv2sfv2si (__a); + return __a; } -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vcvtpq_s32_f32 (float32x4_t __a) +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vdupd_lane_u64 (uint64x1_t __a, const int __attribute__ ((unused)) __b) { - return __builtin_aarch64_lceilv4sfv4si (__a); + return __a; } -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vcvtpq_u32_f32 (float32x4_t __a) +/* vdupb_laneq */ +__extension__ static __inline poly8_t __attribute__ ((__always_inline__)) +vdupb_laneq_p8 (poly8x16_t __a, const int __attribute__ ((unused)) __b) { - /* TODO: This cast should go away when builtins have - their correct types. */ - return (uint32x4_t) __builtin_aarch64_lceiluv4sfv4si (__a); + return __aarch64_vgetq_lane_p8 (__a, 0); } -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vcvtpq_s64_f64 (float64x2_t __a) +__extension__ static __inline int8_t __attribute__ ((__always_inline__)) +vdupb_laneq_s8 (int8x16_t __a, const int __attribute__ ((unused)) __b) { - return __builtin_aarch64_lceilv2dfv2di (__a); + return __aarch64_vgetq_lane_s8 (__a, 0); } -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vcvtpq_u64_f64 (float64x2_t __a) +__extension__ static __inline uint8_t __attribute__ ((__always_inline__)) +vdupb_laneq_u8 (uint8x16_t __a, const int __attribute__ ((unused)) __b) { - /* TODO: This cast should go away when builtins have - their correct types. */ - return (uint64x2_t) __builtin_aarch64_lceiluv2dfv2di (__a); + return __aarch64_vgetq_lane_u8 (__a, 0); } -/* vdup */ +/* vduph_laneq */ +__extension__ static __inline poly16_t __attribute__ ((__always_inline__)) +vduph_laneq_p16 (poly16x8_t __a, const int __attribute__ ((unused)) __b) +{ + return __aarch64_vgetq_lane_p16 (__a, 0); +} -__extension__ static __inline int8x1_t __attribute__ ((__always_inline__)) -vdupb_lane_s8 (int8x16_t a, int const b) +__extension__ static __inline int16_t __attribute__ ((__always_inline__)) +vduph_laneq_s16 (int16x8_t __a, const int __attribute__ ((unused)) __b) { - return __aarch64_vgetq_lane_s8 (a, b); + return __aarch64_vgetq_lane_s16 (__a, 0); } -__extension__ static __inline uint8x1_t __attribute__ ((__always_inline__)) -vdupb_lane_u8 (uint8x16_t a, int const b) +__extension__ static __inline uint16_t __attribute__ ((__always_inline__)) +vduph_laneq_u16 (uint16x8_t __a, const int __attribute__ ((unused)) __b) { - return __aarch64_vgetq_lane_u8 (a, b); + return __aarch64_vgetq_lane_u16 (__a, 0); } -__extension__ static __inline int16x1_t __attribute__ ((__always_inline__)) -vduph_lane_s16 (int16x8_t a, int const b) +/* vdups_laneq */ +__extension__ static __inline float32_t __attribute__ ((__always_inline__)) +vdups_laneq_f32 (float32x4_t __a, const int __attribute__ ((unused)) __b) { - return __aarch64_vgetq_lane_s16 (a, b); + return __aarch64_vgetq_lane_f32 (__a, 0); } -__extension__ static __inline uint16x1_t __attribute__ ((__always_inline__)) -vduph_lane_u16 (uint16x8_t a, int const b) +__extension__ static __inline int32_t __attribute__ ((__always_inline__)) +vdups_laneq_s32 (int32x4_t __a, const int __attribute__ ((unused)) __b) { - return __aarch64_vgetq_lane_u16 (a, b); + return __aarch64_vgetq_lane_s32 (__a, 0); } -__extension__ static __inline int32x1_t __attribute__ ((__always_inline__)) -vdups_lane_s32 (int32x4_t a, int const b) +__extension__ static __inline uint32_t __attribute__ ((__always_inline__)) +vdups_laneq_u32 (uint32x4_t __a, const int __attribute__ ((unused)) __b) { - return __aarch64_vgetq_lane_s32 (a, b); + return __aarch64_vgetq_lane_u32 (__a, 0); } -__extension__ static __inline uint32x1_t __attribute__ ((__always_inline__)) -vdups_lane_u32 (uint32x4_t a, int const b) +/* vdupd_laneq */ +__extension__ static __inline float64_t __attribute__ ((__always_inline__)) +vdupd_laneq_f64 (float64x2_t __a, const int __attribute__ ((unused)) __b) { - return __aarch64_vgetq_lane_u32 (a, b); + return __aarch64_vgetq_lane_f64 (__a, 0); } -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) -vdupd_lane_s64 (int64x2_t a, int const b) +__extension__ static __inline int64_t __attribute__ ((__always_inline__)) +vdupd_laneq_s64 (int64x2_t __a, const int __attribute__ ((unused)) __b) { - return __aarch64_vgetq_lane_s64 (a, b); + return __aarch64_vgetq_lane_s64 (__a, 0); } -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vdupd_lane_u64 (uint64x2_t a, int const b) +__extension__ static __inline uint64_t __attribute__ ((__always_inline__)) +vdupd_laneq_u64 (uint64x2_t __a, const int __attribute__ ((unused)) __b) { - return __aarch64_vgetq_lane_u64 (a, b); + return __aarch64_vgetq_lane_u64 (__a, 0); } /* vld1 */ @@ -25636,4 +25731,54 @@ __INTERLEAVE_LIST (zip) #undef __aarch64_vgetq_lane_u32 #undef __aarch64_vgetq_lane_u64 +#undef __aarch64_vdup_lane_any +#undef __aarch64_vdup_lane_f32 +#undef __aarch64_vdup_lane_f64 +#undef __aarch64_vdup_lane_p8 +#undef __aarch64_vdup_lane_p16 +#undef __aarch64_vdup_lane_s8 +#undef __aarch64_vdup_lane_s16 +#undef __aarch64_vdup_lane_s32 +#undef __aarch64_vdup_lane_s64 +#undef __aarch64_vdup_lane_u8 +#undef __aarch64_vdup_lane_u16 +#undef __aarch64_vdup_lane_u32 +#undef __aarch64_vdup_lane_u64 +#undef __aarch64_vdup_laneq_f32 +#undef __aarch64_vdup_laneq_f64 +#undef __aarch64_vdup_laneq_p8 +#undef __aarch64_vdup_laneq_p16 +#undef __aarch64_vdup_laneq_s8 +#undef __aarch64_vdup_laneq_s16 +#undef __aarch64_vdup_laneq_s32 +#undef __aarch64_vdup_laneq_s64 +#undef __aarch64_vdup_laneq_u8 +#undef __aarch64_vdup_laneq_u16 +#undef __aarch64_vdup_laneq_u32 +#undef __aarch64_vdup_laneq_u64 +#undef __aarch64_vdupq_lane_f32 +#undef __aarch64_vdupq_lane_f64 +#undef __aarch64_vdupq_lane_p8 +#undef __aarch64_vdupq_lane_p16 +#undef __aarch64_vdupq_lane_s8 +#undef __aarch64_vdupq_lane_s16 +#undef __aarch64_vdupq_lane_s32 +#undef __aarch64_vdupq_lane_s64 +#undef __aarch64_vdupq_lane_u8 +#undef __aarch64_vdupq_lane_u16 +#undef __aarch64_vdupq_lane_u32 +#undef __aarch64_vdupq_lane_u64 +#undef __aarch64_vdupq_laneq_f32 +#undef __aarch64_vdupq_laneq_f64 +#undef __aarch64_vdupq_laneq_p8 +#undef __aarch64_vdupq_laneq_p16 +#undef __aarch64_vdupq_laneq_s8 +#undef __aarch64_vdupq_laneq_s16 +#undef __aarch64_vdupq_laneq_s32 +#undef __aarch64_vdupq_laneq_s64 +#undef __aarch64_vdupq_laneq_u8 +#undef __aarch64_vdupq_laneq_u16 +#undef __aarch64_vdupq_laneq_u32 +#undef __aarch64_vdupq_laneq_u64 + #endif diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 37b6cbc8dc8..ffe125b5583 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -383,7 +383,7 @@ (V4HI "V8HI") (V8HI "V8HI") (V2SI "V4SI") (V4SI "V4SI") (DI "V2DI") (V2DI "V2DI") - (V2SF "V2SF") (V4SF "V4SF") + (V2SF "V4SF") (V4SF "V4SF") (V2DF "V2DF") (SI "V4SI") (HI "V8HI") (QI "V16QI")]) @@ -527,6 +527,20 @@ (define_mode_attr fcvt_target [(V2DF "v2di") (V4SF "v4si") (V2SF "v2si")]) (define_mode_attr FCVT_TARGET [(V2DF "V2DI") (V4SF "V4SI") (V2SF "V2SI")]) +(define_mode_attr VSWAP_WIDTH [(V8QI "V16QI") (V16QI "V8QI") + (V4HI "V8HI") (V8HI "V4HI") + (V2SI "V4SI") (V4SI "V2SI") + (DI "V2DI") (V2DI "DI") + (V2SF "V4SF") (V4SF "V2SF") + (DF "V2DF") (V2DF "DF")]) + +(define_mode_attr vswap_width_name [(V8QI "to_128") (V16QI "to_64") + (V4HI "to_128") (V8HI "to_64") + (V2SI "to_128") (V4SI "to_64") + (DI "to_128") (V2DI "to_64") + (V2SF "to_128") (V4SF "to_64") + (DF "to_128") (V2DF "to_64")]) + ;; ------------------------------------------------------------------- ;; Code Iterators ;; ------------------------------------------------------------------- diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index b190c4d382a..d3892f696ea 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2013-09-02 James Greenhalgh + + * gcc.target/aarch64/scalar_intrinsics.c + (vdup_lane<8,16,32,64>): Force values to SIMD registers. + 2013-09-02 Richard Biener PR middle-end/57511 diff --git a/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c b/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c index d84bfeb55e9..aa041cc2c20 100644 --- a/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c +++ b/gcc/testsuite/gcc.target/aarch64/scalar_intrinsics.c @@ -198,13 +198,21 @@ test_vcltzd_s64 (int64x1_t a) int8x1_t test_vdupb_lane_s8 (int8x16_t a) { - return vdupb_lane_s8 (a, 2); + int8x1_t res; + force_simd (a); + res = vdupb_laneq_s8 (a, 2); + force_simd (res); + return res; } uint8x1_t test_vdupb_lane_u8 (uint8x16_t a) { - return vdupb_lane_u8 (a, 2); + uint8x1_t res; + force_simd (a); + res = vdupb_laneq_u8 (a, 2); + force_simd (res); + return res; } /* { dg-final { scan-assembler-times "aarch64_get_lanev8hi" 2 } } */ @@ -212,13 +220,21 @@ test_vdupb_lane_u8 (uint8x16_t a) int16x1_t test_vduph_lane_s16 (int16x8_t a) { - return vduph_lane_s16 (a, 2); + int16x1_t res; + force_simd (a); + res = vduph_laneq_s16 (a, 2); + force_simd (res); + return res; } uint16x1_t test_vduph_lane_u16 (uint16x8_t a) { - return vduph_lane_u16 (a, 2); + uint16x1_t res; + force_simd (a); + res = vduph_laneq_u16 (a, 2); + force_simd (res); + return res; } /* { dg-final { scan-assembler-times "aarch64_get_lanev4si" 2 } } */ @@ -226,13 +242,21 @@ test_vduph_lane_u16 (uint16x8_t a) int32x1_t test_vdups_lane_s32 (int32x4_t a) { - return vdups_lane_s32 (a, 2); + int32x1_t res; + force_simd (a); + res = vdups_laneq_s32 (a, 2); + force_simd (res); + return res; } uint32x1_t test_vdups_lane_u32 (uint32x4_t a) { - return vdups_lane_u32 (a, 2); + uint32x1_t res; + force_simd (a); + res = vdups_laneq_u32 (a, 2); + force_simd (res); + return res; } /* { dg-final { scan-assembler-times "aarch64_get_lanev2di" 2 } } */ @@ -240,13 +264,21 @@ test_vdups_lane_u32 (uint32x4_t a) int64x1_t test_vdupd_lane_s64 (int64x2_t a) { - return vdupd_lane_s64 (a, 1); + int64x1_t res; + force_simd (a); + res = vdupd_laneq_s64 (a, 1); + force_simd (res); + return res; } uint64x1_t test_vdupd_lane_u64 (uint64x2_t a) { - return vdupd_lane_u64 (a, 1); + uint64x1_t res; + force_simd (a); + res = vdupd_laneq_u64 (a, 1); + force_simd (res); + return res; } /* { dg-final { scan-assembler-times "\\tcmtst\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 2 } } */ -- 2.30.2