#define __aarch64_vdupq_laneq_u64(__a, __b) \
__aarch64_vdup_lane_any (u64, q, q, __a, __b)
-/* vset_lane internal macro. */
+/* vset_lane and vld1_lane internal macro. */
#ifdef __AARCH64EB__
/* For big-endian, GCC's vector indices are the opposite way around
return result;
}
-#define vld1_lane_f32(a, b, c) \
- __extension__ \
- ({ \
- float32x2_t b_ = (b); \
- const float32_t * a_ = (a); \
- float32x2_t result; \
- __asm__ ("ld1 {%0.s}[%1], %2" \
- : "=w"(result) \
- : "i" (c), "Utv"(*a_), "0"(b_) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vld1_lane_f64(a, b, c) \
- __extension__ \
- ({ \
- float64x1_t b_ = (b); \
- const float64_t * a_ = (a); \
- float64x1_t result; \
- __asm__ ("ld1 {%0.d}[%1], %2" \
- : "=w"(result) \
- : "i" (c), "Utv"(*a_), "0"(b_) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vld1_lane_p8(a, b, c) \
- __extension__ \
- ({ \
- poly8x8_t b_ = (b); \
- const poly8_t * a_ = (a); \
- poly8x8_t result; \
- __asm__ ("ld1 {%0.b}[%1], %2" \
- : "=w"(result) \
- : "i" (c), "Utv"(*a_), "0"(b_) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vld1_lane_p16(a, b, c) \
- __extension__ \
- ({ \
- poly16x4_t b_ = (b); \
- const poly16_t * a_ = (a); \
- poly16x4_t result; \
- __asm__ ("ld1 {%0.h}[%1], %2" \
- : "=w"(result) \
- : "i" (c), "Utv"(*a_), "0"(b_) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vld1_lane_s8(a, b, c) \
- __extension__ \
- ({ \
- int8x8_t b_ = (b); \
- const int8_t * a_ = (a); \
- int8x8_t result; \
- __asm__ ("ld1 {%0.b}[%1], %2" \
- : "=w"(result) \
- : "i" (c), "Utv"(*a_), "0"(b_) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vld1_lane_s16(a, b, c) \
- __extension__ \
- ({ \
- int16x4_t b_ = (b); \
- const int16_t * a_ = (a); \
- int16x4_t result; \
- __asm__ ("ld1 {%0.h}[%1], %2" \
- : "=w"(result) \
- : "i" (c), "Utv"(*a_), "0"(b_) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vld1_lane_s32(a, b, c) \
- __extension__ \
- ({ \
- int32x2_t b_ = (b); \
- const int32_t * a_ = (a); \
- int32x2_t result; \
- __asm__ ("ld1 {%0.s}[%1], %2" \
- : "=w"(result) \
- : "i" (c), "Utv"(*a_), "0"(b_) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vld1_lane_s64(a, b, c) \
- __extension__ \
- ({ \
- int64x1_t b_ = (b); \
- const int64_t * a_ = (a); \
- int64x1_t result; \
- __asm__ ("ld1 {%0.d}[%1], %2" \
- : "=w"(result) \
- : "i" (c), "Utv"(*a_), "0"(b_) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vld1_lane_u8(a, b, c) \
- __extension__ \
- ({ \
- uint8x8_t b_ = (b); \
- const uint8_t * a_ = (a); \
- uint8x8_t result; \
- __asm__ ("ld1 {%0.b}[%1], %2" \
- : "=w"(result) \
- : "i" (c), "Utv"(*a_), "0"(b_) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vld1_lane_u16(a, b, c) \
- __extension__ \
- ({ \
- uint16x4_t b_ = (b); \
- const uint16_t * a_ = (a); \
- uint16x4_t result; \
- __asm__ ("ld1 {%0.h}[%1], %2" \
- : "=w"(result) \
- : "i" (c), "Utv"(*a_), "0"(b_) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vld1_lane_u32(a, b, c) \
- __extension__ \
- ({ \
- uint32x2_t b_ = (b); \
- const uint32_t * a_ = (a); \
- uint32x2_t result; \
- __asm__ ("ld1 {%0.s}[%1], %2" \
- : "=w"(result) \
- : "i" (c), "Utv"(*a_), "0"(b_) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vld1_lane_u64(a, b, c) \
- __extension__ \
- ({ \
- uint64x1_t b_ = (b); \
- const uint64_t * a_ = (a); \
- uint64x1_t result; \
- __asm__ ("ld1 {%0.d}[%1], %2" \
- : "=w"(result) \
- : "i" (c), "Utv"(*a_), "0"(b_) \
- : /* No clobbers */); \
- result; \
- })
-
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vld1q_dup_f32 (const float32_t * a)
{
return result;
}
-#define vld1q_lane_f32(a, b, c) \
- __extension__ \
- ({ \
- float32x4_t b_ = (b); \
- const float32_t * a_ = (a); \
- float32x4_t result; \
- __asm__ ("ld1 {%0.s}[%1], %2" \
- : "=w"(result) \
- : "i"(c), "Utv"(*a_), "0"(b_) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vld1q_lane_f64(a, b, c) \
- __extension__ \
- ({ \
- float64x2_t b_ = (b); \
- const float64_t * a_ = (a); \
- float64x2_t result; \
- __asm__ ("ld1 {%0.d}[%1], %2" \
- : "=w"(result) \
- : "i"(c), "Utv"(*a_), "0"(b_) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vld1q_lane_p8(a, b, c) \
- __extension__ \
- ({ \
- poly8x16_t b_ = (b); \
- const poly8_t * a_ = (a); \
- poly8x16_t result; \
- __asm__ ("ld1 {%0.b}[%1], %2" \
- : "=w"(result) \
- : "i"(c), "Utv"(*a_), "0"(b_) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vld1q_lane_p16(a, b, c) \
- __extension__ \
- ({ \
- poly16x8_t b_ = (b); \
- const poly16_t * a_ = (a); \
- poly16x8_t result; \
- __asm__ ("ld1 {%0.h}[%1], %2" \
- : "=w"(result) \
- : "i"(c), "Utv"(*a_), "0"(b_) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vld1q_lane_s8(a, b, c) \
- __extension__ \
- ({ \
- int8x16_t b_ = (b); \
- const int8_t * a_ = (a); \
- int8x16_t result; \
- __asm__ ("ld1 {%0.b}[%1], %2" \
- : "=w"(result) \
- : "i"(c), "Utv"(*a_), "0"(b_) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vld1q_lane_s16(a, b, c) \
- __extension__ \
- ({ \
- int16x8_t b_ = (b); \
- const int16_t * a_ = (a); \
- int16x8_t result; \
- __asm__ ("ld1 {%0.h}[%1], %2" \
- : "=w"(result) \
- : "i"(c), "Utv"(*a_), "0"(b_) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vld1q_lane_s32(a, b, c) \
- __extension__ \
- ({ \
- int32x4_t b_ = (b); \
- const int32_t * a_ = (a); \
- int32x4_t result; \
- __asm__ ("ld1 {%0.s}[%1], %2" \
- : "=w"(result) \
- : "i"(c), "Utv"(*a_), "0"(b_) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vld1q_lane_s64(a, b, c) \
- __extension__ \
- ({ \
- int64x2_t b_ = (b); \
- const int64_t * a_ = (a); \
- int64x2_t result; \
- __asm__ ("ld1 {%0.d}[%1], %2" \
- : "=w"(result) \
- : "i"(c), "Utv"(*a_), "0"(b_) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vld1q_lane_u8(a, b, c) \
- __extension__ \
- ({ \
- uint8x16_t b_ = (b); \
- const uint8_t * a_ = (a); \
- uint8x16_t result; \
- __asm__ ("ld1 {%0.b}[%1], %2" \
- : "=w"(result) \
- : "i"(c), "Utv"(*a_), "0"(b_) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vld1q_lane_u16(a, b, c) \
- __extension__ \
- ({ \
- uint16x8_t b_ = (b); \
- const uint16_t * a_ = (a); \
- uint16x8_t result; \
- __asm__ ("ld1 {%0.h}[%1], %2" \
- : "=w"(result) \
- : "i"(c), "Utv"(*a_), "0"(b_) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vld1q_lane_u32(a, b, c) \
- __extension__ \
- ({ \
- uint32x4_t b_ = (b); \
- const uint32_t * a_ = (a); \
- uint32x4_t result; \
- __asm__ ("ld1 {%0.s}[%1], %2" \
- : "=w"(result) \
- : "i"(c), "Utv"(*a_), "0"(b_) \
- : /* No clobbers */); \
- result; \
- })
-
-#define vld1q_lane_u64(a, b, c) \
- __extension__ \
- ({ \
- uint64x2_t b_ = (b); \
- const uint64_t * a_ = (a); \
- uint64x2_t result; \
- __asm__ ("ld1 {%0.d}[%1], %2" \
- : "=w"(result) \
- : "i"(c), "Utv"(*a_), "0"(b_) \
- : /* No clobbers */); \
- result; \
- })
-
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vmla_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
{
__builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a);
}
+/* vld1_lane */
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vld1_lane_f32 (const float32_t *__src, float32x2_t __vec, const int __lane)
+{
+ return vset_lane_f32 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+vld1_lane_f64 (const float64_t *__src, float64x1_t __vec, const int __lane)
+{
+ return vset_lane_f64 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vld1_lane_p8 (const poly8_t *__src, poly8x8_t __vec, const int __lane)
+{
+ return vset_lane_p8 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vld1_lane_p16 (const poly16_t *__src, poly16x4_t __vec, const int __lane)
+{
+ return vset_lane_p16 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vld1_lane_s8 (const int8_t *__src, int8x8_t __vec, const int __lane)
+{
+ return vset_lane_s8 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vld1_lane_s16 (const int16_t *__src, int16x4_t __vec, const int __lane)
+{
+ return vset_lane_s16 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vld1_lane_s32 (const int32_t *__src, int32x2_t __vec, const int __lane)
+{
+ return vset_lane_s32 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vld1_lane_s64 (const int64_t *__src, int64x1_t __vec, const int __lane)
+{
+ return vset_lane_s64 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vld1_lane_u8 (const uint8_t *__src, uint8x8_t __vec, const int __lane)
+{
+ return vset_lane_u8 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vld1_lane_u16 (const uint16_t *__src, uint16x4_t __vec, const int __lane)
+{
+ return vset_lane_u16 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vld1_lane_u32 (const uint32_t *__src, uint32x2_t __vec, const int __lane)
+{
+ return vset_lane_u32 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vld1_lane_u64 (const uint64_t *__src, uint64x1_t __vec, const int __lane)
+{
+ return vset_lane_u64 (*__src, __vec, __lane);
+}
+
+/* vld1q_lane */
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vld1q_lane_f32 (const float32_t *__src, float32x4_t __vec, const int __lane)
+{
+ return vsetq_lane_f32 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vld1q_lane_f64 (const float64_t *__src, float64x2_t __vec, const int __lane)
+{
+ return vsetq_lane_f64 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vld1q_lane_p8 (const poly8_t *__src, poly8x16_t __vec, const int __lane)
+{
+ return vsetq_lane_p8 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vld1q_lane_p16 (const poly16_t *__src, poly16x8_t __vec, const int __lane)
+{
+ return vsetq_lane_p16 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vld1q_lane_s8 (const int8_t *__src, int8x16_t __vec, const int __lane)
+{
+ return vsetq_lane_s8 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vld1q_lane_s16 (const int16_t *__src, int16x8_t __vec, const int __lane)
+{
+ return vsetq_lane_s16 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vld1q_lane_s32 (const int32_t *__src, int32x4_t __vec, const int __lane)
+{
+ return vsetq_lane_s32 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vld1q_lane_s64 (const int64_t *__src, int64x2_t __vec, const int __lane)
+{
+ return vsetq_lane_s64 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vld1q_lane_u8 (const uint8_t *__src, uint8x16_t __vec, const int __lane)
+{
+ return vsetq_lane_u8 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vld1q_lane_u16 (const uint16_t *__src, uint16x8_t __vec, const int __lane)
+{
+ return vsetq_lane_u16 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vld1q_lane_u32 (const uint32_t *__src, uint32x4_t __vec, const int __lane)
+{
+ return vsetq_lane_u32 (*__src, __vec, __lane);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vld1q_lane_u64 (const uint64_t *__src, uint64x2_t __vec, const int __lane)
+{
+ return vsetq_lane_u64 (*__src, __vec, __lane);
+}
+
/* vldn */
__extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__))
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-inline" } */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+#define VARIANTS(VARIANT) \
+VARIANT (uint8, , 8, _u8, 5) \
+VARIANT (uint16, , 4, _u16, 3) \
+VARIANT (uint32, , 2, _u32, 1) \
+VARIANT (uint64, , 1, _u64, 0) \
+VARIANT (int8, , 8, _s8, 3) \
+VARIANT (int16, , 4, _s16, 2) \
+VARIANT (int32, , 2, _s32, 0) \
+VARIANT (int64, , 1, _s64, 0) \
+VARIANT (poly8, , 8, _p8, 7) \
+VARIANT (poly16, , 4, _p16, 2) \
+VARIANT (float32, , 2, _f32, 1) \
+VARIANT (float64, , 1, _f64, 0) \
+VARIANT (uint8, q, 16, _u8, 13) \
+VARIANT (uint16, q, 8, _u16, 5) \
+VARIANT (uint32, q, 4, _u32, 1) \
+VARIANT (uint64, q, 2, _u64, 0) \
+VARIANT (int8, q, 16, _s8, 15) \
+VARIANT (int16, q, 8, _s16, 3) \
+VARIANT (int32, q, 4, _s32, 1) \
+VARIANT (int64, q, 2, _s64, 1) \
+VARIANT (poly8, q, 16, _p8, 7) \
+VARIANT (poly16, q, 8, _p16, 4) \
+VARIANT (float32, q, 4, _f32, 2)\
+VARIANT (float64, q, 2, _f64, 1)
+
+#define TESTMETH(BASE, Q, ELTS, SUFFIX, LANE) \
+__attribute__((noinline)) BASE##x##ELTS##_t \
+wrap_vld1##Q##_lane##SUFFIX (const BASE##_t *load, \
+ BASE##x##ELTS##_t vec) \
+{ return vld1##Q##_lane##SUFFIX (load, vec, LANE); } \
+int \
+test_vld1##Q##_lane##SUFFIX (const BASE##_t *data, \
+ const BASE##_t *overwrite) \
+{ \
+ BASE##_t out[ELTS]; \
+ int j; \
+ BASE##x##ELTS##_t in = vld1##Q##SUFFIX (data); \
+ in = wrap_vld1##Q##_lane##SUFFIX (overwrite, in); \
+ vst1##Q##SUFFIX (out, in); \
+ for (j = 0; j < ELTS; j++) \
+ if (out[j] != (j == LANE ? *overwrite : data[j])) \
+ return 1; \
+ return 0; \
+}
+
+
+VARIANTS (TESTMETH)
+
+#define CHECK(BASE, Q, ELTS, SUFFIX, LANE) \
+ if (test_vld1##Q##_lane##SUFFIX ((const BASE##_t *)orig_data, \
+ BASE##_data) != 0) \
+ abort ();
+
+int
+main (int argc, char **argv)
+{
+ /* Original data for all vector formats. */
+ uint64_t orig_data[2] = {0x1234567890abcdefULL, 0x13579bdf02468aceULL};
+
+ /* Data with which vldN_lane will overwrite some of previous. */
+ uint8_t uint8_data[4] = { 7, 11, 13, 17 };
+ uint16_t uint16_data[4] = { 257, 263, 269, 271 };
+ uint32_t uint32_data[4] = { 65537, 65539, 65543, 65551 };
+ uint64_t uint64_data[4] = { 0xdeadbeefcafebabeULL, 0x0123456789abcdefULL,
+ 0xfedcba9876543210LL, 0xdeadbabecafebeefLL };
+ int8_t int8_data[4] = { -1, 3, -5, 7 };
+ int16_t int16_data[4] = { 257, -259, 261, -263 };
+ int32_t int32_data[4] = { 123456789, -987654321, -135792468, 975318642 };
+ int64_t *int64_data = (int64_t *)uint64_data;
+ poly8_t poly8_data[4] = { 0, 7, 13, 18, };
+ poly16_t poly16_data[4] = { 11111, 2222, 333, 44 };
+ float32_t float32_data[4] = { 3.14159, 2.718, 1.414, 100.0 };
+ float64_t float64_data[4] = { 1.010010001, 12345.6789, -9876.54321, 1.618 };
+
+ VARIANTS (CHECK);
+ return 0;
+}