From 5543f9411cac1528701c1fd70d3cc4a84ae42d4b Mon Sep 17 00:00:00 2001 From: Alan Lawrence Date: Tue, 14 Apr 2015 13:59:06 +0000 Subject: [PATCH] [AArch64 Intrinsics] Replace temporary assembler for vst1_lane * config/aarch64/arm_neon.h (vst1_lane_f32, vst1_lane_f64, vst1_lane_p8, vst1_lane_p16, vst1_lane_s8, vst1_lane_s16, vst1_lane_s32, vst1_lane_s64, vst1_lane_u8, vst1_lane_u16, vst1_lane_u32, vst1_lane_u64, vst1q_lane_f32, vst1q_lane_f64, vst1q_lane_p8, vst1q_lane_p16, vst1q_lane_s8, vst1q_lane_s16, vst1q_lane_s32, vst1q_lane_s64, vst1q_lane_u8, vst1q_lane_u16, vst1q_lane_u32, vst1q_lane_u64): Reimplement with pointer dereference and __aarch64_vget_lane_any. From-SVN: r222092 --- gcc/ChangeLog | 11 + gcc/config/aarch64/arm_neon.h | 418 ++++++++++++---------------------- 2 files changed, 161 insertions(+), 268 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 33c3d92764a..e3ed9678105 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,14 @@ +2015-04-14 Alan Lawrence + + * config/aarch64/arm_neon.h (vst1_lane_f32, vst1_lane_f64, + vst1_lane_p8, vst1_lane_p16, vst1_lane_s8, vst1_lane_s16, + vst1_lane_s32, vst1_lane_s64, vst1_lane_u8, vst1_lane_u16, + vst1_lane_u32, vst1_lane_u64, vst1q_lane_f32, vst1q_lane_f64, + vst1q_lane_p8, vst1q_lane_p16, vst1q_lane_s8, vst1q_lane_s16, + vst1q_lane_s32, vst1q_lane_s64, vst1q_lane_u8, vst1q_lane_u16, + vst1q_lane_u32, vst1q_lane_u64): Reimplement with pointer dereference + and __aarch64_vget_lane_any. + 2015-04-14 Jakub Jelinek PR rtl-optimization/65761 diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 4c1531222c6..71ef027d2b4 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -9824,272 +9824,6 @@ vrsqrtss_f32 (float32_t a, float32_t b) result; \ }) -#define vst1_lane_f32(a, b, c) \ - __extension__ \ - ({ \ - float32x2_t b_ = (b); \ - float32_t * a_ = (a); \ - __asm__ ("st1 {%1.s}[%2],[%0]" \ - : \ - : "r"(a_), "w"(b_), "i"(c) \ - : "memory"); \ - }) - -#define vst1_lane_f64(a, b, c) \ - __extension__ \ - ({ \ - float64x1_t b_ = (b); \ - float64_t * a_ = (a); \ - __asm__ ("st1 {%1.d}[%2],[%0]" \ - : \ - : "r"(a_), "w"(b_), "i"(c) \ - : "memory"); \ - }) - -#define vst1_lane_p8(a, b, c) \ - __extension__ \ - ({ \ - poly8x8_t b_ = (b); \ - poly8_t * a_ = (a); \ - __asm__ ("st1 {%1.b}[%2],[%0]" \ - : \ - : "r"(a_), "w"(b_), "i"(c) \ - : "memory"); \ - }) - -#define vst1_lane_p16(a, b, c) \ - __extension__ \ - ({ \ - poly16x4_t b_ = (b); \ - poly16_t * a_ = (a); \ - __asm__ ("st1 {%1.h}[%2],[%0]" \ - : \ - : "r"(a_), "w"(b_), "i"(c) \ - : "memory"); \ - }) - -#define vst1_lane_s8(a, b, c) \ - __extension__ \ - ({ \ - int8x8_t b_ = (b); \ - int8_t * a_ = (a); \ - __asm__ ("st1 {%1.b}[%2],[%0]" \ - : \ - : "r"(a_), "w"(b_), "i"(c) \ - : "memory"); \ - }) - -#define vst1_lane_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x4_t b_ = (b); \ - int16_t * a_ = (a); \ - __asm__ ("st1 {%1.h}[%2],[%0]" \ - : \ - : "r"(a_), "w"(b_), "i"(c) \ - : "memory"); \ - }) - -#define vst1_lane_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x2_t b_ = (b); \ - int32_t * a_ = (a); \ - __asm__ ("st1 {%1.s}[%2],[%0]" \ - : \ - : "r"(a_), "w"(b_), "i"(c) \ - : "memory"); \ - }) - -#define vst1_lane_s64(a, b, c) \ - __extension__ \ - ({ \ - int64x1_t b_ = (b); \ - int64_t * a_ = (a); \ - __asm__ ("st1 {%1.d}[%2],[%0]" \ - : \ - : "r"(a_), "w"(b_), "i"(c) \ - : "memory"); \ - }) - -#define vst1_lane_u8(a, b, c) \ - __extension__ \ - ({ \ - uint8x8_t b_ = (b); \ - uint8_t * a_ = (a); \ - __asm__ ("st1 {%1.b}[%2],[%0]" \ - : \ - : "r"(a_), "w"(b_), "i"(c) \ - : "memory"); \ - }) - -#define vst1_lane_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x4_t b_ = (b); \ - uint16_t * a_ = (a); \ - __asm__ ("st1 {%1.h}[%2],[%0]" \ - : \ - : "r"(a_), "w"(b_), "i"(c) \ - : "memory"); \ - }) - -#define vst1_lane_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x2_t b_ = (b); \ - uint32_t * a_ = (a); \ - __asm__ ("st1 {%1.s}[%2],[%0]" \ - : \ - : "r"(a_), "w"(b_), "i"(c) \ - : "memory"); \ - }) - -#define vst1_lane_u64(a, b, c) \ - __extension__ \ - ({ \ - uint64x1_t b_ = (b); \ - uint64_t * a_ = (a); \ - __asm__ ("st1 {%1.d}[%2],[%0]" \ - : \ - : "r"(a_), "w"(b_), "i"(c) \ - : "memory"); \ - }) - - -#define vst1q_lane_f32(a, b, c) \ - __extension__ \ - ({ \ - float32x4_t b_ = (b); \ - float32_t * a_ = (a); \ - __asm__ ("st1 {%1.s}[%2],[%0]" \ - : \ - : "r"(a_), "w"(b_), "i"(c) \ - : "memory"); \ - }) - -#define vst1q_lane_f64(a, b, c) \ - __extension__ \ - ({ \ - float64x2_t b_ = (b); \ - float64_t * a_ = (a); \ - __asm__ ("st1 {%1.d}[%2],[%0]" \ - : \ - : "r"(a_), "w"(b_), "i"(c) \ - : "memory"); \ - }) - -#define vst1q_lane_p8(a, b, c) \ - __extension__ \ - ({ \ - poly8x16_t b_ = (b); \ - poly8_t * a_ = (a); \ - __asm__ ("st1 {%1.b}[%2],[%0]" \ - : \ - : "r"(a_), "w"(b_), "i"(c) \ - : "memory"); \ - }) - -#define vst1q_lane_p16(a, b, c) \ - __extension__ \ - ({ \ - poly16x8_t b_ = (b); \ - poly16_t * a_ = (a); \ - __asm__ ("st1 {%1.h}[%2],[%0]" \ - : \ - : "r"(a_), "w"(b_), "i"(c) \ - : "memory"); \ - }) - -#define vst1q_lane_s8(a, b, c) \ - __extension__ \ - ({ \ - int8x16_t b_ = (b); \ - int8_t * a_ = (a); \ - __asm__ ("st1 {%1.b}[%2],[%0]" \ - : \ - : "r"(a_), "w"(b_), "i"(c) \ - : "memory"); \ - }) - -#define vst1q_lane_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x8_t b_ = (b); \ - int16_t * a_ = (a); \ - __asm__ ("st1 {%1.h}[%2],[%0]" \ - : \ - : "r"(a_), "w"(b_), "i"(c) \ - : "memory"); \ - }) - -#define vst1q_lane_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x4_t b_ = (b); \ - int32_t * a_ = (a); \ - __asm__ ("st1 {%1.s}[%2],[%0]" \ - : \ - : "r"(a_), "w"(b_), "i"(c) \ - : "memory"); \ - }) - -#define vst1q_lane_s64(a, b, c) \ - __extension__ \ - ({ \ - int64x2_t b_ = (b); \ - int64_t * a_ = (a); \ - __asm__ ("st1 {%1.d}[%2],[%0]" \ - : \ - : "r"(a_), "w"(b_), "i"(c) \ - : "memory"); \ - }) - -#define vst1q_lane_u8(a, b, c) \ - __extension__ \ - ({ \ - uint8x16_t b_ = (b); \ - uint8_t * a_ = (a); \ - __asm__ ("st1 {%1.b}[%2],[%0]" \ - : \ - : "r"(a_), "w"(b_), "i"(c) \ - : "memory"); \ - }) - -#define vst1q_lane_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x8_t b_ = (b); \ - uint16_t * a_ = (a); \ - __asm__ ("st1 {%1.h}[%2],[%0]" \ - : \ - : "r"(a_), "w"(b_), "i"(c) \ - : "memory"); \ - }) - -#define vst1q_lane_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x4_t b_ = (b); \ - uint32_t * a_ = (a); \ - __asm__ ("st1 {%1.s}[%2],[%0]" \ - : \ - : "r"(a_), "w"(b_), "i"(c) \ - : "memory"); \ - }) - -#define vst1q_lane_u64(a, b, c) \ - __extension__ \ - ({ \ - uint64x2_t b_ = (b); \ - uint64_t * a_ = (a); \ - __asm__ ("st1 {%1.d}[%2],[%0]" \ - : \ - : "r"(a_), "w"(b_), "i"(c) \ - : "memory"); \ - }) - - __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) vtst_p8 (poly8x8_t a, poly8x8_t b) { @@ -22302,6 +22036,8 @@ vst1_u64 (uint64_t *a, uint64x1_t b) *a = b[0]; } +/* vst1q */ + __extension__ static __inline void __attribute__ ((__always_inline__)) vst1q_f32 (float32_t *a, float32x4_t b) { @@ -22314,8 +22050,6 @@ vst1q_f64 (float64_t *a, float64x2_t b) __builtin_aarch64_st1v2df ((__builtin_aarch64_simd_df *) a, b); } -/* vst1q */ - __extension__ static __inline void __attribute__ ((__always_inline__)) vst1q_p8 (poly8_t *a, poly8x16_t b) { @@ -22382,6 +22116,154 @@ vst1q_u64 (uint64_t *a, uint64x2_t b) (int64x2_t) b); } +/* vst1_lane */ + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_lane_f32 (float32_t *__a, float32x2_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_lane_f64 (float64_t *__a, float64x1_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_lane_p8 (poly8_t *__a, poly8x8_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_lane_p16 (poly16_t *__a, poly16x4_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_lane_s8 (int8_t *__a, int8x8_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_lane_s16 (int16_t *__a, int16x4_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_lane_s32 (int32_t *__a, int32x2_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_lane_s64 (int64_t *__a, int64x1_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_lane_u8 (uint8_t *__a, uint8x8_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_lane_u16 (uint16_t *__a, uint16x4_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_lane_u32 (uint32_t *__a, uint32x2_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_lane_u64 (uint64_t *__a, uint64x1_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +/* vst1q_lane */ + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_lane_f32 (float32_t *__a, float32x4_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_lane_f64 (float64_t *__a, float64x2_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_lane_p8 (poly8_t *__a, poly8x16_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_lane_p16 (poly16_t *__a, poly16x8_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_lane_s8 (int8_t *__a, int8x16_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_lane_s16 (int16_t *__a, int16x8_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_lane_s32 (int32_t *__a, int32x4_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_lane_s64 (int64_t *__a, int64x2_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_lane_u8 (uint8_t *__a, uint8x16_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_lane_u16 (uint16_t *__a, uint16x8_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_lane_u32 (uint32_t *__a, uint32x4_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_lane_u64 (uint64_t *__a, uint64x2_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + /* vstn */ __extension__ static __inline void -- 2.30.2