[AArch64][1/10] ARMv8.2-A FP16 data processing intrinsics

author Jiong Wang <jiong.wang@arm.com>

Mon, 25 Jul 2016 14:02:42 +0000 (14:02 +0000)

committer Jiong Wang <jiwang@gcc.gnu.org>

Mon, 25 Jul 2016 14:02:42 +0000 (14:02 +0000)
author Jiong Wang <jiong.wang@arm.com>
Mon, 25 Jul 2016 14:02:42 +0000 (14:02 +0000)
committer Jiong Wang <jiwang@gcc.gnu.org>
Mon, 25 Jul 2016 14:02:42 +0000 (14:02 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index f82a4d3c88894b2bcdedec61796c497afddde7b2..1e3f304a79d8a498737b317c96fc89301bb76ef3 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,51 @@
+2016-07-25  Jiong Wang  <jiong.wang@arm.com>
+
+       * config/aarch64/aarch64-simd.md
+       (aarch64_<PERMUTE:perm_insn><PERMUTE:perm_hilo><mode>): Use VALL_F16.
+       (aarch64_ext<mode>): Likewise.
+       (aarch64_rev<REVERSE:rev_op><mode>): Likewise.
+       * config/aarch64/aarch64.c (aarch64_evpc_trn): Support V4HFmode and
+       V8HFmode.
+       (aarch64_evpc_uzp): Likewise.
+       (aarch64_evpc_zip): Likewise.
+       (aarch64_evpc_ext): Likewise.
+       (aarch64_evpc_rev): Likewise.
+       * config/aarch64/arm_neon.h (__aarch64_vdup_lane_f16): New.
+       (__aarch64_vdup_laneq_f16): New..
+       (__aarch64_vdupq_lane_f16): New.
+       (__aarch64_vdupq_laneq_f16): New.
+       (vbsl_f16): New.
+       (vbslq_f16): New.
+       (vdup_n_f16): New.
+       (vdupq_n_f16): New.
+       (vdup_lane_f16): New.
+       (vdup_laneq_f16): New.
+       (vdupq_lane_f16): New.
+       (vdupq_laneq_f16): New.
+       (vduph_lane_f16): New.
+       (vduph_laneq_f16): New.
+       (vext_f16): New.
+       (vextq_f16): New.
+       (vmov_n_f16): New.
+       (vmovq_n_f16): New.
+       (vrev64_f16): New.
+       (vrev64q_f16): New.
+       (vtrn1_f16): New.
+       (vtrn1q_f16): New.
+       (vtrn2_f16): New.
+       (vtrn2q_f16): New.
+       (vtrn_f16): New.
+       (vtrnq_f16): New.
+       (__INTERLEAVE_LIST): Support float16x4_t, float16x8_t.
+       (vuzp1_f16): New.
+       (vuzp1q_f16): New.
+       (vuzp2_f16): New.
+       (vuzp2q_f16): New.
+       (vzip1_f16): New.
+       (vzip2q_f16): New.
+       (vmov_n_f16): Reimplement using vdup_n_f16.
+       (vmovq_n_f16): Reimplement using vdupq_n_f16..
+
  2016-07-25  Jiong Wang  <jiong.wang@arm.com>
  
         * config/aarch64/aarch64.c (aarch64_add_constant): New parameter
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md

index a19d1711b5bcb516e4aca6a22d1b79df4f32923f..251ad972a4bed027f8c77946fb21ce8d94dc3035 100644 (file)
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -5219,10 +5219,10 @@
  )
  
  (define_insn "aarch64_<PERMUTE:perm_insn><PERMUTE:perm_hilo><mode>"
-  [(set (match_operand:VALL 0 "register_operand" "=w")
-       (unspec:VALL [(match_operand:VALL 1 "register_operand" "w")
-                     (match_operand:VALL 2 "register_operand" "w")]
-                      PERMUTE))]
+  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
+       (unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")
+                         (match_operand:VALL_F16 2 "register_operand" "w")]
+        PERMUTE))]
    "TARGET_SIMD"
    "<PERMUTE:perm_insn><PERMUTE:perm_hilo>\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
    [(set_attr "type" "neon_permute<q>")]
@@ -5230,11 +5230,11 @@
  
  ;; Note immediate (third) operand is lane index not byte index.
  (define_insn "aarch64_ext<mode>"
-  [(set (match_operand:VALL 0 "register_operand" "=w")
-        (unspec:VALL [(match_operand:VALL 1 "register_operand" "w")
-                      (match_operand:VALL 2 "register_operand" "w")
-                      (match_operand:SI 3 "immediate_operand" "i")]
-                     UNSPEC_EXT))]
+  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
+        (unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")
+                         (match_operand:VALL_F16 2 "register_operand" "w")
+                         (match_operand:SI 3 "immediate_operand" "i")]
+        UNSPEC_EXT))]
    "TARGET_SIMD"
  {
    operands[3] = GEN_INT (INTVAL (operands[3])
@@ -5245,8 +5245,8 @@
  )
  
  (define_insn "aarch64_rev<REVERSE:rev_op><mode>"
-  [(set (match_operand:VALL 0 "register_operand" "=w")
-       (unspec:VALL [(match_operand:VALL 1 "register_operand" "w")]
+  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
+       (unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")]
                      REVERSE))]
    "TARGET_SIMD"
    "rev<REVERSE:rev_op>\\t%0.<Vtype>, %1.<Vtype>"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

index 5560516c9f1a903dae412dd70f3dafaffcf48446..381cf7d3b8509ff5c0d51294ef555c4ea4d6eedd 100644 (file)
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -12286,6 +12286,8 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d)
         case V4SImode: gen = gen_aarch64_trn2v4si; break;
         case V2SImode: gen = gen_aarch64_trn2v2si; break;
         case V2DImode: gen = gen_aarch64_trn2v2di; break;
+       case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
+       case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
@@ -12304,6 +12306,8 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d)
         case V4SImode: gen = gen_aarch64_trn1v4si; break;
         case V2SImode: gen = gen_aarch64_trn1v2si; break;
         case V2DImode: gen = gen_aarch64_trn1v2di; break;
+       case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
+       case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
@@ -12369,6 +12373,8 @@ aarch64_evpc_uzp (struct expand_vec_perm_d *d)
         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
+       case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
+       case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
@@ -12387,6 +12393,8 @@ aarch64_evpc_uzp (struct expand_vec_perm_d *d)
         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
+       case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
+       case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
@@ -12457,6 +12465,8 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d)
         case V4SImode: gen = gen_aarch64_zip2v4si; break;
         case V2SImode: gen = gen_aarch64_zip2v2si; break;
         case V2DImode: gen = gen_aarch64_zip2v2di; break;
+       case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
+       case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
@@ -12475,6 +12485,8 @@ aarch64_evpc_zip (struct expand_vec_perm_d *d)
         case V4SImode: gen = gen_aarch64_zip1v4si; break;
         case V2SImode: gen = gen_aarch64_zip1v2si; break;
         case V2DImode: gen = gen_aarch64_zip1v2di; break;
+       case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
+       case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
@@ -12519,6 +12531,8 @@ aarch64_evpc_ext (struct expand_vec_perm_d *d)
      case V8HImode: gen = gen_aarch64_extv8hi; break;
      case V2SImode: gen = gen_aarch64_extv2si; break;
      case V4SImode: gen = gen_aarch64_extv4si; break;
+    case V4HFmode: gen = gen_aarch64_extv4hf; break;
+    case V8HFmode: gen = gen_aarch64_extv8hf; break;
      case V2SFmode: gen = gen_aarch64_extv2sf; break;
      case V4SFmode: gen = gen_aarch64_extv4sf; break;
      case V2DImode: gen = gen_aarch64_extv2di; break;
@@ -12594,6 +12608,8 @@ aarch64_evpc_rev (struct expand_vec_perm_d *d)
         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
+       case V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
+       case V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
         default:
           return false;
         }
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h

index b0ab1d33d37418f34af8f8d61f6caf143cad8477..fd5f094de6a058065e2b1377f5ffc4c1aba01f97 100644 (file)
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -466,6 +466,8 @@ typedef struct poly16x8x4_t
  #define __aarch64_vdup_lane_any(__size, __q, __a, __b) \
    vdup##__q##_n_##__size (__aarch64_vget_lane_any (__a, __b))
  
+#define __aarch64_vdup_lane_f16(__a, __b) \
+   __aarch64_vdup_lane_any (f16, , __a, __b)
  #define __aarch64_vdup_lane_f32(__a, __b) \
     __aarch64_vdup_lane_any (f32, , __a, __b)
  #define __aarch64_vdup_lane_f64(__a, __b) \
@@ -492,6 +494,8 @@ typedef struct poly16x8x4_t
     __aarch64_vdup_lane_any (u64, , __a, __b)
  
  /* __aarch64_vdup_laneq internal macros.  */
+#define __aarch64_vdup_laneq_f16(__a, __b) \
+   __aarch64_vdup_lane_any (f16, , __a, __b)
  #define __aarch64_vdup_laneq_f32(__a, __b) \
     __aarch64_vdup_lane_any (f32, , __a, __b)
  #define __aarch64_vdup_laneq_f64(__a, __b) \
@@ -518,6 +522,8 @@ typedef struct poly16x8x4_t
     __aarch64_vdup_lane_any (u64, , __a, __b)
  
  /* __aarch64_vdupq_lane internal macros.  */
+#define __aarch64_vdupq_lane_f16(__a, __b) \
+   __aarch64_vdup_lane_any (f16, q, __a, __b)
  #define __aarch64_vdupq_lane_f32(__a, __b) \
     __aarch64_vdup_lane_any (f32, q, __a, __b)
  #define __aarch64_vdupq_lane_f64(__a, __b) \
@@ -544,6 +550,8 @@ typedef struct poly16x8x4_t
     __aarch64_vdup_lane_any (u64, q, __a, __b)
  
  /* __aarch64_vdupq_laneq internal macros.  */
+#define __aarch64_vdupq_laneq_f16(__a, __b) \
+   __aarch64_vdup_lane_any (f16, q, __a, __b)
  #define __aarch64_vdupq_laneq_f32(__a, __b) \
     __aarch64_vdup_lane_any (f32, q, __a, __b)
  #define __aarch64_vdupq_laneq_f64(__a, __b) \
@@ -10213,6 +10221,12 @@ vaddvq_f64 (float64x2_t __a)
  
  /* vbsl  */
  
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vbsl_f16 (uint16x4_t __a, float16x4_t __b, float16x4_t __c)
+{
+  return __builtin_aarch64_simd_bslv4hf_suss (__a, __b, __c);
+}
+
  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
  vbsl_f32 (uint32x2_t __a, float32x2_t __b, float32x2_t __c)
  {
@@ -10288,6 +10302,12 @@ vbsl_u64 (uint64x1_t __a, uint64x1_t __b, uint64x1_t __c)
        {__builtin_aarch64_simd_bsldi_uuuu (__a[0], __b[0], __c[0])};
  }
  
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vbslq_f16 (uint16x8_t __a, float16x8_t __b, float16x8_t __c)
+{
+  return __builtin_aarch64_simd_bslv8hf_suss (__a, __b, __c);
+}
+
  __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
  vbslq_f32 (uint32x4_t __a, float32x4_t __b, float32x4_t __c)
  {
@@ -13243,6 +13263,12 @@ vcvtpq_u64_f64 (float64x2_t __a)
  
  /* vdup_n  */
  
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vdup_n_f16 (float16_t __a)
+{
+  return (float16x4_t) {__a, __a, __a, __a};
+}
+
  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
  vdup_n_f32 (float32_t __a)
  {
@@ -13317,6 +13343,12 @@ vdup_n_u64 (uint64_t __a)
  
  /* vdupq_n  */
  
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vdupq_n_f16 (float16_t __a)
+{
+  return (float16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
+}
+
  __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
  vdupq_n_f32 (float32_t __a)
  {
@@ -13394,6 +13426,12 @@ vdupq_n_u64 (uint64_t __a)
  
  /* vdup_lane  */
  
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vdup_lane_f16 (float16x4_t __a, const int __b)
+{
+  return __aarch64_vdup_lane_f16 (__a, __b);
+}
+
  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
  vdup_lane_f32 (float32x2_t __a, const int __b)
  {
@@ -13468,6 +13506,12 @@ vdup_lane_u64 (uint64x1_t __a, const int __b)
  
  /* vdup_laneq  */
  
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vdup_laneq_f16 (float16x8_t __a, const int __b)
+{
+  return __aarch64_vdup_laneq_f16 (__a, __b);
+}
+
  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
  vdup_laneq_f32 (float32x4_t __a, const int __b)
  {
@@ -13541,6 +13585,13 @@ vdup_laneq_u64 (uint64x2_t __a, const int __b)
  }
  
  /* vdupq_lane  */
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vdupq_lane_f16 (float16x4_t __a, const int __b)
+{
+  return __aarch64_vdupq_lane_f16 (__a, __b);
+}
+
  __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
  vdupq_lane_f32 (float32x2_t __a, const int __b)
  {
@@ -13614,6 +13665,13 @@ vdupq_lane_u64 (uint64x1_t __a, const int __b)
  }
  
  /* vdupq_laneq  */
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vdupq_laneq_f16 (float16x8_t __a, const int __b)
+{
+  return __aarch64_vdupq_laneq_f16 (__a, __b);
+}
+
  __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
  vdupq_laneq_f32 (float32x4_t __a, const int __b)
  {
@@ -13706,6 +13764,13 @@ vdupb_lane_u8 (uint8x8_t __a, const int __b)
  }
  
  /* vduph_lane  */
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vduph_lane_f16 (float16x4_t __a, const int __b)
+{
+  return __aarch64_vget_lane_any (__a, __b);
+}
+
  __extension__ static __inline poly16_t __attribute__ ((__always_inline__))
  vduph_lane_p16 (poly16x4_t __a, const int __b)
  {
@@ -13725,6 +13790,7 @@ vduph_lane_u16 (uint16x4_t __a, const int __b)
  }
  
  /* vdups_lane  */
+
  __extension__ static __inline float32_t __attribute__ ((__always_inline__))
  vdups_lane_f32 (float32x2_t __a, const int __b)
  {
@@ -13785,6 +13851,13 @@ vdupb_laneq_u8 (uint8x16_t __a, const int __b)
  }
  
  /* vduph_laneq  */
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vduph_laneq_f16 (float16x8_t __a, const int __b)
+{
+  return __aarch64_vget_lane_any (__a, __b);
+}
+
  __extension__ static __inline poly16_t __attribute__ ((__always_inline__))
  vduph_laneq_p16 (poly16x8_t __a, const int __b)
  {
@@ -13804,6 +13877,7 @@ vduph_laneq_u16 (uint16x8_t __a, const int __b)
  }
  
  /* vdups_laneq  */
+
  __extension__ static __inline float32_t __attribute__ ((__always_inline__))
  vdups_laneq_f32 (float32x4_t __a, const int __b)
  {
@@ -13843,6 +13917,19 @@ vdupd_laneq_u64 (uint64x2_t __a, const int __b)
  
  /* vext  */
  
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vext_f16 (float16x4_t __a, float16x4_t __b, __const int __c)
+{
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a,
+                           (uint16x4_t) {4 - __c, 5 - __c, 6 - __c, 7 - __c});
+#else
+  return __builtin_shuffle (__a, __b,
+                           (uint16x4_t) {__c, __c + 1, __c + 2, __c + 3});
+#endif
+}
+
  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
  vext_f32 (float32x2_t __a, float32x2_t __b, __const int __c)
  {
@@ -13974,6 +14061,22 @@ vext_u64 (uint64x1_t __a, uint64x1_t __b, __const int __c)
    return __a;
  }
  
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vextq_f16 (float16x8_t __a, float16x8_t __b, __const int __c)
+{
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a,
+                           (uint16x8_t) {8 - __c, 9 - __c, 10 - __c, 11 - __c,
+                                         12 - __c, 13 - __c, 14 - __c,
+                                         15 - __c});
+#else
+  return __builtin_shuffle (__a, __b,
+                           (uint16x8_t) {__c, __c + 1, __c + 2, __c + 3,
+                                         __c + 4, __c + 5, __c + 6, __c + 7});
+#endif
+}
+
  __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
  vextq_f32 (float32x4_t __a, float32x4_t __b, __const int __c)
  {
@@ -14609,8 +14712,7 @@ vld1q_u64 (const uint64_t *a)
  __extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
  vld1_dup_f16 (const float16_t* __a)
  {
-  float16_t __f = *__a;
-  return (float16x4_t) { __f, __f, __f, __f };
+  return vdup_n_f16 (*__a);
  }
  
  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
@@ -14690,8 +14792,7 @@ vld1_dup_u64 (const uint64_t* __a)
  __extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
  vld1q_dup_f16 (const float16_t* __a)
  {
-  float16_t __f = *__a;
-  return (float16x8_t) { __f, __f, __f, __f, __f, __f, __f, __f };
+  return vdupq_n_f16 (*__a);
  }
  
  __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
@@ -18294,6 +18395,12 @@ vmlsq_laneq_u32 (uint32x4_t __a, uint32x4_t __b,
  
  /* vmov_n_  */
  
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vmov_n_f16 (float16_t __a)
+{
+  return vdup_n_f16 (__a);
+}
+
  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
  vmov_n_f32 (float32_t __a)
  {
@@ -18366,6 +18473,12 @@ vmov_n_u64 (uint64_t __a)
    return (uint64x1_t) {__a};
  }
  
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vmovq_n_f16 (float16_t __a)
+{
+  return vdupq_n_f16 (__a);
+}
+
  __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
  vmovq_n_f32 (float32_t __a)
  {
@@ -21123,6 +21236,12 @@ vrev32q_u16 (uint16x8_t a)
    return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
  }
  
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vrev64_f16 (float16x4_t __a)
+{
+  return __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 });
+}
+
  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
  vrev64_f32 (float32x2_t a)
  {
@@ -21177,6 +21296,12 @@ vrev64_u32 (uint32x2_t a)
    return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
  }
  
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vrev64q_f16 (float16x8_t __a)
+{
+  return __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+}
+
  __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
  vrev64q_f32 (float32x4_t a)
  {
@@ -24129,6 +24254,16 @@ vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx)
  
  /* vtrn */
  
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vtrn1_f16 (float16x4_t __a, float16x4_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6});
+#endif
+}
+
  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
  vtrn1_f32 (float32x2_t __a, float32x2_t __b)
  {
@@ -24219,6 +24354,16 @@ vtrn1_u32 (uint32x2_t __a, uint32x2_t __b)
  #endif
  }
  
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vtrn1q_f16 (float16x8_t __a, float16x8_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
+#endif
+}
+
  __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
  vtrn1q_f32 (float32x4_t __a, float32x4_t __b)
  {
@@ -24345,6 +24490,16 @@ vtrn1q_u64 (uint64x2_t __a, uint64x2_t __b)
  #endif
  }
  
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vtrn2_f16 (float16x4_t __a, float16x4_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7});
+#endif
+}
+
  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
  vtrn2_f32 (float32x2_t __a, float32x2_t __b)
  {
@@ -24435,6 +24590,16 @@ vtrn2_u32 (uint32x2_t __a, uint32x2_t __b)
  #endif
  }
  
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vtrn2q_f16 (float16x8_t __a, float16x8_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
+#endif
+}
+
  __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
  vtrn2q_f32 (float32x4_t __a, float32x4_t __b)
  {
@@ -24561,6 +24726,12 @@ vtrn2q_u64 (uint64x2_t __a, uint64x2_t __b)
  #endif
  }
  
+__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
+vtrn_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return (float16x4x2_t) {vtrn1_f16 (__a, __b), vtrn2_f16 (__a, __b)};
+}
+
  __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
  vtrn_f32 (float32x2_t a, float32x2_t b)
  {
@@ -24615,6 +24786,12 @@ vtrn_u32 (uint32x2_t a, uint32x2_t b)
    return (uint32x2x2_t) {vtrn1_u32 (a, b), vtrn2_u32 (a, b)};
  }
  
+__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
+vtrnq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return (float16x8x2_t) {vtrn1q_f16 (__a, __b), vtrn2q_f16 (__a, __b)};
+}
+
  __extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
  vtrnq_f32 (float32x4_t a, float32x4_t b)
  {
@@ -24863,6 +25040,7 @@ vuqaddd_s64 (int64_t __a, uint64_t __b)
    }
  
  #define __INTERLEAVE_LIST(op)                                  \
+  __DEFINTERLEAVE (op, float16x4x2_t, float16x4_t, f16,)       \
    __DEFINTERLEAVE (op, float32x2x2_t, float32x2_t, f32,)       \
    __DEFINTERLEAVE (op, poly8x8x2_t, poly8x8_t, p8,)            \
    __DEFINTERLEAVE (op, poly16x4x2_t, poly16x4_t, p16,)         \
@@ -24872,6 +25050,7 @@ vuqaddd_s64 (int64_t __a, uint64_t __b)
    __DEFINTERLEAVE (op, uint8x8x2_t, uint8x8_t, u8,)            \
    __DEFINTERLEAVE (op, uint16x4x2_t, uint16x4_t, u16,)         \
    __DEFINTERLEAVE (op, uint32x2x2_t, uint32x2_t, u32,)         \
+  __DEFINTERLEAVE (op, float16x8x2_t, float16x8_t, f16, q)     \
    __DEFINTERLEAVE (op, float32x4x2_t, float32x4_t, f32, q)     \
    __DEFINTERLEAVE (op, poly8x16x2_t, poly8x16_t, p8, q)                \
    __DEFINTERLEAVE (op, poly16x8x2_t, poly16x8_t, p16, q)       \
@@ -24884,6 +25063,16 @@ vuqaddd_s64 (int64_t __a, uint64_t __b)
  
  /* vuzp */
  
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vuzp1_f16 (float16x4_t __a, float16x4_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6});
+#endif
+}
+
  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
  vuzp1_f32 (float32x2_t __a, float32x2_t __b)
  {
@@ -24974,6 +25163,16 @@ vuzp1_u32 (uint32x2_t __a, uint32x2_t __b)
  #endif
  }
  
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vuzp1q_f16 (float16x8_t __a, float16x8_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
+#endif
+}
+
  __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
  vuzp1q_f32 (float32x4_t __a, float32x4_t __b)
  {
@@ -25100,6 +25299,16 @@ vuzp1q_u64 (uint64x2_t __a, uint64x2_t __b)
  #endif
  }
  
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vuzp2_f16 (float16x4_t __a, float16x4_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7});
+#endif
+}
+
  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
  vuzp2_f32 (float32x2_t __a, float32x2_t __b)
  {
@@ -25190,6 +25399,16 @@ vuzp2_u32 (uint32x2_t __a, uint32x2_t __b)
  #endif
  }
  
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vuzp2q_f16 (float16x8_t __a, float16x8_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
+#endif
+}
+
  __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
  vuzp2q_f32 (float32x4_t __a, float32x4_t __b)
  {
@@ -25320,6 +25539,16 @@ __INTERLEAVE_LIST (uzp)
  
  /* vzip */
  
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vzip1_f16 (float16x4_t __a, float16x4_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5});
+#endif
+}
+
  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
  vzip1_f32 (float32x2_t __a, float32x2_t __b)
  {
@@ -25410,6 +25639,18 @@ vzip1_u32 (uint32x2_t __a, uint32x2_t __b)
  #endif
  }
  
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vzip1q_f16 (float16x8_t __a, float16x8_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b,
+                           (uint16x8_t) {12, 4, 13, 5, 14, 6, 15, 7});
+#else
+  return __builtin_shuffle (__a, __b,
+                           (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
+#endif
+}
+
  __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
  vzip1q_f32 (float32x4_t __a, float32x4_t __b)
  {
@@ -25539,6 +25780,16 @@ vzip1q_u64 (uint64x2_t __a, uint64x2_t __b)
  #endif
  }
  
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vzip2_f16 (float16x4_t __a, float16x4_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7});
+#endif
+}
+
  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
  vzip2_f32 (float32x2_t __a, float32x2_t __b)
  {
@@ -25629,6 +25880,18 @@ vzip2_u32 (uint32x2_t __a, uint32x2_t __b)
  #endif
  }
  
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vzip2q_f16 (float16x8_t __a, float16x8_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b,
+                           (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
+#else
+  return __builtin_shuffle (__a, __b,
+                           (uint16x8_t) {4, 12, 5, 13, 6, 14, 7, 15});
+#endif
+}
+
  __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
  vzip2q_f32 (float32x4_t __a, float32x4_t __b)
  {
@@ -25768,6 +26031,7 @@ __INTERLEAVE_LIST (zip)
  #undef __aarch64_vget_lane_any
  
  #undef __aarch64_vdup_lane_any
+#undef __aarch64_vdup_lane_f16
  #undef __aarch64_vdup_lane_f32
  #undef __aarch64_vdup_lane_f64
  #undef __aarch64_vdup_lane_p8
@@ -25780,6 +26044,7 @@ __INTERLEAVE_LIST (zip)
  #undef __aarch64_vdup_lane_u16
  #undef __aarch64_vdup_lane_u32
  #undef __aarch64_vdup_lane_u64
+#undef __aarch64_vdup_laneq_f16
  #undef __aarch64_vdup_laneq_f32
  #undef __aarch64_vdup_laneq_f64
  #undef __aarch64_vdup_laneq_p8
@@ -25792,6 +26057,7 @@ __INTERLEAVE_LIST (zip)
  #undef __aarch64_vdup_laneq_u16
  #undef __aarch64_vdup_laneq_u32
  #undef __aarch64_vdup_laneq_u64
+#undef __aarch64_vdupq_lane_f16
  #undef __aarch64_vdupq_lane_f32
  #undef __aarch64_vdupq_lane_f64
  #undef __aarch64_vdupq_lane_p8
@@ -25804,6 +26070,7 @@ __INTERLEAVE_LIST (zip)
  #undef __aarch64_vdupq_lane_u16
  #undef __aarch64_vdupq_lane_u32
  #undef __aarch64_vdupq_lane_u64
+#undef __aarch64_vdupq_laneq_f16
  #undef __aarch64_vdupq_laneq_f32
  #undef __aarch64_vdupq_laneq_f64
  #undef __aarch64_vdupq_laneq_p8
author	Jiong Wang <jiong.wang@arm.com>
	Mon, 25 Jul 2016 14:02:42 +0000 (14:02 +0000)
committer	Jiong Wang <jiwang@gcc.gnu.org>
	Mon, 25 Jul 2016 14:02:42 +0000 (14:02 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config/aarch64/aarch64-simd.md		patch \| blob \| history
gcc/config/aarch64/aarch64.c		patch \| blob \| history
gcc/config/aarch64/arm_neon.h		patch \| blob \| history