aarch64: Reimplement vmovn_high_* intrinsics using builtins
authorKyrylo Tkachov <kyrylo.tkachov@arm.com>
Tue, 12 Jan 2021 10:07:19 +0000 (10:07 +0000)
committerKyrylo Tkachov <kyrylo.tkachov@arm.com>
Thu, 14 Jan 2021 08:36:19 +0000 (08:36 +0000)
The vmovn_high* intrinsics are supposed to map to XTN2 instructions that
narrow their source vector and instert it into the top half of the destination vector.
This patch reimplements them away from inline assembly to an RTL builtin
that performs a vec_concat with a truncate.

gcc/
* config/aarch64/aarch64-simd.md (aarch64_xtn2<mode>_le):
Define.
(aarch64_xtn2<mode>_be): Likewise.
(aarch64_xtn2<mode>): Likewise.
* config/aarch64/aarch64-simd-builtins.def (xtn2): Define
builtins.
* config/aarch64/arm_neon.h (vmovn_high_s16): Reimplement using
builtins.
(vmovn_high_s32): Likewise.
(vmovn_high_s64): Likewise.
(vmovn_high_u16): Likewise.
(vmovn_high_u32): Likewise.
(vmovn_high_u64): Likewise.

gcc/testsuite/
* gcc.target/aarch64/narrow_high-intrinsics.c: Adjust
scan-assembler-times for xtn2.

gcc/config/aarch64/aarch64-simd-builtins.def
gcc/config/aarch64/aarch64-simd.md
gcc/config/aarch64/arm_neon.h
gcc/testsuite/gcc.target/aarch64/narrow_high-intrinsics.c

index 0d611878c9dac5d352d86f9c30b3023ae440f122..6efc7706a41e02d947753a4cda984159b68bd39f 100644 (file)
   BUILTIN_VSDQ_I (SHIFTIMM, sqshl_n, 0, ALL)
   BUILTIN_VSDQ_I (USHIFTIMM, uqshl_n, 0, ALL)
 
+  /* Implemented by aarch64_xtn2<mode>.  */
+  BUILTIN_VQN (UNOP, xtn2, 0, NONE)
+
   /* Implemented by aarch64_reduc_plus_<mode>.  */
   BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, NONE)
 
index b6629bfa93e7868442fa422cc052a8909c2a2109..65209686b7e17afca72aa2477aa26ea2472aef1f 100644 (file)
   [(set_attr "type" "neon_shift_imm_narrow_q")]
 )
 
+(define_insn "aarch64_xtn2<mode>_le"
+  [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
+       (vec_concat:<VNARROWQ2>
+         (match_operand:<VNARROWQ> 1 "register_operand" "0")
+         (truncate:<VNARROWQ> (match_operand:VQN 2 "register_operand" "w"))))]
+  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
+  "xtn2\t%0.<V2ntype>, %2.<Vtype>"
+  [(set_attr "type" "neon_shift_imm_narrow_q")]
+)
+
+(define_insn "aarch64_xtn2<mode>_be"
+  [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
+       (vec_concat:<VNARROWQ2>
+         (truncate:<VNARROWQ> (match_operand:VQN 2 "register_operand" "w"))
+         (match_operand:<VNARROWQ> 1 "register_operand" "0")))]
+  "TARGET_SIMD && BYTES_BIG_ENDIAN"
+  "xtn2\t%0.<V2ntype>, %2.<Vtype>"
+  [(set_attr "type" "neon_shift_imm_narrow_q")]
+)
+
+(define_expand "aarch64_xtn2<mode>"
+  [(match_operand:<VNARROWQ2> 0 "register_operand")
+   (match_operand:<VNARROWQ> 1 "register_operand")
+   (truncate:<VNARROWQ> (match_operand:VQN 2 "register_operand"))]
+  "TARGET_SIMD"
+  {
+    if (BYTES_BIG_ENDIAN)
+      emit_insn (gen_aarch64_xtn2<mode>_be (operands[0], operands[1],
+                                            operands[2]));
+    else
+      emit_insn (gen_aarch64_xtn2<mode>_le (operands[0], operands[1],
+                                            operands[2]));
+    DONE;
+  }
+)
+
 (define_insn "aarch64_bfdot<mode>"
   [(set (match_operand:VDQSF 0 "register_operand" "=w")
        (plus:VDQSF
index d3a81dac4757e7757dfa5c1a2a4189d834a1457b..b2a6b58f8c92b896ade449f8f7978fe79c5a114f 100644 (file)
@@ -8751,72 +8751,45 @@ __extension__ extern __inline int8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmovn_high_s16 (int8x8_t __a, int16x8_t __b)
 {
-  int8x16_t __result = vcombine_s8 (__a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("xtn2 %0.16b,%1.8h"
-           : "+w"(__result)
-           : "w"(__b)
-           : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_xtn2v8hi (__a, __b);
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmovn_high_s32 (int16x4_t __a, int32x4_t __b)
 {
-  int16x8_t __result = vcombine_s16 (__a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("xtn2 %0.8h,%1.4s"
-           : "+w"(__result)
-           : "w"(__b)
-           : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_xtn2v4si (__a, __b);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmovn_high_s64 (int32x2_t __a, int64x2_t __b)
 {
-  int32x4_t __result = vcombine_s32 (__a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("xtn2 %0.4s,%1.2d"
-           : "+w"(__result)
-           : "w"(__b)
-           : /* No clobbers */);
-  return __result;
+  return __builtin_aarch64_xtn2v2di (__a, __b);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmovn_high_u16 (uint8x8_t __a, uint16x8_t __b)
 {
-  uint8x16_t __result = vcombine_u8 (__a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("xtn2 %0.16b,%1.8h"
-           : "+w"(__result)
-           : "w"(__b)
-           : /* No clobbers */);
-  return __result;
+  return (uint8x16_t)
+          __builtin_aarch64_xtn2v8hi ((int8x8_t) __a, (int16x8_t) __b);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmovn_high_u32 (uint16x4_t __a, uint32x4_t __b)
 {
-  uint16x8_t __result = vcombine_u16 (__a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("xtn2 %0.8h,%1.4s"
-           : "+w"(__result)
-           : "w"(__b)
-           : /* No clobbers */);
-  return __result;
+  return (uint16x8_t)
+          __builtin_aarch64_xtn2v4si ((int16x4_t) __a, (int32x4_t) __b);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vmovn_high_u64 (uint32x2_t __a, uint64x2_t __b)
 {
-  uint32x4_t __result = vcombine_u32 (__a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
-  __asm__ ("xtn2 %0.4s,%1.2d"
-           : "+w"(__result)
-           : "w"(__b)
-           : /* No clobbers */);
-  return __result;
+  return (uint32x4_t)
+          __builtin_aarch64_xtn2v2di ((int32x2_t) __a, (int64x2_t) __b);
 }
 
 __extension__ extern __inline int8x8_t
index 07d780300586299eea675c5ef09527d0f8066794..a2e0cb9b1008f620922d64f06acc3b66795514b0 100644 (file)
@@ -122,4 +122,4 @@ ONE (vmovn_high, uint32x4_t, uint32x2_t, uint64x2_t, u64)
 /* { dg-final { scan-assembler-times "uqxtn2 v" 3} }  */
 /* { dg-final { scan-assembler-times "sqxtn2 v" 3} }  */
 /* { dg-final { scan-assembler-times "sqxtun2 v" 3} }  */
-/* { dg-final { scan-assembler-times "\\txtn2 v" 6} }  */
+/* { dg-final { scan-assembler-times "\\txtn2\\tv" 6} }  */