From f7d6961126a7f06c8089d8a58bd21be43bc16806 Mon Sep 17 00:00:00 2001 From: Dennis Zhang Date: Tue, 3 Nov 2020 13:00:51 +0000 Subject: [PATCH] aarch64: ACLE intrinsics convert BF16 to Float32 This patch enables intrinsics to convert BFloat16 scalar and vector operands to Float32 modes. The intrinsics are implemented by shifting each BFloat16 item 16 bits to left using shl/shll/shll2 instructions. gcc/ChangeLog: 2020-11-03 Dennis Zhang * config/aarch64/aarch64-simd-builtins.def(vbfcvt): New entry. (vbfcvt_high, bfcvt): Likewise. * config/aarch64/aarch64-simd.md(aarch64_vbfcvt): New entry. (aarch64_vbfcvt_highv8bf, aarch64_bfcvtsf): Likewise. * config/aarch64/arm_bf16.h (vcvtah_f32_bf16): New intrinsic. * config/aarch64/arm_neon.h (vcvt_f32_bf16): Likewise. (vcvtq_low_f32_bf16, vcvtq_high_f32_bf16): Likewise. gcc/testsuite/ChangeLog * gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c (test_vcvt_f32_bf16, test_vcvtq_low_f32_bf16): New tests. (test_vcvtq_high_f32_bf16, test_vcvth_f32_bf16): Likewise. --- gcc/ChangeLog | 10 +++++ gcc/config/aarch64/aarch64-simd-builtins.def | 5 +++ gcc/config/aarch64/aarch64-simd.md | 28 +++++++++++++ gcc/config/aarch64/arm_bf16.h | 7 ++++ gcc/config/aarch64/arm_neon.h | 21 ++++++++++ gcc/testsuite/ChangeLog | 6 +++ .../advsimd-intrinsics/bfcvt-compile.c | 40 +++++++++++++++++++ 7 files changed, 117 insertions(+) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 9f743ecc89a..2ff5c4e76d1 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,13 @@ +2020-11-03 Dennis Zhang + + * config/aarch64/aarch64-simd-builtins.def(vbfcvt): New entry. + (vbfcvt_high, bfcvt): Likewise. + * config/aarch64/aarch64-simd.md(aarch64_vbfcvt): New entry. + (aarch64_vbfcvt_highv8bf, aarch64_bfcvtsf): Likewise. + * config/aarch64/arm_bf16.h (vcvtah_f32_bf16): New intrinsic. + * config/aarch64/arm_neon.h (vcvt_f32_bf16): Likewise. + (vcvtq_low_f32_bf16, vcvtq_high_f32_bf16): Likewise. + 2020-11-02 Alan Modra PR middle-end/97267 diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index eb8e6f7b3d8..f494b535a30 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -732,3 +732,8 @@ VAR1 (UNOP, bfcvtn_q, 0, FP, v8bf) VAR1 (BINOP, bfcvtn2, 0, FP, v8bf) VAR1 (UNOP, bfcvt, 0, FP, bf) + + /* Implemented by aarch64_{v}bfcvt{_high}. */ + VAR2 (UNOP, vbfcvt, 0, AUTO_FP, v4bf, v8bf) + VAR1 (UNOP, vbfcvt_high, 0, AUTO_FP, v8bf) + VAR1 (UNOP, bfcvt, 0, AUTO_FP, sf) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 381a702eba0..030a086d31c 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -7238,3 +7238,31 @@ "bfcvt\\t%h0, %s1" [(set_attr "type" "f_cvt")] ) + +;; Use shl/shll/shll2 to convert BF scalar/vector modes to SF modes. +(define_insn "aarch64_vbfcvt" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (unspec:V4SF [(match_operand:VBF 1 "register_operand" "w")] + UNSPEC_BFCVTN))] + "TARGET_BF16_SIMD" + "shll\\t%0.4s, %1.4h, #16" + [(set_attr "type" "neon_shift_imm_long")] +) + +(define_insn "aarch64_vbfcvt_highv8bf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (unspec:V4SF [(match_operand:V8BF 1 "register_operand" "w")] + UNSPEC_BFCVTN2))] + "TARGET_BF16_SIMD" + "shll2\\t%0.4s, %1.8h, #16" + [(set_attr "type" "neon_shift_imm_long")] +) + +(define_insn "aarch64_bfcvtsf" + [(set (match_operand:SF 0 "register_operand" "=w") + (unspec:SF [(match_operand:BF 1 "register_operand" "w")] + UNSPEC_BFCVT))] + "TARGET_BF16_FP" + "shl\\t%d0, %d1, #16" + [(set_attr "type" "neon_shift_imm")] +) diff --git a/gcc/config/aarch64/arm_bf16.h b/gcc/config/aarch64/arm_bf16.h index 984875dcc01..881615498d3 100644 --- a/gcc/config/aarch64/arm_bf16.h +++ b/gcc/config/aarch64/arm_bf16.h @@ -40,6 +40,13 @@ vcvth_bf16_f32 (float32_t __a) return __builtin_aarch64_bfcvtbf (__a); } +__extension__ extern __inline float32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcvtah_f32_bf16 (bfloat16_t __a) +{ + return __builtin_aarch64_bfcvtsf (__a); +} + #pragma GCC pop_options #endif diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 95bfa5ebba2..69cccd32786 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -35680,6 +35680,27 @@ vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, return __builtin_aarch64_bfmlalt_lane_qv4sf (__r, __a, __b, __index); } +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcvt_f32_bf16 (bfloat16x4_t __a) +{ + return __builtin_aarch64_vbfcvtv4bf (__a); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcvtq_low_f32_bf16 (bfloat16x8_t __a) +{ + return __builtin_aarch64_vbfcvtv8bf (__a); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcvtq_high_f32_bf16 (bfloat16x8_t __a) +{ + return __builtin_aarch64_vbfcvt_highv8bf (__a); +} + __extension__ extern __inline bfloat16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcvt_bf16_f32 (float32x4_t __a) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 94bddaaee09..a7bbb1bd181 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,9 @@ +2020-11-03 Dennis Zhang + + * gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c + (test_vcvt_f32_bf16, test_vcvtq_low_f32_bf16): New tests. + (test_vcvtq_high_f32_bf16, test_vcvth_f32_bf16): Likewise. + 2020-11-02 Alan Modra PR middle-end/97267 diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c index bbea630b182..47af7c494d9 100644 --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c @@ -46,3 +46,43 @@ bfloat16_t test_bfcvt (float32_t a) { return vcvth_bf16_f32 (a); } + +/* +**test_vcvt_f32_bf16: +** shll v0.4s, v0.4h, #16 +** ret +*/ +float32x4_t test_vcvt_f32_bf16 (bfloat16x4_t a) +{ + return vcvt_f32_bf16 (a); +} + +/* +**test_vcvtq_low_f32_bf16: +** shll v0.4s, v0.4h, #16 +** ret +*/ +float32x4_t test_vcvtq_low_f32_bf16 (bfloat16x8_t a) +{ + return vcvtq_low_f32_bf16 (a); +} + +/* +**test_vcvtq_high_f32_bf16: +** shll2 v0.4s, v0.8h, #16 +** ret +*/ +float32x4_t test_vcvtq_high_f32_bf16 (bfloat16x8_t a) +{ + return vcvtq_high_f32_bf16 (a); +} + +/* +**test_vcvtah_f32_bf16: +** shl d0, d0, #16 +** ret +*/ +float32_t test_vcvtah_f32_bf16 (bfloat16_t a) +{ + return vcvtah_f32_bf16 (a); +} -- 2.30.2