From: Dennis Zhang Date: Tue, 3 Mar 2020 13:13:31 +0000 (+0000) Subject: arm: ACLE BFloat16 convert intrinsics X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=8e6d0dba166324f4b257329bd4b4ddc2b4522359;p=gcc.git arm: ACLE BFloat16 convert intrinsics This patch is part of a series adding support for Armv8.6-A features. It implements intrinsics to convert between bfloat16 and float32 formats. gcc/ChangeLog: * config/arm/arm_bf16.h (vcvtah_f32_bf16, vcvth_bf16_f32): New. * config/arm/arm_neon.h (vcvt_f32_bf16, vcvtq_low_f32_bf16): New. (vcvtq_high_f32_bf16, vcvt_bf16_f32): New. (vcvtq_low_bf16_f32, vcvtq_high_bf16_f32): New. * config/arm/arm_neon_builtins.def (vbfcvt, vbfcvt_high): New entries. (vbfcvtv4sf, vbfcvtv4sf_high): Likewise. * config/arm/iterators.md (VBFCVT, VBFCVTM): New mode iterators. (V_bf_low, V_bf_cvt_m): New mode attributes. * config/arm/neon.md (neon_vbfcvtv4sf): New. (neon_vbfcvtv4sf_highv8bf, neon_vbfcvtsf): New. (neon_vbfcvt, neon_vbfcvt_highv8bf): New. (neon_vbfcvtbf_cvtmode, neon_vbfcvtbf): New * config/arm/unspecs.md (UNSPEC_BFCVT, UNSPEC_BFCVT_HIG): New. gcc/testsuite/ChangeLog: * gcc.target/arm/simd/bf16_cvt_1.c: New test. --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 2023cd3c871..862dffee119 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,19 @@ +2020-03-03 Dennis Zhang + + * config/arm/arm_bf16.h (vcvtah_f32_bf16, vcvth_bf16_f32): New. + * config/arm/arm_neon.h (vcvt_f32_bf16, vcvtq_low_f32_bf16): New. + (vcvtq_high_f32_bf16, vcvt_bf16_f32): New. + (vcvtq_low_bf16_f32, vcvtq_high_bf16_f32): New. + * config/arm/arm_neon_builtins.def (vbfcvt, vbfcvt_high): New entries. + (vbfcvtv4sf, vbfcvtv4sf_high): Likewise. + * config/arm/iterators.md (VBFCVT, VBFCVTM): New mode iterators. + (V_bf_low, V_bf_cvt_m): New mode attributes. + * config/arm/neon.md (neon_vbfcvtv4sf): New. + (neon_vbfcvtv4sf_highv8bf, neon_vbfcvtsf): New. + (neon_vbfcvt, neon_vbfcvt_highv8bf): New. + (neon_vbfcvtbf_cvtmode, neon_vbfcvtbf): New + * config/arm/unspecs.md (UNSPEC_BFCVT, UNSPEC_BFCVT_HIG): New. + 2020-03-03 Jakub Jelinek PR tree-optimization/93582 diff --git a/gcc/config/arm/arm_bf16.h b/gcc/config/arm/arm_bf16.h index decf23f3834..1aa593192c0 100644 --- a/gcc/config/arm/arm_bf16.h +++ b/gcc/config/arm/arm_bf16.h @@ -34,6 +34,20 @@ extern "C" { typedef __bf16 bfloat16_t; typedef float float32_t; +__extension__ extern __inline float32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcvtah_f32_bf16 (bfloat16_t __a) +{ + return __builtin_neon_vbfcvtbf (__a); +} + +__extension__ extern __inline bfloat16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcvth_bf16_f32 (float32_t __a) +{ + return __builtin_neon_vbfcvtsf (__a); +} + #ifdef __cplusplus } #endif diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 81c407f5152..a66961d0c51 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -19379,6 +19379,55 @@ vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, #pragma GCC pop_options +#pragma GCC push_options +#pragma GCC target ("arch=armv8.2-a+bf16") + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcvt_f32_bf16 (bfloat16x4_t __a) +{ + return __builtin_neon_vbfcvtv4bf (__a); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcvtq_low_f32_bf16 (bfloat16x8_t __a) +{ + return __builtin_neon_vbfcvtv8bf (__a); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcvtq_high_f32_bf16 (bfloat16x8_t __a) +{ + return __builtin_neon_vbfcvt_highv8bf (__a); +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcvt_bf16_f32 (float32x4_t __a) +{ + return __builtin_neon_vbfcvtv4sfv4bf (__a); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcvtq_low_bf16_f32 (float32x4_t __a) +{ + return __builtin_neon_vbfcvtv4sfv8bf (__a); +} + +/* The 'inactive' operand is not converted but it provides the + low 64 bits to assemble the final 128-bit result. */ +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcvtq_high_bf16_f32 (bfloat16x8_t inactive, float32x4_t __a) +{ + return __builtin_neon_vbfcvtv4sf_highv8bf (inactive, __a); +} + +#pragma GCC pop_options + #ifdef __cplusplus } #endif diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index 4b4d1c808d8..48c06c43a17 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -385,3 +385,9 @@ VAR1 (USTERNOP, usmmla, v16qi) VAR2 (TERNOP, vbfdot, v2sf, v4sf) VAR2 (MAC_LANE_PAIR, vbfdot_lanev4bf, v2sf, v4sf) VAR2 (MAC_LANE_PAIR, vbfdot_lanev8bf, v2sf, v4sf) + +VAR2 (UNOP, vbfcvt, sf, bf) +VAR2 (UNOP, vbfcvt, v4bf, v8bf) +VAR1 (UNOP, vbfcvt_high, v8bf) +VAR2 (UNOP, vbfcvtv4sf, v4bf, v8bf) +VAR1 (BINOP, vbfcvtv4sf_high, v8bf) diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index ab30c371583..5f4e3d12358 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -229,6 +229,10 @@ ;; Modes for polynomial or float values. (define_mode_iterator VPF [V8QI V16QI V2SF V4SF]) +;; Modes for BF16 convert instructions. +(define_mode_iterator VBFCVT [V4BF V8BF]) +(define_mode_iterator VBFCVTM [V2SI SF]) + ;;---------------------------------------------------------------------------- ;; Code iterators ;;---------------------------------------------------------------------------- @@ -747,6 +751,12 @@ (V2SF "") (V4SF "") (DI "_neon") (V2DI "")]) +;; To select the low 64 bits of a vector. +(define_mode_attr V_bf_low [(V4BF "P") (V8BF "e")]) + +;; To generate intermediate modes for BF16 scalar convert. +(define_mode_attr V_bf_cvt_m [(V2SI "BF") (SF "V2SI")]) + ;; Scalars to be presented to scalar multiplication instructions ;; must satisfy the following constraints. diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index fae82131e24..f5286d9c4b1 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -6660,3 +6660,80 @@ if (BYTES_BIG_ENDIAN) } [(set_attr "type" "neon_dot")] ) + +(define_insn "neon_vbfcvtv4sf" + [(set (match_operand:VBFCVT 0 "register_operand" "=w") + (unspec:VBFCVT [(match_operand:V4SF 1 "register_operand" "w")] + UNSPEC_BFCVT))] + "TARGET_BF16_SIMD" + "vcvt.bf16.f32\\t%0, %q1" + [(set_attr "type" "neon_fp_cvt_narrow_s_q")] +) + +(define_insn "neon_vbfcvtv4sf_highv8bf" + [(set (match_operand:V8BF 0 "register_operand" "=w") + (unspec:V8BF [(match_operand:V8BF 1 "register_operand" "0") + (match_operand:V4SF 2 "register_operand" "w")] + UNSPEC_BFCVT_HIGH))] + "TARGET_BF16_SIMD" + "vcvt.bf16.f32\\t%f0, %q2" + [(set_attr "type" "neon_fp_cvt_narrow_s_q")] +) + +(define_insn "neon_vbfcvtsf" + [(set (match_operand:BF 0 "register_operand" "=t") + (unspec:BF [(match_operand:SF 1 "register_operand" "t")] + UNSPEC_BFCVT))] + "TARGET_BF16_FP" + "vcvtb.bf16.f32\\t%0, %1" + [(set_attr "type" "f_cvt")] +) + +(define_insn "neon_vbfcvt" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (unspec:V4SF [(match_operand:VBFCVT 1 "register_operand" "w")] + UNSPEC_BFCVT))] + "TARGET_BF16_SIMD" + "vshll.u32\\t%q0, %1, #16" + [(set_attr "type" "neon_shift_imm_q")] +) + +(define_insn "neon_vbfcvt_highv8bf" + [(set (match_operand:V4SF 0 "register_operand" "=w") + (unspec:V4SF [(match_operand:V8BF 1 "register_operand" "w")] + UNSPEC_BFCVT_HIGH))] + "TARGET_BF16_SIMD" + "vshll.u32\\t%q0, %f1, #16" + [(set_attr "type" "neon_shift_imm_q")] +) + +;; Convert a BF scalar operand to SF via VSHL. +;; VSHL doesn't accept 32-bit registers where the BF and SF scalar operands +;; would be allocated, therefore the operands must be converted to intermediate +;; vectors (i.e. V2SI) in order to apply 64-bit registers. +(define_expand "neon_vbfcvtbf" + [(match_operand:SF 0 "register_operand") + (unspec:SF [(match_operand:BF 1 "register_operand")] UNSPEC_BFCVT)] + "TARGET_BF16_FP" +{ + rtx op0 = gen_reg_rtx (V2SImode); + rtx op1 = gen_reg_rtx (V2SImode); + emit_insn (gen_neon_vbfcvtbf_cvtmodev2si (op1, operands[1])); + emit_insn (gen_neon_vshl_nv2si (op0, op1, gen_int_mode(16, SImode))); + emit_insn (gen_neon_vbfcvtbf_cvtmodesf (operands[0], op0)); + DONE; +}) + +;; Convert BF mode to V2SI and V2SI to SF. +;; Implement this by allocating a 32-bit operand in the low half of a 64-bit +;; register indexed by a 32-bit sub-register number. +;; This will generate reloads but compiler can optimize out the moves. +;; Use 'x' constraint to guarantee the 32-bit sub-registers in an indexable +;; range so that to avoid extra moves. +(define_insn "neon_vbfcvtbf_cvtmode" + [(set (match_operand:VBFCVTM 0 "register_operand" "=x") + (unspec:VBFCVTM [(match_operand: 1 "register_operand" "0")] + UNSPEC_BFCVT))] + "TARGET_BF16_FP" + "" +) diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index c8f3f95c7f3..b36ae512a6e 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -506,4 +506,6 @@ UNSPEC_MATMUL_S UNSPEC_MATMUL_U UNSPEC_MATMUL_US + UNSPEC_BFCVT + UNSPEC_BFCVT_HIGH ]) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 7c4c852ad5d..5b174bb03f2 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2020-03-03 Dennis Zhang + + * gcc.target/arm/simd/bf16_cvt_1.c: New test. + 2020-03-03 Jakub Jelinek PR tree-optimization/93582 diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_cvt_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_cvt_1.c new file mode 100644 index 00000000000..04d4eee8e47 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_cvt_1.c @@ -0,0 +1,51 @@ +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-options "-save-temps -O2" } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +#include "arm_neon.h" + +float32_t test_vcvtah_f32_bf16 (bfloat16_t a) +{ + return vcvtah_f32_bf16 (a); +} + +bfloat16_t test_vcvth_bf16_f32 (float32_t a) +{ + return vcvth_bf16_f32 (a); +} + +float32x4_t test_vcvt_f32_bf16 (bfloat16x4_t a) +{ + return vcvt_f32_bf16 (a); +} + +float32x4_t test_vcvtq_low_f32_bf16 (bfloat16x8_t a) +{ + return vcvtq_low_f32_bf16 (a); +} + +float32x4_t test_vcvtq_high_f32_bf16 (bfloat16x8_t a) +{ + return vcvtq_high_f32_bf16 (a); +} + +bfloat16x4_t test_vcvt_bf16_f32 (float32x4_t a) +{ + return vcvt_bf16_f32 (a); +} + +bfloat16x8_t test_vcvtq_low_bf16_f32 (float32x4_t a) +{ + return vcvtq_low_bf16_f32 (a); +} + +bfloat16x8_t test_vcvtq_high_bf16_f32 (bfloat16x8_t inactive, float32x4_t a) +{ + return vcvtq_high_bf16_f32 (inactive, a); +} + +/* { dg-final { scan-assembler-times {vcvtb.bf16.f32\ts[0-9]+, s[0-9]+\n} 1 } } */ +/* { dg-final { scan-assembler-times {vcvt.bf16.f32\td[0-9]+, q[0-9]+\n} 3 } } */ +/* { dg-final { scan-assembler-times {vshl.i32\td[0-9]+, d[0-9]+, #16} 1 } } */ +/* { dg-final { scan-assembler-times {vshll.u32\tq[0-9]+, d[0-9]+, #16} 3 } } */