From eb7ba6c36b8a17c79936abe26245e4bc66bb8859 Mon Sep 17 00:00:00 2001 From: Dennis Zhang Date: Tue, 25 Feb 2020 17:38:00 +0000 Subject: [PATCH] arm: ACLE intrinsics for bfloat16 dot product This patch is part of a series adding support for Armv8.6-A features. It adds intrinsics for brain half-precision float-point (BF16) dot instructions with AdvSIMD support. gcc/ChangeLog: 2020-02-25 Dennis Zhang * config/arm/arm_neon.h (vbfdot_f32, vbfdotq_f32): New (vbfdot_lane_f32, vbfdotq_laneq_f32): New. (vbfdot_laneq_f32, vbfdotq_lane_f32): New. * config/arm/arm_neon_builtins.def (vbfdot): New entry. (vbfdot_lanev4bf, vbfdot_lanev8bf): Likewise. * config/arm/iterators.md (VSF2BF): New attribute. * config/arm/neon.md (neon_vbfdot): New entry. (neon_vbfdot_lanev4bf): Likewise. (neon_vbfdot_lanev8bf): Likewise. gcc/testsuite/ChangeLog: 2020-02-25 Dennis Zhang * gcc.target/arm/simd/bf16_dot_1.c: New test. * gcc.target/arm/simd/bf16_dot_2.c: New test. * gcc.target/arm/simd/bf16_dot_3.c: New test. --- gcc/ChangeLog | 12 +++ gcc/config/arm/arm_neon.h | 52 +++++++++ gcc/config/arm/arm_neon_builtins.def | 4 + gcc/config/arm/iterators.md | 2 + gcc/config/arm/neon.md | 48 +++++++++ gcc/testsuite/ChangeLog | 6 ++ .../gcc.target/arm/simd/bf16_dot_1.c | 100 ++++++++++++++++++ .../gcc.target/arm/simd/bf16_dot_2.c | 33 ++++++ .../gcc.target/arm/simd/bf16_dot_3.c | 33 ++++++ 9 files changed, 290 insertions(+) create mode 100644 gcc/testsuite/gcc.target/arm/simd/bf16_dot_1.c create mode 100644 gcc/testsuite/gcc.target/arm/simd/bf16_dot_2.c create mode 100644 gcc/testsuite/gcc.target/arm/simd/bf16_dot_3.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 14a4b1a06e8..bbb4a656553 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,15 @@ +2020-02-25 Dennis Zhang + + * config/arm/arm_neon.h (vbfdot_f32, vbfdotq_f32): New + (vbfdot_lane_f32, vbfdotq_laneq_f32): New. + (vbfdot_laneq_f32, vbfdotq_lane_f32): New. + * config/arm/arm_neon_builtins.def (vbfdot): New entry. + (vbfdot_lanev4bf, vbfdot_lanev8bf): Likewise. + * config/arm/iterators.md (VSF2BF): New attribute. + * config/arm/neon.md (neon_vbfdot): New entry. + (neon_vbfdot_lanev4bf): Likewise. + (neon_vbfdot_lanev8bf): Likewise. + 2020-02-25 Christophe Lyon * config/arm/arm.md (required_for_purecode): New attribute. diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index e81681aa415..d2ebee40538 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -18819,6 +18819,58 @@ vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) #pragma GCC pop_options +/* AdvSIMD Brain half-precision float-point (Bfloat16) intrinsics. */ +#pragma GCC push_options +#pragma GCC target ("arch=armv8.2-a+bf16") + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b) +{ + return __builtin_neon_vbfdotv2sf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) +{ + return __builtin_neon_vbfdotv4sf (__r, __a, __b); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdot_lane_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b, + const int __index) +{ + return __builtin_neon_vbfdot_lanev4bfv2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdotq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, + const int __index) +{ + return __builtin_neon_vbfdot_lanev8bfv4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdot_laneq_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x8_t __b, + const int __index) +{ + return __builtin_neon_vbfdot_lanev8bfv2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, + const int __index) +{ + return __builtin_neon_vbfdot_lanev4bfv4sf (__r, __a, __b, __index); +} + +#pragma GCC pop_options + #ifdef __cplusplus } #endif diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index f4a97fd764c..4a6f4cfc44e 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -381,3 +381,7 @@ VAR2 (MAC_LANE_PAIR, vcmlaq_lane270, v4sf, v8hf) VAR1 (TERNOP, smmla, v16qi) VAR1 (UTERNOP, ummla, v16qi) VAR1 (USTERNOP, usmmla, v16qi) + +VAR2 (TERNOP, vbfdot, v2sf, v4sf) +VAR2 (MAC_LANE_PAIR, vbfdot_lanev4bf, v2sf, v4sf) +VAR2 (MAC_LANE_PAIR, vbfdot_lanev8bf, v2sf, v4sf) diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 136c45274ae..b435a05d219 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -835,6 +835,8 @@ (define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")]) (define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")]) +(define_mode_attr VSF2BF [(V2SF "V4BF") (V4SF "V8BF")]) + ;;---------------------------------------------------------------------------- ;; Code attributes ;;---------------------------------------------------------------------------- diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 039cd90c3da..80e94de4b84 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -6596,3 +6596,51 @@ if (BYTES_BIG_ENDIAN) "vmmla.\t%q0, %q2, %q3" [(set_attr "type" "neon_mla_s_q")] ) + +(define_insn "neon_vbfdot" + [(set (match_operand:VCVTF 0 "register_operand" "=w") + (plus:VCVTF (match_operand:VCVTF 1 "register_operand" "0") + (unspec:VCVTF [ + (match_operand: 2 "register_operand" "w") + (match_operand: 3 "register_operand" "w")] + UNSPEC_DOT_S)))] + "TARGET_BF16_SIMD" + "vdot.bf16\\t%0, %2, %3" + [(set_attr "type" "neon_dot")] +) + +(define_insn "neon_vbfdot_lanev4bf" + [(set (match_operand:VCVTF 0 "register_operand" "=w") + (plus:VCVTF (match_operand:VCVTF 1 "register_operand" "0") + (unspec:VCVTF [ + (match_operand: 2 "register_operand" "w") + (match_operand:V4BF 3 "register_operand" "x") + (match_operand:SI 4 "immediate_operand" "i")] + UNSPEC_DOT_S)))] + "TARGET_BF16_SIMD" + "vdot.bf16\\t%0, %2, %P3[%c4]" + [(set_attr "type" "neon_dot")] +) + +(define_insn "neon_vbfdot_lanev8bf" + [(set (match_operand:VCVTF 0 "register_operand" "=w") + (plus:VCVTF (match_operand:VCVTF 1 "register_operand" "0") + (unspec:VCVTF [ + (match_operand: 2 "register_operand" "w") + (match_operand:V8BF 3 "register_operand" "x") + (match_operand:SI 4 "immediate_operand" "i")] + UNSPEC_DOT_S)))] + "TARGET_BF16_SIMD" + { + int lane = INTVAL (operands[4]); + int half = GET_MODE_NUNITS (GET_MODE (operands[3])) / 4; + if (lane < half) + return "vdot.bf16\\t%0, %2, %e3[%c4]"; + else + { + operands[4] = GEN_INT (lane - half); + return "vdot.bf16\\t%0, %2, %f3[%c4]"; + } + } + [(set_attr "type" "neon_dot")] +) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 922ae5c4b7a..7b24b54fac7 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,9 @@ +2020-02-25 Dennis Zhang + + * gcc.target/arm/simd/bf16_dot_1.c: New test. + * gcc.target/arm/simd/bf16_dot_2.c: New test. + * gcc.target/arm/simd/bf16_dot_3.c: New test. + 2020-02-25 Jakub Jelinek PR rtl-optimization/93908 diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_dot_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_dot_1.c new file mode 100644 index 00000000000..4487152d6cb --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_dot_1.c @@ -0,0 +1,100 @@ +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-options "-save-temps -O2" } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +#include "arm_neon.h" + +/* BF16 DOT without lane. */ +float32x2_t +test_vbfdot_f32 (float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) +{ + /* vdot.bf16 d, d, d */ + return vbfdot_f32 (r, a, b); +} + +float32x4_t +test_vbfdotq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) +{ + /* vdot.bf16 q, q, q */ + return vbfdotq_f32 (r, a, b); +} + +/* 64-bit BF16 DOT with lane. */ +float32x2_t +test_vbfdot_lane_f32_0 (float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) +{ + /* vdot.bf16 d, d, d[0] */ + return vbfdot_lane_f32 (r, a, b, 0); +} + +float32x2_t +test_vbfdot_lane_f32_1 (float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) +{ + /* vdot.bf16 d, d, d[1] */ + return vbfdot_lane_f32 (r, a, b, 1); +} + +float32x2_t +test_vbfdot_laneq_f32_0 (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) +{ + /* vdot.bf16 d, d, d[0] */ + return vbfdot_laneq_f32 (r, a, b, 0); +} + +float32x2_t +test_vbfdot_laneq_f32_1 (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) +{ + /* vdot.bf16 d, d, d[1] */ + return vbfdot_laneq_f32 (r, a, b, 1); +} + +float32x2_t +test_vbfdot_laneq_f32_2 (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) +{ + /* vdot.bf16 d, d, d[0] */ + return vbfdot_laneq_f32 (r, a, b, 2); +} + +float32x2_t +test_vbfdot_laneq_f32_3 (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) +{ + /* vdot.bf16 d, d, d[1] */ + return vbfdot_laneq_f32 (r, a, b, 3); +} + +/* 128-bit BF16 DOT with lane. */ +float32x4_t +test_vbfdotq_lane_f32_0 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + /* vdot.bf16 q, q, d[0] */ + return vbfdotq_lane_f32 (r, a, b, 0); +} + +float32x4_t +test_vbfdotq_lane_f32_1 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + /* vdot.bf16 q, q, d[1] */ + return vbfdotq_lane_f32 (r, a, b, 1); +} + +float32x4_t +test_vbfdotq_laneq_f32_0 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) +{ + /* vdot.bf16 q, q, d[0] */ + return vbfdotq_laneq_f32 (r, a, b, 0); +} + +float32x4_t +test_vbfdotq_laneq_f32_3 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) +{ + /* vdot.bf16 q, q, d[1] */ + return vbfdotq_laneq_f32 (r, a, b, 3); +} + +/* { dg-final { scan-assembler-times {\tvdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tvdot.bf16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tvdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+\[0\]\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tvdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+\[1\]\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tvdot.bf16\tq[0-9]+, q[0-9]+, d[0-9]+\[0\]\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tvdot.bf16\tq[0-9]+, q[0-9]+, d[0-9]+\[1\]\n} 2 } } */ diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_dot_2.c b/gcc/testsuite/gcc.target/arm/simd/bf16_dot_2.c new file mode 100644 index 00000000000..d2ef344c68b --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_dot_2.c @@ -0,0 +1,33 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +#include "arm_neon.h" + +float32x2_t +test_vbfdot_lane_f32_a (float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) +{ + /* { dg-error "lane -1 out of range 0 - 1" "" {target *-*-*} 0 } */ + return vbfdot_lane_f32 (r, a, b, -1); +} + +float32x2_t +test_vbfdot_lane_f32_b (float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) +{ + /* { dg-error "lane 2 out of range 0 - 1" "" {target *-*-*} 0 } */ + return vbfdot_lane_f32 (r, a, b, 2); +} + +float32x2_t +test_vbfdot_laneq_f32_a (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) +{ + /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */ + return vbfdot_laneq_f32 (r, a, b, -1); +} + +float32x2_t +test_vbfdot_laneq_f32_b (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) +{ + /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */ + return vbfdot_laneq_f32 (r, a, b, 4); +} diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_dot_3.c b/gcc/testsuite/gcc.target/arm/simd/bf16_dot_3.c new file mode 100644 index 00000000000..93f08f02bc7 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_dot_3.c @@ -0,0 +1,33 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +#include "arm_neon.h" + +float32x4_t +test_vbfdotq_lane_f32_a (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + /* { dg-error "lane -1 out of range 0 - 1" "" {target *-*-*} 0 } */ + return vbfdotq_lane_f32 (r, a, b, -1); +} + +float32x4_t +test_vbfdotq_lane_f32_b (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) +{ + /* { dg-error "lane 2 out of range 0 - 1" "" {target *-*-*} 0 } */ + return vbfdotq_lane_f32 (r, a, b, 2); +} + +float32x4_t +test_vbfdotq_laneq_f32_a (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) +{ + /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */ + return vbfdotq_laneq_f32 (r, a, b, -1); +} + +float32x4_t +test_vbfdotq_laneq_f32_b (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) +{ + /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */ + return vbfdotq_laneq_f32 (r, a, b, 4); +} -- 2.30.2