[GCC][PATCH][AArch64]Add ACLE intrinsics for bfdot for ARMv8.6 Extension

author Stam Markianos-Wright <stam.markianos-wright@arm.com>

Thu, 16 Jan 2020 14:47:30 +0000 (14:47 +0000)

committer Stam Markianos-Wright <stam.markianos-wright@arm.com>

Thu, 16 Jan 2020 14:47:30 +0000 (14:47 +0000)
author Stam Markianos-Wright <stam.markianos-wright@arm.com>
Thu, 16 Jan 2020 14:47:30 +0000 (14:47 +0000)
committer Stam Markianos-Wright <stam.markianos-wright@arm.com>
Thu, 16 Jan 2020 14:47:30 +0000 (14:47 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 49dcecb6777e6522ebf5bfc12cd21f0e9d4d6564..d11b8d3e62a34992376d44bbf37a3c608d442053 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,16 @@
+2020-01-16  Stam Markianos-Wright  <stam.markianos-wright@arm.com>
+
+       * config/aarch64/aarch64-simd-builtins.def (aarch64_bfdot,
+       aarch64_bfdot_lane, aarch64_bfdot_laneq): New.
+       * config/aarch64/aarch64-simd.md (aarch64_bfdot, aarch64_bfdot_lane,
+       aarch64_bfdot_laneq): New.
+       * config/aarch64/arm_bf16.h (vbfdot_f32, vbfdotq_f32,
+       vbfdot_lane_f32, vbfdotq_lane_f32, vbfdot_laneq_f32,
+       vbfdotq_laneq_f32): New.
+       * config/aarch64/iterators.md (UNSPEC_BFDOT, Vbfdottype,
+       VBFMLA_W, VBF): New.
+       (isquadop): Add V4BF, V8BF.
+
  2020-01-16  Stam Markianos-Wright  <stam.markianos-wright@arm.com>
  
         * config/aarch64/aarch64-builtins.c: (enum aarch64_type_qualifiers):
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def

index 4744dd1f6b2f20327db810277ff65c59ce5abdec..a118f4f121de067c0a80f691b852247b0ab27f7a 100644 (file)
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -687,3 +687,8 @@
    BUILTIN_VSFDF (UNOP, frint32x, 0)
    BUILTIN_VSFDF (UNOP, frint64z, 0)
    BUILTIN_VSFDF (UNOP, frint64x, 0)
+
+  /* Implemented by aarch64_bfdot{_lane}{q}<mode>.  */
+  VAR2 (TERNOP, bfdot, 0, v2sf, v4sf)
+  VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, v2sf, v4sf)
+  VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, v2sf, v4sf)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md

index 9e56e8caf3569b4b15f77f627ce6e6a839008083..97f46f96968a6bc2f93bbc812931537b819b3b19 100644 (file)
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7059,3 +7059,35 @@
    "xtn\t%0.<Vntype>, %1.<Vtype>"
    [(set_attr "type" "neon_shift_imm_narrow_q")]
  )
+
+(define_insn "aarch64_bfdot<mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+       (plus:VDQSF
+         (unspec:VDQSF
+          [(match_operand:<VBFMLA_W> 2 "register_operand" "w")
+           (match_operand:<VBFMLA_W> 3 "register_operand" "w")]
+           UNSPEC_BFDOT)
+         (match_operand:VDQSF 1 "register_operand" "0")))]
+  "TARGET_BF16_SIMD"
+  "bfdot\t%0.<Vtype>, %2.<Vbfdottype>, %3.<Vbfdottype>"
+  [(set_attr "type" "neon_dot<q>")]
+)
+
+(define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"
+  [(set (match_operand:VDQSF 0 "register_operand" "=w")
+       (plus:VDQSF
+         (unspec:VDQSF
+          [(match_operand:<VDQSF:VBFMLA_W> 2 "register_operand" "w")
+           (match_operand:VBF 3 "register_operand" "w")
+           (match_operand:SI 4 "const_int_operand" "n")]
+           UNSPEC_BFDOT)
+         (match_operand:VDQSF 1 "register_operand" "0")))]
+  "TARGET_BF16_SIMD"
+{
+  int nunits = GET_MODE_NUNITS (<VBF:MODE>mode).to_constant ();
+  int lane = INTVAL (operands[4]);
+  operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane), SImode);
+  return "bfdot\t%0.<VDQSF:Vtype>, %2.<VDQSF:Vbfdottype>, %3.2h[%4]";
+}
+  [(set_attr "type" "neon_dot<VDQSF:q>")]
+)
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h

index c96214003dd2478820ca2093f6cabc8fcf3f9b92..7f05c3f9eca844b0e7b824a191223a4906c825b1 100644 (file)
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -34611,6 +34611,57 @@ vrnd64xq_f64 (float64x2_t __a)
  
  #include "arm_bf16.h"
  
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b)
+{
+  return __builtin_aarch64_bfdotv2sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_aarch64_bfdotv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_lane_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b,
+                const int __index)
+{
+  return __builtin_aarch64_bfdot_lanev2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+                 const int __index)
+{
+  return __builtin_aarch64_bfdot_lanev4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_laneq_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x8_t __b,
+                 const int __index)
+{
+  return __builtin_aarch64_bfdot_laneqv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+                  const int __index)
+{
+  return __builtin_aarch64_bfdot_laneqv4sf (__r, __a, __b, __index);
+}
+
+#pragma GCC pop_options
+
  /* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics.  */
  
  #pragma GCC push_options
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md

index 83720d9802a0ef454214d04c473b0c60f0abdc2a..661c3e7b4a79e20cabd1ed93d8152108eef90c02 100644 (file)
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -122,6 +122,9 @@
  ;; Quad vector with only 2 element modes.
  (define_mode_iterator VQ_2E [V2DI V2DF])
  
+;; BFmode vector modes.
+(define_mode_iterator VBF [V4BF V8BF])
+
  ;; This mode iterator allows :P to be used for patterns that operate on
  ;; addresses in different modes.  In LP64, only DI will match, while in
  ;; ILP32, either can match.
@@ -801,6 +804,7 @@
      UNSPEC_USUBWT      ; Used in aarch64-sve2.md.
      UNSPEC_USDOT       ; Used in aarch64-simd.md.
      UNSPEC_SUDOT       ; Used in aarch64-simd.md.
+    UNSPEC_BFDOT       ; Used in aarch64-simd.md.
  ])
  
  ;; ------------------------------------------------------------------
@@ -1451,6 +1455,9 @@
  ;; Register suffix for DOTPROD input types from the return type.
  (define_mode_attr Vdottype [(V2SI "8b") (V4SI "16b")])
  
+;; Register suffix for BFDOT input types from the return type.
+(define_mode_attr Vbfdottype [(V2SF "4h") (V4SF "8h")])
+
  ;; Sum of lengths of instructions needed to move vector registers of a mode.
  (define_mode_attr insn_count [(OI "8") (CI "12") (XI "16")])
  
@@ -1461,11 +1468,14 @@
  ;; Width of 2nd and 3rd arguments to fp16 vector multiply add/sub
  (define_mode_attr VFMLA_W [(V2SF "V4HF") (V4SF "V8HF")])
  
+;; Width of 2nd and 3rd arguments to bf16 vector multiply add/sub
+(define_mode_attr VBFMLA_W [(V2SF "V4BF") (V4SF "V8BF")])
+
  (define_mode_attr VFMLA_SEL_W [(V2SF "V2HF") (V4SF "V4HF")])
  
  (define_mode_attr f16quad [(V2SF "") (V4SF "q")])
  
-(define_mode_attr isquadop [(V8QI "") (V16QI "q")])
+(define_mode_attr isquadop [(V8QI "") (V16QI "q") (V4BF "") (V8BF "q")])
  
  (define_code_attr f16mac [(plus "a") (minus "s")])
  
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index 8b01aa06a40df665a59a51caf6b32a352eb09484..e5963d21333d1166b03a7d264ef80347f5a486d0 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,9 +1,15 @@
  2020-01-16  Stam Markianos-Wright  <stam.markianos-wright@arm.com>
  
-       * gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-1.c: New test.
-       * gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c: New test.
-       * gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-3.c: New test.
-       * gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-4.c: New test.
+       * gcc.target/aarch64/advsimd-intrinsics/bfdot-1.c: New.
+       * gcc.target/aarch64/advsimd-intrinsics/bfdot-2.c: New.
+       * gcc.target/aarch64/advsimd-intrinsics/bfdot-3.c: New.
+
+2020-01-16  Stam Markianos-Wright  <stam.markianos-wright@arm.com>
+
+       * gcc.target/aarch64/advsimd-intrinsics/vdot-3-1.c: New test.
+       * gcc.target/aarch64/advsimd-intrinsics/vdot-3-2.c: New test.
+       * gcc.target/aarch64/advsimd-intrinsics/vdot-3-3.c: New test.
+       * gcc.target/aarch64/advsimd-intrinsics/vdot-3-4.c: New test.
  
  2020-01-16  Andre Vieira  <andre.simoesdiasvieira@arm.com>
  
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-1.c

new file mode 100755 (executable)

index 0000000..ad51507
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-1.c
@@ -0,0 +1,91 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include <arm_neon.h>
+
+/*
+**ufoo:
+**     bfdot   v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h)
+**     ret
+*/
+float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_f32 (r, x, y);
+}
+
+/*
+**ufooq:
+**     bfdot   v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h)
+**     ret
+*/
+float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfdotq_f32 (r, x, y);
+}
+
+/*
+**ufoo_lane:
+**     bfdot   v0.2s, v1.4h, v2.2h\[0\]
+**     ret
+*/
+float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_lane_f32 (r, x, y, 0);
+}
+
+/*
+**ufooq_laneq:
+**     bfdot   v0.4s, v1.8h, v2.2h\[2\]
+**     ret
+*/
+float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfdotq_laneq_f32 (r, x, y, 2);
+}
+
+/*
+**ufoo_laneq:
+**     bfdot   v0.2s, v1.4h, v2.2h\[3\]
+**     ret
+*/
+float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
+{
+  return vbfdot_laneq_f32 (r, x, y, 3);
+}
+
+/*
+**ufooq_lane:
+**     bfdot   v0.4s, v1.8h, v2.2h\[1\]
+**     ret
+*/
+float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
+{
+  return vbfdotq_lane_f32 (r, x, y, 1);
+}
+
+/*
+**ufoo_untied:
+**     mov     v0.8b, v1.8b
+**     bfdot   v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
+**     ret
+*/
+float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_f32 (r, x, y);
+}
+
+/*
+**ufooq_lane_untied:
+**     mov     v0.16b, v1.16b
+**     bfdot   v0.4s, v2.8h, v3.2h\[1\]
+**     ret
+*/
+float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
+{
+  return vbfdotq_lane_f32 (r, x, y, 1);
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-2.c

new file mode 100755 (executable)

index 0000000..58bdee5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-2.c
@@ -0,0 +1,91 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-mbig-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include <arm_neon.h>
+
+/*
+**ufoo:
+**     bfdot   v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h)
+**     ret
+*/
+float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_f32 (r, x, y);
+}
+
+/*
+**ufooq:
+**     bfdot   v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h)
+**     ret
+*/
+float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfdotq_f32 (r, x, y);
+}
+
+/*
+**ufoo_lane:
+**     bfdot   v0.2s, v1.4h, v2.2h\[0\]
+**     ret
+*/
+float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_lane_f32 (r, x, y, 0);
+}
+
+/*
+**ufooq_laneq:
+**     bfdot   v0.4s, v1.8h, v2.2h\[2\]
+**     ret
+*/
+float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfdotq_laneq_f32 (r, x, y, 2);
+}
+
+/*
+**ufoo_laneq:
+**     bfdot   v0.2s, v1.4h, v2.2h\[3\]
+**     ret
+*/
+float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
+{
+  return vbfdot_laneq_f32 (r, x, y, 3);
+}
+
+/*
+**ufooq_lane:
+**     bfdot   v0.4s, v1.8h, v2.2h\[1\]
+**     ret
+*/
+float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
+{
+  return vbfdotq_lane_f32 (r, x, y, 1);
+}
+
+/*
+**ufoo_untied:
+**     mov     v0.8b, v1.8b
+**     bfdot   v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
+**     ret
+*/
+float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_f32 (r, x, y);
+}
+
+/*
+**ufooq_lane_untied:
+**     mov     v0.16b, v1.16b
+**     bfdot   v0.4s, v2.8h, v3.2h\[1\]
+**     ret
+*/
+float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
+{
+  return vbfdotq_lane_f32 (r, x, y, 1);
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-3.c

new file mode 100755 (executable)

index 0000000..6071262
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-3.c
@@ -0,0 +1,28 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "--save-temps" } */
+
+#include <arm_neon.h>
+
+float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
+{
+  return vbfdot_lane_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 1} "" { target *-*-* } 0 } */
+}
+
+float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+  return vbfdotq_laneq_f32 (r, x, y, -1); /* { dg-error {lane -1 out of range 0 - 3} "" { target *-*-* } 0 } */
+}
+
+float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
+{
+  return vbfdot_laneq_f32 (r, x, y, 4); /* { dg-error {lane 4 out of range 0 - 3} "" { target *-*-* } 0 } */
+}
+
+float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
+{
+  return vbfdotq_lane_f32 (r, x, y, 2); /* { dg-error {lane 2 out of range 0 - 1} "" { target *-*-* } 0 } */
+}
+
author	Stam Markianos-Wright <stam.markianos-wright@arm.com>
	Thu, 16 Jan 2020 14:47:30 +0000 (14:47 +0000)
committer	Stam Markianos-Wright <stam.markianos-wright@arm.com>
	Thu, 16 Jan 2020 14:47:30 +0000 (14:47 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config/aarch64/aarch64-simd-builtins.def		patch \| blob \| history
gcc/config/aarch64/aarch64-simd.md		patch \| blob \| history
gcc/config/aarch64/arm_neon.h		patch \| blob \| history
gcc/config/aarch64/iterators.md		patch \| blob \| history
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-1.c	[new file with mode: 0755]	patch \| blob
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-2.c	[new file with mode: 0755]	patch \| blob
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfdot-3.c	[new file with mode: 0755]	patch \| blob