From: Hongtao Liu Date: Wed, 8 May 2019 10:21:40 +0000 (+0000) Subject: Enable support for bfloat16 which will be in Future Cooper Lake. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=4f0e90fae97a894247ec93336c8826cf4afb3d0d;p=gcc.git Enable support for bfloat16 which will be in Future Cooper Lake. There are 3 instructions for AVX512BF16: VCVTNE2PS2BF16, VCVTNEPS2BF16 and DPBF16PS instructions, which are Vector Neural Network Instructions supporting: - VCVTNE2PS2BF16: Convert Two Packed Single Data to One Packed BF16 Data. - VCVTNEPS2BF16: Convert Packed Single Data to Packed BF16 Data. - VDPBF16PS: Dot Product of BF16 Pairs Accumulated into Packed Single Precision. 2019-05-07 Wei Xiao * common/config/i386/i386-common.c (OPTION_MASK_ISA_AVX512BF16_SET OPTION_MASK_ISA_AVX512BF16_UNSET, OPTION_MASK_ISA2_AVX512BW_UNSET): New. (OPTION_MASK_ISA2_AVX512F_UNSET): Add OPTION_MASK_ISA_AVX512BF16_UNSET. (ix86_handle_option): Handle -mavx512bf16. * config.gcc: Add avx512bf16vlintrin.h and avx512bf16intrin.h to extra_headers. * config/i386/avx512bf16vlintrin.h: New. * config/i386/avx512bf16intrin.h: New. * config/i386/cpuid.h (bit_AVX512BF16): New. * config/i386/driver-i386.c (host_detect_local_cpu): Detect BF16. * config/i386/i386-builtin-types.def: Add new types. * config/i386/i386-builtin.def: Add new builtins. * config/i386/i386-c.c (ix86_target_macros_internal): Define __AVX512BF16__. * config/i386/i386-option.c (ix86_target_string): Add -mavx512bf16. (ix86_option_override_internal): Handle BF16. (ix86_valid_target_attribute_inner_p): Ditto. * config/i386/i386-expand.c (ix86_expand_args_builtin): Ditto. * config/i386/i386-builtin.c (enum processor_features): Add F_AVX512BF16. (static const _isa_names_table isa_names_table): Ditto. * config/i386/i386.h (TARGET_AVX512BF16, TARGET_AVX512BF16_P): New. (PTA_AVX512BF16): Ditto. * config/i386/i386.opt: Add -mavx512bf16. * config/i386/immintrin.h: Include avx512bf16intrin.h and avx512bf16vlintrin.h. * config/i386/sse.md (avx512f_cvtne2ps2bf16_, avx512f_cvtneps2bf16_, avx512f_dpbf16ps_): New define_insn patterns. * config/i386/subst.md (mask_half): Add new subst. * doc/invoke.texi: Document -mavx512bf16. 2019-05-07 Wei Xiao * gcc.target/i386/avx512bf16-vcvtne2ps2bf16-1.c: New test. * gcc.target/i386/avx512bf16-vcvtneps2bf16-1.c: New test. * gcc.target/i386/avx512bf16-vdpbf16ps-1.c: New test. * gcc.target/i386/avx512bf16vl-vcvtne2ps2bf16-1.c: New test. * gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c: New test. * gcc.target/i386/avx512bf16vl-vdpbf16ps-1.c: New test. * gcc.target/i386/builtin_target.c: Handle avx512bf16. * gcc.target/i386/sse-12.c: Add -mavx512bf16. * gcc.target/i386/sse-13.c: Ditto. * gcc.target/i386/sse-14.c: Ditto. * gcc.target/i386/sse-22.c: Ditto. * gcc.target/i386/sse-23.c: Ditto. * g++.dg/other/i386-2.C: Ditto. * g++.dg/other/i386-3.C: Ditto. 2019-05-07 Hongtao Liu * config/i386/cpuinfo.c (get_available_features): Detect BF16. * config/i386/cpuinfo.h (enum processor_features): Add FEATURE_AVX512BF16. From-SVN: r271006 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 90354983a74..e37aafc4d2a 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -3,6 +3,40 @@ PR tree-optimization/90356 * match.pd ((X +/- 0.0) +/- 0.0): Optimize into X +/- 0.0 if possible. +2019-05-07 Wei Xiao + + * common/config/i386/i386-common.c (OPTION_MASK_ISA_AVX512BF16_SET + OPTION_MASK_ISA_AVX512BF16_UNSET, OPTION_MASK_ISA2_AVX512BW_UNSET): New. + (OPTION_MASK_ISA2_AVX512F_UNSET): Add OPTION_MASK_ISA_AVX512BF16_UNSET. + (ix86_handle_option): Handle -mavx512bf16. + * config.gcc: Add avx512bf16vlintrin.h and avx512bf16intrin.h + to extra_headers. + * config/i386/avx512bf16vlintrin.h: New. + * config/i386/avx512bf16intrin.h: New. + * config/i386/cpuid.h (bit_AVX512BF16): New. + * config/i386/driver-i386.c (host_detect_local_cpu): Detect BF16. + * config/i386/i386-builtin-types.def: Add new types. + * config/i386/i386-builtin.def: Add new builtins. + * config/i386/i386-c.c (ix86_target_macros_internal): Define + __AVX512BF16__. + * config/i386/i386-option.c (ix86_target_string): Add -mavx512bf16. + (ix86_option_override_internal): Handle BF16. + (ix86_valid_target_attribute_inner_p): Ditto. + * config/i386/i386-expand.c (ix86_expand_args_builtin): Ditto. + * config/i386/i386-builtin.c (enum processor_features): Add + F_AVX512BF16. + (static const _isa_names_table isa_names_table): Ditto. + * config/i386/i386.h (TARGET_AVX512BF16, TARGET_AVX512BF16_P): New. + (PTA_AVX512BF16): Ditto. + * config/i386/i386.opt: Add -mavx512bf16. + * config/i386/immintrin.h: Include avx512bf16intrin.h + and avx512bf16vlintrin.h. + * config/i386/sse.md (avx512f_cvtne2ps2bf16_, + avx512f_cvtneps2bf16_, + avx512f_dpbf16ps_): New define_insn patterns. + * config/i386/subst.md (mask_half): Add new subst. + * doc/invoke.texi: Document -mavx512bf16. + 2019-05-07 Segher Boessenkool * config/rs6000/rs6000-protos.h (rs6000_legitimize_reload_address_ptr): diff --git a/gcc/common/config/i386/i386-common.c b/gcc/common/config/i386/i386-common.c index ee725a4202c..db5c3f84346 100644 --- a/gcc/common/config/i386/i386-common.c +++ b/gcc/common/config/i386/i386-common.c @@ -88,6 +88,7 @@ along with GCC; see the file COPYING3. If not see (OPTION_MASK_ISA_AVX512VPOPCNTDQ | OPTION_MASK_ISA_AVX512F_SET) #define OPTION_MASK_ISA_AVX512BITALG_SET \ (OPTION_MASK_ISA_AVX512BITALG | OPTION_MASK_ISA_AVX512F_SET) +#define OPTION_MASK_ISA_AVX512BF16_SET OPTION_MASK_ISA_AVX512BF16 #define OPTION_MASK_ISA_RTM_SET OPTION_MASK_ISA_RTM #define OPTION_MASK_ISA_PRFCHW_SET OPTION_MASK_ISA_PRFCHW #define OPTION_MASK_ISA_RDSEED_SET OPTION_MASK_ISA_RDSEED @@ -215,6 +216,7 @@ along with GCC; see the file COPYING3. If not see #define OPTION_MASK_ISA_AVX512VNNI_UNSET OPTION_MASK_ISA_AVX512VNNI #define OPTION_MASK_ISA_AVX512VPOPCNTDQ_UNSET OPTION_MASK_ISA_AVX512VPOPCNTDQ #define OPTION_MASK_ISA_AVX512BITALG_UNSET OPTION_MASK_ISA_AVX512BITALG +#define OPTION_MASK_ISA_AVX512BF16_UNSET OPTION_MASK_ISA_AVX512BF16 #define OPTION_MASK_ISA_RTM_UNSET OPTION_MASK_ISA_RTM #define OPTION_MASK_ISA_PRFCHW_UNSET OPTION_MASK_ISA_PRFCHW #define OPTION_MASK_ISA_RDSEED_UNSET OPTION_MASK_ISA_RDSEED @@ -276,10 +278,14 @@ along with GCC; see the file COPYING3. If not see | OPTION_MASK_ISA_SSE_UNSET) #define OPTION_MASK_ISA2_AVX512F_UNSET \ - (OPTION_MASK_ISA_AVX5124FMAPS_UNSET | OPTION_MASK_ISA_AVX5124VNNIW_UNSET) + (OPTION_MASK_ISA_AVX512BF16_UNSET \ + | OPTION_MASK_ISA_AVX5124FMAPS_UNSET \ + | OPTION_MASK_ISA_AVX5124VNNIW_UNSET) #define OPTION_MASK_ISA2_GENERAL_REGS_ONLY_UNSET \ (OPTION_MASK_ISA2_AVX512F_UNSET) +#define OPTION_MASK_ISA2_AVX512BW_UNSET OPTION_MASK_ISA_AVX512BF16_UNSET + /* Set 1 << value as value of -malign-FLAG option. */ static void @@ -738,6 +744,21 @@ ix86_handle_option (struct gcc_options *opts, } return true; + case OPT_mavx512bf16: + if (value) + { + opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512BF16_SET; + opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA_AVX512BF16_SET; + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW_SET; + opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_AVX512BW_SET; + } + else + { + opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA_AVX512BF16_UNSET; + opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA_AVX512BF16_UNSET; + } + return true; + case OPT_msgx: if (value) { @@ -800,6 +821,8 @@ ix86_handle_option (struct gcc_options *opts, { opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_AVX512BW_UNSET; opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_AVX512BW_UNSET; + opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA2_AVX512BW_UNSET; + opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA2_AVX512BW_UNSET; } return true; diff --git a/gcc/config.gcc b/gcc/config.gcc index baa156da4d2..b5a313f5c59 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -407,7 +407,7 @@ i[34567]86-*-*) avx512vnnivlintrin.h vaesintrin.h vpclmulqdqintrin.h avx512vpopcntdqvlintrin.h avx512bitalgintrin.h pconfigintrin.h wbnoinvdintrin.h movdirintrin.h - waitpkgintrin.h cldemoteintrin.h" + waitpkgintrin.h cldemoteintrin.h avx512bf16vlintrin.h avx512bf16intrin.h" ;; x86_64-*-*) cpu_type=i386 @@ -439,7 +439,7 @@ x86_64-*-*) avx512vnnivlintrin.h vaesintrin.h vpclmulqdqintrin.h avx512vpopcntdqvlintrin.h avx512bitalgintrin.h pconfigintrin.h wbnoinvdintrin.h movdirintrin.h - waitpkgintrin.h cldemoteintrin.h" + waitpkgintrin.h cldemoteintrin.h avx512bf16vlintrin.h avx512bf16intrin.h" ;; ia64-*-*) extra_headers=ia64intrin.h diff --git a/gcc/config/i386/avx512bf16intrin.h b/gcc/config/i386/avx512bf16intrin.h new file mode 100644 index 00000000000..cc983bdf590 --- /dev/null +++ b/gcc/config/i386/avx512bf16intrin.h @@ -0,0 +1,118 @@ +/* Copyright (C) 2019 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512BF16INTRIN_H_INCLUDED +#define _AVX512BF16INTRIN_H_INCLUDED + +#ifndef __AVX512BF16__ +#pragma GCC push_options +#pragma GCC target("avx512bf16") +#define __DISABLE_AVX512BF16__ +#endif /* __AVX512BF16__ */ + +/* Internal data types for implementing the intrinsics. */ +typedef short __v32bh __attribute__ ((__vector_size__ (64))); + +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef short __m512bh __attribute__ ((__vector_size__ (64), __may_alias__)); + +/* vcvtne2ps2bf16 */ + +extern __inline __m512bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtne2ps_pbh (__m512 __A, __m512 __B) +{ + return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi(__A, __B); +} + +extern __inline __m512bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtne2ps_pbh (__m512bh __A, __mmask32 __B, __m512 __C, __m512 __D) +{ + return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_mask(__C, __D, __A, __B); +} + +extern __inline __m512bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtne2ps_pbh (__mmask32 __A, __m512 __B, __m512 __C) +{ + return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_maskz(__B, __C, __A); +} + +/* vcvtneps2bf16 */ + +extern __inline __m256bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtneps_pbh (__m512 __A) +{ + return (__m256bh)__builtin_ia32_cvtneps2bf16_v16sf(__A); +} + +extern __inline __m256bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtneps_pbh (__m256bh __A, __mmask16 __B, __m512 __C) +{ + return (__m256bh)__builtin_ia32_cvtneps2bf16_v16sf_mask(__C, __A, __B); +} + +extern __inline __m256bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtneps_pbh (__mmask16 __A, __m512 __B) +{ + return (__m256bh)__builtin_ia32_cvtneps2bf16_v16sf_maskz(__B, __A); +} + +/* vdpbf16ps */ + +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_dpbf16_ps (__m512 __A, __m512bh __B, __m512bh __C) +{ + return (__m512)__builtin_ia32_dpbf16ps_v16sf(__A, __B, __C); +} + +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_dpbf16_ps (__m512 __A, __mmask16 __B, __m512bh __C, __m512bh __D) +{ + return (__m512)__builtin_ia32_dpbf16ps_v16sf_mask(__A, __C, __D, __B); +} + +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_dpbf16_ps (__mmask16 __A, __m512 __B, __m512bh __C, __m512bh __D) +{ + return (__m512)__builtin_ia32_dpbf16ps_v16sf_maskz(__B, __C, __D, __A); +} + +#ifdef __DISABLE_AVX512BF16__ +#undef __DISABLE_AVX512BF16__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512BF16__ */ + +#endif /* _AVX512BF16INTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/avx512bf16vlintrin.h b/gcc/config/i386/avx512bf16vlintrin.h new file mode 100644 index 00000000000..fa32a7fbcbe --- /dev/null +++ b/gcc/config/i386/avx512bf16vlintrin.h @@ -0,0 +1,183 @@ +/* Copyright (C) 2019 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512BF16VLINTRIN_H_INCLUDED +#define _AVX512BF16VLINTRIN_H_INCLUDED + +#if !defined(__AVX512VL__) || !defined(__AVX512BF16__) +#pragma GCC push_options +#pragma GCC target("avx512bf16,avx512vl") +#define __DISABLE_AVX512BF16VL__ +#endif /* __AVX512BF16__ */ + +/* Internal data types for implementing the intrinsics. */ +typedef short __v16bh __attribute__ ((__vector_size__ (32))); +typedef short __v8bh __attribute__ ((__vector_size__ (16))); + +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef short __m256bh __attribute__ ((__vector_size__ (32), __may_alias__)); +typedef short __m128bh __attribute__ ((__vector_size__ (16), __may_alias__)); + +/* vcvtne2ps2bf16 */ + +extern __inline __m256bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtne2ps_pbh (__m256 __A, __m256 __B) +{ + return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi(__A, __B); +} + +extern __inline __m256bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtne2ps_pbh (__m256bh __A, __mmask16 __B, __m256 __C, __m256 __D) +{ + return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_mask(__C, __D, __A, __B); +} + +extern __inline __m256bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtne2ps_pbh (__mmask16 __A, __m256 __B, __m256 __C) +{ + return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_maskz(__B, __C, __A); +} + +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtne2ps_pbh (__m128 __A, __m128 __B) +{ + return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi(__A, __B); +} + +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtne2ps_pbh (__m128bh __A, __mmask8 __B, __m128 __C, __m128 __D) +{ + return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_mask(__C, __D, __A, __B); +} + +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtne2ps_pbh (__mmask8 __A, __m128 __B, __m128 __C) +{ + return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_maskz(__B, __C, __A); +} + +/* vcvtneps2bf16 */ + +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtneps_pbh (__m256 __A) +{ + return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf(__A); +} + +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtneps_pbh (__m128bh __A, __mmask8 __B, __m256 __C) +{ + return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf_mask(__C, __A, __B); +} + +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtneps_pbh (__mmask8 __A, __m256 __B) +{ + return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf_maskz(__B, __A); +} + +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtneps_pbh (__m128 __A) +{ + return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf(__A); +} + +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtneps_pbh (__m128bh __A, __mmask8 __B, __m128 __C) +{ + return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf_mask(__C, __A, __B); +} + +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtneps_pbh (__mmask8 __A, __m128 __B) +{ + return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf_maskz(__B, __A); +} + +/* vdpbf16ps */ + +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dpbf16_ps (__m256 __A, __m256bh __B, __m256bh __C) +{ + return (__m256)__builtin_ia32_dpbf16ps_v8sf(__A, __B, __C); +} + +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_dpbf16_ps (__m256 __A, __mmask8 __B, __m256bh __C, __m256bh __D) +{ + return (__m256)__builtin_ia32_dpbf16ps_v8sf_mask(__A, __C, __D, __B); +} + +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_dpbf16_ps (__mmask8 __A, __m256 __B, __m256bh __C, __m256bh __D) +{ + return (__m256)__builtin_ia32_dpbf16ps_v8sf_maskz(__B, __C, __D, __A); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dpbf16_ps (__m128 __A, __m128bh __B, __m128bh __C) +{ + return (__m128)__builtin_ia32_dpbf16ps_v4sf(__A, __B, __C); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_dpbf16_ps (__m128 __A, __mmask8 __B, __m128bh __C, __m128bh __D) +{ + return (__m128)__builtin_ia32_dpbf16ps_v4sf_mask(__A, __C, __D, __B); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_dpbf16_ps (__mmask8 __A, __m128 __B, __m128bh __C, __m128bh __D) +{ + return (__m128)__builtin_ia32_dpbf16ps_v4sf_maskz(__B, __C, __D, __A); +} + +#ifdef __DISABLE_AVX512BF16VL__ +#undef __DISABLE_AVX512BF16VL__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512BF16VL__ */ + +#endif /* _AVX512BF16VLINTRIN_H_INCLUDED */ diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h index 39bf0fb1b83..8ddd425c8b7 100644 --- a/gcc/config/i386/cpuid.h +++ b/gcc/config/i386/cpuid.h @@ -21,6 +21,9 @@ * . */ +/* %eax */ +#define bit_AVX512BF16 (1 << 5) + /* %ecx */ #define bit_SSE3 (1 << 0) #define bit_PCLMUL (1 << 1) diff --git a/gcc/config/i386/driver-i386.c b/gcc/config/i386/driver-i386.c index 75f70269517..22ad5bcf07d 100644 --- a/gcc/config/i386/driver-i386.c +++ b/gcc/config/i386/driver-i386.c @@ -426,6 +426,7 @@ const char *host_detect_local_cpu (int argc, const char **argv) unsigned int has_movdiri = 0, has_movdir64b = 0; unsigned int has_waitpkg = 0; unsigned int has_cldemote = 0; + unsigned int has_avx512bf16 = 0; unsigned int has_ptwrite = 0; @@ -533,6 +534,9 @@ const char *host_detect_local_cpu (int argc, const char **argv) has_shstk = ecx & bit_SHSTK; has_pconfig = edx & bit_PCONFIG; has_waitpkg = ecx & bit_WAITPKG; + + __cpuid_count (7, 1, eax, ebx, ecx, edx); + has_avx512bf16 = eax & bit_AVX512BF16; } if (max_level >= 13) @@ -1143,6 +1147,7 @@ const char *host_detect_local_cpu (int argc, const char **argv) const char *waitpkg = has_waitpkg ? " -mwaitpkg" : " -mno-waitpkg"; const char *cldemote = has_cldemote ? " -mcldemote" : " -mno-cldemote"; const char *ptwrite = has_ptwrite ? " -mptwrite" : " -mno-ptwrite"; + const char *avx512bf16 = has_avx512bf16 ? " -mavx512bf16" : " -mno-avx512bf16"; options = concat (options, mmx, mmx3dnow, sse, sse2, sse3, ssse3, sse4a, cx16, sahf, movbe, aes, sha, pclmul, @@ -1157,7 +1162,7 @@ const char *host_detect_local_cpu (int argc, const char **argv) clwb, mwaitx, clzero, pku, rdpid, gfni, shstk, avx512vbmi2, avx512vnni, vaes, vpclmulqdq, avx512bitalg, movdiri, movdir64b, waitpkg, cldemote, - ptwrite, + ptwrite, avx512bf16, NULL); } diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def index dfe13adb95a..d7b99398e87 100644 --- a/gcc/config/i386/i386-builtin-types.def +++ b/gcc/config/i386/i386-builtin-types.def @@ -1262,3 +1262,29 @@ DEF_FUNCTION_TYPE (V2DI, V2DI, V2DI, V2DI, INT) DEF_FUNCTION_TYPE (V4DI, V4DI) DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI, UHI) DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, UHI) + +# BF16 builtins +DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF) +DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF, V32HI, USI) +DEF_FUNCTION_TYPE (V32HI, V16SF, V16SF, USI) +DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF) +DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF, V16HI, UHI) +DEF_FUNCTION_TYPE (V16HI, V8SF, V8SF, UHI) +DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF) +DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF, V8HI, UQI) +DEF_FUNCTION_TYPE (V8HI, V4SF, V4SF, UQI) +DEF_FUNCTION_TYPE (V16HI, V16SF) +DEF_FUNCTION_TYPE (V16HI, V16SF, V16HI, UHI) +DEF_FUNCTION_TYPE (V16HI, V16SF, UHI) +DEF_FUNCTION_TYPE (V8HI, V8SF) +DEF_FUNCTION_TYPE (V8HI, V8SF, V8HI, UQI) +DEF_FUNCTION_TYPE (V8HI, V8SF, UQI) +DEF_FUNCTION_TYPE (V8HI, V4SF) +DEF_FUNCTION_TYPE (V8HI, V4SF, V8HI, UQI) +DEF_FUNCTION_TYPE (V8HI, V4SF, UQI) +DEF_FUNCTION_TYPE (V16SF, V16SF, V32HI, V32HI) +DEF_FUNCTION_TYPE (V16SF, V16SF, V32HI, V32HI, UHI) +DEF_FUNCTION_TYPE (V8SF, V8SF, V16HI, V16HI) +DEF_FUNCTION_TYPE (V8SF, V8SF, V16HI, V16HI, UQI) +DEF_FUNCTION_TYPE (V4SF, V4SF, V8HI, V8HI) +DEF_FUNCTION_TYPE (V4SF, V4SF, V8HI, V8HI, UQI) diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index 6580890edc0..e95d5d35c85 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -2703,6 +2703,35 @@ BDESC (0, OPTION_MASK_ISA_VAES, CODE_FOR_vaesenclast_v16qi, "__builtin_ia32_vaes BDESC (0, OPTION_MASK_ISA_VAES, CODE_FOR_vaesenclast_v32qi, "__builtin_ia32_vaesenclast_v32qi", IX86_BUILTIN_VAESENCLAST32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) BDESC (0, OPTION_MASK_ISA_VAES, CODE_FOR_vaesenclast_v64qi, "__builtin_ia32_vaesenclast_v64qi", IX86_BUILTIN_VAESENCLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) +/* BF16 */ +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi, "__builtin_ia32_cvtne2ps2bf16_v32hi", IX86_BUILTIN_CVTNE2PS2HI16_V32HI, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi_mask, "__builtin_ia32_cvtne2ps2bf16_v32hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V32HI_MASK, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF_V32HI_USI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v32hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V32HI_MASKZ, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF_USI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi, "__builtin_ia32_cvtne2ps2bf16_v16hi", IX86_BUILTIN_CVTNE2PS2HI16_V16HI, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi_mask, "__builtin_ia32_cvtne2ps2bf16_v16hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V16HI_MASK, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF_V16HI_UHI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v16hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v16hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V16HI_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V8SF_V8SF_UHI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi, "__builtin_ia32_cvtne2ps2bf16_v8hi", IX86_BUILTIN_CVTNE2PS2HI16_V8HI, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi_mask, "__builtin_ia32_cvtne2ps2bf16_v8hi_mask", IX86_BUILTIN_CVTNE2PS2HI16_V8HI_MASK, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF_V8HI_UQI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v8hi_maskz, "__builtin_ia32_cvtne2ps2bf16_v8hi_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V8HI_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V4SF_V4SF_UQI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf, "__builtin_ia32_cvtneps2bf16_v16sf", IX86_BUILTIN_CVTNEPS2HI16_V16SF, UNKNOWN, (int) V16HI_FTYPE_V16SF) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_mask, "__builtin_ia32_cvtneps2bf16_v16sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V16SF_MASK, UNKNOWN, (int) V16HI_FTYPE_V16SF_V16HI_UHI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v16sf_maskz, "__builtin_ia32_cvtneps2bf16_v16sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V16SF_MASKZ, UNKNOWN, (int) V16HI_FTYPE_V16SF_UHI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf, "__builtin_ia32_cvtneps2bf16_v8sf", IX86_BUILTIN_CVTNEPS2HI16_V8SF, UNKNOWN, (int) V8HI_FTYPE_V8SF) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_mask, "__builtin_ia32_cvtneps2bf16_v8sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V8SF_MASK, UNKNOWN, (int) V8HI_FTYPE_V8SF_V8HI_UQI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v8sf_maskz, "__builtin_ia32_cvtneps2bf16_v8sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V8SF_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V8SF_UQI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf, "__builtin_ia32_cvtneps2bf16_v4sf", IX86_BUILTIN_CVTNEPS2HI16_V4SF, UNKNOWN, (int) V8HI_FTYPE_V4SF) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_mask, "__builtin_ia32_cvtneps2bf16_v4sf_mask", IX86_BUILTIN_CVTNEPS2HI16_V4SF_MASK, UNKNOWN, (int) V8HI_FTYPE_V4SF_V8HI_UQI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_cvtneps2bf16_v4sf_maskz, "__builtin_ia32_cvtneps2bf16_v4sf_maskz", IX86_BUILTIN_CVTNE2PS2HI16_V4SF_MASKZ, UNKNOWN, (int) V8HI_FTYPE_V4SF_UQI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf, "__builtin_ia32_dpbf16ps_v16sf", IX86_BUILTIN_DPHI16PS_V16SF, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_mask, "__builtin_ia32_dpbf16ps_v16sf_mask", IX86_BUILTIN_DPHI16PS_V16SF_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI_UHI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v16sf_maskz, "__builtin_ia32_dpbf16ps_v16sf_maskz", IX86_BUILTIN_DPHI16PS_V16SF_MASKZ, UNKNOWN, (int) V16SF_FTYPE_V16SF_V32HI_V32HI_UHI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf, "__builtin_ia32_dpbf16ps_v8sf", IX86_BUILTIN_DPHI16PS_V8SF, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_mask, "__builtin_ia32_dpbf16ps_v8sf_mask", IX86_BUILTIN_DPHI16PS_V8SF_MASK, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI_UQI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_maskz, "__builtin_ia32_dpbf16ps_v8sf_maskz", IX86_BUILTIN_DPHI16PS_V8SF_MASKZ, UNKNOWN, (int) V8SF_FTYPE_V8SF_V16HI_V16HI_UQI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf, "__builtin_ia32_dpbf16ps_v4sf", IX86_BUILTIN_DPHI16PS_V4SF, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, "__builtin_ia32_dpbf16ps_v4sf_mask", IX86_BUILTIN_DPHI16PS_V4SF_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI) +BDESC (0, OPTION_MASK_ISA_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPHI16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8HI_V8HI_UQI) + /* Builtins with rounding support. */ BDESC_END (ARGS, ROUND_ARGS) diff --git a/gcc/config/i386/i386-builtins.c b/gcc/config/i386/i386-builtins.c index 9779727480f..72bb5d72d86 100644 --- a/gcc/config/i386/i386-builtins.c +++ b/gcc/config/i386/i386-builtins.c @@ -1920,6 +1920,7 @@ enum processor_features F_VPCLMULQDQ, F_AVX512VNNI, F_AVX512BITALG, + F_AVX512BF16, F_MAX }; @@ -2064,7 +2065,8 @@ static const _isa_names_table isa_names_table[] = {"gfni", F_GFNI, P_ZERO}, {"vpclmulqdq", F_VPCLMULQDQ, P_ZERO}, {"avx512vnni", F_AVX512VNNI, P_ZERO}, - {"avx512bitalg", F_AVX512BITALG, P_ZERO} + {"avx512bitalg", F_AVX512BITALG, P_ZERO}, + {"avx512bf16", F_AVX512BF16, P_ZERO} }; /* This parses the attribute arguments to target in DECL and determines diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c index 50cac3b1a9f..92bf066c079 100644 --- a/gcc/config/i386/i386-c.c +++ b/gcc/config/i386/i386-c.c @@ -548,6 +548,8 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, def_or_undef (parse_in, "__CLDEMOTE__"); if (isa_flag2 & OPTION_MASK_ISA_PTWRITE) def_or_undef (parse_in, "__PTWRITE__"); + if (isa_flag2 & OPTION_MASK_ISA_AVX512BF16) + def_or_undef (parse_in, "__AVX512BF16__"); if (TARGET_IAMCU) { def_or_undef (parse_in, "__iamcu"); diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 0835ebf74b7..b7ce5d0975b 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -8968,6 +8968,9 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V8DF_FTYPE_V2DF: case V8DF_FTYPE_V8DF: case V4DI_FTYPE_V4DI: + case V16HI_FTYPE_V16SF: + case V8HI_FTYPE_V8SF: + case V8HI_FTYPE_V4SF: nargs = 1; break; case V4SF_FTYPE_V4SF_VEC_MERGE: @@ -9092,6 +9095,12 @@ ix86_expand_args_builtin (const struct builtin_description *d, case USI_FTYPE_USI_USI: case UDI_FTYPE_UDI_UDI: case V16SI_FTYPE_V8DF_V8DF: + case V32HI_FTYPE_V16SF_V16SF: + case V16HI_FTYPE_V8SF_V8SF: + case V8HI_FTYPE_V4SF_V4SF: + case V16HI_FTYPE_V16SF_UHI: + case V8HI_FTYPE_V8SF_UQI: + case V8HI_FTYPE_V4SF_UQI: nargs = 2; break; case V2DI_FTYPE_V2DI_INT_CONVERT: @@ -9274,6 +9283,15 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V16HI_FTYPE_V16HI_V16HI_V16HI: case V8SI_FTYPE_V8SI_V8SI_V8SI: case V8HI_FTYPE_V8HI_V8HI_V8HI: + case V32HI_FTYPE_V16SF_V16SF_USI: + case V16HI_FTYPE_V8SF_V8SF_UHI: + case V8HI_FTYPE_V4SF_V4SF_UQI: + case V16HI_FTYPE_V16SF_V16HI_UHI: + case V8HI_FTYPE_V8SF_V8HI_UQI: + case V8HI_FTYPE_V4SF_V8HI_UQI: + case V16SF_FTYPE_V16SF_V32HI_V32HI: + case V8SF_FTYPE_V8SF_V16HI_V16HI: + case V4SF_FTYPE_V4SF_V8HI_V8HI: nargs = 3; break; case V32QI_FTYPE_V32QI_V32QI_INT: @@ -9413,6 +9431,9 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI: case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI: case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI: + case V32HI_FTYPE_V16SF_V16SF_V32HI_USI: + case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI: + case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI: nargs = 4; break; case V2DF_FTYPE_V2DF_V2DF_V2DI_INT: @@ -9456,6 +9477,9 @@ ix86_expand_args_builtin (const struct builtin_description *d, break; case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED: case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG: + case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI: + case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI: + case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI: nargs = 4; break; case UQI_FTYPE_V8DI_V8DI_INT_UQI: diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c index 95a9ae3f79f..dec8352143c 100644 --- a/gcc/config/i386/i386-options.c +++ b/gcc/config/i386/i386-options.c @@ -209,7 +209,8 @@ ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2, { "-mmovdir64b", OPTION_MASK_ISA_MOVDIR64B }, { "-mwaitpkg", OPTION_MASK_ISA_WAITPKG }, { "-mcldemote", OPTION_MASK_ISA_CLDEMOTE }, - { "-mptwrite", OPTION_MASK_ISA_PTWRITE } + { "-mptwrite", OPTION_MASK_ISA_PTWRITE }, + { "-mavx512bf16", OPTION_MASK_ISA_AVX512BF16 } }; static struct ix86_target_opts isa_opts[] = { @@ -919,6 +920,7 @@ ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[], IX86_ATTR_ISA ("waitpkg", OPT_mwaitpkg), IX86_ATTR_ISA ("cldemote", OPT_mcldemote), IX86_ATTR_ISA ("ptwrite", OPT_mptwrite), + IX86_ATTR_ISA ("avx512bf16", OPT_mavx512bf16), /* enum options */ IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_), @@ -2034,6 +2036,10 @@ ix86_option_override_internal (bool main_args_p, && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VPOPCNTDQ)) opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VPOPCNTDQ; + if (((processor_alias_table[i].flags & PTA_AVX512BF16) != 0) + && !(opts->x_ix86_isa_flags2_explicit + & OPTION_MASK_ISA_AVX512BF16)) + opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512BF16; if (((processor_alias_table[i].flags & PTA_SGX) != 0) && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX)) opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX; diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index ad6c36ba265..3fee779296f 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -193,6 +193,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #define TARGET_CLDEMOTE_P(x) TARGET_ISA_CLDEMOTE_P(x) #define TARGET_PTWRITE TARGET_ISA_PTWRITE #define TARGET_PTWRITE_P(x) TARGET_ISA_PTWRITE_P(x) +#define TARGET_AVX512BF16 TARGET_ISA_AVX512BF16 +#define TARGET_AVX512BF16_P(x) TARGET_ISA_AVX512BF16_P(x) #define TARGET_LP64 TARGET_ABI_64 #define TARGET_LP64_P(x) TARGET_ABI_64_P(x) @@ -2355,6 +2357,7 @@ const wide_int_bitmask PTA_PCONFIG (0, HOST_WIDE_INT_1U << 7); const wide_int_bitmask PTA_WBNOINVD (0, HOST_WIDE_INT_1U << 8); const wide_int_bitmask PTA_WAITPKG (0, HOST_WIDE_INT_1U << 9); const wide_int_bitmask PTA_PTWRITE (0, HOST_WIDE_INT_1U << 10); +const wide_int_bitmask PTA_AVX512BF16 (0, HOST_WIDE_INT_1U << 11); const wide_int_bitmask PTA_CORE2 = PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR; diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index 5fb2ec6aeae..8f3dcf942ff 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -1101,3 +1101,8 @@ Enum(instrument_return) String(nop5) Value(instrument_return_nop5) mrecord-return Target Report Var(ix86_flag_record_return) Init(0) Generate a __return_loc section pointing to all return instrumentation code. + +mavx512bf16 +Target Report Mask(ISA_AVX512BF16) Var(ix86_isa_flags2) Save +Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F and +AVX512BF16 built-in functions and code generation. diff --git a/gcc/config/i386/immintrin.h b/gcc/config/i386/immintrin.h index 10e1f27c605..d99886a5e7b 100644 --- a/gcc/config/i386/immintrin.h +++ b/gcc/config/i386/immintrin.h @@ -130,6 +130,10 @@ #include +#include + +#include + #include #include diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 6b8298d957e..11363de44fe 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -187,6 +187,11 @@ ;; For AVX512BITALG support UNSPEC_VPSHUFBIT + + ;; For AVX512BF16 support + UNSPEC_VCVTNE2PS2BF16 + UNSPEC_VCVTNEPS2BF16 + UNSPEC_VDPBF16PS ]) (define_c_enum "unspecv" [ @@ -726,6 +731,15 @@ (V16SF "hi") (V8SF "qi") (V4SF "qi") (V8DF "qi") (V4DF "qi") (V2DF "qi")]) +;; Mapping of vector modes to corresponding mask half size +(define_mode_attr avx512fmaskhalfmode + [(V64QI "SI") (V32QI "HI") (V16QI "QI") + (V32HI "HI") (V16HI "QI") (V8HI "QI") (V4HI "QI") + (V16SI "QI") (V8SI "QI") (V4SI "QI") + (V8DI "QI") (V4DI "QI") (V2DI "QI") + (V16SF "QI") (V8SF "QI") (V4SF "QI") + (V8DF "QI") (V4DF "QI") (V2DF "QI")]) + ;; Mapping of vector float modes to an integer mode of the same size (define_mode_attr sseintvecmode [(V16SF "V16SI") (V8DF "V8DI") @@ -22184,3 +22198,90 @@ "vpshufbitqmb\t{%2, %1, %0|%0, %1, %2}" [(set_attr "prefix" "evex") (set_attr "mode" "")]) + +(define_mode_iterator BF16 [V32HI (V16HI "TARGET_AVX512VL") (V8HI "TARGET_AVX512VL")]) +;; Converting from BF to SF +(define_mode_attr bf16_cvt_2sf + [(V32HI "V16SF") (V16HI "V8SF") (V8HI "V4SF")]) +;; Converting from SF to BF +(define_mode_attr sf_cvt_bf16 + [(V4SF "V8HI") (V8SF "V8HI") (V16SF "V16HI")]) +;; Mapping from BF to SF +(define_mode_attr sf_bf16 + [(V4SF "V8HI") (V8SF "V16HI") (V16SF "V32HI")]) + +(define_expand "avx512f_cvtne2ps2bf16__maskz" + [(match_operand:BF16 0 "register_operand") + (match_operand: 1 "register_operand") + (match_operand: 2 "register_operand") + (match_operand: 3 "register_operand")] + "TARGET_AVX512BF16" +{ + emit_insn (gen_avx512f_cvtne2ps2bf16__mask(operands[0], operands[1], + operands[2], CONST0_RTX(mode), operands[3])); + DONE; +}) + +(define_insn "avx512f_cvtne2ps2bf16_" + [(set (match_operand:BF16 0 "register_operand" "=v") + (unspec:BF16 + [(match_operand: 1 "register_operand" "v") + (match_operand: 2 "register_operand" "v")] + UNSPEC_VCVTNE2PS2BF16))] + "TARGET_AVX512BF16" + "vcvtne2ps2bf16\t{%2, %1, %0|%0, %1, %2}") + +(define_expand "avx512f_cvtneps2bf16__maskz" + [(match_operand: 0 "register_operand") + (match_operand:VF1_AVX512VL 1 "register_operand") + (match_operand: 2 "register_operand")] + "TARGET_AVX512BF16" +{ + emit_insn (gen_avx512f_cvtneps2bf16__mask(operands[0], operands[1], + CONST0_RTX(mode), operands[2])); + DONE; +}) + +(define_insn "avx512f_cvtneps2bf16_" + [(set (match_operand: 0 "register_operand" "=v") + (unspec: + [(match_operand:VF1_AVX512VL 1 "register_operand" "v")] + UNSPEC_VCVTNEPS2BF16))] + "TARGET_AVX512BF16" + "vcvtneps2bf16\t{%1, %0|%0, %1}") + +(define_expand "avx512f_dpbf16ps__maskz" + [(match_operand:VF1_AVX512VL 0 "register_operand") + (match_operand:VF1_AVX512VL 1 "register_operand") + (match_operand: 2 "register_operand") + (match_operand: 3 "register_operand") + (match_operand: 4 "register_operand")] + "TARGET_AVX512BF16" +{ + emit_insn (gen_avx512f_dpbf16ps__maskz_1(operands[0], operands[1], + operands[2], operands[3], CONST0_RTX(mode), operands[4])); + DONE; +}) + +(define_insn "avx512f_dpbf16ps_" + [(set (match_operand:VF1_AVX512VL 0 "register_operand" "=v") + (unspec:VF1_AVX512VL + [(match_operand:VF1_AVX512VL 1 "register_operand" "0") + (match_operand: 2 "register_operand" "v") + (match_operand: 3 "register_operand" "v")] + UNSPEC_VDPBF16PS))] + "TARGET_AVX512BF16" + "vdpbf16ps\t{%3, %2, %0|%0, %2, %3}") + +(define_insn "avx512f_dpbf16ps__mask" + [(set (match_operand:VF1_AVX512VL 0 "register_operand" "=v") + (vec_merge:VF1_AVX512VL + (unspec:VF1_AVX512VL + [(match_operand:VF1_AVX512VL 1 "register_operand" "0") + (match_operand: 2 "register_operand" "v") + (match_operand: 3 "register_operand" "v")] + UNSPEC_VDPBF16PS) + (match_dup 1) + (match_operand: 4 "register_operand" "Yk")))] + "TARGET_AVX512BF16" + "vdpbf16ps\t{%3, %2, %0%{%4%}|%0%{%4%}, %2, %3}") diff --git a/gcc/config/i386/subst.md b/gcc/config/i386/subst.md index 99198a3ea69..dd5890584f4 100644 --- a/gcc/config/i386/subst.md +++ b/gcc/config/i386/subst.md @@ -313,3 +313,16 @@ (const_int 1)) (match_operand:SI 3 "const48_operand")] UNSPEC_EMBEDDED_ROUNDING))]) + +(define_subst_attr "maskz_half_name" "maskz_half" "" "_maskz_1") +(define_subst_attr "maskz_half_operand4" "maskz_half" "" "%{%5%}%N4") + +(define_subst "maskz_half" + [(set (match_operand:SUBST_V 0) + (match_operand:SUBST_V 1))] + "" + [(set (match_dup 0) + (vec_merge:SUBST_V + (match_dup 1) + (match_operand:SUBST_V 2 "const0_operand" "C") + (match_operand: 3 "register_operand" "Yk")))]) diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index de7e1aaec67..8e4a8a880ef 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -1274,7 +1274,7 @@ See RS/6000 and PowerPC Options. -msse4a -m3dnow -m3dnowa -mpopcnt -mabm -mbmi -mtbm -mfma4 -mxop @gol -madx -mlzcnt -mbmi2 -mfxsr -mxsave -mxsaveopt -mrtm -mhle -mlwp @gol -mmwaitx -mclzero -mpku -mthreads -mgfni -mvaes -mwaitpkg @gol --mshstk -mmanual-endbr -mforce-indirect-call -mavx512vbmi2 @gol +-mshstk -mmanual-endbr -mforce-indirect-call -mavx512vbmi2 -mavx512bf16 @gol -mvpclmulqdq -mavx512bitalg -mmovdiri -mmovdir64b -mavx512vpopcntdq @gol -mavx5124fmaps -mavx512vnni -mavx5124vnniw -mprfchw -mrdpid @gol -mrdseed -msgx @gol @@ -28041,6 +28041,9 @@ preferred alignment to @option{-mpreferred-stack-boundary=2}. @itemx -mavx512vbmi2 @opindex mavx512vbmi2 @need 200 +@itemx -mavx512bf16 +@opindex mavx512bf16 +@need 200 @itemx -mgfni @opindex mgfni @need 200 @@ -28083,7 +28086,7 @@ AES, PCLMUL, CLFLUSHOPT, CLWB, FSGSBASE, PTWRITE, RDRND, F16C, FMA, PCONFIG, WBNOINVD, FMA4, PREFETCHW, RDPID, PREFETCHWT1, RDSEED, SGX, XOP, LWP, 3DNow!@:, enhanced 3DNow!@:, POPCNT, ABM, ADX, BMI, BMI2, LZCNT, FXSR, XSAVE, XSAVEOPT, XSAVEC, XSAVES, RTM, HLE, TBM, MWAITX, CLZERO, PKU, AVX512VBMI2, -GFNI, VAES, WAITPKG, VPCLMULQDQ, AVX512BITALG, MOVDIRI, MOVDIR64B, +GFNI, VAES, WAITPKG, VPCLMULQDQ, AVX512BITALG, MOVDIRI, MOVDIR64B, AVX512BF16 AVX512VPOPCNTDQ, AVX5124FMAPS, AVX512VNNI, AVX5124VNNIW, or CLDEMOTE extended instruction sets. Each has a corresponding @option{-mno-} option to disable use of these instructions. diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 8e2a7c55770..d50e4e040ee 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -12,6 +12,23 @@ * gcc.dg/tree-ssa/pr90356-3.c: New test. * gcc.dg/tree-ssa/pr90356-4.c: New test. +2019-05-07 Wei Xiao + + * gcc.target/i386/avx512bf16-vcvtne2ps2bf16-1.c: New test. + * gcc.target/i386/avx512bf16-vcvtneps2bf16-1.c: New test. + * gcc.target/i386/avx512bf16-vdpbf16ps-1.c: New test. + * gcc.target/i386/avx512bf16vl-vcvtne2ps2bf16-1.c: New test. + * gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c: New test. + * gcc.target/i386/avx512bf16vl-vdpbf16ps-1.c: New test. + * gcc.target/i386/builtin_target.c: Handle avx512bf16. + * gcc.target/i386/sse-12.c: Add -mavx512bf16. + * gcc.target/i386/sse-13.c: Ditto. + * gcc.target/i386/sse-14.c: Ditto. + * gcc.target/i386/sse-22.c: Ditto. + * gcc.target/i386/sse-23.c: Ditto. + * g++.dg/other/i386-2.C: Ditto. + * g++.dg/other/i386-3.C: Ditto. + 2019-05-07 Cherry Zhang * go.dg/arrayclear.go: New test. diff --git a/gcc/testsuite/g++.dg/other/i386-2.C b/gcc/testsuite/g++.dg/other/i386-2.C index a70d9f413a6..f7a564bf46c 100644 --- a/gcc/testsuite/g++.dg/other/i386-2.C +++ b/gcc/testsuite/g++.dg/other/i386-2.C @@ -1,5 +1,5 @@ /* { dg-do compile { target i?86-*-* x86_64-*-* } } */ -/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd" } */ +/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16" } */ /* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h, xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h, diff --git a/gcc/testsuite/g++.dg/other/i386-3.C b/gcc/testsuite/g++.dg/other/i386-3.C index 73eb5e74c9a..4d6f94fe9c1 100644 --- a/gcc/testsuite/g++.dg/other/i386-3.C +++ b/gcc/testsuite/g++.dg/other/i386-3.C @@ -1,5 +1,5 @@ /* { dg-do compile { target i?86-*-* x86_64-*-* } } */ -/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd" } */ +/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16" } */ /* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h, xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h, diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16-vcvtne2ps2bf16-1.c b/gcc/testsuite/gcc.target/i386/avx512bf16-vcvtne2ps2bf16-1.c new file mode 100644 index 00000000000..6d19459cc97 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bf16-vcvtne2ps2bf16-1.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bf16 -O2" } */ +/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +volatile __m512bh res; +volatile __m512 x1, x2; +volatile __mmask32 m32; + +void extern +avx512bf16_test (void) +{ + res = _mm512_cvtne2ps_pbh (x1, x2); + res = _mm512_mask_cvtne2ps_pbh (res, m32, x1, x2); + res = _mm512_maskz_cvtne2ps_pbh (m32, x1, x2); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16-vcvtneps2bf16-1.c b/gcc/testsuite/gcc.target/i386/avx512bf16-vcvtneps2bf16-1.c new file mode 100644 index 00000000000..99ba4ee57f7 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bf16-vcvtneps2bf16-1.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bf16 -O2" } */ +/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +volatile __m256bh res; +volatile __m512 x1; +volatile __mmask16 m16; + +void extern +avx512bf16_test (void) +{ + res = _mm512_cvtneps_pbh (x1); + res = _mm512_mask_cvtneps_pbh (res, m16, x1); + res = _mm512_maskz_cvtneps_pbh (m16, x1); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-1.c b/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-1.c new file mode 100644 index 00000000000..d9ad444f630 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-1.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bf16 -O2" } */ +/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\{\n\]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +volatile __m512 res; +volatile __m512bh x1, x2; +volatile __mmask16 m16; + +void extern +avx512bf16_test (void) +{ + res = _mm512_dpbf16_ps (res, x1, x2); + res = _mm512_mask_dpbf16_ps (res, m16, x1, x2); + res = _mm512_maskz_dpbf16_ps (m16, res, x1, x2); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c b/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c new file mode 100644 index 00000000000..b64ad7b84dd --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bf16-vdpbf16ps-2.c @@ -0,0 +1,49 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bf16 -O2" } */ +/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\r]*%zmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +typedef union +{ + __m512 x; + float a[16]; +} union512s; + +float res_ref[16]; +union512s res; +__m512bh x1, x2; +__mmask16 m16; + +static void __attribute__((noinline, unused)) +merge_masking_s (float *arr, unsigned long long mask, int size) +{ + int i; + for (i = 0; i < size; i++) + { + arr[i] = (mask & (1LL << i)) ? arr[i] : 117; + } +} + +static int __attribute__((noinline, unused)) +check_union512s (union512s u, const float *v) +{ + int i; + int err = 0; + for (i = 0; i < (sizeof (u.a) / sizeof ((u.a)[0])); i++) + if (u.a[i] != v[i]) + { + err++; + ; + } + return err; +} + +void extern +avx512bf16_test (void) +{ + res.x = _mm512_mask_dpbf16_ps (res.x, m16, x1, x2); + merge_masking_s (res_ref, m16, 16); + if (check_union512s (res, res_ref)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtne2ps2bf16-1.c b/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtne2ps2bf16-1.c new file mode 100644 index 00000000000..f0ec70f65c4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtne2ps2bf16-1.c @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bf16 -mavx512vl -O2" } */ +/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtne2ps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +volatile __m128bh res1; +volatile __m256bh res2; +volatile __m128 x1, x2; +volatile __m256 x3, x4; +volatile __mmask8 m8; +volatile __mmask16 m16; + +void extern +avx512bf16_test (void) +{ + res2 = _mm256_cvtne2ps_pbh (x3, x4); + res2 = _mm256_mask_cvtne2ps_pbh (res2, m16, x3, x4); + res2 = _mm256_maskz_cvtne2ps_pbh (m16, x3, x4); + + res1 = _mm_cvtne2ps_pbh (x1, x2); + res1 = _mm_mask_cvtne2ps_pbh (res1, m8, x1, x2); + res1 = _mm_maskz_cvtne2ps_pbh (m8, x1, x2); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c b/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c new file mode 100644 index 00000000000..0969ae1b35e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bf16vl-vcvtneps2bf16-1.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bf16 -mavx512vl -O2" } */ +/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtneps2bf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +volatile __m128bh res1, res2; +volatile __m128 x1; +volatile __m256 x2; +volatile __mmask8 m8; + +void extern +avx512bf16_test (void) +{ + res2 = _mm256_cvtneps_pbh (x2); + res2 = _mm256_mask_cvtneps_pbh (res2, m8, x2); + res2 = _mm256_maskz_cvtneps_pbh (m8, x2); + + res1 = _mm_cvtneps_pbh (x1); + res1 = _mm_mask_cvtneps_pbh (res1, m8, x1); + res1 = _mm_maskz_cvtneps_pbh (m8, x1); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16vl-vdpbf16ps-1.c b/gcc/testsuite/gcc.target/i386/avx512bf16vl-vdpbf16ps-1.c new file mode 100644 index 00000000000..18374629fbd --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bf16vl-vdpbf16ps-1.c @@ -0,0 +1,28 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bf16 -mavx512vl -O2" } */ +/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\{\n\]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\r]*%ymm\[0-9\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\{%k\[0-9\]\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vdpbf16ps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[0-9\]\}\{z\}\[^\n\r]*(?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +volatile __m256 res1; +volatile __m256bh x1, x2; +volatile __m128 res2; +volatile __m128bh x3, x4; +volatile __mmask8 m8; + +void extern +avx512bf16_test (void) +{ + res1 = _mm256_dpbf16_ps (res1, x1, x2); + res1 = _mm256_mask_dpbf16_ps (res1, m8, x1, x2); + res1 = _mm256_maskz_dpbf16_ps (m8, res1, x1, x2); + + res2 = _mm_dpbf16_ps (res2, x3, x4); + res2 = _mm_mask_dpbf16_ps (res2, m8, x3, x4); + res2 = _mm_maskz_dpbf16_ps (m8, res2, x3, x4); +} diff --git a/gcc/testsuite/gcc.target/i386/builtin_target.c b/gcc/testsuite/gcc.target/i386/builtin_target.c index d39626611a7..7a8b6e805ed 100644 --- a/gcc/testsuite/gcc.target/i386/builtin_target.c +++ b/gcc/testsuite/gcc.target/i386/builtin_target.c @@ -265,6 +265,10 @@ check_features (unsigned int ecx, unsigned int edx, assert (__builtin_cpu_supports ("avx5124vnniw")); if (edx & bit_AVX5124FMAPS) assert (__builtin_cpu_supports ("avx5124fmaps")); + + __cpuid_count (7, 1, eax, ebx, ecx, edx); + if (eax & bit_AVX512BF16) + assert (__builtin_cpu_supports ("avx512bf16")); } /* Check cpuid level of extended features. */ diff --git a/gcc/testsuite/gcc.target/i386/sse-12.c b/gcc/testsuite/gcc.target/i386/sse-12.c index f7f55f4317e..606697388ee 100644 --- a/gcc/testsuite/gcc.target/i386/sse-12.c +++ b/gcc/testsuite/gcc.target/i386/sse-12.c @@ -3,7 +3,7 @@ popcntintrin.h gfniintrin.h and mm_malloc.h are usable with -O -std=c89 -pedantic-errors. */ /* { dg-do compile } */ -/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512bw -mavx512dq -mavx512vl -mavx512vbmi -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd" } */ +/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512bw -mavx512dq -mavx512vl -mavx512vbmi -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16" } */ #include diff --git a/gcc/testsuite/gcc.target/i386/sse-13.c b/gcc/testsuite/gcc.target/i386/sse-13.c index e868f6d293f..2b48c455eaf 100644 --- a/gcc/testsuite/gcc.target/i386/sse-13.c +++ b/gcc/testsuite/gcc.target/i386/sse-13.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd" } */ +/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mavx512bitalg -mpconfig -mwbnoinvd -mavx512bf16" } */ /* { dg-add-options bind_pic_locally } */ #include diff --git a/gcc/testsuite/gcc.target/i386/sse-14.c b/gcc/testsuite/gcc.target/i386/sse-14.c index 748339f7d47..ae7d3d5623b 100644 --- a/gcc/testsuite/gcc.target/i386/sse-14.c +++ b/gcc/testsuite/gcc.target/i386/sse-14.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd" } */ +/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mavx512vpopcntdq -mclwb -mmwaitx -mclzero -mpku -msgx -mrdpid -mgfni -mpconfig -mwbnoinvd -mavx512vl -mavx512bf16" } */ /* { dg-add-options bind_pic_locally } */ #include diff --git a/gcc/testsuite/gcc.target/i386/sse-22.c b/gcc/testsuite/gcc.target/i386/sse-22.c index 0c62f2049c2..733c67011ea 100644 --- a/gcc/testsuite/gcc.target/i386/sse-22.c +++ b/gcc/testsuite/gcc.target/i386/sse-22.c @@ -101,7 +101,7 @@ #ifndef DIFFERENT_PRAGMAS -#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg") +#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16") #endif /* Following intrinsics require immediate arguments. They @@ -218,7 +218,7 @@ test_4 (_mm_cmpestrz, int, __m128i, int, __m128i, int, 1) /* immintrin.h (AVX/AVX2/RDRND/FSGSBASE/F16C/RTM/AVX512F/SHA) */ #ifdef DIFFERENT_PRAGMAS -#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,avx512f,avx512er,avx512cd,avx512pf,sha,avx512vl,avx512bw,avx512dq,avx512ifma,avx512vbmi,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg") +#pragma GCC target ("avx,avx2,rdrnd,fsgsbase,f16c,rtm,avx512f,avx512er,avx512cd,avx512pf,sha,avx512vl,avx512bw,avx512dq,avx512ifma,avx512vbmi,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,gfni,avx512bitalg,avx512bf16") #endif #include test_1 (_cvtss_sh, unsigned short, float, 1) diff --git a/gcc/testsuite/gcc.target/i386/sse-23.c b/gcc/testsuite/gcc.target/i386/sse-23.c index 78a3c0aa26e..58f4c824731 100644 --- a/gcc/testsuite/gcc.target/i386/sse-23.c +++ b/gcc/testsuite/gcc.target/i386/sse-23.c @@ -696,6 +696,6 @@ #define __builtin_ia32_vpclmulqdq_v2di(A, B, C) __builtin_ia32_vpclmulqdq_v2di(A, B, 1) #define __builtin_ia32_vpclmulqdq_v8di(A, B, C) __builtin_ia32_vpclmulqdq_v8di(A, B, 1) -#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,xsavec,xsaves,clflushopt,avx512bw,avx512dq,avx512vl,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,avx512vbmi2,vpclmulqdq,avx512bitalg,pconfig,wbnoinvd") +#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,xsavec,xsaves,clflushopt,avx512bw,avx512dq,avx512vl,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,avx512vpopcntdq,clwb,mwaitx,clzero,pku,sgx,rdpid,gfni,avx512vbmi2,vpclmulqdq,avx512bitalg,pconfig,wbnoinvd,avx512bf16") #include diff --git a/libgcc/ChangeLog b/libgcc/ChangeLog index 30e4da4277e..c3c7a16b106 100644 --- a/libgcc/ChangeLog +++ b/libgcc/ChangeLog @@ -1,3 +1,9 @@ +2019-05-07 Hongtao Liu + + * config/i386/cpuinfo.c (get_available_features): Detect BF16. + * config/i386/cpuinfo.h (enum processor_features): Add + FEATURE_AVX512BF16. + 2019-04-23 Ramana Radhakrishnan Bernd Edlinger Jakub Jelinek diff --git a/libgcc/config/i386/cpuinfo.c b/libgcc/config/i386/cpuinfo.c index d6cb2de6265..5659ec89546 100644 --- a/libgcc/config/i386/cpuinfo.c +++ b/libgcc/config/i386/cpuinfo.c @@ -336,7 +336,7 @@ get_available_features (unsigned int ecx, unsigned int edx, set_feature (FEATURE_FMA); } - /* Get Advanced Features at level 7 (eax = 7, ecx = 0). */ + /* Get Advanced Features at level 7 (eax = 7, ecx = 0/1). */ if (max_cpuid_level >= 7) { __cpuid_count (7, 0, eax, ebx, ecx, edx); @@ -385,6 +385,10 @@ get_available_features (unsigned int ecx, unsigned int edx, set_feature (FEATURE_AVX5124VNNIW); if (edx & bit_AVX5124FMAPS) set_feature (FEATURE_AVX5124FMAPS); + + __cpuid_count (7, 1, eax, ebx, ecx, edx); + if (eax & bit_AVX512BF16) + set_feature (FEATURE_AVX512BF16); } } diff --git a/libgcc/config/i386/cpuinfo.h b/libgcc/config/i386/cpuinfo.h index b4492eb7eb4..68ca466fa84 100644 --- a/libgcc/config/i386/cpuinfo.h +++ b/libgcc/config/i386/cpuinfo.h @@ -119,7 +119,8 @@ enum processor_features FEATURE_GFNI, FEATURE_VPCLMULQDQ, FEATURE_AVX512VNNI, - FEATURE_AVX512BITALG + FEATURE_AVX512BITALG, + FEATURE_AVX512BF16 }; extern struct __processor_model