(OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512F_SET)
#define OPTION_MASK_ISA_AVX512VBMI_SET \
(OPTION_MASK_ISA_AVX512VBMI | OPTION_MASK_ISA_AVX512BW_SET)
+#define OPTION_MASK_ISA_AVX5124FMAPS_SET OPTION_MASK_ISA_AVX5124FMAPS
+#define OPTION_MASK_ISA_AVX5124VNNIW_SET OPTION_MASK_ISA_AVX5124VNNIW
#define OPTION_MASK_ISA_RTM_SET OPTION_MASK_ISA_RTM
#define OPTION_MASK_ISA_PRFCHW_SET OPTION_MASK_ISA_PRFCHW
#define OPTION_MASK_ISA_RDSEED_SET OPTION_MASK_ISA_RDSEED
#define OPTION_MASK_ISA_AVX512VL_UNSET OPTION_MASK_ISA_AVX512VL
#define OPTION_MASK_ISA_AVX512IFMA_UNSET OPTION_MASK_ISA_AVX512IFMA
#define OPTION_MASK_ISA_AVX512VBMI_UNSET OPTION_MASK_ISA_AVX512VBMI
+#define OPTION_MASK_ISA_AVX5124FMAPS_UNSET OPTION_MASK_ISA_AVX5124FMAPS
+#define OPTION_MASK_ISA_AVX5124VNNIW_UNSET OPTION_MASK_ISA_AVX5124VNNIW
#define OPTION_MASK_ISA_RTM_UNSET OPTION_MASK_ISA_RTM
#define OPTION_MASK_ISA_PRFCHW_UNSET OPTION_MASK_ISA_PRFCHW
#define OPTION_MASK_ISA_RDSEED_UNSET OPTION_MASK_ISA_RDSEED
{
opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_AVX512F_UNSET;
opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_AVX512F_UNSET;
+
+ /* Turn off additional isa flags. */
+ opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA_AVX5124FMAPS_UNSET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA_AVX5124FMAPS_UNSET;
+ opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA_AVX5124VNNIW_UNSET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA_AVX5124VNNIW_UNSET;
}
return true;
}
return true;
+ case OPT_mavx5124fmaps:
+ if (value)
+ {
+ opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS_SET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA_AVX5124FMAPS_SET;
+ opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F_SET;
+ opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_AVX512F_SET;
+ }
+ else
+ {
+ opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA_AVX5124FMAPS_UNSET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA_AVX5124FMAPS_UNSET;
+ }
+ return true;
+
+ case OPT_mavx5124vnniw:
+ if (value)
+ {
+ opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW_SET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA_AVX5124VNNIW_SET;
+ opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F_SET;
+ opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_AVX512F_SET;
+ }
+ else
+ {
+ opts->x_ix86_isa_flags2 &= ~OPTION_MASK_ISA_AVX5124VNNIW_UNSET;
+ opts->x_ix86_isa_flags2_explicit |= OPTION_MASK_ISA_AVX5124VNNIW_UNSET;
+ }
+ return true;
+
case OPT_mavx512dq:
if (value)
{
xsavesintrin.h avx512dqintrin.h avx512bwintrin.h
avx512vlintrin.h avx512vlbwintrin.h avx512vldqintrin.h
avx512ifmaintrin.h avx512ifmavlintrin.h avx512vbmiintrin.h
- avx512vbmivlintrin.h clwbintrin.h mwaitxintrin.h
- clzerointrin.h pkuintrin.h"
+ avx512vbmivlintrin.h avx5124fmapsintrin.h avx5124vnniwintrin.h
+ clwbintrin.h mwaitxintrin.h clzerointrin.h pkuintrin.h"
;;
x86_64-*-*)
cpu_type=i386
xsavesintrin.h avx512dqintrin.h avx512bwintrin.h
avx512vlintrin.h avx512vlbwintrin.h avx512vldqintrin.h
avx512ifmaintrin.h avx512ifmavlintrin.h avx512vbmiintrin.h
- avx512vbmivlintrin.h clwbintrin.h mwaitxintrin.h
- clzerointrin.h pkuintrin.h"
+ avx512vbmivlintrin.h avx5124fmapsintrin.h avx5124vnniwintrin.h
+ clwbintrin.h mwaitxintrin.h clzerointrin.h pkuintrin.h"
;;
ia64-*-*)
extra_headers=ia64intrin.h
--- /dev/null
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+# error "Never use <avx5124fmapsintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _AVX5124FMAPSINTRIN_H_INCLUDED
+#define _AVX5124FMAPSINTRIN_H_INCLUDED
+
+#ifndef __AVX5124FMAPS__
+#pragma GCC push_options
+#pragma GCC target("avx5124fmaps")
+#define __DISABLE_AVX5124FMAPS__
+#endif /* __AVX5124FMAPS__ */
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_4fmadd_ps (__m512 __A, __m512 __B, __m512 __C,
+ __m512 __D, __m512 __E, __m128 *__F)
+{
+ return (__m512) __builtin_ia32_4fmaddps ((__v16sf) __B,
+ (__v16sf) __C,
+ (__v16sf) __D,
+ (__v16sf) __E,
+ (__v16sf) __A,
+ (const __v4sf *) __F);
+}
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_4fmadd_ps (__m512 __A, __mmask16 __U, __m512 __B,
+ __m512 __C, __m512 __D, __m512 __E, __m128 *__F)
+{
+ return (__m512) __builtin_ia32_4fmaddps_mask ((__v16sf) __B,
+ (__v16sf) __C,
+ (__v16sf) __D,
+ (__v16sf) __E,
+ (__v16sf) __A,
+ (const __v4sf *) __F,
+ (__v16sf) __A,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_4fmadd_ps (__mmask16 __U,
+ __m512 __A, __m512 __B, __m512 __C,
+ __m512 __D, __m512 __E, __m128 *__F)
+{
+ return (__m512) __builtin_ia32_4fmaddps_mask ((__v16sf) __B,
+ (__v16sf) __C,
+ (__v16sf) __D,
+ (__v16sf) __E,
+ (__v16sf) __A,
+ (const __v4sf *) __F,
+ (__v16sf) _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_4fmadd_ss (__m128 __A, __m128 __B, __m128 __C,
+ __m128 __D, __m128 __E, __m128 *__F)
+{
+ return (__m128) __builtin_ia32_4fmaddss ((__v4sf) __B,
+ (__v4sf) __C,
+ (__v4sf) __D,
+ (__v4sf) __E,
+ (__v4sf) __A,
+ (const __v4sf *) __F);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_4fmadd_ss (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C,
+ __m128 __D, __m128 __E, __m128 *__F)
+{
+ return (__m128) __builtin_ia32_4fmaddss_mask ((__v4sf) __B,
+ (__v4sf) __C,
+ (__v4sf) __D,
+ (__v4sf) __E,
+ (__v4sf) __A,
+ (const __v4sf *) __F,
+ (__v4sf) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_4fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C,
+ __m128 __D, __m128 __E, __m128 *__F)
+{
+ return (__m128) __builtin_ia32_4fmaddss_mask ((__v4sf) __B,
+ (__v4sf) __C,
+ (__v4sf) __D,
+ (__v4sf) __E,
+ (__v4sf) __A,
+ (const __v4sf *) __F,
+ (__v4sf) _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_4fnmadd_ps (__m512 __A, __m512 __B, __m512 __C,
+ __m512 __D, __m512 __E, __m128 *__F)
+{
+ return (__m512) __builtin_ia32_4fnmaddps ((__v16sf) __B,
+ (__v16sf) __C,
+ (__v16sf) __D,
+ (__v16sf) __E,
+ (__v16sf) __A,
+ (const __v4sf *) __F);
+}
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_4fnmadd_ps (__m512 __A, __mmask16 __U, __m512 __B,
+ __m512 __C, __m512 __D, __m512 __E, __m128 *__F)
+{
+ return (__m512) __builtin_ia32_4fnmaddps_mask ((__v16sf) __B,
+ (__v16sf) __C,
+ (__v16sf) __D,
+ (__v16sf) __E,
+ (__v16sf) __A,
+ (const __v4sf *) __F,
+ (__v16sf) __A,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_4fnmadd_ps (__mmask16 __U,
+ __m512 __A, __m512 __B, __m512 __C,
+ __m512 __D, __m512 __E, __m128 *__F)
+{
+ return (__m512) __builtin_ia32_4fnmaddps_mask ((__v16sf) __B,
+ (__v16sf) __C,
+ (__v16sf) __D,
+ (__v16sf) __E,
+ (__v16sf) __A,
+ (const __v4sf *) __F,
+ (__v16sf) _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_4fnmadd_ss (__m128 __A, __m128 __B, __m128 __C,
+ __m128 __D, __m128 __E, __m128 *__F)
+{
+ return (__m128) __builtin_ia32_4fnmaddss ((__v4sf) __B,
+ (__v4sf) __C,
+ (__v4sf) __D,
+ (__v4sf) __E,
+ (__v4sf) __A,
+ (const __v4sf *) __F);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_4fnmadd_ss (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C,
+ __m128 __D, __m128 __E, __m128 *__F)
+{
+ return (__m128) __builtin_ia32_4fnmaddss_mask ((__v4sf) __B,
+ (__v4sf) __C,
+ (__v4sf) __D,
+ (__v4sf) __E,
+ (__v4sf) __A,
+ (const __v4sf *) __F,
+ (__v4sf) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_4fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C,
+ __m128 __D, __m128 __E, __m128 *__F)
+{
+ return (__m128) __builtin_ia32_4fnmaddss_mask ((__v4sf) __B,
+ (__v4sf) __C,
+ (__v4sf) __D,
+ (__v4sf) __E,
+ (__v4sf) __A,
+ (const __v4sf *) __F,
+ (__v4sf) _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+#ifdef __DISABLE_AVX5124FMAPS__
+#undef __DISABLE_AVX5124FMAPS__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX5124FMAPS__ */
+
+#endif /* _AVX5124FMAPSINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+# error "Never use <avx5124vnniwintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _AVX5124VNNIWINTRIN_H_INCLUDED
+#define _AVX5124VNNIWINTRIN_H_INCLUDED
+
+#ifndef __AVX5124VNNIW__
+#pragma GCC push_options
+#pragma GCC target("avx5124vnniw")
+#define __DISABLE_AVX5124VNNIW__
+#endif /* __AVX5124VNNIW__ */
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_4dpwssd_epi32 (__m512i __A, __m512i __B, __m512i __C,
+ __m512i __D, __m512i __E, __m128i *__F)
+{
+ return (__m512i) __builtin_ia32_vp4dpwssd ((__v16si) __B,
+ (__v16si) __C,
+ (__v16si) __D,
+ (__v16si) __E,
+ (__v16si) __A,
+ (const __v4si *) __F);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_4dpwssd_epi32 (__m512i __A, __mmask16 __U, __m512i __B,
+ __m512i __C, __m512i __D, __m512i __E,
+ __m128i *__F)
+{
+ return (__m512i) __builtin_ia32_vp4dpwssd_mask ((__v16si) __B,
+ (__v16si) __C,
+ (__v16si) __D,
+ (__v16si) __E,
+ (__v16si) __A,
+ (const __v4si *) __F,
+ (__v16si) __A,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_4dpwssd_epi32 (__mmask16 __U, __m512i __A, __m512i __B,
+ __m512i __C, __m512i __D, __m512i __E,
+ __m128i *__F)
+{
+ return (__m512i) __builtin_ia32_vp4dpwssd_mask ((__v16si) __B,
+ (__v16si) __C,
+ (__v16si) __D,
+ (__v16si) __E,
+ (__v16si) __A,
+ (const __v4si *) __F,
+ (__v16si) _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_4dpwssds_epi32 (__m512i __A, __m512i __B, __m512i __C,
+ __m512i __D, __m512i __E, __m128i *__F)
+{
+ return (__m512i) __builtin_ia32_vp4dpwssds ((__v16si) __B,
+ (__v16si) __C,
+ (__v16si) __D,
+ (__v16si) __E,
+ (__v16si) __A,
+ (const __v4si *) __F);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_4dpwssds_epi32 (__m512i __A, __mmask16 __U, __m512i __B,
+ __m512i __C, __m512i __D, __m512i __E,
+ __m128i *__F)
+{
+ return (__m512i) __builtin_ia32_vp4dpwssds_mask ((__v16si) __B,
+ (__v16si) __C,
+ (__v16si) __D,
+ (__v16si) __E,
+ (__v16si) __A,
+ (const __v4si *) __F,
+ (__v16si) __A,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_4dpwssds_epi32 (__mmask16 __U, __m512i __A, __m512i __B,
+ __m512i __C, __m512i __D, __m512i __E,
+ __m128i *__F)
+{
+ return (__m512i) __builtin_ia32_vp4dpwssds_mask ((__v16si) __B,
+ (__v16si) __C,
+ (__v16si) __D,
+ (__v16si) __E,
+ (__v16si) __A,
+ (const __v4si *) __F,
+ (__v16si) _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+#ifdef __DISABLE_AVX5124VNNIW__
+#undef __DISABLE_AVX5124VNNIW__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX5124VNNIW__ */
+
+#endif /* _AVX5124VNNIWINTRIN_H_INCLUDED */
;; f x87 register when 80387 floating point arithmetic is enabled
;; r SSE regs not requiring REX prefix when prefixes avoidance is enabled
;; and all SSE regs otherwise
+;; h EVEX encodable SSE register with number factor of four
(define_register_constraint "Yz" "TARGET_SSE ? SSE_FIRST_REG : NO_REGS"
"First SSE register (@code{%xmm0}).")
"TARGET_AVX512VL ? ALL_SSE_REGS : TARGET_SSE ? SSE_REGS : NO_REGS"
"@internal For AVX512VL, any EVEX encodable SSE register (@code{%xmm0-%xmm31}), otherwise any SSE register.")
+(define_register_constraint "Yh" "TARGET_AVX512F ? MOD4_SSE_REGS : NO_REGS"
+ "@internal Any EVEX encodable SSE register, which has number factor of four.")
+
;; We use the B prefix to denote any number of internal operands:
;; f FLAGS_REG
;; g GOT memory operand.
#define bit_MWAITX (1 << 29)
/* %edx */
+#define bit_AVX5124VNNIW (1 << 2)
+#define bit_AVX5124FMAPS (1 << 3)
#define bit_MMXEXT (1 << 22)
#define bit_LM (1 << 29)
#define bit_3DNOWP (1 << 30)
unsigned int has_avx512dq = 0, has_avx512bw = 0, has_avx512vl = 0;
unsigned int has_avx512vbmi = 0, has_avx512ifma = 0, has_clwb = 0;
unsigned int has_mwaitx = 0, has_clzero = 0, has_pku = 0;
+ unsigned int has_avx5124fmaps = 0, has_avx5124vnniw = 0;
bool arch;
has_prefetchwt1 = ecx & bit_PREFETCHWT1;
has_avx512vbmi = ecx & bit_AVX512VBMI;
has_pku = ecx & bit_OSPKE;
+ has_avx5124vnniw = edx & bit_AVX5124VNNIW;
+ has_avx5124fmaps = edx & bit_AVX5124FMAPS;
}
if (max_level >= 13)
const char *avx512vl = has_avx512vl ? " -mavx512vl" : " -mno-avx512vl";
const char *avx512ifma = has_avx512ifma ? " -mavx512ifma" : " -mno-avx512ifma";
const char *avx512vbmi = has_avx512vbmi ? " -mavx512vbmi" : " -mno-avx512vbmi";
+ const char *avx5124vnniw = has_avx5124vnniw ? " -mavx5124vnniw" : " -mno-avx5124vnniw";
+ const char *avx5124fmaps = has_avx5124fmaps ? " -mavx5124fmaps" : " -mno-avx5124fmaps";
const char *clwb = has_clwb ? " -mclwb" : " -mno-clwb";
const char *mwaitx = has_mwaitx ? " -mmwaitx" : " -mno-mwaitx";
const char *clzero = has_clzero ? " -mclzero" : " -mno-clzero";
fxsr, xsave, xsaveopt, avx512f, avx512er,
avx512cd, avx512pf, prefetchwt1, clflushopt,
xsavec, xsaves, avx512dq, avx512bw, avx512vl,
- avx512ifma, avx512vbmi, clwb, mwaitx,
- clzero, pku, NULL);
+ avx512ifma, avx512vbmi, avx5124fmaps, avx5124vnniw,
+ clwb, mwaitx, clzero, pku, NULL);
}
done:
DEF_FUNCTION_TYPE (VOID, UNSIGNED, UNSIGNED, UNSIGNED)
DEF_FUNCTION_TYPE (VOID, PV8DI, V8DI)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, V16SF, V16SF, V16SF, PCV4SF, V16SF, UHI)
+DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, V16SF, V16SF, V16SF, PCV4SF)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SF, V4SF, V4SF, PCV4SF)
+DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SF, V4SF, V4SF, PCV4SF, V4SF, UQI)
+
+DEF_FUNCTION_TYPE (V16SI, V16SI, V16SI, V16SI, V16SI, V16SI, PCV4SI, V16SI, UHI)
+DEF_FUNCTION_TYPE (V16SI, V16SI, V16SI, V16SI, V16SI, V16SI, PCV4SI)
+
+
# Instructions returning mask
DEF_FUNCTION_TYPE (UHI, UHI)
DEF_FUNCTION_TYPE (UHI, V16QI)
BDESC (OPTION_MASK_ISA_AVX512DQ, CODE_FOR_avx512dq_rangepv16sf_mask_round, "__builtin_ia32_rangeps512_mask", IX86_BUILTIN_RANGEPS512, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT)
BDESC (OPTION_MASK_ISA_AVX512DQ, CODE_FOR_avx512dq_rangepv8df_mask_round, "__builtin_ia32_rangepd512_mask", IX86_BUILTIN_RANGEPD512, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT)
-BDESC_END (ROUND_ARGS, MPX)
+BDESC_END (ROUND_ARGS, ARGS2)
+
+/* AVX512_4FMAPS and AVX512_4VNNIW builtins with variable number of arguments. Defined in additional ix86_isa_flags2. */
+BDESC_FIRST (args2, ARGS2,
+ OPTION_MASK_ISA_AVX5124FMAPS, CODE_FOR_avx5124fmaddps_4fmaddps_mask, "__builtin_ia32_4fmaddps_mask", IX86_BUILTIN_4FMAPS_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_V16SF_V16SF_PCV4SF_V16SF_UHI)
+BDESC (OPTION_MASK_ISA_AVX5124FMAPS, CODE_FOR_avx5124fmaddps_4fmaddps, "__builtin_ia32_4fmaddps", IX86_BUILTIN_4FMAPS, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_V16SF_V16SF_PCV4SF)
+BDESC (OPTION_MASK_ISA_AVX5124FMAPS, CODE_FOR_avx5124fmaddps_4fmaddss, "__builtin_ia32_4fmaddss", IX86_BUILTIN_4FMASS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF_V4SF_V4SF_PCV4SF)
+BDESC (OPTION_MASK_ISA_AVX5124FMAPS, CODE_FOR_avx5124fmaddps_4fmaddss_mask, "__builtin_ia32_4fmaddss_mask", IX86_BUILTIN_4FMASS_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF_V4SF_V4SF_PCV4SF_V4SF_UQI)
+BDESC (OPTION_MASK_ISA_AVX5124FMAPS, CODE_FOR_avx5124fmaddps_4fnmaddps_mask, "__builtin_ia32_4fnmaddps_mask", IX86_BUILTIN_4FNMAPS_MASK, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_V16SF_V16SF_PCV4SF_V16SF_UHI)
+BDESC (OPTION_MASK_ISA_AVX5124FMAPS, CODE_FOR_avx5124fmaddps_4fnmaddps, "__builtin_ia32_4fnmaddps", IX86_BUILTIN_4FNMAPS, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_V16SF_V16SF_V16SF_PCV4SF)
+BDESC (OPTION_MASK_ISA_AVX5124FMAPS, CODE_FOR_avx5124fmaddps_4fnmaddss, "__builtin_ia32_4fnmaddss", IX86_BUILTIN_4FNMASS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF_V4SF_V4SF_PCV4SF)
+BDESC (OPTION_MASK_ISA_AVX5124FMAPS, CODE_FOR_avx5124fmaddps_4fnmaddss_mask, "__builtin_ia32_4fnmaddss_mask", IX86_BUILTIN_4FNMASS_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF_V4SF_V4SF_PCV4SF_V4SF_UQI)
+BDESC (OPTION_MASK_ISA_AVX5124VNNIW, CODE_FOR_avx5124vnniw_vp4dpwssd, "__builtin_ia32_vp4dpwssd", IX86_BUILTIN_4DPWSSD, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_V16SI_V16SI_PCV4SI)
+BDESC (OPTION_MASK_ISA_AVX5124VNNIW, CODE_FOR_avx5124vnniw_vp4dpwssd_mask, "__builtin_ia32_vp4dpwssd_mask", IX86_BUILTIN_4DPWSSD_MASK, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_V16SI_V16SI_PCV4SI_V16SI_UHI)
+BDESC (OPTION_MASK_ISA_AVX5124VNNIW, CODE_FOR_avx5124vnniw_vp4dpwssds, "__builtin_ia32_vp4dpwssds", IX86_BUILTIN_4DPWSSDS, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_V16SI_V16SI_PCV4SI)
+BDESC (OPTION_MASK_ISA_AVX5124VNNIW, CODE_FOR_avx5124vnniw_vp4dpwssds_mask, "__builtin_ia32_vp4dpwssds_mask", IX86_BUILTIN_4DPWSSDS_MASK, UNKNOWN, (int) V16SI_FTYPE_V16SI_V16SI_V16SI_V16SI_V16SI_PCV4SI_V16SI_UHI)
+
+BDESC_END (ARGS2, MPX)
/* Builtins for MPX. */
BDESC_FIRST (mpx, MPX,
static bool ix86_pragma_target_parse (tree, tree);
static void ix86_target_macros_internal
- (HOST_WIDE_INT, enum processor_type, enum processor_type, enum fpmath_unit,
+ (HOST_WIDE_INT, HOST_WIDE_INT, enum processor_type, enum processor_type, enum fpmath_unit,
void (*def_or_undef) (cpp_reader *, const char *));
-\f
/* Internal function to either define or undef the appropriate system
macros. */
static void
ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
+ HOST_WIDE_INT isa_flag2,
enum processor_type arch,
enum processor_type tune,
enum fpmath_unit fpmath,
def_or_undef (parse_in, "__AVX512VBMI__");
if (isa_flag & OPTION_MASK_ISA_AVX512IFMA)
def_or_undef (parse_in, "__AVX512IFMA__");
+ if (isa_flag2 & OPTION_MASK_ISA_AVX5124VNNIW)
+ def_or_undef (parse_in, "__AVX5124VNNIW__");
+ if (isa_flag2 & OPTION_MASK_ISA_AVX5124FMAPS)
+ def_or_undef (parse_in, "__AVX5124FMAPS__");
if (isa_flag & OPTION_MASK_ISA_FMA)
def_or_undef (parse_in, "__FMA__");
if (isa_flag & OPTION_MASK_ISA_RTM)
HOST_WIDE_INT prev_isa;
HOST_WIDE_INT cur_isa;
HOST_WIDE_INT diff_isa;
+ HOST_WIDE_INT prev_isa2;
+ HOST_WIDE_INT cur_isa2;
+ HOST_WIDE_INT diff_isa2;
enum processor_type prev_arch;
enum processor_type prev_tune;
enum processor_type cur_arch;
prev_isa = prev_opt->x_ix86_isa_flags;
cur_isa = cur_opt->x_ix86_isa_flags;
diff_isa = (prev_isa ^ cur_isa);
+ prev_isa2 = prev_opt->x_ix86_isa_flags2;
+ cur_isa2 = cur_opt->x_ix86_isa_flags2;
+ diff_isa2 = (prev_isa2 ^ cur_isa2);
prev_arch = (enum processor_type) prev_opt->arch;
prev_tune = (enum processor_type) prev_opt->tune;
cur_arch = (enum processor_type) cur_opt->arch;
/* Undef all of the macros for that are no longer current. */
ix86_target_macros_internal (prev_isa & diff_isa,
+ prev_isa2 & diff_isa2,
prev_arch,
prev_tune,
(enum fpmath_unit) prev_opt->x_ix86_fpmath,
/* Define all of the macros for new options that were just turned on. */
ix86_target_macros_internal (cur_isa & diff_isa,
+ cur_isa2 & diff_isa2,
cur_arch,
cur_tune,
(enum fpmath_unit) cur_opt->x_ix86_fpmath,
cpp_define (parse_in, "__GCC_ASM_FLAG_OUTPUTS__");
ix86_target_macros_internal (ix86_isa_flags,
+ ix86_isa_flags2,
ix86_arch,
ix86_tune,
ix86_fpmath,
VECTOR_MODES (INT, 32); /* V32QI V16HI V8SI V4DI */
VECTOR_MODES (INT, 64); /* V64QI V32HI V16SI V8DI */
VECTOR_MODES (INT, 128); /* V128QI V64HI V32SI V16DI */
-VECTOR_MODES (FLOAT, 8); /* V4HF V2SF */
-VECTOR_MODES (FLOAT, 16); /* V8HF V4SF V2DF */
-VECTOR_MODES (FLOAT, 32); /* V16HF V8SF V4DF */
-VECTOR_MODES (FLOAT, 64); /* V32HF V16SF V8DF */
-VECTOR_MODES (FLOAT, 128); /* V64HF V32SF V16DF */
+VECTOR_MODES (FLOAT, 8); /* V2SF */
+VECTOR_MODES (FLOAT, 16); /* V4SF V2DF */
+VECTOR_MODES (FLOAT, 32); /* V8SF V4DF V2TF */
+VECTOR_MODES (FLOAT, 64); /* V16SF V8DF V4TF */
+VECTOR_MODES (FLOAT, 128); /* V32SF V16DF V8TF */
+VECTOR_MODES (FLOAT, 256); /* V64SF V32DF V16TF */
VECTOR_MODE (INT, TI, 1); /* V1TI */
VECTOR_MODE (INT, DI, 1); /* V1DI */
VECTOR_MODE (INT, SI, 1); /* V1SI */
VECTOR_MODE (INT, QI, 12); /* V12QI */
VECTOR_MODE (INT, QI, 14); /* V14QI */
VECTOR_MODE (INT, HI, 6); /* V6HI */
+VECTOR_MODE (INT, SI, 64); /* V64SI */
POINTER_BOUNDS_MODE (BND32, 8);
POINTER_BOUNDS_MODE (BND64, 16);
static void ix86_compute_frame_layout (struct ix86_frame *);
static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
rtx, rtx, int);
-static void ix86_add_new_builtins (HOST_WIDE_INT);
+static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
static tree ix86_canonical_va_list_type (tree);
static void predict_jump (int);
static unsigned int split_stack_prologue_scratch_regno (void);
IX86_FUNCTION_SPECIFIC_MAX
};
-static char *ix86_target_string (HOST_WIDE_INT, int, int, const char *,
- const char *, enum fpmath_unit, bool);
+static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
+ const char *, const char *, enum fpmath_unit,
+ bool);
static void ix86_function_specific_save (struct cl_target_option *,
struct gcc_options *opts);
static void ix86_function_specific_restore (struct gcc_options *opts,
responsible for freeing the string. */
static char *
-ix86_target_string (HOST_WIDE_INT isa, int flags, int ix86_flags,
- const char *arch, const char *tune,
+ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2, int flags,
+ int ix86_flags, const char *arch, const char *tune,
enum fpmath_unit fpmath, bool add_nl_p)
{
struct ix86_target_opts
{ "-mclzero", OPTION_MASK_ISA_CLZERO },
{ "-mpku", OPTION_MASK_ISA_PKU },
};
-
+ /* Additional structure for isa flags. */
+ static struct ix86_target_opts isa_opts2[] =
+ {
+ { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
+ { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
+ };
/* Flag options. */
static struct ix86_target_opts flag_opts[] =
{
{ "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY },
};
- const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts)
- + ARRAY_SIZE (ix86_flag_opts) + 6][2];
+ const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa_opts2)
+ + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (ix86_flag_opts) + 6][2];
char isa_other[40];
char target_other[40];
isa);
}
+ /* Pick out the options in isa2 options. */
+ for (i = 0; i < ARRAY_SIZE (isa_opts2); i++)
+ {
+ if ((isa2 & isa_opts2[i].mask) != 0)
+ {
+ opts[num++][0] = isa_opts2[i].option;
+ isa &= ~ isa_opts2[i].mask;
+ }
+ }
+
/* Add flag options. */
for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
{
void ATTRIBUTE_UNUSED
ix86_debug_options (void)
{
- char *opts = ix86_target_string (ix86_isa_flags, target_flags,
- ix86_target_flags,
- ix86_arch_string, ix86_tune_string,
+ char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
+ target_flags, ix86_target_flags,
+ ix86_arch_string,ix86_tune_string,
ix86_fpmath, true);
if (opts)
#define PTA_CLZERO (HOST_WIDE_INT_1 << 57)
#define PTA_NO_80387 (HOST_WIDE_INT_1 << 58)
#define PTA_PKU (HOST_WIDE_INT_1 << 59)
+#define PTA_AVX5124VNNIW (HOST_WIDE_INT_1 << 60)
+#define PTA_AVX5124FMAPS (HOST_WIDE_INT_1 << 61)
#define PTA_CORE2 \
(PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \
if (processor_alias_table[i].flags & PTA_AVX512IFMA
&& !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
+
+ if (processor_alias_table[i].flags & PTA_AVX5124VNNIW
+ && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124VNNIW))
+ opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
+ if (processor_alias_table[i].flags & PTA_AVX5124FMAPS
+ && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124FMAPS))
+ opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
+
if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
x86_prefetch_sse = true;
if (processor_alias_table[i].flags & PTA_MWAITX
ptr->tune_defaulted = ix86_tune_defaulted;
ptr->arch_specified = ix86_arch_specified;
ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
+ ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
ix86_tune_defaulted = ptr->tune_defaulted;
ix86_arch_specified = ptr->arch_specified;
opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
+ opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
struct cl_target_option *ptr)
{
char *target_string
- = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
- ptr->x_ix86_target_flags, NULL, NULL,
- ptr->x_ix86_fpmath, false);
+ = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
+ ptr->x_target_flags, ptr->x_ix86_target_flags,
+ NULL, NULL, ptr->x_ix86_fpmath, false);
gcc_assert (ptr->arch < PROCESSOR_max);
fprintf (file, "%*sarch = %d (%s)\n",
IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
+ IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
+ IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
IX86_ATTR_ISA ("mmx", OPT_mmmx),
IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
The string options are attribute options, and will be undone
when we copy the save structure. */
if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
+ || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
|| opts->x_target_flags != def->x_target_flags
|| option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
|| option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
| OPTION_MASK_ABI_64
| OPTION_MASK_ABI_X32
| OPTION_MASK_CODE16);
-
+ opts->x_ix86_isa_flags &= 0;
}
else if (!orig_arch_specified)
opts->x_ix86_arch_string = NULL;
}
/* Add any builtin functions with the new isa if any. */
- ix86_add_new_builtins (opts->x_ix86_isa_flags);
+ ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
/* Save the current options unless we are validating options for
#pragma. */
/* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
can inline a SSE2 function but a SSE2 function can't inline a SSE4
function. */
- if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
- != callee_opts->x_ix86_isa_flags)
+ if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
+ != callee_opts->x_ix86_isa_flags) &
+ ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
+ != callee_opts->x_ix86_isa_flags2))
ret = false;
/* See if we have the same non-isa options. */
&& df_regs_ever_live_p (regno)));
}
+/* Return true if register class CL should be an additional allocno
+ class. */
+
+static bool
+ix86_additional_allocno_class_p (reg_class_t cl)
+{
+ return cl == MOD4_SSE_REGS;
+}
+
/* Return TRUE if we need to save REGNO. */
static bool
const char *name; /* function name */
enum ix86_builtin_func_type tcode; /* type to use in the declaration */
HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
+ HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
bool const_p; /* true if the declaration is constant */
bool leaf_p; /* true if the declaration has leaf attribute */
bool nothrow_p; /* true if the declaration has nothrow attribute */
/* Bits that can still enable any inclusion of a builtin. */
static HOST_WIDE_INT deferred_isa_values = 0;
+static HOST_WIDE_INT deferred_isa_values2 = 0;
/* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
of which isa_flags to use in the ix86_builtins_isa array. Stores the
return decl;
}
+/* Like def_builtin, but for additional isa2 flags. */
+
+static inline tree
+def_builtin2 (HOST_WIDE_INT mask, const char *name,
+ enum ix86_builtin_func_type tcode,
+ enum ix86_builtins code)
+{
+ tree decl = NULL_TREE;
+
+ ix86_builtins_isa[(int) code].isa2 = mask;
+
+ if (mask == 0
+ || (mask & ix86_isa_flags2) != 0
+ || (lang_hooks.builtin_function
+ == lang_hooks.builtin_function_ext_scope))
+
+ {
+ tree type = ix86_get_builtin_func_type (tcode);
+ decl = add_builtin_function (name, type, code, BUILT_IN_MD,
+ NULL, NULL_TREE);
+ ix86_builtins[(int) code] = decl;
+ ix86_builtins_isa[(int) code].set_and_not_built_p = false;
+ }
+ else
+ {
+ /* Just a MASK where set_and_not_built_p == true can potentially
+ include a builtin. */
+ deferred_isa_values2 |= mask;
+ ix86_builtins[(int) code] = NULL_TREE;
+ ix86_builtins_isa[(int) code].tcode = tcode;
+ ix86_builtins_isa[(int) code].name = name;
+ ix86_builtins_isa[(int) code].leaf_p = false;
+ ix86_builtins_isa[(int) code].nothrow_p = false;
+ ix86_builtins_isa[(int) code].const_p = false;
+ ix86_builtins_isa[(int) code].set_and_not_built_p = true;
+ }
+
+ return decl;
+}
+
+/* Like def_builtin, but also marks the function decl "const". */
+
+static inline tree
+def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
+ enum ix86_builtin_func_type tcode, enum ix86_builtins code)
+{
+ tree decl = def_builtin2 (mask, name, tcode, code);
+ if (decl)
+ TREE_READONLY (decl) = 1;
+ else
+ ix86_builtins_isa[(int) code].const_p = true;
+
+ return decl;
+}
+
/* Add any new builtin functions for a given ISA that may not have been
declared. This saves a bit of space compared to adding all of the
declarations to the tree, even if we didn't use them. */
static void
-ix86_add_new_builtins (HOST_WIDE_INT isa)
+ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
{
- if ((isa & deferred_isa_values) == 0)
+ if (((isa & deferred_isa_values) == 0)
+ && ((isa2 & deferred_isa_values2) == 0))
return;
/* Bits in ISA value can be removed from potential isa values. */
deferred_isa_values &= ~isa;
+ deferred_isa_values2 &= ~isa2;
int i;
tree saved_current_target_pragma = current_target_pragma;
for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
{
- if ((ix86_builtins_isa[i].isa & isa) != 0
+ if ((((ix86_builtins_isa[i].isa & isa) != 0) || ((ix86_builtins_isa[i].isa2 & isa2) != 0))
&& ix86_builtins_isa[i].set_and_not_built_p)
{
tree decl, type;
IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
IX86_BUILTIN__BDESC_ARGS_LAST, 1);
-BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
+BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
+BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
+ IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
IX86_BUILTIN__BDESC_MPX_LAST, 1);
BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
IX86_BUILTIN__BDESC_ARGS_FIRST,
ARRAY_SIZE (bdesc_args) - 1);
+ /* Add all builtins with variable number of operands. */
+ for (i = 0, d = bdesc_args2;
+ i < ARRAY_SIZE (bdesc_args2);
+ i++, d++)
+ {
+ if (d->name == 0)
+ continue;
+
+ ftype = (enum ix86_builtin_func_type) d->flag;
+ def_builtin_const2 (d->mask, d->name, ftype, d->code);
+ }
+
/* Add all builtins with rounding. */
for (i = 0, d = bdesc_round_args;
i < ARRAY_SIZE (bdesc_round_args);
current ISA based on the command line switches. With function specific
options, we need to check in the context of the function making the call
whether it is supported. */
- if (ix86_builtins_isa[fcode].isa
- && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
+ if ((ix86_builtins_isa[fcode].isa
+ && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
+ && (ix86_builtins_isa[fcode].isa2
+ && !(ix86_builtins_isa[fcode].isa2 & ix86_isa_flags2)))
{
- char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, 0,
+ char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa,
+ ix86_builtins_isa[fcode].isa2, 0, 0,
NULL, NULL, (enum fpmath_unit) 0,
false);
if (!opts)
}
}
+ if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
+ && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
+ {
+ i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
+ rtx (*fcn) (rtx, rtx, rtx, rtx);
+ rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
+ rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
+ rtx (*msk_mov) (rtx, rtx, rtx, rtx);
+ int masked = 1;
+ machine_mode mode, wide_mode, nar_mode;
+
+ nar_mode = V4SFmode;
+ mode = V16SFmode;
+ wide_mode = V64SFmode;
+ msk_mov = gen_avx512f_loadv16sf_mask;
+ fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
+ fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
+
+ switch (fcode)
+ {
+ case IX86_BUILTIN_4FMAPS:
+ fcn = gen_avx5124fmaddps_4fmaddps;
+ masked = 0;
+ goto v4fma_expand;
+
+ case IX86_BUILTIN_4DPWSSD:
+ nar_mode = V4SImode;
+ mode = V16SImode;
+ wide_mode = V64SImode;
+ fcn = gen_avx5124vnniw_vp4dpwssd;
+ masked = 0;
+ goto v4fma_expand;
+
+ case IX86_BUILTIN_4DPWSSDS:
+ nar_mode = V4SImode;
+ mode = V16SImode;
+ wide_mode = V64SImode;
+ fcn = gen_avx5124vnniw_vp4dpwssds;
+ masked = 0;
+ goto v4fma_expand;
+
+ case IX86_BUILTIN_4FNMAPS:
+ fcn = gen_avx5124fmaddps_4fnmaddps;
+ masked = 0;
+ goto v4fma_expand;
+
+ case IX86_BUILTIN_4FNMAPS_MASK:
+ fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
+ fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
+ goto v4fma_expand;
+
+ case IX86_BUILTIN_4DPWSSD_MASK:
+ nar_mode = V4SImode;
+ mode = V16SImode;
+ wide_mode = V64SImode;
+ fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
+ fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
+ msk_mov = gen_avx512f_loadv16si_mask;
+ goto v4fma_expand;
+
+ case IX86_BUILTIN_4DPWSSDS_MASK:
+ nar_mode = V4SImode;
+ mode = V16SImode;
+ wide_mode = V64SImode;
+ fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
+ fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
+ msk_mov = gen_avx512f_loadv16si_mask;
+ goto v4fma_expand;
+
+ case IX86_BUILTIN_4FMAPS_MASK:
+ {
+ tree args[4];
+ rtx ops[4];
+ rtx wide_reg;
+ rtx accum;
+ rtx addr;
+ rtx mem;
+
+v4fma_expand:
+ wide_reg = gen_reg_rtx (wide_mode);
+ for (i = 0; i < 4; i++)
+ {
+ args[i] = CALL_EXPR_ARG (exp, i);
+ ops[i] = expand_normal (args[i]);
+
+ emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, (i) * 64),
+ ops[i]);
+ }
+
+ accum = expand_normal (CALL_EXPR_ARG (exp, 4));
+ accum = force_reg (mode, accum);
+
+ addr = expand_normal (CALL_EXPR_ARG (exp, 5));
+ addr = force_reg (Pmode, addr);
+
+ mem = gen_rtx_MEM (nar_mode, addr);
+
+ target = gen_reg_rtx (mode);
+
+ emit_move_insn (target, accum);
+
+ if (! masked)
+ emit_insn (fcn (target, accum, wide_reg, mem));
+ else
+ {
+ rtx merge, mask;
+ merge = expand_normal (CALL_EXPR_ARG (exp, 6));
+
+ mask = expand_normal (CALL_EXPR_ARG (exp, 7));
+
+ if (CONST_INT_P (mask))
+ mask = fixup_modeless_constant (mask, HImode);
+
+ mask = force_reg (HImode, mask);
+
+ if (GET_MODE (mask) != HImode)
+ mask = gen_rtx_SUBREG (HImode, mask, 0);
+
+ /* If merge is 0 then we're about to emit z-masked variant. */
+ if (const0_operand (merge, mode))
+ emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
+ /* If merge is the same as accum then emit merge-masked variant. */
+ else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
+ {
+ merge = force_reg (mode, merge);
+ emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
+ }
+ /* Merge with something unknown might happen if we z-mask w/ -O0. */
+ else
+ {
+ rtx tmp = target;
+ emit_insn (fcn_mask (tmp, wide_reg, mem, tmp, mask));
+
+ target = force_reg (mode, merge);
+ emit_insn (msk_mov (target, tmp, target, mask));
+ }
+ }
+ return target;
+ }
+
+ case IX86_BUILTIN_4FNMASS:
+ fcn = gen_avx5124fmaddps_4fnmaddss;
+ masked = 0;
+ goto s4fma_expand;
+
+ case IX86_BUILTIN_4FMASS:
+ fcn = gen_avx5124fmaddps_4fmaddss;
+ masked = 0;
+ goto s4fma_expand;
+
+ case IX86_BUILTIN_4FNMASS_MASK:
+ fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
+ fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
+ msk_mov = gen_avx512vl_loadv4sf_mask;
+ goto s4fma_expand;
+
+ case IX86_BUILTIN_4FMASS_MASK:
+ {
+ tree args[4];
+ rtx ops[4];
+ rtx wide_reg;
+ rtx accum;
+ rtx addr;
+ rtx mem;
+
+ fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
+ fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
+ msk_mov = gen_avx512vl_loadv4sf_mask;
+
+s4fma_expand:
+ mode = V4SFmode;
+ wide_reg = gen_reg_rtx (V64SFmode);
+ for (i = 0; i < 4; i++)
+ {
+ rtx tmp;
+ args[i] = CALL_EXPR_ARG (exp, i);
+ ops[i] = expand_normal (args[i]);
+
+ tmp = gen_reg_rtx (SFmode);
+ emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
+
+ emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
+ gen_rtx_SUBREG (V16SFmode, tmp, 0));
+ }
+
+ accum = expand_normal (CALL_EXPR_ARG (exp, 4));
+ accum = force_reg (V4SFmode, accum);
+
+ addr = expand_normal (CALL_EXPR_ARG (exp, 5));
+ addr = force_reg (Pmode, addr);
+
+ mem = gen_rtx_MEM (V4SFmode, addr);
+
+ target = gen_reg_rtx (V4SFmode);
+
+ emit_move_insn (target, accum);
+
+ if (! masked)
+ emit_insn (fcn (target, accum, wide_reg, mem));
+ else
+ {
+ rtx merge, mask;
+ merge = expand_normal (CALL_EXPR_ARG (exp, 6));
+
+ mask = expand_normal (CALL_EXPR_ARG (exp, 7));
+
+ if (CONST_INT_P (mask))
+ mask = fixup_modeless_constant (mask, QImode);
+
+ mask = force_reg (QImode, mask);
+
+ if (GET_MODE (mask) != QImode)
+ mask = gen_rtx_SUBREG (QImode, mask, 0);
+
+ /* If merge is 0 then we're about to emit z-masked variant. */
+ if (const0_operand (merge, mode))
+ emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
+ /* If merge is the same as accum then emit merge-masked variant. */
+ else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
+ {
+ merge = force_reg (mode, merge);
+ emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
+ }
+ /* Merge with something unknown might happen if we z-mask w/ -O0. */
+ else
+ {
+ rtx tmp = target;
+ emit_insn (fcn_mask (tmp, wide_reg, mem, tmp, mask));
+
+ target = force_reg (mode, merge);
+ emit_insn (msk_mov (target, tmp, target, mask));
+ }
+ }
+ return target;
+ }
+ default:
+ return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
+ }
+ }
+
if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
&& fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
{
opts = TREE_TARGET_OPTION (target_tree);
- if (ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
+ if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
+ && (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
return ix86_builtin_decl (code, true);
else
return NULL_TREE;
|| VALID_AVX512F_SCALAR_MODE (mode)))
return true;
+ /* For AVX-5124FMAPS allow V64SFmode for special regnos. */
+ if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
+ && MOD4_SSE_REGNO_P (regno)
+ && mode == V64SFmode)
+ return true;
+
+ /* For AVX-5124VNNIW allow V64SImode for special regnos. */
+ if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
+ && MOD4_SSE_REGNO_P (regno)
+ && mode == V64SImode)
+ return true;
+
/* TODO check for QI/HI scalars. */
/* AVX512VL allows sse regs16+ for 128/256 bit modes. */
if (TARGET_AVX512VL
#undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
#define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
+#undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
+#define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
+
#undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
#define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
#define TARGET_AVX512VBMI_P(x) TARGET_ISA_AVX512VBMI_P(x)
#define TARGET_AVX512IFMA TARGET_ISA_AVX512IFMA
#define TARGET_AVX512IFMA_P(x) TARGET_ISA_AVX512IFMA_P(x)
+#define TARGET_AVX5124FMAPS TARGET_ISA_AVX5124FMAPS
+#define TARGET_AVX5124FMAPS_P(x) TARGET_ISA_AVX5124FMAPS_P(x)
+#define TARGET_AVX5124VNNIW TARGET_ISA_AVX5124VNNIW
+#define TARGET_AVX5124VNNIW_P(x) TARGET_ISA_AVX5124VNNIW_P(x)
#define TARGET_FMA TARGET_ISA_FMA
#define TARGET_FMA_P(x) TARGET_ISA_FMA_P(x)
#define TARGET_SSE4A TARGET_ISA_SSE4A
#define HARD_REGNO_NREGS(REGNO, MODE) \
(STACK_REGNO_P (REGNO) || SSE_REGNO_P (REGNO) || MMX_REGNO_P (REGNO) \
|| MASK_REGNO_P (REGNO) || BND_REGNO_P (REGNO) \
- ? (COMPLEX_MODE_P (MODE) ? 2 : 1) \
+ ? (COMPLEX_MODE_P (MODE) ? 2 : \
+ (((MODE == V64SFmode) || (MODE == V64SImode)) ? 4 : 1)) \
: ((MODE) == XFmode \
? (TARGET_64BIT ? 2 : 3) \
: ((MODE) == XCmode \
FLOAT_INT_SSE_REGS,
MASK_EVEX_REGS,
MASK_REGS,
+ MOD4_SSE_REGS,
ALL_REGS, LIM_REG_CLASSES
};
"FLOAT_INT_SSE_REGS", \
"MASK_EVEX_REGS", \
"MASK_REGS", \
+ "MOD4_SSE_REGS" \
"ALL_REGS" }
/* Define which registers fit in which classes. This is an initializer
{ 0x11ffff, 0x1fe0, 0x0 }, /* FLOAT_INT_REGS */ \
{ 0x1ff100ff,0xffffffe0, 0x1f }, /* INT_SSE_REGS */ \
{ 0x1ff1ffff,0xffffffe0, 0x1f }, /* FLOAT_INT_SSE_REGS */ \
- { 0x0, 0x0, 0x1fc0 }, /* MASK_EVEX_REGS */ \
+ { 0x0, 0x0, 0x1fc0 }, /* MASK_EVEX_REGS */ \
{ 0x0, 0x0, 0x1fe0 }, /* MASK_REGS */ \
-{ 0xffffffff,0xffffffff,0x1ffff } \
+{ 0x1fe00000,0xffffe000, 0x1f }, /* MOD4_SSE_REGS */ \
+{ 0xffffffff,0xffffffff,0x1ffff } \
}
/* The same information, inverted:
#define BND_REG_P(X) (REG_P (X) && BND_REGNO_P (REGNO (X)))
#define BND_REGNO_P(N) IN_RANGE ((N), FIRST_BND_REG, LAST_BND_REG)
+#define MOD4_SSE_REG_P(X) (REG_P (X) && MOD4_SSE_REGNO_P (REGNO (X)))
+#define MOD4_SSE_REGNO_P(N) ((N) == XMM0_REG \
+ || (N) == XMM4_REG \
+ || (N) == XMM8_REG \
+ || (N) == XMM12_REG \
+ || (N) == XMM16_REG \
+ || (N) == XMM20_REG \
+ || (N) == XMM24_REG \
+ || (N) == XMM28_REG)
+
/* First floating point reg */
#define FIRST_FLOAT_REG FIRST_STACK_REG
#define STACK_TOP_P(X) (REG_P (X) && REGNO (X) == FIRST_FLOAT_REG)
Variable
HOST_WIDE_INT ix86_isa_flags = TARGET_64BIT_DEFAULT | TARGET_SUBTARGET_ISA_DEFAULT
+Variable
+HOST_WIDE_INT ix86_isa_flags2 = 0
+
; A mask of ix86_isa_flags that includes bit X if X was set or cleared
; on the command line.
Variable
HOST_WIDE_INT ix86_isa_flags_explicit
+Variable
+HOST_WIDE_INT ix86_isa_flags2_explicit
+
; Additional target flags
Variable
int ix86_target_flags
TargetSave
unsigned char branch_cost
+;; which flags were passed by the user
+TargetSave
+HOST_WIDE_INT x_ix86_isa_flags2_explicit
+
;; which flags were passed by the user
TargetSave
HOST_WIDE_INT x_ix86_isa_flags_explicit
Target Report Mask(ISA_AVX512VBMI) Var(ix86_isa_flags) Save
Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2 and AVX512F and AVX512VBMI built-in functions and code generation.
+mavx5124fmaps
+Target Report Mask(ISA_AVX5124FMAPS) Var(ix86_isa_flags2) Save
+Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F and AVX5124FMAPS built-in functions and code generation.
+
+mavx5124vnniw
+Target Report Mask(ISA_AVX5124VNNIW) Var(ix86_isa_flags2) Save
+Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F and AVX5124VNNIW built-in functions and code generation.
+
mfma
Target Report Mask(ISA_FMA) Var(ix86_isa_flags) Save
Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX and FMA built-in functions and code generation.
#include <avx512vbmivlintrin.h>
+#include <avx5124fmapsintrin.h>
+
+#include <avx5124vnniwintrin.h>
+
#include <shaintrin.h>
#include <lzcntintrin.h>
;; For AVX512VBMI support
UNSPEC_VPMULTISHIFT
+
+ ;; For AVX5124FMAPS/AVX5124VNNIW support
+ UNSPEC_VP4FMADD
+ UNSPEC_VP4FNMADD
+ UNSPEC_VP4DPWSSD
+ UNSPEC_VP4DPWSSDS
])
(define_c_enum "unspecv" [
[(set_attr "type" "sselog")
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
+
+(define_mode_iterator IMOD4
+ [(V64SF "TARGET_AVX5124FMAPS") (V64SI "TARGET_AVX5124VNNIW")])
+
+(define_mode_attr imod4_narrow
+ [(V64SF "V16SF") (V64SI "V16SI")])
+
+(define_insn "mov<mode>"
+ [(set (match_operand:IMOD4 0 "nonimmediate_operand")
+ (match_operand:IMOD4 1 "general_operand"))]
+ "TARGET_AVX512F"
+ "#")
+
+(define_split
+ [(set (match_operand:IMOD4 0 "register_operand")
+ (match_operand:IMOD4 1 "nonimmediate_operand"))]
+ "TARGET_AVX512F && reload_completed"
+ [(set (subreg:<imod4_narrow> (match_dup 0) 0)
+ (subreg:<imod4_narrow> (match_dup 1) 0))
+ (set (subreg:<imod4_narrow> (match_dup 0) 64)
+ (subreg:<imod4_narrow> (match_dup 1) 64))
+ (set (subreg:<imod4_narrow> (match_dup 0) 128)
+ (subreg:<imod4_narrow> (match_dup 1) 128))
+ (set (subreg:<imod4_narrow> (match_dup 0) 192)
+ (subreg:<imod4_narrow> (match_dup 1) 192))])
+
+(define_insn "avx5124fmaddps_4fmaddps"
+ [(set (match_operand:V16SF 0 "register_operand" "=v")
+ (unspec:V16SF
+ [(match_operand:V16SF 1 "register_operand" "0")
+ (match_operand:V64SF 2 "register_operand" "Yh")
+ (match_operand:V4SF 3 "memory_operand" "m")] UNSPEC_VP4FMADD))]
+ "TARGET_AVX5124FMAPS"
+ "v4fmaddps\t{%3, %g2, %0|%0, %g2, %3}"
+ [(set_attr ("type") ("ssemuladd"))
+ (set_attr ("prefix") ("evex"))
+ (set_attr ("mode") ("V16SF"))])
+
+(define_insn "avx5124fmaddps_4fmaddps_mask"
+ [(set (match_operand:V16SF 0 "register_operand" "=v")
+ (vec_merge:V16SF
+ (unspec:V16SF
+ [(match_operand:V64SF 1 "register_operand" "Yh")
+ (match_operand:V4SF 2 "memory_operand" "m")] UNSPEC_VP4FMADD)
+ (match_operand:V16SF 3 "register_operand" "0")
+ (match_operand:HI 4 "register_operand" "Yk")))]
+ "TARGET_AVX5124FMAPS"
+ "v4fmaddps\t{%2, %g1, %0%{%4%}|%{%4%}%0, %g1, %2}"
+ [(set_attr ("type") ("ssemuladd"))
+ (set_attr ("prefix") ("evex"))
+ (set_attr ("mode") ("V16SF"))])
+
+(define_insn "avx5124fmaddps_4fmaddps_maskz"
+ [(set (match_operand:V16SF 0 "register_operand" "=v")
+ (vec_merge:V16SF
+ (unspec:V16SF
+ [(match_operand:V16SF 1 "register_operand" "0")
+ (match_operand:V64SF 2 "register_operand" "Yh")
+ (match_operand:V4SF 3 "memory_operand" "m")] UNSPEC_VP4FMADD)
+ (match_operand:V16SF 4 "const0_operand" "C")
+ (match_operand:HI 5 "register_operand" "Yk")))]
+ "TARGET_AVX5124FMAPS"
+ "v4fmaddps\t{%3, %g2, %0%{%5%}%{z%}|%{%5%}%{z%}%0, %g2, %3}"
+ [(set_attr ("type") ("ssemuladd"))
+ (set_attr ("prefix") ("evex"))
+ (set_attr ("mode") ("V16SF"))])
+
+(define_insn "avx5124fmaddps_4fmaddss"
+ [(set (match_operand:V4SF 0 "register_operand" "=v")
+ (unspec:V4SF
+ [(match_operand:V4SF 1 "register_operand" "0")
+ (match_operand:V64SF 2 "register_operand" "Yh")
+ (match_operand:V4SF 3 "memory_operand" "m")] UNSPEC_VP4FMADD))]
+ "TARGET_AVX5124FMAPS"
+ "v4fmaddss\t{%3, %x2, %0|%0, %x2, %3}"
+ [(set_attr ("type") ("ssemuladd"))
+ (set_attr ("prefix") ("evex"))
+ (set_attr ("mode") ("SF"))])
+
+(define_insn "avx5124fmaddps_4fmaddss_mask"
+ [(set (match_operand:V4SF 0 "register_operand" "=v")
+ (vec_merge:V4SF
+ (unspec:V4SF
+ [(match_operand:V64SF 1 "register_operand" "Yh")
+ (match_operand:V4SF 2 "memory_operand" "m")] UNSPEC_VP4FMADD)
+ (match_operand:V4SF 3 "register_operand" "0")
+ (match_operand:QI 4 "register_operand" "Yk")))]
+ "TARGET_AVX5124FMAPS"
+ "v4fmaddss\t{%2, %x1, %0%{%4%}|%{%4%}%0, %x1, %2}"
+ [(set_attr ("type") ("ssemuladd"))
+ (set_attr ("prefix") ("evex"))
+ (set_attr ("mode") ("SF"))])
+
+(define_insn "avx5124fmaddps_4fmaddss_maskz"
+ [(set (match_operand:V4SF 0 "register_operand" "=v")
+ (vec_merge:V4SF
+ (unspec:V4SF
+ [(match_operand:V4SF 1 "register_operand" "0")
+ (match_operand:V64SF 2 "register_operand" "Yh")
+ (match_operand:V4SF 3 "memory_operand" "m")] UNSPEC_VP4FMADD)
+ (match_operand:V4SF 4 "const0_operand" "C")
+ (match_operand:QI 5 "register_operand" "Yk")))]
+ "TARGET_AVX5124FMAPS"
+ "v4fmaddss\t{%3, %x2, %0%{%5%}%{z%}|%{%5%}%{z%}%0, %x2, %3}"
+ [(set_attr ("type") ("ssemuladd"))
+ (set_attr ("prefix") ("evex"))
+ (set_attr ("mode") ("SF"))])
+
+(define_insn "avx5124fmaddps_4fnmaddps"
+ [(set (match_operand:V16SF 0 "register_operand" "=v")
+ (unspec:V16SF
+ [(match_operand:V16SF 1 "register_operand" "0")
+ (match_operand:V64SF 2 "register_operand" "Yh")
+ (match_operand:V4SF 3 "memory_operand" "m")] UNSPEC_VP4FNMADD))]
+ "TARGET_AVX5124FMAPS"
+ "v4fnmaddps\t{%3, %g2, %0|%0, %g2, %3}"
+ [(set_attr ("type") ("ssemuladd"))
+ (set_attr ("prefix") ("evex"))
+ (set_attr ("mode") ("V16SF"))])
+
+(define_insn "avx5124fmaddps_4fnmaddps_mask"
+ [(set (match_operand:V16SF 0 "register_operand" "=v")
+ (vec_merge:V16SF
+ (unspec:V16SF
+ [(match_operand:V64SF 1 "register_operand" "Yh")
+ (match_operand:V4SF 2 "memory_operand" "m")] UNSPEC_VP4FNMADD)
+ (match_operand:V16SF 3 "register_operand" "0")
+ (match_operand:HI 4 "register_operand" "Yk")))]
+ "TARGET_AVX5124FMAPS"
+ "v4fnmaddps\t{%2, %g1, %0%{%4%}|%{%4%}%0, %g1, %2}"
+ [(set_attr ("type") ("ssemuladd"))
+ (set_attr ("prefix") ("evex"))
+ (set_attr ("mode") ("V16SF"))])
+
+(define_insn "avx5124fmaddps_4fnmaddps_maskz"
+ [(set (match_operand:V16SF 0 "register_operand" "=v")
+ (vec_merge:V16SF
+ (unspec:V16SF
+ [(match_operand:V16SF 1 "register_operand" "0")
+ (match_operand:V64SF 2 "register_operand" "Yh")
+ (match_operand:V4SF 3 "memory_operand" "m")] UNSPEC_VP4FNMADD)
+ (match_operand:V16SF 4 "const0_operand" "C")
+ (match_operand:HI 5 "register_operand" "Yk")))]
+ "TARGET_AVX5124FMAPS"
+ "v4fnmaddps\t{%3, %g2, %0%{%5%}%{z%}|%{%5%}%{z%}%0, %g2, %3}"
+ [(set_attr ("type") ("ssemuladd"))
+ (set_attr ("prefix") ("evex"))
+ (set_attr ("mode") ("V16SF"))])
+
+(define_insn "avx5124fmaddps_4fnmaddss"
+ [(set (match_operand:V4SF 0 "register_operand" "=v")
+ (unspec:V4SF
+ [(match_operand:V4SF 1 "register_operand" "0")
+ (match_operand:V64SF 2 "register_operand" "Yh")
+ (match_operand:V4SF 3 "memory_operand" "m")] UNSPEC_VP4FNMADD))]
+ "TARGET_AVX5124FMAPS"
+ "v4fnmaddss\t{%3, %x2, %0|%0, %x2, %3}"
+ [(set_attr ("type") ("ssemuladd"))
+ (set_attr ("prefix") ("evex"))
+ (set_attr ("mode") ("SF"))])
+
+(define_insn "avx5124fmaddps_4fnmaddss_mask"
+ [(set (match_operand:V4SF 0 "register_operand" "=v")
+ (vec_merge:V4SF
+ (unspec:V4SF
+ [(match_operand:V64SF 1 "register_operand" "Yh")
+ (match_operand:V4SF 2 "memory_operand" "m")] UNSPEC_VP4FNMADD)
+ (match_operand:V4SF 3 "register_operand" "0")
+ (match_operand:QI 4 "register_operand" "Yk")))]
+ "TARGET_AVX5124FMAPS"
+ "v4fnmaddss\t{%2, %x1, %0%{%4%}|%{%4%}%0, %x1, %2}"
+ [(set_attr ("type") ("ssemuladd"))
+ (set_attr ("prefix") ("evex"))
+ (set_attr ("mode") ("SF"))])
+
+(define_insn "avx5124fmaddps_4fnmaddss_maskz"
+ [(set (match_operand:V4SF 0 "register_operand" "=v")
+ (vec_merge:V4SF
+ (unspec:V4SF
+ [(match_operand:V4SF 1 "register_operand" "0")
+ (match_operand:V64SF 2 "register_operand" "Yh")
+ (match_operand:V4SF 3 "memory_operand" "m")] UNSPEC_VP4FNMADD)
+ (match_operand:V4SF 4 "const0_operand" "C")
+ (match_operand:QI 5 "register_operand" "Yk")))]
+ "TARGET_AVX5124FMAPS"
+ "v4fnmaddss\t{%3, %x2, %0%{%5%}%{z%}|%{%5%}%{z%}%0, %x2, %3}"
+ [(set_attr ("type") ("ssemuladd"))
+ (set_attr ("prefix") ("evex"))
+ (set_attr ("mode") ("SF"))])
+
+(define_insn "avx5124vnniw_vp4dpwssd"
+ [(set (match_operand:V16SI 0 "register_operand" "=v")
+ (unspec:V16SI
+ [(match_operand:V16SI 1 "register_operand" "0")
+ (match_operand:V64SI 2 "register_operand" "Yh")
+ (match_operand:V4SI 3 "memory_operand" "m")] UNSPEC_VP4DPWSSD))]
+ "TARGET_AVX5124VNNIW"
+ "vp4dpwssd\t{%3, %g2, %0|%0, %g2, %3}"
+ [(set_attr ("type") ("ssemuladd"))
+ (set_attr ("prefix") ("evex"))
+ (set_attr ("mode") ("TI"))])
+
+(define_insn "avx5124vnniw_vp4dpwssd_mask"
+ [(set (match_operand:V16SI 0 "register_operand" "=v")
+ (vec_merge:V16SI
+ (unspec:V16SI
+ [(match_operand:V64SI 1 "register_operand" "Yh")
+ (match_operand:V4SI 2 "memory_operand" "m")] UNSPEC_VP4DPWSSD)
+ (match_operand:V16SI 3 "register_operand" "0")
+ (match_operand:HI 4 "register_operand" "Yk")))]
+ "TARGET_AVX5124VNNIW"
+ "vp4dpwssd\t{%2, %g1, %0%{%4%}|%{%4%}%0, %g1, %2}"
+ [(set_attr ("type") ("ssemuladd"))
+ (set_attr ("prefix") ("evex"))
+ (set_attr ("mode") ("TI"))])
+
+(define_insn "avx5124vnniw_vp4dpwssd_maskz"
+ [(set (match_operand:V16SI 0 "register_operand" "=v")
+ (vec_merge:V16SI
+ (unspec:V16SI
+ [(match_operand:V16SI 1 "register_operand" "0")
+ (match_operand:V64SI 2 "register_operand" "Yh")
+ (match_operand:V4SI 3 "memory_operand" "m")] UNSPEC_VP4DPWSSD)
+ (match_operand:V16SI 4 "const0_operand" "C")
+ (match_operand:HI 5 "register_operand" "Yk")))]
+ "TARGET_AVX5124VNNIW"
+ "vp4dpwssd\t{%3, %g2, %0%{%5%}%{z%}|%{%5%}%{z%}%0, %g2, %3}"
+ [(set_attr ("type") ("ssemuladd"))
+ (set_attr ("prefix") ("evex"))
+ (set_attr ("mode") ("TI"))])
+
+(define_insn "avx5124vnniw_vp4dpwssds"
+ [(set (match_operand:V16SI 0 "register_operand" "=v")
+ (unspec:V16SI
+ [(match_operand:V16SI 1 "register_operand" "0")
+ (match_operand:V64SI 2 "register_operand" "Yh")
+ (match_operand:V4SI 3 "memory_operand" "m")] UNSPEC_VP4DPWSSDS))]
+ "TARGET_AVX5124VNNIW"
+ "vp4dpwssds\t{%3, %g2, %0|%0, %g2, %3}"
+ [(set_attr ("type") ("ssemuladd"))
+ (set_attr ("prefix") ("evex"))
+ (set_attr ("mode") ("TI"))])
+
+(define_insn "avx5124vnniw_vp4dpwssds_mask"
+ [(set (match_operand:V16SI 0 "register_operand" "=v")
+ (vec_merge:V16SI
+ (unspec:V16SI
+ [(match_operand:V64SI 1 "register_operand" "Yh")
+ (match_operand:V4SI 2 "memory_operand" "m")] UNSPEC_VP4DPWSSDS)
+ (match_operand:V16SI 3 "register_operand" "0")
+ (match_operand:HI 4 "register_operand" "Yk")))]
+ "TARGET_AVX5124VNNIW"
+ "vp4dpwssds\t{%2, %g1, %0%{%4%}|%{%4%}%0, %g1, %2}"
+ [(set_attr ("type") ("ssemuladd"))
+ (set_attr ("prefix") ("evex"))
+ (set_attr ("mode") ("TI"))])
+
+(define_insn "avx5124vnniw_vp4dpwssds_maskz"
+ [(set (match_operand:V16SI 0 "register_operand" "=v")
+ (vec_merge:V16SI
+ (unspec:V16SI
+ [(match_operand:V16SI 1 "register_operand" "0")
+ (match_operand:V64SI 2 "register_operand" "Yh")
+ (match_operand:V4SI 3 "memory_operand" "m")] UNSPEC_VP4DPWSSDS)
+ (match_operand:V16SI 4 "const0_operand" "C")
+ (match_operand:HI 5 "register_operand" "Yk")))]
+ "TARGET_AVX5124VNNIW"
+ "vp4dpwssds\t{%3, %g2, %0%{%5%}%{z%}|%{%5%}%{z%}%0, %g2, %3}"
+ [(set_attr ("type") ("ssemuladd"))
+ (set_attr ("prefix") ("evex"))
+ (set_attr ("mode") ("TI"))])
#else\n\
extern __inline__ __attribute__((__always_inline__, __gnu_inline__))\n\
#endif\n\
-unsigned char\n\
+unsigned short\n\
mode_size_inline (machine_mode mode)\n\
{\n\
- extern %sunsigned char mode_size[NUM_MACHINE_MODES];\n\
+ extern %sunsigned short mode_size[NUM_MACHINE_MODES];\n\
gcc_assert (mode >= 0 && mode < NUM_MACHINE_MODES);\n\
switch (mode)\n\
{\n", adj_bytesize ? "" : "const ");
int c;
struct mode_data *m;
- print_maybe_const_decl ("%sunsigned char", "mode_size",
+ print_maybe_const_decl ("%sunsigned short", "mode_size",
"NUM_MACHINE_MODES", bytesize);
for_all_modes (c, m)
int c;
struct mode_data *m;
- print_maybe_const_decl ("%sunsigned char",
+ print_maybe_const_decl ("%sunsigned short",
"mode_base_align", "NUM_MACHINE_MODES",
alignment);
bitmap_set_bit (already_genned, regno);
start_sequence ();
+ emit_clobber (reg);
emit_move_insn (reg, CONST0_RTX (GET_MODE (reg)));
move_insn = get_insns ();
end_sequence ();
/* Get the size in bytes and bits of an object of mode MODE. */
-extern CONST_MODE_SIZE unsigned char mode_size[NUM_MACHINE_MODES];
+extern CONST_MODE_SIZE unsigned short mode_size[NUM_MACHINE_MODES];
#if GCC_VERSION >= 4001
#define GET_MODE_SIZE(MODE) \
((unsigned short) (__builtin_constant_p (MODE) \
/* Determine alignment, 1<=result<=BIGGEST_ALIGNMENT. */
-extern CONST_MODE_BASE_ALIGN unsigned char mode_base_align[NUM_MACHINE_MODES];
+extern CONST_MODE_BASE_ALIGN unsigned short mode_base_align[NUM_MACHINE_MODES];
extern unsigned get_mode_alignment (machine_mode);
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mclwb -mmwaitx -mclzero -mpku" } */
+/* { dg-options "-O -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mclwb -mmwaitx -mclzero -mpku" } */
/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
- popcntintrin.h, fmaintrin.h, pkuintrin.h and mm_malloc.h.h are usable with
+ popcntintrin.h, fmaintrin.h, pkuintrin.h, avx5124fmapsintrin.h,
+ avx5124vnniwintrin.h and mm_malloc.h.h are usable with
-O -pedantic-errors. */
#include <x86intrin.h>
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
-/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mclwb -mmwaitx -mclzero -mpku" } */
+/* { dg-options "-O -fkeep-inline-functions -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512dq -mavx512bw -mavx512vl -mavx512ifma -mavx512vbmi -mavx5124fmaps -mavx5124vnniw -mclwb -mmwaitx -mclzero -mpku" } */
/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h, fma4intrin.h,
xopintrin.h, abmintrin.h, bmiintrin.h, tbmintrin.h, lwpintrin.h,
- popcntintrin.h, fmaintrin.h, pkuintrin.h and mm_malloc.h are usable with
+ popcntintrin.h, fmaintrin.h, pkuintrin.h, avx5124fmapsintrin.h,
+ avx5124vnniwintrin.h and mm_malloc.h are usable with
-O -fkeep-inline-functions. */
#include <x86intrin.h>
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx5124fmaps" } */
+/* { dg-final { scan-assembler-times "v4fmaddps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "v4fmaddps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "v4fmaddps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
+
+#include <x86intrin.h>
+
+__m512 a, b, c, d, e, f, g, x1, x2, x3;
+__m128 *mem;
+__mmask16 m;
+
+int foo ()
+{
+ x1 = _mm512_4fmadd_ps (a, b, c, d, e, mem);
+ x2 = _mm512_mask_4fmadd_ps (a, m, b, c, d, e, mem);
+ x3 = _mm512_maskz_4fmadd_ps (m, a, b, c, d, e, mem);
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx5124fmaps" } */
+/* { dg-require-effective-target avx5124fmaps } */
+
+#define ESP_FLOAT 1.0
+
+#define AVX5124FMAPS
+#include "avx512f-helper.h"
+
+#define SIZE (AVX512F_LEN / 32)
+
+#include "avx512f-mask-type.h"
+
+void
+CALC (float *src1, float* src2, float *src3,
+ float *src4, float* prev_dst, float *mult, float *dst)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ {
+ dst[i] = (double)prev_dst[i]
+ + (double)src1[i] * (double)mult[0]
+ + (double)src2[i] * (double)mult[1]
+ + (double)src3[i] * (double)mult[2]
+ + (double)src4[i] * (double)mult[3];
+ }
+}
+
+void
+TEST (void)
+{
+ int i, sign;
+ UNION_TYPE (AVX512F_LEN,) src1, src2, src3, src4, src5, dst, res1, res2, res3;
+ UNION_TYPE (128,) mult;
+ MASK_TYPE mask = MASK_VALUE;
+ float res_ref[SIZE];
+
+ sign = -1;
+ for (i = 0; i < SIZE; i++)
+ {
+ src1.a[i] = 1.5 + 34.67 * i * sign;
+ src2.a[i] = -22.17 * i * sign;
+ src3.a[i] = src1.a[i] * src1.a[i];
+ src4.a[i] = src2.a[i] * src2.a[i];
+ sign = sign * -1;
+ }
+ for (i = 0; i < 4; i++)
+ mult.a[i] = 3.1415 + i * 2.71828;
+
+ for (i = 0; i < SIZE; i++)
+ src5.a[i] = DEFAULT_VALUE;
+
+ CALC (src1.a, src2.a, src3.a, src4.a, src5.a, mult.a, res_ref);
+
+ res1.x = INTRINSIC (_4fmadd_ps) ( src5.x, src1.x, src2.x, src3.x, src4.x, &mult.x);
+ res2.x = INTRINSIC (_mask_4fmadd_ps) (src5.x, mask, src1.x, src2.x, src3.x, src4.x, &mult.x);
+ res3.x = INTRINSIC (_maskz_4fmadd_ps) (mask, src5.x, src1.x, src2.x, src3.x, src4.x, &mult.x);
+
+ if (UNION_FP_CHECK (AVX512F_LEN,) (res1, res_ref))
+ abort ();
+
+ MASK_MERGE () (res_ref, mask, SIZE);
+ if (UNION_FP_CHECK (AVX512F_LEN,) (res2, res_ref))
+ abort ();
+
+ MASK_ZERO () (res_ref, mask, SIZE);
+ if (UNION_FP_CHECK (AVX512F_LEN,) (res3, res_ref))
+ abort ();
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx5124fmaps" } */
+/* { dg-final { scan-assembler-times "v4fmaddss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "v4fmaddss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "v4fmaddss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
+
+#include <x86intrin.h>
+
+__m128 a, b, c, d, e, f, x1, x2, x3;
+__m128 *mem;
+__mmask8 m;
+
+int foo ()
+{
+ x1 = _mm_4fmadd_ss (a, b, c, d, e, mem);
+ x2 = _mm_mask_4fmadd_ss (a, m, b, c, d, e, mem);
+ x3 = _mm_maskz_4fmadd_ss (m, a, b, c, d, e, mem);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx5124fmaps" } */
+/* { dg-final { scan-assembler-times "v4fnmaddps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "v4fnmaddps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "v4fnmaddps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
+
+#include <x86intrin.h>
+
+__m512 a, b, c, d, e, f, g, x1, x2, x3;
+__m128 *mem;
+__mmask16 m;
+
+int foo ()
+{
+ x1 = _mm512_4fnmadd_ps (a, b, c, d, e, mem);
+ x2 = _mm512_mask_4fnmadd_ps (a, m, b, c, d, e, mem);
+ x3 = _mm512_maskz_4fnmadd_ps (m, a, b, c, d, e, mem);
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx5124fmaps" } */
+/* { dg-require-effective-target avx5124fmaps } */
+
+#define ESP_FLOAT 1.0
+
+#define AVX5124FMAPS
+#include "avx512f-helper.h"
+
+#define SIZE (AVX512F_LEN / 32)
+
+#include "avx512f-mask-type.h"
+
+void
+CALC (float *src1, float* src2, float *src3,
+ float *src4, float* prev_dst, float *mult, float *dst)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ {
+ dst[i] = (double)prev_dst[i]
+ - (double)src1[i] * (double)mult[0]
+ - (double)src2[i] * (double)mult[1]
+ - (double)src3[i] * (double)mult[2]
+ - (double)src4[i] * (double)mult[3];
+ }
+}
+
+void
+TEST (void)
+{
+ int i, sign;
+ UNION_TYPE (AVX512F_LEN,) src1, src2, src3, src4, src5, dst, res1, res2, res3;
+ UNION_TYPE (128,) mult;
+ MASK_TYPE mask = MASK_VALUE;
+ float res_ref[SIZE];
+
+ sign = -1;
+ for (i = 0; i < SIZE; i++)
+ {
+ src1.a[i] = 1.5 + 34.67 * i * sign;
+ src2.a[i] = -22.17 * i * sign;
+ src3.a[i] = src1.a[i] * src1.a[i];
+ src4.a[i] = src2.a[i] * src2.a[i];
+ sign = sign * -1;
+ }
+ for (i = 0; i < 4; i++)
+ mult.a[i] = 3.1415 + i * 2.71828;
+
+ for (i = 0; i < SIZE; i++)
+ src5.a[i] = DEFAULT_VALUE;
+
+ CALC (src1.a, src2.a, src3.a, src4.a, src5.a, mult.a, res_ref);
+
+ res1.x = INTRINSIC (_4fnmadd_ps) ( src5.x, src1.x, src2.x, src3.x, src4.x, &mult.x);
+ res2.x = INTRINSIC (_mask_4fnmadd_ps) (src5.x, mask, src1.x, src2.x, src3.x, src4.x, &mult.x);
+ res3.x = INTRINSIC (_maskz_4fnmadd_ps) (mask, src5.x, src1.x, src2.x, src3.x, src4.x, &mult.x);
+
+ if (UNION_FP_CHECK (AVX512F_LEN,) (res1, res_ref))
+ abort ();
+
+ MASK_MERGE () (res_ref, mask, SIZE);
+ if (UNION_FP_CHECK (AVX512F_LEN,) (res2, res_ref))
+ abort ();
+
+ MASK_ZERO () (res_ref, mask, SIZE);
+ if (UNION_FP_CHECK (AVX512F_LEN,) (res3, res_ref))
+ abort ();
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx5124fmaps" } */
+/* { dg-final { scan-assembler-times "v4fnmaddss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "v4fnmaddss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "v4fnmaddss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
+
+
+#include <x86intrin.h>
+
+__m128 a, b, c, d, e, f, x1, x2, x3;
+__m128 *mem;
+__mmask8 m;
+
+int foo ()
+{
+ x1 = _mm_4fnmadd_ss (a, b, c, d, e, mem);
+ x2 = _mm_mask_4fnmadd_ss (a, m, b, c, d, e, mem);
+ x3 = _mm_maskz_4fnmadd_ss (m, a, b, c, d, e, mem);
+}
--- /dev/null
+#include <stdlib.h>
+#include "cpuid.h"
+#include "m512-check.h"
+#include "avx512f-os-support.h"
+
+static void avx5124fmaps_test (void);
+
+static void __attribute__ ((noinline)) do_test (void)
+{
+ avx5124fmaps_test ();
+}
+
+int
+main ()
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ if (!__get_cpuid (1, &eax, &ebx, &ecx, &edx))
+ return 0;
+
+ /* Run AVX512_4FMAPS test only if host has the support. */
+ if ((ecx & bit_OSXSAVE) == (bit_OSXSAVE))
+ {
+ if (__get_cpuid_max (0, NULL) < 7)
+ return 0;
+
+ __cpuid_count (7, 0, eax, ebx, ecx, edx);
+
+ if ((avx512f_os_support ()) && ((edx & bit_AVX5124FMAPS) == bit_AVX5124FMAPS))
+ {
+ do_test ();
+#ifdef DEBUG
+ printf ("PASSED\n");
+#endif
+ return 0;
+ }
+#ifdef DEBUG
+ printf ("SKIPPED\n");
+#endif
+ }
+#ifdef DEBUG
+ else
+ printf ("SKIPPED\n");
+#endif
+
+ return 0;
+}
--- /dev/null
+#include <stdlib.h>
+#include "cpuid.h"
+#include "m512-check.h"
+#include "avx512f-os-support.h"
+
+static void avx5124vnniw_test (void);
+
+static void __attribute__ ((noinline)) do_test (void)
+{
+ avx5124vnniw_test ();
+}
+
+int
+main ()
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ if (!__get_cpuid (1, &eax, &ebx, &ecx, &edx))
+ return 0;
+
+ /* Run AVX512_4VNNIW test only if host has the support. */
+ if ((ecx & bit_OSXSAVE) == (bit_OSXSAVE))
+ {
+ if (__get_cpuid_max (0, NULL) < 7)
+ return 0;
+
+ __cpuid_count (7, 0, eax, ebx, ecx, edx);
+
+ if ((avx512f_os_support ()) && ((edx & bit_AVX5124VNNIW) == bit_AVX5124VNNIW))
+ {
+ do_test ();
+#ifdef DEBUG
+ printf ("PASSED\n");
+#endif
+ return 0;
+ }
+#ifdef DEBUG
+ printf ("SKIPPED\n");
+#endif
+ }
+#ifdef DEBUG
+ else
+ printf ("SKIPPED\n");
+#endif
+
+ return 0;
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx5124vnniw" } */
+/* { dg-final { scan-assembler-times "vp4dpwssd\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vp4dpwssd\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vp4dpwssd\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
+
+#include <x86intrin.h>
+
+__m512i a, b, c, d, e, f, g, x1, x2, x3;
+__m128i *mem;
+__mmask16 m;
+
+int foo ()
+{
+ x1 = _mm512_4dpwssd_epi32 (a, b, c, d, e, mem);
+ x2 = _mm512_mask_4dpwssd_epi32 (a, m, b, c, d, e, mem);
+ x3 = _mm512_maskz_4dpwssd_epi32 (m, a, b, c, d, e, mem);
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx5124vnniw" } */
+/* { dg-require-effective-target avx5124vnniw } */
+
+#define AVX5124VNNIW
+#include "avx512f-helper.h"
+
+#define SIZE (AVX512F_LEN / 32)
+
+#include "avx512f-mask-type.h"
+
+void
+CALC (short *src1, short* src2, short *src3,
+ short *src4, int* prev_dst, short *mult, int *dst)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ {
+ int p1dword, p2dword;
+ dst[i] = prev_dst[i];
+ p1dword = (int)(src1[2*i ]) * (int)(mult[0]);
+ p2dword = (int)(src1[2*i+1]) * (int)(mult[1]);
+ dst[i] += p1dword + p2dword;
+
+ p1dword = (int)(src2[2*i ]) * (int)(mult[2]);
+ p2dword = (int)(src2[2*i+1]) * (int)(mult[3]);
+ dst[i] += p1dword + p2dword;
+
+ p1dword = (int)(src3[2*i ]) * (int)(mult[4]);
+ p2dword = (int)(src3[2*i+1]) * (int)(mult[5]);
+ dst[i] += p1dword + p2dword;
+
+ p1dword = (int)(src4[2*i ]) * (int)(mult[6]);
+ p2dword = (int)(src4[2*i+1]) * (int)(mult[7]);
+ dst[i] += p1dword + p2dword;
+ }
+}
+
+void
+TEST (void)
+{
+ int i;
+ UNION_TYPE (AVX512F_LEN, i_w) src1, src2, src3, src4;
+ UNION_TYPE (AVX512F_LEN, i_d) src5, dst, res1, res2, res3;
+ UNION_TYPE (128, i_w) mult;
+ MASK_TYPE mask = MASK_VALUE;
+ int res_ref[SIZE];
+
+ for (i = 0; i < SIZE * 2; i++)
+ {
+ src1.a[i] = 2 + 7 * i % 291;
+ src2.a[i] = 3 + 11 * (i % 377) * i;
+ src3.a[i] = src1.a[i] * src1.a[i];
+ src4.a[i] = src2.a[i] * src2.a[i];
+ }
+ for (i = 0; i < 8; i++)
+ mult.a[i] = 3 + i * 2;
+
+ for (i = 0; i < SIZE; i++)
+ src5.a[i] = DEFAULT_VALUE;
+
+ CALC (src1.a, src2.a, src3.a, src4.a, src5.a, mult.a, res_ref);
+
+ res1.x = INTRINSIC (_4dpwssd_epi32) ( src5.x, src1.x, src2.x, src3.x, src4.x, &mult.x);
+ res2.x = INTRINSIC (_mask_4dpwssd_epi32) (src5.x, mask, src1.x, src2.x, src3.x, src4.x, &mult.x);
+ res3.x = INTRINSIC (_maskz_4dpwssd_epi32) (mask, src5.x, src1.x, src2.x, src3.x, src4.x, &mult.x);
+
+ if (UNION_CHECK (AVX512F_LEN, i_d) (res1, res_ref))
+ abort ();
+
+ MASK_MERGE (i_d) (res_ref, mask, SIZE);
+ if (UNION_CHECK (AVX512F_LEN, i_d) (res2, res_ref))
+ abort ();
+
+ MASK_ZERO (i_d) (res_ref, mask, SIZE);
+ if (UNION_CHECK (AVX512F_LEN, i_d) (res3, res_ref))
+ abort ();
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx5124vnniw" } */
+/* { dg-final { scan-assembler-times "vp4dpwssds\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vp4dpwssds\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vp4dpwssds\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
+
+#include <x86intrin.h>
+
+__m512i a, b, c, d, e, f, g, x1, x2, x3;
+__m128i *mem;
+__mmask16 m;
+
+int foo ()
+{
+ x1 = _mm512_4dpwssds_epi32 (a, b, c, d, e, mem);
+ x2 = _mm512_mask_4dpwssds_epi32 (a, m, b, c, d, e, mem);
+ x3 = _mm512_maskz_4dpwssds_epi32 (m, a, b, c, d, e, mem);
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx5124vnniw" } */
+/* { dg-require-effective-target avx5124vnniw } */
+
+#define DEFAULT_VALUE 0x7ffffffe
+
+#define AVX5124VNNIW
+#include "avx512f-helper.h"
+
+#define SIZE (AVX512F_LEN / 32)
+
+#include "avx512f-mask-type.h"
+
+void
+CALC (short *src1, short* src2, short *src3,
+ short *src4, int* prev_dst, short *mult, int *dst)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ {
+ int p1dword, p2dword;
+ long long int tmp;
+ dst[i] = prev_dst[i];
+ p1dword = (int)(src1[2*i ]) * (int)(mult[0]);
+ p2dword = (int)(src1[2*i+1]) * (int)(mult[1]);
+ tmp = (long long)dst[i] + p1dword + p2dword;
+ if (tmp > 0x7fffffff)
+ dst[i] = 0x7fffffff;
+ else
+ dst[i] += p1dword + p2dword;
+
+ p1dword = (int)(src2[2*i ]) * (int)(mult[2]);
+ p2dword = (int)(src2[2*i+1]) * (int)(mult[3]);
+ tmp = (long long)dst[i] + p1dword + p2dword;
+ if (tmp > 0x7fffffff)
+ dst[i] = 0x7fffffff;
+ else
+ dst[i] += p1dword + p2dword;
+
+ p1dword = (int)(src3[2*i ]) * (int)(mult[4]);
+ p2dword = (int)(src3[2*i+1]) * (int)(mult[5]);
+ tmp = (long long)dst[i] + p1dword + p2dword;
+ if (tmp > 0x7fffffff)
+ dst[i] = 0x7fffffff;
+ else
+ dst[i] += p1dword + p2dword;
+
+ p1dword = (int)(src4[2*i ]) * (int)(mult[6]);
+ p2dword = (int)(src4[2*i+1]) * (int)(mult[7]);
+ tmp = (long long)dst[i] + p1dword + p2dword;
+ if (tmp > 0x7fffffff)
+ dst[i] = 0x7fffffff;
+ else
+ dst[i] += p1dword + p2dword;
+ }
+}
+
+void
+TEST (void)
+{
+ int i;
+ UNION_TYPE (AVX512F_LEN, i_w) src1, src2, src3, src4;
+ UNION_TYPE (AVX512F_LEN, i_d) src5, dst, res1, res2, res3;
+ UNION_TYPE (128, i_w) mult;
+ MASK_TYPE mask = MASK_VALUE;
+ int res_ref[SIZE];
+
+ for (i = 0; i < SIZE * 2; i++)
+ {
+ src1.a[i] = 2 + 7 * i % 291;
+ src2.a[i] = 3 + 11 * (i % 377) * i;
+ src3.a[i] = src1.a[i] * src1.a[i];
+ src4.a[i] = src2.a[i] * src2.a[i];
+ }
+ for (i = 0; i < 8; i++)
+ mult.a[i] = 3 + i * 2;
+
+ for (i = 0; i < SIZE; i++)
+ src5.a[i] = DEFAULT_VALUE;
+
+ CALC (src1.a, src2.a, src3.a, src4.a, src5.a, mult.a, res_ref);
+
+ res1.x = INTRINSIC (_4dpwssds_epi32) ( src5.x, src1.x, src2.x, src3.x, src4.x, &mult.x);
+ res2.x = INTRINSIC (_mask_4dpwssds_epi32) (src5.x, mask, src1.x, src2.x, src3.x, src4.x, &mult.x);
+ res3.x = INTRINSIC (_maskz_4dpwssds_epi32) (mask, src5.x, src1.x, src2.x, src3.x, src4.x, &mult.x);
+
+ if (UNION_CHECK (AVX512F_LEN, i_d) (res1, res_ref))
+ abort ();
+
+ MASK_MERGE (i_d) (res_ref, mask, SIZE);
+ if (UNION_CHECK (AVX512F_LEN, i_d) (res2, res_ref))
+ abort ();
+
+ MASK_ZERO (i_d) (res_ref, mask, SIZE);
+ if (UNION_CHECK (AVX512F_LEN, i_d) (res3, res_ref))
+ abort ();
+}
#include "avx512ifma-check.h"
#elif defined (AVX512VBMI) && !defined (AVX512VL)
#include "avx512vbmi-check.h"
+#elif defined (AVX5124FMAPS) && !defined (AVX512VL)
+#include "avx5124fmaps-check.h"
+#elif defined (AVX5124VNNIW) && !defined (AVX512VL)
+#include "avx5124vnniw-check.h"
#elif defined (AVX512VL)
#include "avx512vl-check.h"
#endif
/* Value to be written into destination.
We have one value for all types so it must be small enough
to fit into signed char. */
+#ifndef DEFAULT_VALUE
#define DEFAULT_VALUE 117
+#endif
#define MAKE_MASK_MERGE(NAME, TYPE) \
static void \
#elif defined (AVX512VBMI) && !defined (AVX512VL)
void
avx512vbmi_test (void) { test_512 (); }
+#elif defined (AVX5124FMAPS) && !defined (AVX512VL)
+void
+avx5124fmaps_test (void) { test_512 (); }
+#elif defined (AVX5124VNNIW) && !defined (AVX512VL)
+void
+avx5124vnniw_test (void) { test_512 (); }
#elif defined (AVX512VL)
void
avx512vl_test (void) { test_256 (); test_128 (); }
} "-mavx512vbmi" ]
}
+# Return 1 if avx512_4fmaps instructions can be compiled.
+proc check_effective_target_avx5124fmaps { } {
+ return [check_no_compiler_messages avx5124fmaps object {
+ typedef float __v16sf __attribute__ ((__vector_size__ (64)));
+ typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+ __v16sf
+ _mm512_mask_4fmadd_ps (__v16sf __DEST, __v16sf __A, __v16sf __B, __v16sf __C,
+ __v16sf __D, __v16sf __E, __v4sf *__F)
+ {
+ return (__v16sf) __builtin_ia32_4fmaddps_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__v16sf) __D,
+ (__v16sf) __E,
+ (const __v4sf *) __F,
+ (__v16sf) __DEST,
+ 0xffff);
+ }
+ } "-mavx5124fmaps" ]
+}
+
+# Return 1 if avx512_4vnniw instructions can be compiled.
+proc check_effective_target_avx5124vnniw { } {
+ return [check_no_compiler_messages avx5124vnniw object {
+ typedef int __v16si __attribute__ ((__vector_size__ (64)));
+ typedef int __v4si __attribute__ ((__vector_size__ (16)));
+
+ __v16si
+ _mm512_4dpwssd_epi32 (__v16si __A, __v16si __B, __v16si __C,
+ __v16si __D, __v16si __E, __v4si *__F)
+ {
+ return (__v16si) __builtin_ia32_vp4dpwssd ((__v16si) __B,
+ (__v16si) __C,
+ (__v16si) __D,
+ (__v16si) __E,
+ (__v16si) __A,
+ (const __v4si *) __F);
+ }
+ } "-mavx5124vnniw" ]
+}
+
# If a testcase doesn't have special options, use these.
global DEFAULT_CFLAGS
if ![info exists DEFAULT_CFLAGS] then {
CHECK_EXP (union128, float, "%f")
+#ifndef ESP_FLOAT
#define ESP_FLOAT 0.000001
+#endif
+#ifndef ESP_DOUBLE
#define ESP_DOUBLE 0.000001
+#endif
#define CHECK_ARRAY(ARRAY, TYPE, FMT) \
static int \
__attribute__((noinline, unused)) \
popcntintrin.h and mm_malloc.h are usable
with -O -std=c89 -pedantic-errors. */
/* { dg-do compile } */
-/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512bw -mavx512dq -mavx512vl -mavx512vbmi -mavx512ifma -mclwb -mmwaitx -mclzero -mpku" } */
+/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512bw -mavx512dq -mavx512vl -mavx512vbmi -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mclwb -mmwaitx -mclzero -mpku" } */
#include <x86intrin.h>
/* { dg-do compile } */
-/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi -mavx512ifma -mclwb -mmwaitx -mclzero -mpku" } */
+/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -msse4a -m3dnow -mavx -mavx2 -mfma4 -mxop -maes -mpclmul -mpopcnt -mabm -mlzcnt -mbmi -mbmi2 -mtbm -mlwp -mfsgsbase -mrdrnd -mf16c -mfma -mrtm -mrdseed -mprfchw -madx -mfxsr -mxsaveopt -mavx512f -mavx512er -mavx512cd -mavx512pf -msha -mprefetchwt1 -mxsavec -mxsaves -mclflushopt -mavx512vl -mavx512dq -mavx512bw -mavx512vbmi -mavx512ifma -mavx5124fmaps -mavx5124vnniw -mclwb -mmwaitx -mclzero -mpku" } */
/* { dg-add-options bind_pic_locally } */
#include <mm_malloc.h>
/* Test that the intrinsics compile with optimization. All of them
are defined as inline functions in {,x,e,p,t,s,w,a,b,i}mmintrin.h,
mm3dnow.h, fma4intrin.h, xopintrin.h, abmintrin.h, bmiintrin.h,
- tbmintrin.h, lwpintrin.h, popcntintrin.h, fmaintrin.h and mm_malloc.h
+ tbmintrin.h, lwpintrin.h, popcntintrin.h, fmaintrin.h,
+ avx5124fmapsintrin.h, avx5124vnniwintrin.h and mm_malloc.h
that reference the proper builtin functions.
Defining away "extern" and "__inline" results in all of them being
#ifndef DIFFERENT_PRAGMAS
-#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512ifma")
+#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw")
#endif
/* Following intrinsics require immediate arguments. They
/* Test that the intrinsics compile with optimization. All of them
are defined as inline functions in {,x,e,p,t,s,w,a,b,i}mmintrin.h,
mm3dnow.h, fma4intrin.h, xopintrin.h, abmintrin.h, bmiintrin.h,
- tbmintrin.h, lwpintrin.h, popcntintrin.h, fmaintrin.h and mm_malloc.h
+ tbmintrin.h, lwpintrin.h, popcntintrin.h, fmaintrin.h,
+ avx5124fmapsintrin.h, avx5124vnniwintrin.h and mm_malloc.h
that reference the proper builtin functions.
Defining away "extern" and "__inline" results in all of them being
#define __builtin_ia32_extracti64x2_256_mask(A, E, C, D) __builtin_ia32_extracti64x2_256_mask(A, 1, C, D)
#define __builtin_ia32_extractf64x2_256_mask(A, E, C, D) __builtin_ia32_extractf64x2_256_mask(A, 1, C, D)
-#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,xsavec,xsaves,clflushopt,avx512bw,avx512dq,avx512vl,avx512vbmi,avx512ifma,clwb,mwaitx,clzero,pku")
+#pragma GCC target ("sse4a,3dnow,avx,avx2,fma4,xop,aes,pclmul,popcnt,abm,lzcnt,bmi,bmi2,tbm,lwp,fsgsbase,rdrnd,f16c,fma,rtm,rdseed,prfchw,adx,fxsr,xsaveopt,avx512f,avx512er,avx512cd,avx512pf,sha,prefetchwt1,xsavec,xsaves,clflushopt,avx512bw,avx512dq,avx512vl,avx512vbmi,avx512ifma,avx5124fmaps,avx5124vnniw,clwb,mwaitx,clzero,pku")
#include <x86intrin.h>