From: Sunil K Pandey Date: Wed, 14 Oct 2020 18:36:39 +0000 (-0700) Subject: x86: Add missing intrinsics [PR95483] X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=93103603fd66a9fcf3ea2d8b52657e4b2496f544;p=gcc.git x86: Add missing intrinsics [PR95483] Tested on x86-64. gcc/ChangeLog: PR target/95483 * config/i386/avx2intrin.h (_mm_broadcastsi128_si256): New intrinsics. (_mm_broadcastsd_pd): Ditto. * config/i386/avx512bwintrin.h (_mm512_loadu_epi16): New intrinsics. (_mm512_storeu_epi16): Ditto. (_mm512_loadu_epi8): Ditto. (_mm512_storeu_epi8): Ditto. * config/i386/avx512dqintrin.h (_mm_reduce_round_sd): New intrinsics. (_mm_mask_reduce_round_sd): Ditto. (_mm_maskz_reduce_round_sd): Ditto. (_mm_reduce_round_ss): Ditto. (_mm_mask_reduce_round_ss): Ditto. (_mm_maskz_reduce_round_ss): Ditto. (_mm512_reduce_round_pd): Ditto. (_mm512_mask_reduce_round_pd): Ditto. (_mm512_maskz_reduce_round_pd): Ditto. (_mm512_reduce_round_ps): Ditto. (_mm512_mask_reduce_round_ps): Ditto. (_mm512_maskz_reduce_round_ps): Ditto. * config/i386/avx512erintrin.h (_mm_mask_rcp28_round_sd): New intrinsics. (_mm_maskz_rcp28_round_sd): Ditto. (_mm_mask_rcp28_round_ss): Ditto. (_mm_maskz_rcp28_round_ss): Ditto. (_mm_mask_rsqrt28_round_sd): Ditto. (_mm_maskz_rsqrt28_round_sd): Ditto. (_mm_mask_rsqrt28_round_ss): Ditto. (_mm_maskz_rsqrt28_round_ss): Ditto. (_mm_mask_rcp28_sd): Ditto. (_mm_maskz_rcp28_sd): Ditto. (_mm_mask_rcp28_ss): Ditto. (_mm_maskz_rcp28_ss): Ditto. (_mm_mask_rsqrt28_sd): Ditto. (_mm_maskz_rsqrt28_sd): Ditto. (_mm_mask_rsqrt28_ss): Ditto. (_mm_maskz_rsqrt28_ss): Ditto. * config/i386/avx512fintrin.h (_mm_mask_sqrt_sd): New intrinsics. (_mm_maskz_sqrt_sd): Ditto. (_mm_mask_sqrt_ss): Ditto. (_mm_maskz_sqrt_ss): Ditto. (_mm_mask_scalef_sd): Ditto. (_mm_maskz_scalef_sd): Ditto. (_mm_mask_scalef_ss): Ditto. (_mm_maskz_scalef_ss): Ditto. (_mm_mask_cvt_roundsd_ss): Ditto. (_mm_maskz_cvt_roundsd_ss): Ditto. (_mm_mask_cvt_roundss_sd): Ditto. (_mm_maskz_cvt_roundss_sd): Ditto. (_mm_mask_cvtss_sd): Ditto. (_mm_maskz_cvtss_sd): Ditto. (_mm_mask_cvtsd_ss): Ditto. (_mm_maskz_cvtsd_ss): Ditto. (_mm512_cvtsi512_si32): Ditto. (_mm_cvtsd_i32): Ditto. (_mm_cvtss_i32): Ditto. (_mm_cvti32_sd): Ditto. (_mm_cvti32_ss): Ditto. (_mm_cvtsd_i64): Ditto. (_mm_cvtss_i64): Ditto. (_mm_cvti64_sd): Ditto. (_mm_cvti64_ss): Ditto. * config/i386/avx512vlbwintrin.h (_mm256_storeu_epi8): New intrinsics. (_mm_storeu_epi8): Ditto. (_mm256_loadu_epi16): Ditto. (_mm_loadu_epi16): Ditto. (_mm256_loadu_epi8): Ditto. (_mm_loadu_epi8): Ditto. (_mm256_storeu_epi16): Ditto. (_mm_storeu_epi16): Ditto. * config/i386/avx512vlintrin.h (_mm256_load_epi64): New intrinsics. (_mm_load_epi64): Ditto. (_mm256_load_epi32): Ditto. (_mm_load_epi32): Ditto. (_mm256_store_epi32): Ditto. (_mm_store_epi32): Ditto. (_mm256_loadu_epi64): Ditto. (_mm_loadu_epi64): Ditto. (_mm256_loadu_epi32): Ditto. (_mm_loadu_epi32): Ditto. (_mm256_mask_cvt_roundps_ph): Ditto. (_mm256_maskz_cvt_roundps_ph): Ditto. (_mm_mask_cvt_roundps_ph): Ditto. (_mm_maskz_cvt_roundps_ph): Ditto. * config/i386/avxintrin.h (_mm256_cvtsi256_si32): New intrinsics. * config/i386/emmintrin.h (_mm_loadu_si32): New intrinsics. (_mm_loadu_si16): Ditto. (_mm_storeu_si32): Ditto. (_mm_storeu_si16): Ditto. * config/i386/i386-builtin-types.def (V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT): Add new type. (V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT): Ditto. (V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT): Ditto. (V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT): Ditto. * config/i386/i386-builtin.def (__builtin_ia32_cvtsd2ss_mask_round): New builtin. (__builtin_ia32_cvtss2sd_mask_round): Ditto. (__builtin_ia32_rcp28sd_mask_round): Ditto. (__builtin_ia32_rcp28ss_mask_round): Ditto. (__builtin_ia32_rsqrt28sd_mask_round): Ditto. (__builtin_ia32_rsqrt28ss_mask_round): Ditto. (__builtin_ia32_reducepd512_mask_round): Ditto. (__builtin_ia32_reduceps512_mask_round): Ditto. (__builtin_ia32_reducesd_mask_round): Ditto. (__builtin_ia32_reducess_mask_round): Ditto. * config/i386/i386-expand.c (ix86_expand_round_builtin): Expand round builtin for new type. (V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT) (V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT) (V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT) (V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT) * config/i386/mmintrin.h () Define datatype __m32 and __m16. Define datatype __m32_u and __m16_u. * config/i386/sse.md: Adjust pattern. (reducep): Adjust. (reduces): Ditto. (sse2_cvtsd2ss): Ditto. (sse2_cvtss2sd): Ditto. (avx512er_vmrcp28): Ditto. (avx512er_vmrsqrt28): Ditto. gcc/testsuite/ChangeLog: PR target/95483 * gcc.target/i386/avx-1.c: Add test. * gcc.target/i386/avx2-vbroadcastsi128-1.c: Ditto. * gcc.target/i386/avx2-vbroadcastsi128-2.c: Ditto. * gcc.target/i386/avx512bw-vmovdqu16-1.c: Ditto. * gcc.target/i386/avx512bw-vmovdqu8-1.c: Ditto. * gcc.target/i386/avx512dq-vreducesd-1.c: Ditto. * gcc.target/i386/avx512dq-vreducesd-2.c: Ditto. * gcc.target/i386/avx512dq-vreducess-1.c: Ditto. * gcc.target/i386/avx512dq-vreducess-2.c: Ditto. * gcc.target/i386/avx512er-vrcp28sd-1.c: Ditto. * gcc.target/i386/avx512er-vrcp28sd-2.c: Ditto. * gcc.target/i386/avx512er-vrcp28ss-1.c: Ditto. * gcc.target/i386/avx512er-vrcp28ss-2.c: Ditto. * gcc.target/i386/avx512er-vrsqrt28sd-1.c: Ditto. * gcc.target/i386/avx512er-vrsqrt28sd-2.c: Ditto. * gcc.target/i386/avx512er-vrsqrt28ss-1.c: Ditto. * gcc.target/i386/avx512er-vrsqrt28ss-2.c: Ditto. * gcc.target/i386/avx512f-vcvtsd2si-1.c: Ditto. * gcc.target/i386/avx512f-vcvtsd2si64-1.c: Ditto. * gcc.target/i386/avx512f-vcvtsd2ss-1.c: Ditto. * gcc.target/i386/avx512f-vcvtsi2sd64-1.c: Ditto. * gcc.target/i386/avx512f-vcvtsi2ss-1.c: Ditto. * gcc.target/i386/avx512f-vcvtsi2ss64-1.c: Ditto. * gcc.target/i386/avx512f-vcvtss2sd-1.c: Ditto. * gcc.target/i386/avx512f-vcvtss2si-1.c: Ditto. * gcc.target/i386/avx512f-vcvtss2si64-1.c: Ditto. * gcc.target/i386/avx512f-vscalefsd-1.c: Ditto. * gcc.target/i386/avx512f-vscalefsd-2.c: Ditto. * gcc.target/i386/avx512f-vscalefss-1.c: Ditto. * gcc.target/i386/avx512f-vscalefss-2.c: Ditto. * gcc.target/i386/avx512f-vsqrtsd-1.c: Ditto. * gcc.target/i386/avx512f-vsqrtsd-2.c: Ditto. * gcc.target/i386/avx512f-vsqrtss-1.c: Ditto. * gcc.target/i386/avx512f-vsqrtss-2.c: Ditto. * gcc.target/i386/avx512vl-vmovdqa32-1.c: Ditto. * gcc.target/i386/avx512vl-vmovdqa64-1.c: Ditto. * gcc.target/i386/sse-13.c: Ditto. * gcc.target/i386/sse-23.c: Ditto. * gcc.target/i386/avx512dq-vreducepd-3.c: New test. * gcc.target/i386/avx512dq-vreducepd-4.c: New test. * gcc.target/i386/avx512dq-vreduceps-3.c: New test. * gcc.target/i386/avx512dq-vreduceps-4.c: New test. * gcc.target/i386/avx512f-vcvtsi2sd-1.c: New test. * gcc.target/i386/pr95483-1.c: New test. * gcc.target/i386/pr95483-2.c: New test. * gcc.target/i386/pr95483-3.c: New test. * gcc.target/i386/pr95483-4.c: New test. * gcc.target/i386/pr95483-5.c: New test. * gcc.target/i386/pr95483-6.c: New test. * gcc.target/i386/pr95483-7.c: New test. --- diff --git a/gcc/config/i386/avx2intrin.h b/gcc/config/i386/avx2intrin.h index 6bf1f8c4333..e29c53200aa 100644 --- a/gcc/config/i386/avx2intrin.h +++ b/gcc/config/i386/avx2intrin.h @@ -950,6 +950,9 @@ _mm256_broadcastsi128_si256 (__m128i __X) return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X); } +#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X) +#define _mm_broadcastsd_pd(X) _mm_movedup_pd(X) + #ifdef __OPTIMIZE__ extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) diff --git a/gcc/config/i386/avx512bwintrin.h b/gcc/config/i386/avx512bwintrin.h index d19c1044471..3da05e15dd6 100644 --- a/gcc/config/i386/avx512bwintrin.h +++ b/gcc/config/i386/avx512bwintrin.h @@ -36,7 +36,11 @@ /* Internal data types for implementing the intrinsics. */ typedef short __v32hi __attribute__ ((__vector_size__ (64))); +typedef short __v32hi_u __attribute__ ((__vector_size__ (64), \ + __may_alias__, __aligned__ (1))); typedef char __v64qi __attribute__ ((__vector_size__ (64))); +typedef char __v64qi_u __attribute__ ((__vector_size__ (64), \ + __may_alias__, __aligned__ (1))); typedef unsigned long long __mmask64; @@ -301,6 +305,13 @@ _mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A) (__mmask32) __U); } +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadu_epi16 (void const *__P) +{ + return (__m512i) (*(__v32hi_u *) __P); +} + extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_loadu_epi16 (__m512i __W, __mmask32 __U, void const *__P) @@ -320,6 +331,13 @@ _mm512_maskz_loadu_epi16 (__mmask32 __U, void const *__P) (__mmask32) __U); } +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_storeu_epi16 (void *__P, __m512i __A) +{ + *(__v32hi_u *) __P = (__v32hi_u) __A; +} + extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_storeu_epi16 (void *__P, __mmask32 __U, __m512i __A) @@ -380,6 +398,13 @@ _kunpackd_mask64 (__mmask32 __A, __mmask32 __B) (__mmask64) __B); } +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadu_epi8 (void const *__P) +{ + return (__m512i) (*(__v64qi_u *) __P); +} + extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_loadu_epi8 (__m512i __W, __mmask64 __U, void const *__P) @@ -399,6 +424,13 @@ _mm512_maskz_loadu_epi8 (__mmask64 __U, void const *__P) (__mmask64) __U); } +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_storeu_epi8 (void *__P, __m512i __A) +{ + *(__v64qi_u *) __P = (__v64qi_u) __A; +} + extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A) diff --git a/gcc/config/i386/avx512dqintrin.h b/gcc/config/i386/avx512dqintrin.h index d28dfab8e4e..fd61b70fa19 100644 --- a/gcc/config/i386/avx512dqintrin.h +++ b/gcc/config/i386/avx512dqintrin.h @@ -1166,6 +1166,17 @@ _mm_reduce_sd (__m128d __A, __m128d __B, int __C) (__mmask8) -1); } +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_round_sd (__m128d __A, __m128d __B, int __C, const int __R) +{ + return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, __R); +} + extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_reduce_sd (__m128d __W, __mmask8 __U, __m128d __A, @@ -1177,6 +1188,17 @@ _mm_mask_reduce_sd (__m128d __W, __mmask8 __U, __m128d __A, (__mmask8) __U); } +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, int __C, const int __R) +{ + return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) __W, + __U, __R); +} + extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskz_reduce_sd (__mmask8 __U, __m128d __A, __m128d __B, int __C) @@ -1187,6 +1209,18 @@ _mm_maskz_reduce_sd (__mmask8 __U, __m128d __A, __m128d __B, int __C) (__mmask8) __U); } +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reduce_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + int __C, const int __R) +{ + return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + __U, __R); +} + extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_reduce_ss (__m128 __A, __m128 __B, int __C) @@ -1197,6 +1231,16 @@ _mm_reduce_ss (__m128 __A, __m128 __B, int __C) (__mmask8) -1); } +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_round_ss (__m128 __A, __m128 __B, int __C, const int __R) +{ + return (__m128) __builtin_ia32_reducess_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, __R); +} extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -1209,6 +1253,17 @@ _mm_mask_reduce_ss (__m128 __W, __mmask8 __U, __m128 __A, (__mmask8) __U); } +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, int __C, const int __R) +{ + return (__m128) __builtin_ia32_reducess_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) __W, + __U, __R); +} + extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_maskz_reduce_ss (__mmask8 __U, __m128 __A, __m128 __B, int __C) @@ -1219,6 +1274,18 @@ _mm_maskz_reduce_ss (__mmask8 __U, __m128 __A, __m128 __B, int __C) (__mmask8) __U); } +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reduce_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + int __C, const int __R) +{ + return (__m128) __builtin_ia32_reducess_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + __U, __R); +} + extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_range_sd (__m128d __A, __m128d __B, int __C) @@ -1806,6 +1873,17 @@ _mm512_reduce_pd (__m512d __A, int __B) (__mmask8) -1); } +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_round_pd (__m512d __A, int __B, const int __R) +{ + return (__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df) __A, + __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, __R); +} + extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_reduce_pd (__m512d __W, __mmask8 __U, __m512d __A, int __B) @@ -1815,6 +1893,17 @@ _mm512_mask_reduce_pd (__m512d __W, __mmask8 __U, __m512d __A, int __B) (__mmask8) __U); } +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + int __B, const int __R) +{ + return (__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df) __A, + __B, + (__v8df) __W, + __U, __R); +} + extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_maskz_reduce_pd (__mmask8 __U, __m512d __A, int __B) @@ -1825,6 +1914,18 @@ _mm512_maskz_reduce_pd (__mmask8 __U, __m512d __A, int __B) (__mmask8) __U); } +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_reduce_round_pd (__mmask8 __U, __m512d __A, int __B, + const int __R) +{ + return (__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df) __A, + __B, + (__v8df) + _mm512_setzero_pd (), + __U, __R); +} + extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_reduce_ps (__m512 __A, int __B) @@ -1835,6 +1936,17 @@ _mm512_reduce_ps (__m512 __A, int __B) (__mmask16) -1); } +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_round_ps (__m512 __A, int __B, const int __R) +{ + return (__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf) __A, + __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, __R); +} + extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_reduce_ps (__m512 __W, __mmask16 __U, __m512 __A, int __B) @@ -1844,6 +1956,17 @@ _mm512_mask_reduce_ps (__m512 __W, __mmask16 __U, __m512 __A, int __B) (__mmask16) __U); } +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __B, + const int __R) +{ + return (__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf) __A, + __B, + (__v16sf) __W, + __U, __R); +} + extern __inline __m512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_maskz_reduce_ps (__mmask16 __U, __m512 __A, int __B) @@ -1854,6 +1977,18 @@ _mm512_maskz_reduce_ps (__mmask16 __U, __m512 __A, int __B) (__mmask16) __U); } +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_reduce_round_ps (__mmask16 __U, __m512 __A, int __B, + const int __R) +{ + return (__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf) __A, + __B, + (__v16sf) + _mm512_setzero_ps (), + __U, __R); +} + extern __inline __m256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_extractf32x8_ps (__m512 __A, const int __imm) @@ -2440,26 +2575,50 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm) ((__m512d) __builtin_ia32_reducepd512_mask ((__v8df)(__m512d)(A), \ (int)(B), (__v8df)_mm512_setzero_pd (), (__mmask8)-1)) +#define _mm512_reduce_round_pd(A, B, R) \ + ((__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df)(__m512d)(A),\ + (int)(B), (__v8df)_mm512_setzero_pd (), (__mmask8)-1, (R))) + #define _mm512_mask_reduce_pd(W, U, A, B) \ ((__m512d) __builtin_ia32_reducepd512_mask ((__v8df)(__m512d)(A), \ (int)(B), (__v8df)(__m512d)(W), (__mmask8)(U))) +#define _mm512_mask_reduce_round_pd(W, U, A, B, R) \ + ((__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df)(__m512d)(A),\ + (int)(B), (__v8df)(__m512d)(W), (U), (R))) + #define _mm512_maskz_reduce_pd(U, A, B) \ ((__m512d) __builtin_ia32_reducepd512_mask ((__v8df)(__m512d)(A), \ (int)(B), (__v8df)_mm512_setzero_pd (), (__mmask8)(U))) +#define _mm512_maskz_reduce_round_pd(U, A, B, R) \ + ((__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df)(__m512d)(A),\ + (int)(B), (__v8df)_mm512_setzero_pd (), (U), (R))) + #define _mm512_reduce_ps(A, B) \ ((__m512) __builtin_ia32_reduceps512_mask ((__v16sf)(__m512)(A), \ (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)-1)) +#define _mm512_reduce_round_ps(A, B, R) \ + ((__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf)(__m512)(A),\ + (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)-1, (R))) + #define _mm512_mask_reduce_ps(W, U, A, B) \ ((__m512) __builtin_ia32_reduceps512_mask ((__v16sf)(__m512)(A), \ (int)(B), (__v16sf)(__m512)(W), (__mmask16)(U))) +#define _mm512_mask_reduce_round_ps(W, U, A, B, R) \ + ((__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf)(__m512)(A),\ + (int)(B), (__v16sf)(__m512)(W), (U), (R))) + #define _mm512_maskz_reduce_ps(U, A, B) \ ((__m512) __builtin_ia32_reduceps512_mask ((__v16sf)(__m512)(A), \ (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)(U))) +#define _mm512_maskz_reduce_round_ps(U, A, B, R) \ + ((__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf)(__m512)(A),\ + (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)(U), (R))) + #define _mm512_extractf32x8_ps(X, C) \ ((__m256) __builtin_ia32_extractf32x8_mask ((__v16sf)(__m512) (X), \ (int) (C), (__v8sf)(__m256) _mm256_setzero_ps (), (__mmask8)-1)) @@ -2679,6 +2838,20 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm) (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), \ (__mmask8)(U))) +#define _mm_reduce_round_sd(A, B, C, R) \ + ((__m128d) __builtin_ia32_reducesd_round ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), (__mmask8)(U), (int)(R))) + +#define _mm_mask_reduce_round_sd(W, U, A, B, C, R) \ + ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_reduce_round_sd(U, A, B, C, R) \ + ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), \ + (__mmask8)(U), (int)(R))) + #define _mm_reduce_ss(A, B, C) \ ((__m128) __builtin_ia32_reducess_mask ((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), \ @@ -2693,6 +2866,19 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm) (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), \ (__mmask8)(U))) +#define _mm_reduce_round_ss(A, B, C, R) \ + ((__m128) __builtin_ia32_reducess_round ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), (__mmask8)(U), (int)(R))) + +#define _mm_mask_reduce_round_ss(W, U, A, B, C, R) \ + ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_reduce_round_ss(U, A, B, C, R) \ + ((__m128) __builtin_ia32_reducesd_mask_round ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), \ + (__mmask8)(U), (int)(R))) #endif diff --git a/gcc/config/i386/avx512erintrin.h b/gcc/config/i386/avx512erintrin.h index b9804c97a61..6ec8ee2d3cb 100644 --- a/gcc/config/i386/avx512erintrin.h +++ b/gcc/config/i386/avx512erintrin.h @@ -168,6 +168,30 @@ _mm_rcp28_round_sd (__m128d __A, __m128d __B, int __R) __R); } +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rcp28_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, int __R) +{ + return (__m128d) __builtin_ia32_rcp28sd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) __W, + __U, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rcp28_round_sd (__mmask8 __U, __m128d __A, __m128d __B, int __R) +{ + return (__m128d) __builtin_ia32_rcp28sd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) + _mm_setzero_pd (), + __U, + __R); +} + extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_rcp28_round_ss (__m128 __A, __m128 __B, int __R) @@ -177,6 +201,30 @@ _mm_rcp28_round_ss (__m128 __A, __m128 __B, int __R) __R); } +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rcp28_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, int __R) +{ + return (__m128) __builtin_ia32_rcp28ss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) __W, + __U, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rcp28_round_ss (__mmask8 __U, __m128 __A, __m128 __B, int __R) +{ + return (__m128) __builtin_ia32_rcp28ss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + __U, + __R); +} + extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_rsqrt28_round_pd (__m512d __A, int __R) @@ -242,6 +290,30 @@ _mm_rsqrt28_round_sd (__m128d __A, __m128d __B, int __R) __R); } +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rsqrt28_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, int __R) +{ + return (__m128d) __builtin_ia32_rsqrt28sd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) __W, + __U, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rsqrt28_round_sd (__mmask8 __U, __m128d __A, __m128d __B, int __R) +{ + return (__m128d) __builtin_ia32_rsqrt28sd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) + _mm_setzero_pd (), + __U, + __R); +} + extern __inline __m128 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_rsqrt28_round_ss (__m128 __A, __m128 __B, int __R) @@ -251,6 +323,30 @@ _mm_rsqrt28_round_ss (__m128 __A, __m128 __B, int __R) __R); } +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rsqrt28_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, int __R) +{ + return (__m128) __builtin_ia32_rsqrt28ss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) __W, + __U, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rsqrt28_round_ss (__mmask8 __U, __m128 __A, __m128 __B, int __R) +{ + return (__m128) __builtin_ia32_rsqrt28ss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + __U, + __R); +} + #else #define _mm512_exp2a23_round_pd(A, C) \ __builtin_ia32_exp2pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C) @@ -309,17 +405,69 @@ _mm_rsqrt28_round_ss (__m128 __A, __m128 __B, int __R) #define _mm_rcp28_round_sd(A, B, R) \ __builtin_ia32_rcp28sd_round(A, B, R) +#define _mm_mask_rcp28_round_sd(W, U, A, B, R) \ + __builtin_ia32_rcp28sd_mask_round ((A), (B), (W), (U), (R)) + +#define _mm_maskz_rcp28_round_sd(U, A, B, R) \ + __builtin_ia32_rcp28sd_mask_round ((A), (B), (__v2df) _mm_setzero_pd (), \ + (U), (R)) + #define _mm_rcp28_round_ss(A, B, R) \ __builtin_ia32_rcp28ss_round(A, B, R) +#define _mm_mask_rcp28_round_ss(W, U, A, B, R) \ + __builtin_ia32_rcp28ss_mask_round ((A), (B), (W), (U), (R)) + +#define _mm_maskz_rcp28_round_ss(U, A, B, R) \ + __builtin_ia32_rcp28ss_mask_round ((A), (B), (__v4sf) _mm_setzero_ps (), \ + (U), (R)) + #define _mm_rsqrt28_round_sd(A, B, R) \ __builtin_ia32_rsqrt28sd_round(A, B, R) +#define _mm_mask_rsqrt28_round_sd(W, U, A, B, R) \ + __builtin_ia32_rsqrt28sd_mask_round ((A), (B), (W), (U), (R)) + +#define _mm_maskz_rsqrt28_round_sd(U, A, B, R) \ + __builtin_ia32_rsqrt28sd_mask_round ((A), (B), (__v2df) _mm_setzero_pd (),\ + (U), (R)) + #define _mm_rsqrt28_round_ss(A, B, R) \ __builtin_ia32_rsqrt28ss_round(A, B, R) +#define _mm_mask_rsqrt28_round_ss(W, U, A, B, R) \ + __builtin_ia32_rsqrt28ss_mask_round ((A), (B), (W), (U), (R)) + +#define _mm_maskz_rsqrt28_round_ss(U, A, B, R) \ + __builtin_ia32_rsqrt28ss_mask_round ((A), (B), (__v4sf) _mm_setzero_ps (),\ + (U), (R)) + #endif +#define _mm_mask_rcp28_sd(W, U, A, B)\ + _mm_mask_rcp28_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_rcp28_sd(U, A, B)\ + _mm_maskz_rcp28_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_rcp28_ss(W, U, A, B)\ + _mm_mask_rcp28_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_rcp28_ss(U, A, B)\ + _mm_maskz_rcp28_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_rsqrt28_sd(W, U, A, B)\ + _mm_mask_rsqrt28_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_rsqrt28_sd(U, A, B)\ + _mm_maskz_rsqrt28_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_rsqrt28_ss(W, U, A, B)\ + _mm_mask_rsqrt28_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_rsqrt28_ss(U, A, B)\ + _mm_maskz_rsqrt28_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + #define _mm512_exp2a23_pd(A) \ _mm512_exp2a23_round_pd(A, _MM_FROUND_CUR_DIRECTION) diff --git a/gcc/config/i386/avx512fintrin.h b/gcc/config/i386/avx512fintrin.h index 729d5686d68..6342fdebf91 100644 --- a/gcc/config/i386/avx512fintrin.h +++ b/gcc/config/i386/avx512fintrin.h @@ -2124,6 +2124,18 @@ _mm_maskz_sqrt_round_ss (__mmask8 __U, __m128 __A, __m128 __B, const int __R) (__v4sf) _mm_setzero_ps (), U, C) #endif +#define _mm_mask_sqrt_sd(W, U, A, B) \ + _mm_mask_sqrt_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_sqrt_sd(U, A, B) \ + _mm_maskz_sqrt_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_sqrt_ss(W, U, A, B) \ + _mm_mask_sqrt_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_sqrt_ss(U, A, B) \ + _mm_maskz_sqrt_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + extern __inline __m512i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_cvtepi8_epi32 (__m128i __A) @@ -3259,6 +3271,18 @@ _mm_maskz_scalef_round_ss (__mmask8 __U, __m128 __A, __m128 __B, const int __R) (__v4sf)_mm_setzero_ps (), -1, C) #endif +#define _mm_mask_scalef_sd(W, U, A, B) \ + _mm_mask_scalef_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_scalef_sd(U, A, B) \ + _mm_maskz_scalef_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_scalef_ss(W, U, A, B) \ + _mm_mask_scalef_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_scalef_ss(U, A, B) \ + _mm_maskz_scalef_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + #ifdef __OPTIMIZE__ extern __inline __m512d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -8621,6 +8645,30 @@ _mm_cvt_roundsd_ss (__m128 __A, __m128d __B, const int __R) __R); } +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvt_roundsd_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128d __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsd2ss_mask_round ((__v4sf) __A, + (__v2df) __B, + (__v4sf) __W, + __U, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvt_roundsd_ss (__mmask8 __U, __m128 __A, + __m128d __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsd2ss_mask_round ((__v4sf) __A, + (__v2df) __B, + _mm_setzero_ps (), + __U, + __R); +} + extern __inline __m128d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvt_roundss_sd (__m128d __A, __m128 __B, const int __R) @@ -8629,6 +8677,30 @@ _mm_cvt_roundss_sd (__m128d __A, __m128 __B, const int __R) (__v4sf) __B, __R); } + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvt_roundss_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128 __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtss2sd_mask_round ((__v2df) __A, + (__v4sf) __B, + (__v2df) __W, + __U, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvt_roundss_sd (__mmask8 __U, __m128d __A, + __m128 __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtss2sd_mask_round ((__v2df) __A, + (__v4sf) __B, + _mm_setzero_pd (), + __U, + __R); +} #else #define _mm512_cvt_roundpd_ps(A, B) \ (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)_mm256_undefined_ps(), -1, B) @@ -8642,10 +8714,37 @@ _mm_cvt_roundss_sd (__m128d __A, __m128 __B, const int __R) #define _mm_cvt_roundsd_ss(A, B, C) \ (__m128)__builtin_ia32_cvtsd2ss_round(A, B, C) +#define _mm_mask_cvt_roundsd_ss(W, U, A, B, C) \ + (__m128)__builtin_ia32_cvtsd2ss_mask_round ((A), (B), (W), (U), (C)) + +#define _mm_maskz_cvt_roundsd_ss(U, A, B, C) \ + (__m128)__builtin_ia32_cvtsd2ss_mask_round ((A), (B), _mm_setzero_ps (), \ + (U), (C)) + #define _mm_cvt_roundss_sd(A, B, C) \ (__m128d)__builtin_ia32_cvtss2sd_round(A, B, C) + +#define _mm_mask_cvt_roundss_sd(W, U, A, B, C) \ + (__m128d)__builtin_ia32_cvtss2sd_mask_round ((A), (B), (W), (U), (C)) + +#define _mm_maskz_cvt_roundss_sd(U, A, B, C) \ + (__m128d)__builtin_ia32_cvtss2sd_mask_round ((A), (B), _mm_setzero_pd (), \ + (U), (C)) + #endif +#define _mm_mask_cvtss_sd(W, U, A, B) \ + _mm_mask_cvt_roundss_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_cvtss_sd(U, A, B) \ + _mm_maskz_cvt_roundss_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_cvtsd_ss(W, U, A, B) \ + _mm_mask_cvt_roundsd_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_cvtsd_ss(U, A, B) \ + _mm_maskz_cvt_roundsd_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_stream_si512 (__m512i * __P, __m512i __A) @@ -14265,6 +14364,14 @@ _mm_cvttss_i64 (__m128 __A) } #endif /* __x86_64__ */ +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsi512_si32 (__m512i __A) +{ + __v16si __B = (__v16si) __A; + return __B[0]; +} + extern __inline unsigned __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtss_u32 (__m128 __A) @@ -14289,6 +14396,34 @@ _mm_cvttss_i32 (__m128 __A) _MM_FROUND_CUR_DIRECTION); } +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_i32 (__m128d __A) +{ + return (int) __builtin_ia32_cvtsd2si ((__v2df) __A); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_i32 (__m128 __A) +{ + return (int) __builtin_ia32_cvtss2si ((__v4sf) __A); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvti32_sd (__m128d __A, int __B) +{ + return (__m128d) __builtin_ia32_cvtsi2sd ((__v2df) __A, __B); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvti32_ss (__m128 __A, int __B) +{ + return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B); +} + #ifdef __x86_64__ extern __inline unsigned long long __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -14315,6 +14450,34 @@ _mm_cvttsd_i64 (__m128d __A) return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, _MM_FROUND_CUR_DIRECTION); } + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_i64 (__m128d __A) +{ + return (long long) __builtin_ia32_cvtsd2si64 ((__v2df) __A); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_i64 (__m128 __A) +{ + return (long long) __builtin_ia32_cvtss2si64 ((__v4sf) __A); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvti64_sd (__m128d __A, long long __B) +{ + return (__m128d) __builtin_ia32_cvtsi642sd ((__v2df) __A, __B); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvti64_ss (__m128 __A, long long __B) +{ + return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B); +} #endif /* __x86_64__ */ extern __inline unsigned diff --git a/gcc/config/i386/avx512vlbwintrin.h b/gcc/config/i386/avx512vlbwintrin.h index cd4275e0781..b4b1d7fb19e 100644 --- a/gcc/config/i386/avx512vlbwintrin.h +++ b/gcc/config/i386/avx512vlbwintrin.h @@ -34,6 +34,15 @@ #define __DISABLE_AVX512VLBW__ #endif /* __AVX512VLBW__ */ +/* Internal data types for implementing the intrinsics. */ +typedef short __v16hi_u __attribute__ ((__vector_size__ (32), \ + __may_alias__, __aligned__ (1))); +typedef short __v8hi_u __attribute__ ((__vector_size__ (16), \ + __may_alias__, __aligned__ (1))); +typedef char __v32qi_u __attribute__ ((__vector_size__ (32), \ + __may_alias__, __aligned__ (1))); +typedef char __v16qi_u __attribute__ ((__vector_size__ (16), \ + __may_alias__, __aligned__ (1))); extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -73,6 +82,13 @@ _mm_maskz_mov_epi8 (__mmask16 __U, __m128i __A) (__mmask16) __U); } +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu_epi8 (void *__P, __m256i __A) +{ + *(__v32qi_u *) __P = (__v32qi_u) __A; +} + extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A) @@ -82,6 +98,13 @@ _mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A) (__mmask32) __U); } +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_epi8 (void *__P, __m128i __A) +{ + *(__v16qi_u *) __P = (__v16qi_u) __A; +} + extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_storeu_epi8 (void *__P, __mmask16 __U, __m128i __A) @@ -91,6 +114,13 @@ _mm_mask_storeu_epi8 (void *__P, __mmask16 __U, __m128i __A) (__mmask16) __U); } +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_epi16 (void const *__P) +{ + return (__m256i) (*(__v16hi_u *) __P); +} + extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_loadu_epi16 (__m256i __W, __mmask16 __U, void const *__P) @@ -110,6 +140,13 @@ _mm256_maskz_loadu_epi16 (__mmask16 __U, void const *__P) (__mmask16) __U); } +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_epi16 (void const *__P) +{ + return (__m128i) (*(__v8hi_u *) __P); +} + extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_loadu_epi16 (__m128i __W, __mmask8 __U, void const *__P) @@ -168,6 +205,13 @@ _mm_maskz_mov_epi16 (__mmask8 __U, __m128i __A) (__mmask8) __U); } +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_epi8 (void const *__P) +{ + return (__m256i) (*(__v32qi_u *) __P); +} + extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_loadu_epi8 (__m256i __W, __mmask32 __U, void const *__P) @@ -187,6 +231,13 @@ _mm256_maskz_loadu_epi8 (__mmask32 __U, void const *__P) (__mmask32) __U); } +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_epi8 (void const *__P) +{ + return (__m128i) (*(__v16qi_u *) __P); +} + extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_loadu_epi8 (__m128i __W, __mmask16 __U, void const *__P) @@ -3708,6 +3759,13 @@ _mm256_cmple_epu16_mask (__m256i __X, __m256i __Y) (__mmask16) -1); } +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu_epi16 (void *__P, __m256i __A) +{ + *(__v16hi_u *) __P = (__v16hi_u) __A; +} + extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_storeu_epi16 (void *__P, __mmask16 __U, __m256i __A) @@ -3717,6 +3775,13 @@ _mm256_mask_storeu_epi16 (void *__P, __mmask16 __U, __m256i __A) (__mmask16) __U); } +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_epi16 (void *__P, __m128i __A) +{ + *(__v8hi_u *) __P = (__v8hi_u) __A; +} + extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_storeu_epi16 (void *__P, __mmask8 __U, __m128i __A) diff --git a/gcc/config/i386/avx512vlintrin.h b/gcc/config/i386/avx512vlintrin.h index 7abd6018f4f..99666c7f9e0 100644 --- a/gcc/config/i386/avx512vlintrin.h +++ b/gcc/config/i386/avx512vlintrin.h @@ -36,6 +36,14 @@ /* Internal data types for implementing the intrinsics. */ typedef unsigned int __mmask32; +typedef int __v4si_u __attribute__ ((__vector_size__ (16), \ + __may_alias__, __aligned__ (1))); +typedef int __v8si_u __attribute__ ((__vector_size__ (32), \ + __may_alias__, __aligned__ (1))); +typedef long long __v2di_u __attribute__ ((__vector_size__ (16), \ + __may_alias__, __aligned__ (1))); +typedef long long __v4di_u __attribute__ ((__vector_size__ (32), \ + __may_alias__, __aligned__ (1))); extern __inline __m256d __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) @@ -263,6 +271,13 @@ _mm_maskz_mov_epi64 (__mmask8 __U, __m128i __A) (__mmask8) __U); } +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_load_epi64 (void const *__P) +{ + return (__m256i) (*(__v4di *) __P); +} + extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_load_epi64 (__m256i __W, __mmask8 __U, void const *__P) @@ -284,6 +299,13 @@ _mm256_maskz_load_epi64 (__mmask8 __U, void const *__P) __U); } +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_epi64 (void const *__P) +{ + return (__m128i) (*(__v2di *) __P); +} + extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_load_epi64 (__m128i __W, __mmask8 __U, void const *__P) @@ -361,6 +383,13 @@ _mm_maskz_mov_epi32 (__mmask8 __U, __m128i __A) (__mmask8) __U); } +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_load_epi32 (void const *__P) +{ + return (__m256i) (*(__v8si *) __P); +} + extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_load_epi32 (__m256i __W, __mmask8 __U, void const *__P) @@ -382,6 +411,13 @@ _mm256_maskz_load_epi32 (__mmask8 __U, void const *__P) __U); } +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_epi32 (void const *__P) +{ + return (__m128i) (*(__v4si *) __P); +} + extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_load_epi32 (__m128i __W, __mmask8 __U, void const *__P) @@ -403,6 +439,13 @@ _mm_maskz_load_epi32 (__mmask8 __U, void const *__P) __U); } +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_store_epi32 (void *__P, __m256i __A) +{ + *(__v8si *) __P = (__v8si) __A; +} + extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_store_epi32 (void *__P, __mmask8 __U, __m256i __A) @@ -412,6 +455,13 @@ _mm256_mask_store_epi32 (void *__P, __mmask8 __U, __m256i __A) (__mmask8) __U); } +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_epi32 (void *__P, __m128i __A) +{ + *(__v4si *) __P = (__v4si) __A; +} + extern __inline void __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_store_epi32 (void *__P, __mmask8 __U, __m128i __A) @@ -717,6 +767,13 @@ _mm_mask_storeu_ps (void *__P, __mmask8 __U, __m128 __A) (__mmask8) __U); } +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_epi64 (void const *__P) +{ + return (__m256i) (*(__v4di_u *) __P); +} + extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_loadu_epi64 (__m256i __W, __mmask8 __U, void const *__P) @@ -736,6 +793,13 @@ _mm256_maskz_loadu_epi64 (__mmask8 __U, void const *__P) (__mmask8) __U); } +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_epi64 (void const *__P) +{ + return (__m128i) (*(__v2di_u *) __P); +} + extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_loadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) @@ -787,6 +851,13 @@ _mm_mask_storeu_epi64 (void *__P, __mmask8 __U, __m128i __A) (__mmask8) __U); } +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_epi32 (void const *__P) +{ + return (__m256i) (*(__v8si_u *) __P); +} + extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_mask_loadu_epi32 (__m256i __W, __mmask8 __U, void const *__P) @@ -806,6 +877,13 @@ _mm256_maskz_loadu_epi32 (__mmask8 __U, void const *__P) (__mmask8) __U); } +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_epi32 (void const *__P) +{ + return (__m128i) (*(__v4si_u *) __P); +} + extern __inline __m128i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm_mask_loadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) @@ -13730,6 +13808,13 @@ _mm256_permutex_pd (__m256d __X, const int __M) #endif #define _mm256_permutexvar_ps(A, B) _mm256_permutevar8x32_ps ((B), (A)) +#define _mm256_mask_cvt_roundps_ph(A, B, C, D) \ + _mm256_mask_cvtps_ph ((A), (B), (C), (D)) +#define _mm256_maskz_cvt_roundps_ph(A, B, C) \ + _mm256_maskz_cvtps_ph ((A), (B), (C)) +#define _mm_mask_cvt_roundps_ph(A, B, C, D) \ + _mm_mask_cvtps_ph ((A), (B), (C), (D)) +#define _mm_maskz_cvt_roundps_ph(A, B, C) _mm_maskz_cvtps_ph ((A), (B), (C)) #ifdef __DISABLE_AVX512VL__ #undef __DISABLE_AVX512VL__ diff --git a/gcc/config/i386/avxintrin.h b/gcc/config/i386/avxintrin.h index 22b2baeb17b..fd5cf6ad53a 100644 --- a/gcc/config/i386/avxintrin.h +++ b/gcc/config/i386/avxintrin.h @@ -444,6 +444,13 @@ _mm_cmp_ss (__m128 __X, __m128 __Y, const int __P) (__v4sf)(__m128)(Y), (int)(P))) #endif +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtsi256_si32 (__m256i __A) +{ + __v8si __B = (__v8si) __A; + return __B[0]; +} + extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_cvtepi32_pd (__m128i __A) { diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h index 545d3bfc2af..8ff240eef6d 100644 --- a/gcc/config/i386/emmintrin.h +++ b/gcc/config/i386/emmintrin.h @@ -715,6 +715,19 @@ _mm_loadu_si64 (void const *__P) return _mm_loadl_epi64 ((__m128i_u *)__P); } +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_si32 (void const *__P) +{ + return _mm_set_epi32 (*(int *)__P, (int)0, (int)0, (int)0); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_si16 (void const *__P) +{ + return _mm_set_epi16 (*(short *)__P, (short)0, (short)0, (short)0, + (short)0, (short)0, (short)0, (short)0); +} + extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store_si128 (__m128i *__P, __m128i __B) { @@ -739,6 +752,18 @@ _mm_storeu_si64 (void *__P, __m128i __B) _mm_storel_epi64 ((__m128i_u *)__P, __B); } +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_si32 (void *__P, __m128i __B) +{ + *(__m32_u *)__P = (__m32) ((__v4si)__B)[0]; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_si16 (void *__P, __m128i __B) +{ + *(__m16_u *)__P = (__m16) ((__v8hi)__B)[0]; +} + extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movepi64_pi64 (__m128i __B) { diff --git a/gcc/config/i386/i386-builtin-types.def b/gcc/config/i386/i386-builtin-types.def index 1adf7c44f4a..ff2fa3f5b9d 100644 --- a/gcc/config/i386/i386-builtin-types.def +++ b/gcc/config/i386/i386-builtin-types.def @@ -443,6 +443,7 @@ DEF_FUNCTION_TYPE (V8DF, V8DF, V8DF, INT) DEF_FUNCTION_TYPE (V8DF, V8DF, V8DF, INT, V8DF, UQI) DEF_FUNCTION_TYPE (V8DF, V8DF, V8DF, INT, V8DF, QI, INT) DEF_FUNCTION_TYPE (V8DF, V8DF, INT, V8DF, UQI) +DEF_FUNCTION_TYPE (V8DF, V8DF, INT, V8DF, UQI, INT) DEF_FUNCTION_TYPE (V8DF, V8DF, V8DF, V8DI, INT) DEF_FUNCTION_TYPE (V4DF, V4DF, V4DF, V4DI, INT, UQI) DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DI, INT, UQI) @@ -452,6 +453,7 @@ DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, INT) DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, INT, V16SF, UHI) DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, INT, V16SF, HI, INT) DEF_FUNCTION_TYPE (V16SF, V16SF, INT, V16SF, UHI) +DEF_FUNCTION_TYPE (V16SF, V16SF, INT, V16SF, UHI, INT) DEF_FUNCTION_TYPE (V16SI, V16SI, V4SI, INT, V16SI, UHI) DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, V16SI, INT) DEF_FUNCTION_TYPE (V16SF, V16SF, V16SF, V16SI, INT, HI, INT) @@ -1026,8 +1028,10 @@ DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DF, UQI, INT) DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SF, UQI, INT) DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SF, QI, INT) DEF_FUNCTION_TYPE (V4SF, V4SF, V2DF, V4SF, QI, INT) +DEF_FUNCTION_TYPE (V4SF, V4SF, V2DF, V4SF, UQI, INT) DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DF, QI, INT) DEF_FUNCTION_TYPE (V2DF, V2DF, V4SF, V2DF, QI, INT) +DEF_FUNCTION_TYPE (V2DF, V2DF, V4SF, V2DF, UQI, INT) DEF_FUNCTION_TYPE (V2DF, V2DF, V2DF, V2DF, INT) DEF_FUNCTION_TYPE (V4SF, V4SF, V4SF, V4SF, INT) diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index fec5cef0b55..e6deaa2c5c2 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -2772,10 +2772,12 @@ BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_fix_notruncv16sfv16si_mask_r BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_cvtps2pd512_mask_round, "__builtin_ia32_cvtps2pd512_mask", IX86_BUILTIN_CVTPS2PD512, UNKNOWN, (int) V8DF_FTYPE_V8SF_V8DF_QI_INT) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_ufix_notruncv16sfv16si_mask_round, "__builtin_ia32_cvtps2udq512_mask", IX86_BUILTIN_CVTPS2UDQ512, UNKNOWN, (int) V16SI_FTYPE_V16SF_V16SI_HI_INT) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_sse2_cvtsd2ss_round, "__builtin_ia32_cvtsd2ss_round", IX86_BUILTIN_CVTSD2SS_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF_INT) +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_sse2_cvtsd2ss_mask_round, "__builtin_ia32_cvtsd2ss_mask_round", IX86_BUILTIN_CVTSD2SS_MASK_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT) BDESC (OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_sse2_cvtsi2sdq_round, "__builtin_ia32_cvtsi2sd64", IX86_BUILTIN_CVTSI2SD64, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT64_INT) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_sse_cvtsi2ss_round, "__builtin_ia32_cvtsi2ss32", IX86_BUILTIN_CVTSI2SS32, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT_INT) BDESC (OPTION_MASK_ISA_AVX512F | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_sse_cvtsi2ssq_round, "__builtin_ia32_cvtsi2ss64", IX86_BUILTIN_CVTSI2SS64, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT64_INT) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_sse2_cvtss2sd_round, "__builtin_ia32_cvtss2sd_round", IX86_BUILTIN_CVTSS2SD_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF_INT) +BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_sse2_cvtss2sd_mask_round, "__builtin_ia32_cvtss2sd_mask_round", IX86_BUILTIN_CVTSS2SD_MASK_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_fix_truncv8dfv8si2_mask_round, "__builtin_ia32_cvttpd2dq512_mask", IX86_BUILTIN_CVTTPD2DQ512, UNKNOWN, (int) V8SI_FTYPE_V8DF_V8SI_QI_INT) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_fixuns_truncv8dfv8si2_mask_round, "__builtin_ia32_cvttpd2udq512_mask", IX86_BUILTIN_CVTTPD2UDQ512, UNKNOWN, (int) V8SI_FTYPE_V8DF_V8SI_QI_INT) BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_fix_truncv16sfv16si2_mask_round, "__builtin_ia32_cvttps2dq512_mask", IX86_BUILTIN_CVTTPS2DQ512, UNKNOWN, (int) V16SI_FTYPE_V16SF_V16SI_HI_INT) @@ -2911,13 +2913,21 @@ BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_exp2v16sf_mask_round, "__b BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_rcp28v8df_mask_round, "__builtin_ia32_rcp28pd_mask", IX86_BUILTIN_RCP28PD, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI_INT) BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_rcp28v16sf_mask_round, "__builtin_ia32_rcp28ps_mask", IX86_BUILTIN_RCP28PS, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI_INT) BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_vmrcp28v2df_round, "__builtin_ia32_rcp28sd_round", IX86_BUILTIN_RCP28SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT) +BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_vmrcp28v2df_mask_round, "__builtin_ia32_rcp28sd_mask_round", IX86_BUILTIN_RCP28SD_MASK_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT) BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_vmrcp28v4sf_round, "__builtin_ia32_rcp28ss_round", IX86_BUILTIN_RCP28SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT) +BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_vmrcp28v4sf_mask_round, "__builtin_ia32_rcp28ss_mask_round", IX86_BUILTIN_RCP28SS_MASK_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT) BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_rsqrt28v8df_mask_round, "__builtin_ia32_rsqrt28pd_mask", IX86_BUILTIN_RSQRT28PD, UNKNOWN, (int) V8DF_FTYPE_V8DF_V8DF_QI_INT) BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_rsqrt28v16sf_mask_round, "__builtin_ia32_rsqrt28ps_mask", IX86_BUILTIN_RSQRT28PS, UNKNOWN, (int) V16SF_FTYPE_V16SF_V16SF_HI_INT) BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_vmrsqrt28v2df_round, "__builtin_ia32_rsqrt28sd_round", IX86_BUILTIN_RSQRT28SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT) +BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_vmrsqrt28v2df_mask_round, "__builtin_ia32_rsqrt28sd_mask_round", IX86_BUILTIN_RSQRT28SD_MASK_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT) BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_vmrsqrt28v4sf_round, "__builtin_ia32_rsqrt28ss_round", IX86_BUILTIN_RSQRT28SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT) +BDESC (OPTION_MASK_ISA_AVX512ER, 0, CODE_FOR_avx512er_vmrsqrt28v4sf_mask_round, "__builtin_ia32_rsqrt28ss_mask_round", IX86_BUILTIN_RSQRT28SS_MASK_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT) /* AVX512DQ. */ +BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_reducepv8df_mask_round, "__builtin_ia32_reducepd512_mask_round", IX86_BUILTIN_REDUCEPD512_MASK_ROUND, UNKNOWN, (int) V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT) +BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_reducepv16sf_mask_round, "__builtin_ia32_reduceps512_mask_round", IX86_BUILTIN_REDUCEPS512_MASK_ROUND, UNKNOWN, (int) V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT) +BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_reducesv2df_mask_round, "__builtin_ia32_reducesd_mask_round", IX86_BUILTIN_REDUCESD128_MASK_ROUND, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT) +BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_reducesv4sf_mask_round, "__builtin_ia32_reducess_mask_round", IX86_BUILTIN_REDUCESS128_MASK_ROUND, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT) BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_rangesv2df_mask_round, "__builtin_ia32_rangesd128_mask_round", IX86_BUILTIN_RANGESD128, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT) BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_avx512dq_rangesv4sf_mask_round, "__builtin_ia32_rangess128_mask_round", IX86_BUILTIN_RANGESS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT) BDESC (OPTION_MASK_ISA_AVX512DQ, 0, CODE_FOR_fix_notruncv8dfv8di2_mask_round, "__builtin_ia32_cvtpd2qq512_mask", IX86_BUILTIN_CVTPD2QQ512, UNKNOWN, (int) V8DI_FTYPE_V8DF_V8DI_QI_INT) diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index e6f8b314f18..d35c6540cea 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -10225,12 +10225,16 @@ ix86_expand_round_builtin (const struct builtin_description *d, case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT: case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT: case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT: + case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT: case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT: case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT: + case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT: nargs = 5; break; case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT: case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT: + case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT: + case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT: nargs_constant = 4; nargs = 5; break; diff --git a/gcc/config/i386/mmintrin.h b/gcc/config/i386/mmintrin.h index 77de7cae95c..dff42fd73c4 100644 --- a/gcc/config/i386/mmintrin.h +++ b/gcc/config/i386/mmintrin.h @@ -42,9 +42,15 @@ /* The Intel API is flexible enough that we must allow aliasing with other vector types, and their scalar components. */ typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__)); +typedef int __m32 __attribute__ ((__vector_size__ (4), __may_alias__)); +typedef short __m16 __attribute__ ((__vector_size__ (2), __may_alias__)); /* Unaligned version of the same type */ typedef int __m64_u __attribute__ ((__vector_size__ (8), __may_alias__, __aligned__ (1))); +typedef int __m32_u __attribute__ ((__vector_size__ (4), \ + __may_alias__, __aligned__ (1))); +typedef short __m16_u __attribute__ ((__vector_size__ (2), \ + __may_alias__, __aligned__ (1))); /* Internal data types for implementing the intrinsics. */ typedef int __v2si __attribute__ ((__vector_size__ (8))); diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 934b60a288f..3689ab0e7b7 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -2861,30 +2861,30 @@ DONE; }) -(define_insn "reducep" +(define_insn "reducep" [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v") (unspec:VF_AVX512VL - [(match_operand:VF_AVX512VL 1 "nonimmediate_operand" "vm") + [(match_operand:VF_AVX512VL 1 "" "") (match_operand:SI 2 "const_0_to_255_operand")] UNSPEC_REDUCE))] "TARGET_AVX512DQ" - "vreduce\t{%2, %1, %0|%0, %1, %2}" + "vreduce\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "sse") (set_attr "prefix" "evex") (set_attr "mode" "")]) -(define_insn "reduces" +(define_insn "reduces" [(set (match_operand:VF_128 0 "register_operand" "=v") (vec_merge:VF_128 (unspec:VF_128 [(match_operand:VF_128 1 "register_operand" "v") - (match_operand:VF_128 2 "nonimmediate_operand" "vm") + (match_operand:VF_128 2 "" "") (match_operand:SI 3 "const_0_to_255_operand")] UNSPEC_REDUCE) (match_dup 1) (const_int 1)))] "TARGET_AVX512DQ" - "vreduce\t{%3, %2, %1, %0|%0, %1, %2, %3}" + "vreduce\t{%3, %2, %1, %0|%0, %1, %2, %3}" [(set_attr "type" "sse") (set_attr "prefix" "evex") (set_attr "mode" "")]) @@ -6374,7 +6374,7 @@ (set_attr "prefix" "evex") (set_attr "mode" "TI")]) -(define_insn "sse2_cvtsd2ss" +(define_insn "sse2_cvtsd2ss" [(set (match_operand:V4SF 0 "register_operand" "=x,x,v") (vec_merge:V4SF (vec_duplicate:V4SF @@ -6386,7 +6386,7 @@ "@ cvtsd2ss\t{%2, %0|%0, %2} cvtsd2ss\t{%2, %0|%0, %q2} - vcvtsd2ss\t{%2, %1, %0|%0, %1, %q2}" + vcvtsd2ss\t{%2, %1, %0|%0, %1, %q2}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "ssecvt") (set_attr "athlon_decode" "vector,double,*") @@ -6417,7 +6417,7 @@ (set_attr "prefix" "orig,orig,vex") (set_attr "mode" "SF")]) -(define_insn "sse2_cvtss2sd" +(define_insn "sse2_cvtss2sd" [(set (match_operand:V2DF 0 "register_operand" "=x,x,v") (vec_merge:V2DF (float_extend:V2DF @@ -6430,7 +6430,7 @@ "@ cvtss2sd\t{%2, %0|%0, %2} cvtss2sd\t{%2, %0|%0, %k2} - vcvtss2sd\t{%2, %1, %0|%0, %1, %k2}" + vcvtss2sd\t{%2, %1, %0|%0, %1, %k2}" [(set_attr "isa" "noavx,noavx,avx") (set_attr "type" "ssecvt") (set_attr "amdfam10_decode" "vector,double,*") @@ -19092,7 +19092,7 @@ (set_attr "type" "sse") (set_attr "mode" "")]) -(define_insn "avx512er_vmrcp28" +(define_insn "avx512er_vmrcp28" [(set (match_operand:VF_128 0 "register_operand" "=v") (vec_merge:VF_128 (unspec:VF_128 @@ -19101,7 +19101,7 @@ (match_operand:VF_128 2 "register_operand" "v") (const_int 1)))] "TARGET_AVX512ER" - "vrcp28\t{%1, %2, %0|%0, %2, %1}" + "vrcp28\t{%1, %2, %0|%0, %2, %1}" [(set_attr "length_immediate" "1") (set_attr "prefix" "evex") (set_attr "type" "sse") @@ -19118,7 +19118,7 @@ (set_attr "type" "sse") (set_attr "mode" "")]) -(define_insn "avx512er_vmrsqrt28" +(define_insn "avx512er_vmrsqrt28" [(set (match_operand:VF_128 0 "register_operand" "=v") (vec_merge:VF_128 (unspec:VF_128 @@ -19127,7 +19127,7 @@ (match_operand:VF_128 2 "register_operand" "v") (const_int 1)))] "TARGET_AVX512ER" - "vrsqrt28\t{%1, %2, %0|%0, %2, %1}" + "vrsqrt28\t{%1, %2, %0|%0, %2, %1}" [(set_attr "length_immediate" "1") (set_attr "type" "sse") (set_attr "prefix" "evex") diff --git a/gcc/testsuite/gcc.target/i386/avx-1.c b/gcc/testsuite/gcc.target/i386/avx-1.c index 2dbed1cc943..6178e38ce02 100644 --- a/gcc/testsuite/gcc.target/i386/avx-1.c +++ b/gcc/testsuite/gcc.target/i386/avx-1.c @@ -381,6 +381,8 @@ #define __builtin_ia32_vfmaddss3_mask3(A, B, C, D, E) __builtin_ia32_vfmaddss3_mask3(A, B, C, D, 8) #define __builtin_ia32_vfmaddss3_maskz(A, B, C, D, E) __builtin_ia32_vfmaddss3_maskz(A, B, C, D, 8) #define __builtin_ia32_vfmsubss3_mask3(A, B, C, D, E) __builtin_ia32_vfmsubss3_mask3(A, B, C, D, 8) +#define __builtin_ia32_cvtsd2ss_mask_round(A, B, C, D, E) __builtin_ia32_cvtsd2ss_mask_round(A, B, C, D, 8) +#define __builtin_ia32_cvtss2sd_mask_round(A, B, C, D, E) __builtin_ia32_cvtss2sd_mask_round(A, B, C, D, 8) /* avx512erintrin.h */ #define __builtin_ia32_exp2ps_mask(A, B, C, D) __builtin_ia32_exp2ps_mask(A, B, C, 8) @@ -393,6 +395,10 @@ #define __builtin_ia32_rcp28sd_round(A, B, C) __builtin_ia32_rcp28sd_round(A, B, 8) #define __builtin_ia32_rsqrt28ss_round(A, B, C) __builtin_ia32_rsqrt28ss_round(A, B, 8) #define __builtin_ia32_rsqrt28sd_round(A, B, C) __builtin_ia32_rsqrt28sd_round(A, B, 8) +#define __builtin_ia32_rcp28sd_mask_round(A, B, C, D, E) __builtin_ia32_rcp28sd_mask_round(A, B, C, D, 8) +#define __builtin_ia32_rcp28ss_mask_round(A, B, C, D, E) __builtin_ia32_rcp28ss_mask_round(A, B, C, D, 8) +#define __builtin_ia32_rsqrt28sd_mask_round(A, B, C, D, E) __builtin_ia32_rsqrt28sd_mask_round(A, B, C, D, 8) +#define __builtin_ia32_rsqrt28ss_mask_round(A, B, C, D, E) __builtin_ia32_rsqrt28ss_mask_round(A, B, C, D, 8) /* avx512pfintrin.h */ #define __builtin_ia32_gatherpfdps(A, B, C, D, E) __builtin_ia32_gatherpfdps(A, B, C, 1, _MM_HINT_T0) @@ -464,6 +470,10 @@ #define __builtin_ia32_cvtps2qq512_mask(A, B, C, D) __builtin_ia32_cvtps2qq512_mask(A, B, C, 8) #define __builtin_ia32_cvtpd2uqq512_mask(A, B, C, D) __builtin_ia32_cvtpd2uqq512_mask(A, B, C, 8) #define __builtin_ia32_cvtpd2qq512_mask(A, B, C, D) __builtin_ia32_cvtpd2qq512_mask(A, B, C, 8) +#define __builtin_ia32_reducesd_mask_round(A, B, C, D, E, F) __builtin_ia32_reducesd_mask_round(A, B, 8, D, E, 8) +#define __builtin_ia32_reducess_mask_round(A, B, C, D, E, F) __builtin_ia32_reducess_mask_round(A, B, 8, D, E, 8) +#define __builtin_ia32_reducepd512_mask_round(A, B, C, D, E) __builtin_ia32_reducepd512_mask_round(A, 8, C, D, 8) +#define __builtin_ia32_reduceps512_mask_round(A, B, C, D, E) __builtin_ia32_reduceps512_mask_round(A, 8, C, D, 8) /* avx512vlintrin.h */ #define __builtin_ia32_vpermilps_mask(A, E, C, D) __builtin_ia32_vpermilps_mask(A, 1, C, D) diff --git a/gcc/testsuite/gcc.target/i386/avx2-vbroadcastsi128-1.c b/gcc/testsuite/gcc.target/i386/avx2-vbroadcastsi128-1.c index 7805e3ddbc4..24dadbef4c8 100644 --- a/gcc/testsuite/gcc.target/i386/avx2-vbroadcastsi128-1.c +++ b/gcc/testsuite/gcc.target/i386/avx2-vbroadcastsi128-1.c @@ -1,14 +1,15 @@ /* { dg-do compile } */ /* { dg-options "-mavx2 -O2" } */ -/* { dg-final { scan-assembler "vbroadcasti128\[ \\t\]+\[^\n\]*%ymm\[0-9\]" } } */ +/* { dg-final { scan-assembler-times "vbroadcasti128\[ \\t\]+\[^\n\]*%ymm\[0-9\]+" 2 } } */ #include -volatile __m256i x; -__m128i y; +volatile __m256i x,xx; +__m128i y,yy; void extern avx2_test (void) { x = _mm256_broadcastsi128_si256 (y); + xx = _mm_broadcastsi128_si256 (yy); } diff --git a/gcc/testsuite/gcc.target/i386/avx2-vbroadcastsi128-2.c b/gcc/testsuite/gcc.target/i386/avx2-vbroadcastsi128-2.c index ef1d370ab48..a19464e718c 100644 --- a/gcc/testsuite/gcc.target/i386/avx2-vbroadcastsi128-2.c +++ b/gcc/testsuite/gcc.target/i386/avx2-vbroadcastsi128-2.c @@ -9,7 +9,7 @@ void static avx2_test (void) { union128i_q s1; - union256i_q res; + union256i_q res, res1; long long int res_ref[4]; int i, j; int fail = 0; @@ -20,11 +20,13 @@ avx2_test (void) s1.a[j] = j * i; res.x = _mm256_broadcastsi128_si256 (s1.x); + res1.x = _mm_broadcastsi128_si256 (s1.x); memcpy (res_ref, s1.a, 16); memcpy (res_ref + 2, s1.a, 16); fail += check_union256i_q (res, res_ref); + fail += check_union256i_q (res1, res_ref); } if (fail != 0) diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-vmovdqu16-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-vmovdqu16-1.c index a0d0e36389b..dcb8caaa73e 100644 --- a/gcc/testsuite/gcc.target/i386/avx512bw-vmovdqu16-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512bw-vmovdqu16-1.c @@ -15,13 +15,19 @@ /* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*\\)\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*\\)\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*\\)\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "(?:vmovdqu16|vinserti128)\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*\\)(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "(?:vmovdqu16|vextracti128)\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*\\)(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*\\)(?:\n|\[ \\t\]+#)" 1 } } */ #include -short *p; -volatile __m512i x1, yy; -volatile __m256i x2, y2; -volatile __m128i x3, y3; +short *p, *p1, *p2, *p3, *p4, *p5, *p6; +volatile __m512i x1, yy, zzz; +volatile __m256i x2, y2, yyy; +volatile __m128i x3, y3, xxx; volatile __mmask32 m32; volatile __mmask16 m16; volatile __mmask8 m8; @@ -45,7 +51,15 @@ avx512bw_test (void) x2 = _mm256_maskz_loadu_epi16 (m16, p); x3 = _mm_maskz_loadu_epi16 (m8, p); + zzz = _mm512_loadu_epi16 (p5); + yyy = _mm256_loadu_epi16 (p3); + xxx = _mm_loadu_epi16 (p1); + _mm512_mask_storeu_epi16 (p, m32, x1); _mm256_mask_storeu_epi16 (p, m16, x2); _mm_mask_storeu_epi16 (p, m8, x3); + + _mm512_storeu_epi16 (p6, zzz); + _mm256_storeu_epi16 (p4, yyy); + _mm_storeu_epi16 (p2, xxx); } diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-vmovdqu8-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-vmovdqu8-1.c index 6d24e79bf66..a335bcab3b2 100644 --- a/gcc/testsuite/gcc.target/i386/avx512bw-vmovdqu8-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512bw-vmovdqu8-1.c @@ -15,13 +15,17 @@ /* { dg-final { scan-assembler-times "vmovdqu8\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*\\)\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vmovdqu8\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*\\)\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vmovdqu8\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*\\)\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vmovdqu8\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vmovdqu8\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vmovdqu8\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*\\)(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vmovdqu8\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*\\)(?:\n|\[ \\t\]+#)" 1 } } */ #include -char *p; -volatile __m512i x1, yy; +char *p, *p1, *p2, *p3, *p4; +volatile __m512i x1, yy, zzz; volatile __m256i x2, y2; -volatile __m128i x3, y3; +volatile __m128i x3, y3, xxx; volatile __mmask64 m64; volatile __mmask32 m32; volatile __mmask16 m16; @@ -45,7 +49,13 @@ avx512bw_test (void) x2 = _mm256_maskz_loadu_epi8 (m32, p); x3 = _mm_maskz_loadu_epi8 (m16, p); + zzz = _mm512_loadu_epi8 (p3); + xxx = _mm_loadu_epi8 (p1); + _mm512_mask_storeu_epi8 (p, m64, x1); _mm256_mask_storeu_epi8 (p, m32, x2); _mm_mask_storeu_epi8 (p, m16, x3); + + _mm512_storeu_epi8 (p4, zzz); + _mm_storeu_epi8 (p2, xxx); } diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-vreducepd-3.c b/gcc/testsuite/gcc.target/i386/avx512dq-vreducepd-3.c new file mode 100644 index 00000000000..4dc1585691b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512dq-vreducepd-3.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512dq -O2" } */ +/* { dg-final { scan-assembler-times "vreducepd\[ \\t\]+\[^\{\n\]*\{sae\}\[^\n\]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vreducepd\[ \\t\]+\[^\{\n\]*\{sae\}\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vreducepd\[ \\t\]+\[^\{\n\]*\{sae\}\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +#define IMM 123 + +volatile __m512d xx1; +volatile __mmask8 m; + +void extern +avx512dq_test (void) +{ + xx1 = _mm512_reduce_round_pd(xx1, IMM, _MM_FROUND_NO_EXC); + + xx1 = _mm512_mask_reduce_round_pd (xx1, m, xx1, IMM, _MM_FROUND_NO_EXC); + + xx1 = _mm512_maskz_reduce_round_pd (m, xx1, IMM, _MM_FROUND_NO_EXC); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-vreducepd-4.c b/gcc/testsuite/gcc.target/i386/avx512dq-vreducepd-4.c new file mode 100644 index 00000000000..2b39670847a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512dq-vreducepd-4.c @@ -0,0 +1,61 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx512dq" } */ +/* { dg-require-effective-target avx512dq } */ + +#define AVX512DQ +#include "avx512f-helper.h" + +#define SIZE (AVX512F_LEN / 64) +#include "avx512f-mask-type.h" + +#define IMM 0x23 + +void +CALC (double *s, double *r) +{ + int i; + + for (i = 0; i < SIZE; i++) + { + double tmp = (int) (4 * s[i]) / 4.0; + r[i] = s[i] - tmp; + } +} + +void +TEST (void) +{ + UNION_TYPE (AVX512F_LEN, d) s, res1, res2, res3; + MASK_TYPE mask = MASK_VALUE; + double res_ref[SIZE]; + int i, sign = 1; + + for (i = 0; i < SIZE; i++) + { + s.a[i] = 123.456 * (i + 2000) * sign; + res2.a[i] = DEFAULT_VALUE; + sign = -sign; + } + + res1.x = INTRINSIC (_reduce_round_pd) (s.x, IMM, _MM_FROUND_TO_NEAREST_INT + | _MM_FROUND_NO_EXC); + res2.x = INTRINSIC (_mask_reduce_round_pd) (res2.x, mask, s.x, + IMM, _MM_FROUND_TO_NEAREST_INT + | _MM_FROUND_NO_EXC); + res3.x = INTRINSIC (_maskz_reduce_round_pd) (mask, s.x, IMM, + _MM_FROUND_TO_NEAREST_INT + | _MM_FROUND_NO_EXC); + + CALC (s.a, res_ref); + + if (UNION_FP_CHECK (AVX512F_LEN, d) (res1, res_ref)) + abort (); + + MASK_MERGE (d) (res_ref, mask, SIZE); + if (UNION_FP_CHECK (AVX512F_LEN, d) (res2, res_ref)) + abort (); + + MASK_ZERO (d) (res_ref, mask, SIZE); + if (UNION_FP_CHECK (AVX512F_LEN, d) (res3, res_ref)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-vreduceps-3.c b/gcc/testsuite/gcc.target/i386/avx512dq-vreduceps-3.c new file mode 100644 index 00000000000..959d51ff9e0 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512dq-vreduceps-3.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512dq -O2" } */ +/* { dg-final { scan-assembler-times "vreduceps\[ \\t\]+\[^\{\n\]*\{sae\}\[^\n\]*%zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vreduceps\[ \\t\]+\[^\{\n\]*\{sae\}\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vreduceps\[ \\t\]+\[^\{\n\]*\{sae\}\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +#define IMM 123 + +volatile __m512 xx1; +volatile __mmask16 m16; + +void extern +avx512dq_test (void) +{ + xx1 = _mm512_reduce_round_ps (xx1, IMM, _MM_FROUND_NO_EXC); + + xx1 = _mm512_mask_reduce_round_ps (xx1, m16, xx1, IMM, _MM_FROUND_NO_EXC); + + xx1 = _mm512_maskz_reduce_round_ps (m16, xx1, IMM, _MM_FROUND_NO_EXC); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-vreduceps-4.c b/gcc/testsuite/gcc.target/i386/avx512dq-vreduceps-4.c new file mode 100644 index 00000000000..6b687b292e0 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512dq-vreduceps-4.c @@ -0,0 +1,61 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx512dq" } */ +/* { dg-require-effective-target avx512dq } */ + +#define AVX512DQ +#include "avx512f-helper.h" + +#define SIZE (AVX512F_LEN / 32) +#include "avx512f-mask-type.h" + +#define IMM 0x23 + +void +CALC (float *s, float *r) +{ + int i; + + for (i = 0; i < SIZE; i++) + { + float tmp = (int) (4 * s[i]) / 4.0; + r[i] = s[i] - tmp; + } +} + +void +TEST (void) +{ + UNION_TYPE (AVX512F_LEN,) s, res1, res2, res3; + MASK_TYPE mask = MASK_VALUE; + float res_ref[SIZE]; + int i, sign = 1; + + for (i = 0; i < SIZE; i++) + { + s.a[i] = 123.456 * (i + 2000) * sign; + res2.a[i] = DEFAULT_VALUE; + sign = -sign; + } + + res1.x = INTRINSIC (_reduce_round_ps) (s.x, IMM, _MM_FROUND_TO_NEAREST_INT + | _MM_FROUND_NO_EXC); + res2.x = INTRINSIC (_mask_reduce_round_ps) (res2.x, mask, s.x, + IMM, _MM_FROUND_TO_NEAREST_INT + | _MM_FROUND_NO_EXC); + res3.x = INTRINSIC (_maskz_reduce_round_ps) (mask, s.x, IMM, + _MM_FROUND_TO_NEAREST_INT + | _MM_FROUND_NO_EXC); + + CALC (s.a, res_ref); + + if (UNION_FP_CHECK (AVX512F_LEN,) (res1, res_ref)) + abort (); + + MASK_MERGE () (res_ref, mask, SIZE); + if (UNION_FP_CHECK (AVX512F_LEN,) (res2, res_ref)) + abort (); + + MASK_ZERO () (res_ref, mask, SIZE); + if (UNION_FP_CHECK (AVX512F_LEN,) (res3, res_ref)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-vreducesd-1.c b/gcc/testsuite/gcc.target/i386/avx512dq-vreducesd-1.c index b8f24a0ccbd..63d9537dc70 100644 --- a/gcc/testsuite/gcc.target/i386/avx512dq-vreducesd-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512dq-vreducesd-1.c @@ -2,8 +2,11 @@ /* { dg-options "-mavx512dq -O2" } */ /* { dg-final { scan-assembler-times "vreducesd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vreducesd\[ \\t\]+\[^\{\n\]*\{sae\}\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vreducesd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vreducesd\[ \\t\]+\[^\{\n\]*\{sae\}\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vreducesd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vreducesd\[ \\t\]+\[^\{\n\]*\{sae\}\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vreducesd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ @@ -11,15 +14,18 @@ #define IMM 123 -volatile __m128d x1, x2; +volatile __m128d x1, x2, xx1, xx2; volatile __mmask8 m; void extern avx512dq_test (void) { + xx1 = _mm_reduce_round_sd (xx1, xx2, IMM, _MM_FROUND_NO_EXC); x1 = _mm_reduce_sd (x1, x2, IMM); + xx1 = _mm_mask_reduce_round_sd(xx1, m, xx1, xx2, IMM, _MM_FROUND_NO_EXC); x1 = _mm_mask_reduce_sd(x1, m, x1, x2, IMM); + xx1 = _mm_maskz_reduce_round_sd(m, xx1, xx2, IMM, _MM_FROUND_NO_EXC); x1 = _mm_maskz_reduce_sd(m, x1, x2, IMM); } diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-vreducesd-2.c b/gcc/testsuite/gcc.target/i386/avx512dq-vreducesd-2.c index 93e18271cbd..17448c0fcee 100644 --- a/gcc/testsuite/gcc.target/i386/avx512dq-vreducesd-2.c +++ b/gcc/testsuite/gcc.target/i386/avx512dq-vreducesd-2.c @@ -28,7 +28,7 @@ CALC (double *r, double *s) void TEST (void) { - union128d res1, res2, res3; + union128d res1, res2, res3, res4, res5, res6; union128d s1, s2, src; double res_ref[2]; MASK_TYPE mask = MASK_VALUE; @@ -42,25 +42,45 @@ TEST (void) res1.a[j] = DEFAULT_VALUE; res2.a[j] = DEFAULT_VALUE; res3.a[j] = DEFAULT_VALUE; + res4.a[j] = DEFAULT_VALUE; + res5.a[j] = DEFAULT_VALUE; + res6.a[j] = DEFAULT_VALUE; } res1.x = _mm_reduce_sd (s1.x, s2.x, IMM); res2.x = _mm_mask_reduce_sd (s1.x, mask, s1.x, s2.x, IMM); res3.x = _mm_maskz_reduce_sd (mask, s1.x, s2.x, IMM); + res4.x = _mm_reduce_round_sd (s1.x, s2.x, IMM,_MM_FROUND_TO_NEAREST_INT + | _MM_FROUND_NO_EXC); + res5.x = _mm_mask_reduce_round_sd (s1.x, mask, s1.x, s2.x, IMM, + _MM_FROUND_TO_NEAREST_INT + | _MM_FROUND_NO_EXC); + res6.x = _mm_maskz_reduce_round_sd (mask, s1.x, s2.x, IMM, + _MM_FROUND_TO_NEAREST_INT + | _MM_FROUND_NO_EXC); CALC (res_ref, s2.a); if (check_union128d (res1, res_ref)) abort (); + if (check_union128d (res4, res_ref)) + abort (); + MASK_MERGE (d) (res_ref, mask, 1); if (check_union128d (res2, res_ref)) abort (); + if (check_union128d (res5, res_ref)) + abort (); + MASK_ZERO (d) (res_ref, mask, 1); if (check_union128d (res3, res_ref)) abort (); + if (check_union128d (res6, res_ref)) + abort (); + } diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-vreducess-1.c b/gcc/testsuite/gcc.target/i386/avx512dq-vreducess-1.c index 804074e2ba6..341bd46aa56 100644 --- a/gcc/testsuite/gcc.target/i386/avx512dq-vreducess-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512dq-vreducess-1.c @@ -2,23 +2,29 @@ /* { dg-options "-mavx512dq -O2" } */ /* { dg-final { scan-assembler-times "vreducess\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vreducess\[ \\t\]+\[^\{\n\]*\{sae\}\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vreducess\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vreducess\[ \\t\]+\[^\{\n\]*\{sae\}\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vreducess\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vreducess\[ \\t\]+\[^\{\n\]*\{sae\}\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vreducess\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ #include #define IMM 123 -volatile __m128 x1, x2; +volatile __m128 x1, x2, xx1, xx2; volatile __mmask8 m; void extern avx512dq_test (void) { + xx1 = _mm_reduce_round_ss (xx1, xx2, IMM, _MM_FROUND_NO_EXC); x1 = _mm_reduce_ss (x1, x2, IMM); + xx1 = _mm_mask_reduce_round_ss (xx1, m, xx1, xx2, IMM, _MM_FROUND_NO_EXC); x1 = _mm_mask_reduce_ss (x1, m, x1, x2, IMM); + xx1 = _mm_maskz_reduce_round_ss (m, xx1, xx2, IMM, _MM_FROUND_NO_EXC); x1 = _mm_maskz_reduce_ss (m, x1, x2, IMM); } diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-vreducess-2.c b/gcc/testsuite/gcc.target/i386/avx512dq-vreducess-2.c index 8558c3b3468..6d8938deb30 100644 --- a/gcc/testsuite/gcc.target/i386/avx512dq-vreducess-2.c +++ b/gcc/testsuite/gcc.target/i386/avx512dq-vreducess-2.c @@ -30,7 +30,7 @@ TEST (void) { printf("\nsize = %d\n\n", SIZE); - union128 res1, res2, res3; + union128 res1, res2, res3, res4, res5, res6; union128 s1, s2, src; float res_ref[4]; MASK_TYPE mask = MASK_VALUE; @@ -44,25 +44,45 @@ TEST (void) res1.a[j] = DEFAULT_VALUE; res2.a[j] = DEFAULT_VALUE; res3.a[j] = DEFAULT_VALUE; + res4.a[j] = DEFAULT_VALUE; + res5.a[j] = DEFAULT_VALUE; + res6.a[j] = DEFAULT_VALUE; } res1.x = _mm_reduce_ss (s1.x, s2.x, IMM); res2.x = _mm_mask_reduce_ss (s1.x, mask, s1.x, s2.x, IMM); res3.x = _mm_maskz_reduce_ss (mask, s1.x, s2.x, IMM); + res4.x = _mm_reduce_round_ss (s1.x, s2.x, IMM, _MM_FROUND_TO_NEAREST_INT + | _MM_FROUND_NO_EXC); + res5.x = _mm_mask_reduce_round_ss (s1.x, mask, s1.x, s2.x, + IMM, _MM_FROUND_TO_NEAREST_INT + | _MM_FROUND_NO_EXC); + res6.x = _mm_maskz_reduce_round_ss (mask, s1.x, s2.x, IMM, + _MM_FROUND_TO_NEAREST_INT + | _MM_FROUND_NO_EXC); CALC (res_ref, s2.a); if (check_union128 (res1, res_ref)) abort (); + if (check_union128 (res4, res_ref)) + abort (); + MASK_MERGE () (res_ref, mask, 1); if (check_union128 (res2, res_ref)) abort (); + if (check_union128 (res5, res_ref)) + abort (); + MASK_ZERO () (res_ref, mask, 1); if (check_union128 (res3, res_ref)) abort (); + if (check_union128 (res6, res_ref)) + abort (); + } diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28sd-1.c b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28sd-1.c index 9c7608573c4..03e75cc5f3b 100644 --- a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28sd-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28sd-1.c @@ -2,14 +2,23 @@ /* { dg-options "-mavx512er -O2" } */ /* { dg-final { scan-assembler-times "vrcp28sd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vrcp28sd\[ \\t\]+\[^\{\n\]*\{sae\}\[^\n\]*%xmm\[0-9\]+\[^\{\]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrcp28sd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrcp28sd\[ \\t\]+\[^\n\]*\{sae\}\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrcp28sd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrcp28sd\[ \\t\]+\[^\n\]*\{sae\}\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ #include -volatile __m128d x, y; +volatile __m128d x, y, z; +volatile __mmask8 m; void extern avx512er_test (void) { x = _mm_rcp28_sd (x, y); x = _mm_rcp28_round_sd (x, y, _MM_FROUND_NO_EXC); + x = _mm_mask_rcp28_sd (z, m, x, y); + x = _mm_mask_rcp28_round_sd (z, m, x, y, _MM_FROUND_NO_EXC); + x = _mm_maskz_rcp28_sd (m, x, y); + x = _mm_maskz_rcp28_round_sd (m, x, y, _MM_FROUND_NO_EXC); } diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28sd-2.c b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28sd-2.c index 889f990acfe..93d370d0d78 100644 --- a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28sd-2.c +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28sd-2.c @@ -7,11 +7,14 @@ #include "avx512f-helper.h" #include +#define IMM 0x23 + void static avx512er_test (void) { - union128d src1, src2, res; + union128d src1, src2, res, res1, res2, res3, res4; double res_ref[2]; + MASK_TYPE mask = MASK_VALUE; int i; for (i = 0; i < 2; i++) @@ -24,7 +27,32 @@ avx512er_test (void) res_ref[0] = 1.0 / src2.a[0]; res.x = _mm_rcp28_round_sd (src1.x, src2.x, _MM_FROUND_NO_EXC); + res1.x = _mm_mask_rcp28_sd (src1.x, IMM, src1.x, src2.x); + res2.x = _mm_mask_rcp28_round_sd (src1.x, IMM, src1.x, src2.x, + _MM_FROUND_TO_NEAREST_INT + | _MM_FROUND_NO_EXC); + res3.x = _mm_maskz_rcp28_sd (IMM, src1.x, src2.x); + res4.x = _mm_maskz_rcp28_round_sd (IMM, src1.x, src2.x, + _MM_FROUND_TO_NEAREST_INT + | _MM_FROUND_NO_EXC); + if (checkVd (res.a, res_ref, 2)) abort (); + + MASK_MERGE (d) (res_ref, mask, 1); + + if (checkVd (res1.a, res_ref, 2)) + abort (); + + if (checkVd (res2.a, res_ref, 2)) + abort (); + + MASK_ZERO (d) (res_ref, mask, 1); + + if (checkVd (res3.a, res_ref, 2)) + abort (); + + if (checkVd (res4.a, res_ref, 2)) + abort (); } diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ss-1.c b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ss-1.c index 335c44fca30..87a8ac3026f 100644 --- a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ss-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ss-1.c @@ -2,14 +2,23 @@ /* { dg-options "-mavx512er -O2" } */ /* { dg-final { scan-assembler-times "vrcp28ss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vrcp28ss\[ \\t\]+\[^\{\n\]*\{sae\}\[^\n\]*%xmm\[0-9\]+\[^\{\]*(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrcp28ss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrcp28ss\[ \\t\]+\[^\n\]*\{sae\}\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrcp28ss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrcp28ss\[ \\t\]+\[^\n\]*\{sae\}\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ #include -volatile __m128 x, y; +volatile __m128 x, y, z; +volatile __mmask8 m; void extern avx512er_test (void) { x = _mm_rcp28_ss (x, y); x = _mm_rcp28_round_ss (x, y, _MM_FROUND_NO_EXC); + x = _mm_mask_rcp28_ss (z, m, x, y); + x = _mm_mask_rcp28_round_ss (z, m, x, y, _MM_FROUND_NO_EXC); + x = _mm_maskz_rcp28_ss (m, x, y); + x = _mm_maskz_rcp28_round_ss (m, x, y, _MM_FROUND_NO_EXC); } diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ss-2.c b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ss-2.c index 3280879107e..4ffa92c66ee 100644 --- a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ss-2.c +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ss-2.c @@ -7,11 +7,14 @@ #include "avx512f-helper.h" #include +#define IMM 0x23 + void static avx512er_test (void) { - union128 src1, src2, res; + union128 src1, src2, res, res1, res2, res3, res4; float res_ref[4]; + MASK_TYPE mask = MASK_VALUE; int i; for (i = 0; i < 4; i++) @@ -24,7 +27,31 @@ avx512er_test (void) res_ref[0] = 1.0 / src2.a[0]; res.x = _mm_rcp28_round_ss (src1.x, src2.x, _MM_FROUND_NO_EXC); + res1.x = _mm_mask_rcp28_ss (src1.x, IMM, src1.x, src2.x); + res2.x = _mm_mask_rcp28_round_ss (src1.x, IMM, src1.x, src2.x, + _MM_FROUND_TO_NEAREST_INT + | _MM_FROUND_NO_EXC); + res3.x = _mm_maskz_rcp28_ss (IMM, src1.x, src2.x); + res4.x = _mm_maskz_rcp28_round_ss (IMM, src1.x, src2.x, + _MM_FROUND_TO_NEAREST_INT + | _MM_FROUND_NO_EXC); if (checkVf (res.a, res_ref, 4)) abort (); + + MASK_MERGE () (res_ref, mask, 1); + + if (checkVf (res1.a, res_ref, 2)) + abort (); + + if (checkVf (res2.a, res_ref, 2)) + abort (); + + MASK_ZERO () (res_ref, mask, 1); + + if (checkVf (res3.a, res_ref, 2)) + abort (); + + if (checkVf (res4.a, res_ref, 2)) + abort (); } diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28sd-1.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28sd-1.c index de3341f9bed..ca549062b75 100644 --- a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28sd-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28sd-1.c @@ -2,14 +2,23 @@ /* { dg-options "-mavx512er -O2" } */ /* { dg-final { scan-assembler-times "vrsqrt28sd\[ \\t\]+\[^\{^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vrsqrt28sd\[ \\t\]+\[^\{\n\]*\{sae\}\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrsqrt28sd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrsqrt28sd\[ \\t\]+\[^\n\]*\{sae\}\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrsqrt28sd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrsqrt28sd\[ \\t\]+\[^\n\]*\{sae\}\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ #include -volatile __m128d x, y; +volatile __m128d x, y, z; +volatile __mmask8 m; void extern avx512er_test (void) { x = _mm_rsqrt28_sd (x, y); x = _mm_rsqrt28_round_sd (x, y, _MM_FROUND_NO_EXC); + x = _mm_mask_rsqrt28_sd (z, m, x, y); + x = _mm_mask_rsqrt28_round_sd (z, m, x, y, _MM_FROUND_NO_EXC); + x = _mm_maskz_rsqrt28_sd (m, x, y); + x = _mm_maskz_rsqrt28_round_sd (m, x, y, _MM_FROUND_NO_EXC); } diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28sd-2.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28sd-2.c index bd217e8228f..2606191b97b 100644 --- a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28sd-2.c +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28sd-2.c @@ -7,11 +7,14 @@ #include "avx512f-helper.h" #include +#define IMM 0x23 + void static avx512er_test (void) { - union128d src1, src2, res; + union128d src1, src2, res, res1, res2, res3, res4; double res_ref[2]; + MASK_TYPE mask = MASK_VALUE; int i; for (i = 0; i < 2; i++) @@ -24,7 +27,31 @@ avx512er_test (void) res_ref[0] = 1.0 / sqrt (src2.a[0]); res.x = _mm_rsqrt28_round_sd (src1.x, src2.x, _MM_FROUND_NO_EXC); + res1.x = _mm_mask_rsqrt28_sd (src1.x, IMM, src1.x, src2.x); + res2.x = _mm_mask_rsqrt28_round_sd (src1.x, IMM, src1.x, src2.x, + _MM_FROUND_TO_NEAREST_INT + | _MM_FROUND_NO_EXC); + res3.x = _mm_maskz_rsqrt28_sd (IMM, src1.x, src2.x); + res4.x = _mm_maskz_rsqrt28_round_sd (IMM, src1.x, src2.x, + _MM_FROUND_TO_NEAREST_INT + | _MM_FROUND_NO_EXC); if (checkVd (res.a, res_ref, 2)) abort (); + + MASK_MERGE (d) (res_ref, mask, 1); + + if (checkVd (res1.a, res_ref, 2)) + abort (); + + if (checkVd (res2.a, res_ref, 2)) + abort (); + + MASK_ZERO (d) (res_ref, mask, 1); + + if (checkVd (res3.a, res_ref, 2)) + abort (); + + if (checkVd (res4.a, res_ref, 2)) + abort (); } diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ss-1.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ss-1.c index 701cb557190..c97376ed746 100644 --- a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ss-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ss-1.c @@ -2,14 +2,23 @@ /* { dg-options "-mavx512er -O2" } */ /* { dg-final { scan-assembler-times "vrsqrt28ss\[ \\t\]+\[^\{^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vrsqrt28ss\[ \\t\]+\[^\{\n\]*\{sae\}\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrsqrt28ss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrsqrt28ss\[ \\t\]+\[^\n\]*\{sae\}\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrsqrt28ss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrsqrt28ss\[ \\t\]+\[^\n\]*\{sae\}\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ #include -volatile __m128 x, y; +volatile __m128 x, y, z; +volatile __mmask8 m; void extern avx512er_test (void) { x = _mm_rsqrt28_ss (x, y); x = _mm_rsqrt28_round_ss (x, y, _MM_FROUND_NO_EXC); + x = _mm_mask_rsqrt28_ss (z, m, x, y); + x = _mm_mask_rsqrt28_round_ss (z, m, x, y, _MM_FROUND_NO_EXC); + x = _mm_maskz_rsqrt28_ss (m, x, y); + x = _mm_maskz_rsqrt28_round_ss (m, x, y, _MM_FROUND_NO_EXC); } diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ss-2.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ss-2.c index f7bfff5a50d..fa1c19b9f6b 100644 --- a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ss-2.c +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ss-2.c @@ -7,11 +7,14 @@ #include "avx512f-helper.h" #include +#define IMM 0x23 + void static avx512er_test (void) { - union128 src1, src2, res; + union128 src1, src2, res, res1, res2, res3, res4; float res_ref[4]; + MASK_TYPE mask = MASK_VALUE; int i; for (i = 0; i < 4; i++) @@ -24,7 +27,31 @@ avx512er_test (void) res_ref[0] = 1.0 / sqrt (src2.a[0]); res.x = _mm_rsqrt28_round_ss (src1.x, src2.x, _MM_FROUND_NO_EXC); + res1.x = _mm_mask_rsqrt28_ss (src1.x, IMM, src1.x, src2.x); + res2.x = _mm_mask_rsqrt28_round_ss (src1.x, IMM, src1.x, src2.x, + _MM_FROUND_TO_NEAREST_INT + | _MM_FROUND_NO_EXC); + res3.x = _mm_maskz_rsqrt28_ss (IMM, src1.x, src2.x); + res4.x = _mm_maskz_rsqrt28_round_ss (IMM, src1.x, src2.x, + _MM_FROUND_TO_NEAREST_INT + | _MM_FROUND_NO_EXC); if (checkVf (res.a, res_ref, 4)) abort (); + + MASK_MERGE () (res_ref, mask, 1); + + if (checkVf (res1.a, res_ref, 2)) + abort (); + + if (checkVf (res2.a, res_ref, 2)) + abort (); + + MASK_ZERO () (res_ref, mask, 1); + + if (checkVf (res3.a, res_ref, 2)) + abort (); + + if (checkVf (res4.a, res_ref, 2)) + abort (); } diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vcvtsd2si-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vcvtsd2si-1.c index 845e206e025..402af5048f6 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vcvtsd2si-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vcvtsd2si-1.c @@ -1,13 +1,15 @@ /* { dg-do compile } */ /* { dg-options "-O2 -mavx512f" } */ /* { dg-final { scan-assembler-times "vcvtsd2sil?\[ \\t\]+\[^\n\]*\{rn-sae\}\[^\n\]*%xmm\[0-9\]+.{6}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtsd2sil?\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+.{6}(?:\n|\[ \\t\]+#)" 1 } } */ #include volatile __m128d x; -volatile unsigned y; +volatile unsigned y, z; void extern avx512f_test (void) { y = _mm_cvt_roundsd_i32 (x, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + z = _mm_cvtsd_i32 (x); } diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vcvtsd2si64-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vcvtsd2si64-1.c index ad11e8bd238..dad26e4b729 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vcvtsd2si64-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vcvtsd2si64-1.c @@ -1,14 +1,16 @@ /* { dg-do compile { target { ! ia32 } } } */ /* { dg-options "-O2 -mavx512f" } */ /* { dg-final { scan-assembler-times "vcvtsd2siq\[ \\t\]+\[^\n\]*\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+.{6}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtsd2siq\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+.{6}(?:\n|\[ \\t\]+#)" 1 } } */ #include volatile __m128d x; -volatile unsigned long long y; +volatile unsigned long long y, z; void extern avx512f_test (void) { y = _mm_cvt_roundsd_i64 (x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + z = _mm_cvtsd_i64 (x); } diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vcvtsd2ss-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vcvtsd2ss-1.c index 84d8e05f10d..d61e76c84d2 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vcvtsd2ss-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vcvtsd2ss-1.c @@ -1,14 +1,23 @@ /* { dg-do compile } */ /* { dg-options "-mavx512f -O2" } */ /* { dg-final { scan-assembler-times "vcvtsd2ss\[ \\t\]+\[^\n\]*\{rn-sae\}\[^\n\]*%xmm\[0-9\]+\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtsd2ss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtsd2ss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtsd2ss\[ \\t\]+\[^\n\]*\{rn-sae\}\[^\n\]*%xmm\[0-9\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtsd2ss\[ \\t\]+\[^\n\]*\{rn-sae\}\[^\n\]*%xmm\[0-9\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ #include -volatile __m128 s1, r; +volatile __m128 s1, r, s3; volatile __m128d s2; +volatile __mmask8 m; void extern avx512f_test (void) { r = _mm_cvt_roundsd_ss (s1, s2, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + r = _mm_mask_cvtsd_ss (s3, m, s1, s2); + r = _mm_maskz_cvtsd_ss (m, s1, s2); + r = _mm_mask_cvt_roundsd_ss (s3, m, s1, s2, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + r = _mm_maskz_cvt_roundsd_ss (m, s1, s2, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vcvtsi2sd-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vcvtsi2sd-1.c new file mode 100644 index 00000000000..2035eeaadb0 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512f-vcvtsi2sd-1.c @@ -0,0 +1,13 @@ +/* { dg-options "-mavx512f -O2" } */ +/* { dg-final { scan-assembler-times "vcvtsi2sdl\[ \\t\]+\[^%\n\]*%e\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +volatile __m128d x; +volatile int n; + +void extern +avx512f_test (void) +{ + x = _mm_cvti32_sd (x, n); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vcvtsi2sd64-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vcvtsi2sd64-1.c index 440ddbcdb87..4843ace8f0a 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vcvtsi2sd64-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vcvtsi2sd64-1.c @@ -1,14 +1,16 @@ /* { dg-do compile { target { ! ia32 } } } */ /* { dg-options "-mavx512f -O2" } */ /* { dg-final { scan-assembler-times "vcvtsi2sdq\[ \\t\]+\[^%\n\]*%r\[^\{\n\]*\{ru-sae\}\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtsi2sdq\[ \\t\]+\[^%\n\]*%r\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ #include -volatile __m128d x; +volatile __m128d x, y; volatile long long n; void extern avx512f_test (void) { x = _mm_cvt_roundi64_sd (x, n, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + y = _mm_cvti64_sd (x, n); } diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vcvtsi2ss-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vcvtsi2ss-1.c index 8e26576ee7f..0b3f518e566 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vcvtsi2ss-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vcvtsi2ss-1.c @@ -1,14 +1,16 @@ /* { dg-do compile } */ /* { dg-options "-mavx512f -O2" } */ /* { dg-final { scan-assembler-times "vcvtsi2ssl\[ \\t\]+\[^%\n\]*%e\[^\{\n\]*\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtsi2ssl\[ \\t\]+\[^%\n\]*%e\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ #include -volatile __m128 x; +volatile __m128 x, y; volatile int n; void extern avx512f_test (void) { x = _mm_cvt_roundi32_ss (x, n, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + y = _mm_cvti32_ss (x, n); } diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vcvtsi2ss64-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vcvtsi2ss64-1.c index ceef7cd9605..9b2bce13684 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vcvtsi2ss64-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vcvtsi2ss64-1.c @@ -1,14 +1,16 @@ /* { dg-do compile { target { ! ia32 } } } */ /* { dg-options "-mavx512f -O2" } */ /* { dg-final { scan-assembler-times "vcvtsi2ssq\[ \\t\]+\[^%\n\]*%r\[^\{\n\]*\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtsi2ssq\[ \\t\]+\[^%\n\]*%r\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ #include -volatile __m128 x; +volatile __m128 x, y; volatile long long n; void extern avx512f_test (void) { x = _mm_cvt_roundi64_ss (x, n, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + y = _mm_cvti64_ss (x, n); } diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vcvtss2sd-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vcvtss2sd-1.c index ee98d3638e8..48cbac50fef 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vcvtss2sd-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vcvtss2sd-1.c @@ -1,14 +1,23 @@ /* { dg-do compile } */ /* { dg-options "-mavx512f -O2" } */ /* { dg-final { scan-assembler-times "vcvtss2sd\[ \\t\]+\[^\{\n\]*\{sae\}\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtss2sd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtss2sd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtss2sd\[ \\t\]+\[^\n\]*\{sae\}\[^\n\]*%xmm\[0-9\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtss2sd\[ \\t\]+\[^\n\]*\{sae\}\[^\n\]*%xmm\[0-9\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ #include -volatile __m128d s1, r; +volatile __m128d s1, r, s3; volatile __m128 s2; +volatile __mmask8 m; void extern avx512f_test (void) { r = _mm_cvt_roundss_sd (s1, s2, _MM_FROUND_NO_EXC); + r = _mm_mask_cvtss_sd (s3, m, s1, s2); + r = _mm_maskz_cvtss_sd (m, s1, s2); + r = _mm_mask_cvt_roundss_sd (s3, m, s1, s2, _MM_FROUND_NO_EXC); + r = _mm_maskz_cvt_roundss_sd (m, s1, s2, _MM_FROUND_NO_EXC); } diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vcvtss2si-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vcvtss2si-1.c index 96289ef95c5..e3f42238097 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vcvtss2si-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vcvtss2si-1.c @@ -1,13 +1,15 @@ /* { dg-do compile } */ /* { dg-options "-O2 -mavx512f" } */ /* { dg-final { scan-assembler-times "vcvtss2sil?\[ \\t\]+\[^\n\]*\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+.{6}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtss2sil?\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+.{6}(?:\n|\[ \\t\]+#)" 1 } } */ #include volatile __m128 x; -volatile unsigned y; +volatile unsigned y, z; void extern avx512f_test (void) { y = _mm_cvt_roundss_i32 (x, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + z = _mm_cvtss_i32 (x); } diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vcvtss2si64-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vcvtss2si64-1.c index 7ada1f4cd48..86ef95a64fd 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vcvtss2si64-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vcvtss2si64-1.c @@ -1,14 +1,16 @@ /* { dg-do compile { target { ! ia32 } } } */ /* { dg-options "-O2 -mavx512f" } */ /* { dg-final { scan-assembler-times "vcvtss2siq\[ \\t\]+\[^\n\]*\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+.{6}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vcvtss2siq\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+.{6}(?:\n|\[ \\t\]+#)" 1 } } */ #include volatile __m128 x; -volatile unsigned long long y; +volatile unsigned long long y, z; void extern avx512f_test (void) { y = _mm_cvt_roundss_i64 (x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + z = _mm_cvtss_i64 (x); } diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vscalefsd-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vscalefsd-1.c index 09bc5c63bb7..d0ba9cf0e0e 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vscalefsd-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vscalefsd-1.c @@ -4,6 +4,8 @@ /* { dg-final { scan-assembler-times "vscalefsd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vscalefsd\[ \\t\]+\[^\n\]*\{rd-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vscalefsd\[ \\t\]+\[^\n\]*\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vscalefsd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vscalefsd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ #include @@ -18,4 +20,6 @@ avx512f_test (void) x = _mm_scalef_round_sd (x, x, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); x = _mm_mask_scalef_round_sd (x, m, x, x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); x = _mm_maskz_scalef_round_sd (m, x, x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + x = _mm_mask_scalef_sd (x, m, x, x); + x = _mm_maskz_scalef_sd (m, x, x); } diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vscalefsd-2.c b/gcc/testsuite/gcc.target/i386/avx512f-vscalefsd-2.c index afe73dc5e83..986e9787a25 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vscalefsd-2.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vscalefsd-2.c @@ -18,7 +18,7 @@ compute_scalefsd (double *s1, double *s2, double *r) void static avx512f_test (void) { - union128d res1, res2, res3, res4; + union128d res1, res2, res3, res4, res5, res6; union128d s1, s2; double res_ref[SIZE]; MASK_TYPE mask = MASK_VALUE; @@ -33,6 +33,8 @@ avx512f_test (void) res2.a[i] = DEFAULT_VALUE; res3.a[i] = DEFAULT_VALUE; res4.a[i] = DEFAULT_VALUE; + res5.a[i] = DEFAULT_VALUE; + res6.a[i] = DEFAULT_VALUE; } res1.x = _mm_scalef_sd (s1.x, s2.x); @@ -42,6 +44,8 @@ avx512f_test (void) _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); res4.x = _mm_maskz_scalef_round_sd (mask, s1.x, s2.x, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + res5.x = _mm_mask_scalef_sd (s1.x, mask, s1.x, s2.x); + res6.x = _mm_maskz_scalef_sd (mask, s1.x, s2.x); compute_scalefsd (s1.a, s2.a, res_ref); @@ -55,8 +59,14 @@ avx512f_test (void) if (check_union128d (res3, res_ref)) abort (); + if (check_union128d (res5, res_ref)) + abort (); + MASK_ZERO (d) (res_ref, mask, 1); if (check_union128d (res4, res_ref)) abort (); + + if (check_union128d (res6, res_ref)) + abort (); } diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vscalefss-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vscalefss-1.c index d1af336c267..381d39ef21b 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vscalefss-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vscalefss-1.c @@ -4,6 +4,8 @@ /* { dg-final { scan-assembler-times "vscalefss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vscalefss\[ \\t\]+\[^\n\]*\{ru-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vscalefss\[ \\t\]+\[^\n\]*\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vscalefss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vscalefss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ #include @@ -17,4 +19,6 @@ avx512f_test (void) x = _mm_scalef_round_ss (x, x, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); x = _mm_mask_scalef_round_ss (x, m, x, x, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); x = _mm_maskz_scalef_round_ss (m, x, x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + x = _mm_mask_scalef_ss (x, m, x, x); + x = _mm_maskz_scalef_ss (m, x, x); } diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vscalefss-2.c b/gcc/testsuite/gcc.target/i386/avx512f-vscalefss-2.c index 811ff15e5e6..d83feecf97d 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vscalefss-2.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vscalefss-2.c @@ -20,7 +20,7 @@ compute_scalefss (float *s1, float *s2, float *r) static void avx512f_test (void) { - union128 res1, res2, res3, res4; + union128 res1, res2, res3, res4, res5, res6; union128 s1, s2; float res_ref[SIZE]; MASK_TYPE mask = MASK_VALUE; @@ -35,6 +35,8 @@ avx512f_test (void) res2.a[i] = DEFAULT_VALUE; res3.a[i] = DEFAULT_VALUE; res4.a[i] = DEFAULT_VALUE; + res5.a[i] = DEFAULT_VALUE; + res6.a[i] = DEFAULT_VALUE; } res1.x = _mm_scalef_ss (s1.x, s2.x); @@ -44,6 +46,8 @@ avx512f_test (void) _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); res4.x = _mm_maskz_scalef_round_ss (mask, s1.x, s2.x, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + res5.x = _mm_mask_scalef_ss (s1.x, mask, s1.x, s2.x); + res6.x = _mm_maskz_scalef_ss (mask, s1.x, s2.x); compute_scalefss (s1.a, s2.a, res_ref); @@ -57,8 +61,14 @@ avx512f_test (void) if (check_union128 (res3, res_ref)) abort (); + if (check_union128 (res5, res_ref)) + abort (); + MASK_ZERO () (res_ref, mask, 1); if (check_union128 (res4, res_ref)) abort (); + + if (check_union128 (res6, res_ref)) + abort (); } diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vsqrtsd-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vsqrtsd-1.c index a7d7af9b81b..22601e9a9e6 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vsqrtsd-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vsqrtsd-1.c @@ -3,10 +3,12 @@ /* { dg-final { scan-assembler-times "vsqrtsd\[ \\t\]+\[^\n\]*\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vsqrtsd\[ \\t\]+\[^\n\]*\{rd-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vsqrtsd\[ \\t\]+\[^\n\]*\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vsqrtsd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vsqrtsd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ #include -volatile __m128d x1, x2; +volatile __m128d x1, x2, x3; volatile __mmask8 m; void extern @@ -15,4 +17,6 @@ avx512f_test (void) x1 = _mm_sqrt_round_sd (x1, x2, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); x1 = _mm_mask_sqrt_round_sd (x1, m, x1, x2, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); x1 = _mm_maskz_sqrt_round_sd (m, x1, x2, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + x1 = _mm_mask_sqrt_sd (x3, m, x1, x2); + x1 = _mm_maskz_sqrt_sd (m, x1, x2); } diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vsqrtsd-2.c b/gcc/testsuite/gcc.target/i386/avx512f-vsqrtsd-2.c index 49ca7eea39f..ec908fdc803 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vsqrtsd-2.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vsqrtsd-2.c @@ -18,7 +18,7 @@ compute_sqrtsd (double *s1, double *s2, double *r) void static avx512f_test (void) { - union128d res1, res2, res3; + union128d res1, res2, res3, res4, res5; union128d s1, s2; double res_ref[SIZE]; MASK_TYPE mask = MASK_VALUE; @@ -32,6 +32,8 @@ avx512f_test (void) res1.a[i] = DEFAULT_VALUE; res2.a[i] = DEFAULT_VALUE; res3.a[i] = DEFAULT_VALUE; + res4.a[i] = DEFAULT_VALUE; + res5.a[i] = DEFAULT_VALUE; } res1.x = _mm_sqrt_round_sd (s1.x, s2.x, @@ -40,6 +42,8 @@ avx512f_test (void) _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); res3.x = _mm_maskz_sqrt_round_sd (mask, s1.x, s2.x, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + res4.x = _mm_mask_sqrt_sd (s1.x, mask, s1.x, s2.x); + res5.x = _mm_maskz_sqrt_sd (mask, s1.x, s2.x); compute_sqrtsd (s1.a, s2.a, res_ref); @@ -51,10 +55,16 @@ avx512f_test (void) if (check_union128d (res2, res_ref)) abort (); + if (check_union128d (res4, res_ref)) + abort (); + MASK_ZERO (d) (res_ref, mask, 1); if (check_union128d (res3, res_ref)) abort (); + + if (check_union128d (res5, res_ref)) + abort (); } diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vsqrtss-1.c b/gcc/testsuite/gcc.target/i386/avx512f-vsqrtss-1.c index 103ff30e560..c32903b702f 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vsqrtss-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vsqrtss-1.c @@ -3,11 +3,13 @@ /* { dg-final { scan-assembler-times "vsqrtss\[ \\t\]+\[^\n\]*\{rn-sae\}\[^\{\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vsqrtss\[ \\t\]+\[^\n\]*\{rd-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vsqrtss\[ \\t\]+\[^\n\]*\{rz-sae\}\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vsqrtss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vsqrtss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ #include -volatile __m128 x1, x2; +volatile __m128 x1, x2, x3; volatile __mmask8 m; void extern @@ -16,4 +18,6 @@ avx512f_test (void) x1 = _mm_sqrt_round_ss (x1, x2, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); x1 = _mm_mask_sqrt_round_ss (x1, m, x1, x2, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); x1 = _mm_maskz_sqrt_round_ss (m, x1, x2, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + x1 = _mm_mask_sqrt_ss (x3, m, x1, x2); + x1 = _mm_maskz_sqrt_ss (m, x1, x2); } diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vsqrtss-2.c b/gcc/testsuite/gcc.target/i386/avx512f-vsqrtss-2.c index 90f88bef904..33222bb0424 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-vsqrtss-2.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-vsqrtss-2.c @@ -22,7 +22,7 @@ compute_sqrtss (float *s1, float *s2, float *r) static void avx512f_test (void) { - union128 res1, res2, res3; + union128 res1, res2, res3, res4, res5; union128 s1, s2; float res_ref[SIZE]; MASK_TYPE mask = MASK_VALUE; @@ -36,6 +36,8 @@ avx512f_test (void) res1.a[i] = DEFAULT_VALUE; res2.a[i] = DEFAULT_VALUE; res3.a[i] = DEFAULT_VALUE; + res4.a[i] = DEFAULT_VALUE; + res5.a[i] = DEFAULT_VALUE; } res1.x = _mm_sqrt_round_ss (s1.x, s2.x, @@ -44,6 +46,8 @@ avx512f_test (void) _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); res3.x = _mm_maskz_sqrt_round_ss (mask, s1.x, s2.x, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + res4.x = _mm_mask_sqrt_ss (s1.x, mask, s1.x, s2.x); + res5.x = _mm_maskz_sqrt_ss (mask, s1.x, s2.x); compute_sqrtss (s1.a, s2.a, res_ref); @@ -55,9 +59,15 @@ avx512f_test (void) if (check_union128 (res2, res_ref)) abort (); + if (check_union128 (res4, res_ref)) + abort (); + MASK_ZERO () (res_ref, mask, 1); if (check_union128 (res3, res_ref)) abort (); + + if (check_union128 (res5, res_ref)) + abort (); } diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa32-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa32-1.c index 217afbc6904..5c6a3d0bfb4 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa32-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa32-1.c @@ -10,12 +10,16 @@ /* { dg-final { scan-assembler-times "vmovdqa32\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vmovdqa32\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*\\)\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vmovdqa32\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*\\)\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vmovdqa\[ \\t\]+\\(\[^\{\n\]*\\)\[^\n\]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vmovdqa\[ \\t\]+\\(\[^\{\n\]*\\)\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vmovdqa\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*\[ \\t\]+\\(\[^\n\]*\\)(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vmovdqa\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*\[ \\t\]+\\(\[^\n\]*\\)(?:\n|\[ \\t\]+#)" 1 } } */ #include -int *p; -volatile __m256i yy, y2; -volatile __m128i xx, x2; +int *p, *p1, *p2; +volatile __m256i yy, y2, yyy; +volatile __m128i xx, x2, xxx; volatile __mmask8 m; void extern @@ -30,9 +34,15 @@ avx512vl_test (void) yy = _mm256_mask_load_epi32 (yy, m, p); xx = _mm_mask_load_epi32 (xx, m, p); + yyy = _mm256_load_epi32 (p2); + xxx = _mm_load_epi32 (p1); + yy = _mm256_maskz_load_epi32 (m, p); xx = _mm_maskz_load_epi32 (m, p); _mm256_mask_store_epi32 (p, m, yy); _mm_mask_store_epi32 (p, m, xx); + + _mm256_store_epi32 (p2, yyy); + _mm_store_epi32 (p2, xxx); } diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa64-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa64-1.c index 9dc794d6a80..592541aeb8e 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa64-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa64-1.c @@ -4,8 +4,8 @@ /* { dg-final { scan-assembler-times "(?:vmovdqa64|vpblendmq)\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ -/* { dg-final { scan-assembler-times "vmovdqa\[ \\t\]+\\(\[^\n\]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { target nonpic } } } */ -/* { dg-final { scan-assembler-times "vmovdqa\[ \\t\]+\\(\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 { target nonpic } } } */ +/* { dg-final { scan-assembler-times "vmovdqa\[ \\t\]+\\(\[^\n\]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 { target nonpic } } } */ +/* { dg-final { scan-assembler-times "vmovdqa\[ \\t\]+\\(\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 { target nonpic } } } */ /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */ @@ -35,6 +35,9 @@ avx512vl_test (void) yy = _mm256_load_si256 (p1); xx = _mm_load_si128 (p2); + yy = _mm256_load_epi64 (p); + xx = _mm_load_epi64 (p); + yy = _mm256_mask_load_epi64 (yy, m, p); xx = _mm_mask_load_epi64 (xx, m, p); diff --git a/gcc/testsuite/gcc.target/i386/pr95483-1.c b/gcc/testsuite/gcc.target/i386/pr95483-1.c new file mode 100644 index 00000000000..6b008261f35 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr95483-1.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse" } */ +/* { dg-final { scan-assembler-times "pxor\[ \\t\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "pinsrw\[ \\t\]+\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "pextrw\[ \\t\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*(?:\n|\[ \\t\]+#)" 1 } } */ + + +#include +unsigned short *p1,*p2; +volatile __m128i x1,x2; + +void foo (void) +{ + x1=_mm_loadu_si16 (p1); + _mm_storeu_si16 (p2, x2); +} diff --git a/gcc/testsuite/gcc.target/i386/pr95483-2.c b/gcc/testsuite/gcc.target/i386/pr95483-2.c new file mode 100644 index 00000000000..a12aea8f83b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr95483-2.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse2" } */ +/* { dg-final { scan-assembler-times "(?:vpinsrd|movd)\[ \\t\]+\[^\n\]*\\)\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "movd\[ \\t\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*\\)(?:\n|\[ \\t\]+#)" 1 } } */ + +#include +unsigned int *p1,*p2; +volatile __m128i x1,x2; + +void foo (void) +{ + x1=_mm_loadu_si32 (p1); + _mm_storeu_si32 (p2, x2); +} diff --git a/gcc/testsuite/gcc.target/i386/pr95483-3.c b/gcc/testsuite/gcc.target/i386/pr95483-3.c new file mode 100644 index 00000000000..ec9018965b2 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr95483-3.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-times "vmovd\[ \\t\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*(?:\n|\[ \\t\]+#)" 1 } } */ + +#include +volatile __m256i x1; + +int foo (void) +{ + return _mm256_cvtsi256_si32 (x1); +} diff --git a/gcc/testsuite/gcc.target/i386/pr95483-4.c b/gcc/testsuite/gcc.target/i386/pr95483-4.c new file mode 100644 index 00000000000..1d4cdb45020 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr95483-4.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512f" } */ +/* { dg-final { scan-assembler-times "vmovd\[ \\t\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*(?:\n|\[ \\t\]+#)" 1 } } */ + +#include +volatile __m512i x1; + +int foo (void) +{ + return _mm512_cvtsi512_si32 (x1); +} diff --git a/gcc/testsuite/gcc.target/i386/pr95483-5.c b/gcc/testsuite/gcc.target/i386/pr95483-5.c new file mode 100644 index 00000000000..b52e39dff79 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr95483-5.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512bw -mavx512vl -O2" } */ +/* { dg-final { scan-assembler-times "(?:vmovdqu8|vinserti128)\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "(?:vmovdqu8|vextracti128)\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*\\)(?:\n|\[ \\t\]+#)" 1 } } */ + +#include + +char *p, *p1; +volatile __m256i yyy; + +void extern +avx512bw_test (void) +{ + yyy = _mm256_loadu_epi8 (p); + _mm256_storeu_epi8 (p1, yyy); +} diff --git a/gcc/testsuite/gcc.target/i386/pr95483-6.c b/gcc/testsuite/gcc.target/i386/pr95483-6.c new file mode 100644 index 00000000000..3540bf8218e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr95483-6.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512vl -O2" } */ +/* { dg-final { scan-assembler-times "(?:vinserti128|vmovdqu)\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ + +#include + +int *p; +long long *p1; +volatile __m256i x1, x2; + +void extern +avx512vl_test (void) +{ + x1 = _mm256_loadu_epi32 (p); + x2 = _mm256_loadu_epi64 (p1); +} diff --git a/gcc/testsuite/gcc.target/i386/pr95483-7.c b/gcc/testsuite/gcc.target/i386/pr95483-7.c new file mode 100644 index 00000000000..f72aa93eae2 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr95483-7.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512vl -O2" } */ +/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ + +#include + +int *p; +long long *p1; +volatile __m128i x1, x2; + +void extern +avx512vl_test (void) +{ + x1 = _mm_loadu_epi32 (p); + x2 = _mm_loadu_epi64 (p1); +} diff --git a/gcc/testsuite/gcc.target/i386/sse-13.c b/gcc/testsuite/gcc.target/i386/sse-13.c index 4d6c9b3a17a..dd6dd55ae8b 100644 --- a/gcc/testsuite/gcc.target/i386/sse-13.c +++ b/gcc/testsuite/gcc.target/i386/sse-13.c @@ -398,6 +398,8 @@ #define __builtin_ia32_vfmaddss3_mask3(A, B, C, D, E) __builtin_ia32_vfmaddss3_mask3(A, B, C, D, 8) #define __builtin_ia32_vfmaddss3_maskz(A, B, C, D, E) __builtin_ia32_vfmaddss3_maskz(A, B, C, D, 8) #define __builtin_ia32_vfmsubss3_mask3(A, B, C, D, E) __builtin_ia32_vfmsubss3_mask3(A, B, C, D, 8) +#define __builtin_ia32_cvtsd2ss_mask_round(A, B, C, D, E) __builtin_ia32_cvtsd2ss_mask_round(A, B, C, D, 8) +#define __builtin_ia32_cvtss2sd_mask_round(A, B, C, D, E) __builtin_ia32_cvtss2sd_mask_round(A, B, C, D, 8) /* avx512erintrin.h */ #define __builtin_ia32_exp2ps_mask(A, B, C, D) __builtin_ia32_exp2ps_mask(A, B, C, 8) @@ -410,6 +412,10 @@ #define __builtin_ia32_rcp28sd_round(A, B, C) __builtin_ia32_rcp28sd_round(A, B, 8) #define __builtin_ia32_rsqrt28ss_round(A, B, C) __builtin_ia32_rsqrt28ss_round(A, B, 8) #define __builtin_ia32_rsqrt28sd_round(A, B, C) __builtin_ia32_rsqrt28sd_round(A, B, 8) +#define __builtin_ia32_rcp28sd_mask_round(A, B, C, D, E) __builtin_ia32_rcp28sd_mask_round(A, B, C, D, 8) +#define __builtin_ia32_rcp28ss_mask_round(A, B, C, D, E) __builtin_ia32_rcp28ss_mask_round(A, B, C, D, 8) +#define __builtin_ia32_rsqrt28sd_mask_round(A, B, C, D, E) __builtin_ia32_rsqrt28sd_mask_round(A, B, C, D, 8) +#define __builtin_ia32_rsqrt28ss_mask_round(A, B, C, D, E) __builtin_ia32_rsqrt28ss_mask_round(A, B, C, D, 8) /* avx512pfintrin.h */ #define __builtin_ia32_gatherpfdps(A, B, C, D, E) __builtin_ia32_gatherpfdps(A, B, C, 1, _MM_HINT_T0) @@ -621,6 +627,10 @@ #define __builtin_ia32_cmpw128_mask(A, B, E, D) __builtin_ia32_cmpw128_mask(A, B, 1, D) #define __builtin_ia32_cmpb256_mask(A, B, E, D) __builtin_ia32_cmpb256_mask(A, B, 1, D) #define __builtin_ia32_cmpb128_mask(A, B, E, D) __builtin_ia32_cmpb128_mask(A, B, 1, D) +#define __builtin_ia32_reducepd512_mask_round(A,B,C,D,E) __builtin_ia32_reducepd512_mask_round(A,1,C,D,8) +#define __builtin_ia32_reduceps512_mask_round(A,B,C,D,E) __builtin_ia32_reduceps512_mask_round(A,1,C,D,8) +#define __builtin_ia32_reducesd_mask_round(A, B, F, W, U, E) __builtin_ia32_reducesd_mask_round(A, B, 1, W, U, 8) +#define __builtin_ia32_reducess_mask_round(A, B, F, W, U, E) __builtin_ia32_reducess_mask_round(A, B, 1, W, U, 8) /* avx512vldqintrin.h */ #define __builtin_ia32_reduceps256_mask(A, E, C, D) __builtin_ia32_reduceps256_mask(A, 1, C, D) diff --git a/gcc/testsuite/gcc.target/i386/sse-23.c b/gcc/testsuite/gcc.target/i386/sse-23.c index 9ca7c5d919d..b51644c70e2 100644 --- a/gcc/testsuite/gcc.target/i386/sse-23.c +++ b/gcc/testsuite/gcc.target/i386/sse-23.c @@ -399,6 +399,8 @@ #define __builtin_ia32_vfmaddss3_mask3(A, B, C, D, E) __builtin_ia32_vfmaddss3_mask3(A, B, C, D, 8) #define __builtin_ia32_vfmaddss3_maskz(A, B, C, D, E) __builtin_ia32_vfmaddss3_maskz(A, B, C, D, 8) #define __builtin_ia32_vfmsubss3_mask3(A, B, C, D, E) __builtin_ia32_vfmsubss3_mask3(A, B, C, D, 8) +#define __builtin_ia32_cvtsd2ss_mask_round(A, B, C, D, E) __builtin_ia32_cvtsd2ss_mask_round(A, B, C, D, 8) +#define __builtin_ia32_cvtss2sd_mask_round(A, B, C, D, E) __builtin_ia32_cvtss2sd_mask_round(A, B, C, D, 8) /* avx512pfintrin.h */ #define __builtin_ia32_gatherpfdps(A, B, C, D, E) __builtin_ia32_gatherpfdps(A, B, C, 1, _MM_HINT_T0) @@ -421,6 +423,10 @@ #define __builtin_ia32_rcp28ss_round(A, B, C) __builtin_ia32_rcp28ss_round(A, B, 8) #define __builtin_ia32_rsqrt28sd_round(A, B, C) __builtin_ia32_rsqrt28sd_round(A, B, 8) #define __builtin_ia32_rsqrt28ss_round(A, B, C) __builtin_ia32_rsqrt28ss_round(A, B, 8) +#define __builtin_ia32_rcp28sd_mask_round(A, B, C, D, E) __builtin_ia32_rcp28sd_mask_round(A, B, C, D, 8) +#define __builtin_ia32_rcp28ss_mask_round(A, B, C, D, E) __builtin_ia32_rcp28ss_mask_round(A, B, C, D, 8) +#define __builtin_ia32_rsqrt28sd_mask_round(A, B, C, D, E) __builtin_ia32_rsqrt28sd_mask_round(A, B, C, D, 8) +#define __builtin_ia32_rsqrt28ss_mask_round(A, B, C, D, E) __builtin_ia32_rsqrt28ss_mask_round(A, B, C, D, 8) /* shaintrin.h */ #define __builtin_ia32_sha1rnds4(A, B, C) __builtin_ia32_sha1rnds4(A, B, 1) @@ -482,6 +488,10 @@ #define __builtin_ia32_cvtps2qq512_mask(A, B, C, D) __builtin_ia32_cvtps2qq512_mask(A, B, C, 8) #define __builtin_ia32_cvtpd2uqq512_mask(A, B, C, D) __builtin_ia32_cvtpd2uqq512_mask(A, B, C, 8) #define __builtin_ia32_cvtpd2qq512_mask(A, B, C, D) __builtin_ia32_cvtpd2qq512_mask(A, B, C, 8) +#define __builtin_ia32_reducesd_mask_round(A, B, C, D, E, F) __builtin_ia32_reducesd_mask_round(A, B, 8, D, E, 8) +#define __builtin_ia32_reducess_mask_round(A, B, C, D, E, F) __builtin_ia32_reducess_mask_round(A, B, 8, D, E, 8) +#define __builtin_ia32_reducepd512_mask_round(A, B, C, D, E) __builtin_ia32_reducepd512_mask_round(A, 8, C, D, 8) +#define __builtin_ia32_reduceps512_mask_round(A, B, C, D, E) __builtin_ia32_reduceps512_mask_round(A, 8, C, D, 8) /* avx512vlintrin.h */ #define __builtin_ia32_vpermilps_mask(A, E, C, D) __builtin_ia32_vpermilps_mask(A, 1, C, D)