re PR target/79932 (_mm512_packus_epi32 does not compile under -O0)
authorJakub Jelinek <jakub@redhat.com>
Thu, 9 Mar 2017 09:11:06 +0000 (10:11 +0100)
committerJakub Jelinek <jakub@gcc.gnu.org>
Thu, 9 Mar 2017 09:11:06 +0000 (10:11 +0100)
PR target/79932
* config/i386/avx512vlintrin.h (_mm256_cmpge_epi32_mask,
_mm256_cmpge_epi64_mask, _mm256_cmpge_epu32_mask,
_mm256_cmpge_epu64_mask, _mm256_cmple_epi32_mask,
_mm256_cmple_epi64_mask, _mm256_cmple_epu32_mask,
_mm256_cmple_epu64_mask, _mm256_cmplt_epi32_mask,
_mm256_cmplt_epi64_mask, _mm256_cmplt_epu32_mask,
_mm256_cmplt_epu64_mask, _mm256_cmpneq_epi32_mask,
_mm256_cmpneq_epi64_mask, _mm256_cmpneq_epu32_mask,
_mm256_cmpneq_epu64_mask, _mm256_mask_cmpge_epi32_mask,
_mm256_mask_cmpge_epi64_mask, _mm256_mask_cmpge_epu32_mask,
_mm256_mask_cmpge_epu64_mask, _mm256_mask_cmple_epi32_mask,
_mm256_mask_cmple_epi64_mask, _mm256_mask_cmple_epu32_mask,
_mm256_mask_cmple_epu64_mask, _mm256_mask_cmplt_epi32_mask,
_mm256_mask_cmplt_epi64_mask, _mm256_mask_cmplt_epu32_mask,
_mm256_mask_cmplt_epu64_mask, _mm256_mask_cmpneq_epi32_mask,
_mm256_mask_cmpneq_epi64_mask, _mm256_mask_cmpneq_epu32_mask,
_mm256_mask_cmpneq_epu64_mask, _mm_cmpge_epi32_mask,
_mm_cmpge_epi64_mask, _mm_cmpge_epu32_mask, _mm_cmpge_epu64_mask,
_mm_cmple_epi32_mask, _mm_cmple_epi64_mask, _mm_cmple_epu32_mask,
_mm_cmple_epu64_mask, _mm_cmplt_epi32_mask, _mm_cmplt_epi64_mask,
_mm_cmplt_epu32_mask, _mm_cmplt_epu64_mask, _mm_cmpneq_epi32_mask,
_mm_cmpneq_epi64_mask, _mm_cmpneq_epu32_mask, _mm_cmpneq_epu64_mask,
_mm_mask_cmpge_epi32_mask, _mm_mask_cmpge_epi64_mask,
_mm_mask_cmpge_epu32_mask, _mm_mask_cmpge_epu64_mask,
_mm_mask_cmple_epi32_mask, _mm_mask_cmple_epi64_mask,
_mm_mask_cmple_epu32_mask, _mm_mask_cmple_epu64_mask,
_mm_mask_cmplt_epi32_mask, _mm_mask_cmplt_epi64_mask,
_mm_mask_cmplt_epu32_mask, _mm_mask_cmplt_epu64_mask,
_mm_mask_cmpneq_epi32_mask, _mm_mask_cmpneq_epi64_mask,
_mm_mask_cmpneq_epu32_mask, _mm_mask_cmpneq_epu64_mask): Move
definitions outside of __OPTIMIZE__ guarded section.

* gcc.target/i386/pr79932-2.c: New test.

From-SVN: r245990

gcc/ChangeLog
gcc/config/i386/avx512vlintrin.h
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.target/i386/pr79932-2.c [new file with mode: 0644]

index 33f340b49b8ccecab464401c9c1fe6ce5a4cc53f..3d85fe0e98180187f6e53152122b7f3354085201 100644 (file)
@@ -1,5 +1,38 @@
 2017-03-09  Jakub Jelinek  <jakub@redhat.com>
 
+       PR target/79932
+       * config/i386/avx512vlintrin.h (_mm256_cmpge_epi32_mask,
+       _mm256_cmpge_epi64_mask, _mm256_cmpge_epu32_mask,
+       _mm256_cmpge_epu64_mask, _mm256_cmple_epi32_mask,
+       _mm256_cmple_epi64_mask, _mm256_cmple_epu32_mask,
+       _mm256_cmple_epu64_mask, _mm256_cmplt_epi32_mask,
+       _mm256_cmplt_epi64_mask, _mm256_cmplt_epu32_mask,
+       _mm256_cmplt_epu64_mask, _mm256_cmpneq_epi32_mask,
+       _mm256_cmpneq_epi64_mask, _mm256_cmpneq_epu32_mask,
+       _mm256_cmpneq_epu64_mask, _mm256_mask_cmpge_epi32_mask,
+       _mm256_mask_cmpge_epi64_mask, _mm256_mask_cmpge_epu32_mask,
+       _mm256_mask_cmpge_epu64_mask, _mm256_mask_cmple_epi32_mask,
+       _mm256_mask_cmple_epi64_mask, _mm256_mask_cmple_epu32_mask,
+       _mm256_mask_cmple_epu64_mask, _mm256_mask_cmplt_epi32_mask,
+       _mm256_mask_cmplt_epi64_mask, _mm256_mask_cmplt_epu32_mask,
+       _mm256_mask_cmplt_epu64_mask, _mm256_mask_cmpneq_epi32_mask,
+       _mm256_mask_cmpneq_epi64_mask, _mm256_mask_cmpneq_epu32_mask,
+       _mm256_mask_cmpneq_epu64_mask, _mm_cmpge_epi32_mask,
+       _mm_cmpge_epi64_mask, _mm_cmpge_epu32_mask, _mm_cmpge_epu64_mask,
+       _mm_cmple_epi32_mask, _mm_cmple_epi64_mask, _mm_cmple_epu32_mask,
+       _mm_cmple_epu64_mask, _mm_cmplt_epi32_mask, _mm_cmplt_epi64_mask,
+       _mm_cmplt_epu32_mask, _mm_cmplt_epu64_mask, _mm_cmpneq_epi32_mask,
+       _mm_cmpneq_epi64_mask, _mm_cmpneq_epu32_mask, _mm_cmpneq_epu64_mask,
+       _mm_mask_cmpge_epi32_mask, _mm_mask_cmpge_epi64_mask,
+       _mm_mask_cmpge_epu32_mask, _mm_mask_cmpge_epu64_mask,
+       _mm_mask_cmple_epi32_mask, _mm_mask_cmple_epi64_mask,
+       _mm_mask_cmple_epu32_mask, _mm_mask_cmple_epu64_mask,
+       _mm_mask_cmplt_epi32_mask, _mm_mask_cmplt_epi64_mask,
+       _mm_mask_cmplt_epu32_mask, _mm_mask_cmplt_epu64_mask,
+       _mm_mask_cmpneq_epi32_mask, _mm_mask_cmpneq_epi64_mask,
+       _mm_mask_cmpneq_epu32_mask, _mm_mask_cmpneq_epu64_mask): Move
+       definitions outside of __OPTIMIZE__ guarded section.
+
        PR target/79932
        * config/i386/avx512bwintrin.h (_mm512_packs_epi32,
        _mm512_maskz_packs_epi32, _mm512_mask_packs_epi32,
index 9750cd811aade7724413f4e80a5cf8abcce9e537..f62f641188ee65c26f0011d81cbd446f25334416 100644 (file)
@@ -9172,3192 +9172,3192 @@ _mm256_mask_permutexvar_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
                                                     __M);
 }
 
-#ifdef __OPTIMIZE__
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_permutex_epi64 (__m256i __W, __mmask8 __M,
-                           __m256i __X, const int __I)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpneq_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_permdi256_mask ((__v4di) __X,
-                                                 __I,
-                                                 (__v4di) __W,
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+                                                 (__v8si) __Y, 4,
                                                  (__mmask8) __M);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_permutex_epi64 (__mmask8 __M, __m256i __X, const int __I)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpneq_epu32_mask (__m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_permdi256_mask ((__v4di) __X,
-                                                 __I,
-                                                 (__v4di)
-                                                 _mm256_setzero_si256 (),
-                                                 (__mmask8) __M);
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+                                                 (__v8si) __Y, 4,
+                                                 (__mmask8) -1);
 }
 
-extern __inline __m256d
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_shuffle_pd (__m256d __W, __mmask8 __U, __m256d __A,
-                       __m256d __B, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmplt_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256d) __builtin_ia32_shufpd256_mask ((__v4df) __A,
-                                                 (__v4df) __B, __imm,
-                                                 (__v4df) __W,
-                                                 (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+                                                 (__v8si) __Y, 1,
+                                                 (__mmask8) __M);
 }
 
-extern __inline __m256d
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_shuffle_pd (__mmask8 __U, __m256d __A, __m256d __B,
-                        const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmplt_epu32_mask (__m256i __X, __m256i __Y)
 {
-  return (__m256d) __builtin_ia32_shufpd256_mask ((__v4df) __A,
-                                                 (__v4df) __B, __imm,
-                                                 (__v4df)
-                                                 _mm256_setzero_pd (),
-                                                 (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+                                                 (__v8si) __Y, 1,
+                                                 (__mmask8) -1);
 }
 
-extern __inline __m128d
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_shuffle_pd (__m128d __W, __mmask8 __U, __m128d __A,
-                    __m128d __B, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpge_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m128d) __builtin_ia32_shufpd128_mask ((__v2df) __A,
-                                                 (__v2df) __B, __imm,
-                                                 (__v2df) __W,
-                                                 (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+                                                 (__v8si) __Y, 5,
+                                                 (__mmask8) __M);
 }
 
-extern __inline __m128d
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_shuffle_pd (__mmask8 __U, __m128d __A, __m128d __B,
-                     const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpge_epu32_mask (__m256i __X, __m256i __Y)
 {
-  return (__m128d) __builtin_ia32_shufpd128_mask ((__v2df) __A,
-                                                 (__v2df) __B, __imm,
-                                                 (__v2df)
-                                                 _mm_setzero_pd (),
-                                                 (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+                                                 (__v8si) __Y, 5,
+                                                 (__mmask8) -1);
 }
 
-extern __inline __m256
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_shuffle_ps (__m256 __W, __mmask8 __U, __m256 __A,
-                       __m256 __B, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmple_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256) __builtin_ia32_shufps256_mask ((__v8sf) __A,
-                                                (__v8sf) __B, __imm,
-                                                (__v8sf) __W,
-                                                (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+                                                 (__v8si) __Y, 2,
+                                                 (__mmask8) __M);
 }
 
-extern __inline __m256
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_shuffle_ps (__mmask8 __U, __m256 __A, __m256 __B,
-                        const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmple_epu32_mask (__m256i __X, __m256i __Y)
 {
-  return (__m256) __builtin_ia32_shufps256_mask ((__v8sf) __A,
-                                                (__v8sf) __B, __imm,
-                                                (__v8sf)
-                                                _mm256_setzero_ps (),
-                                                (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+                                                 (__v8si) __Y, 2,
+                                                 (__mmask8) -1);
 }
 
-extern __inline __m128
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_shuffle_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B,
-                    const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpneq_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m128) __builtin_ia32_shufps128_mask ((__v4sf) __A,
-                                                (__v4sf) __B, __imm,
-                                                (__v4sf) __W,
-                                                (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+                                                 (__v4di) __Y, 4,
+                                                 (__mmask8) __M);
 }
 
-extern __inline __m128
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_shuffle_ps (__mmask8 __U, __m128 __A, __m128 __B,
-                     const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpneq_epu64_mask (__m256i __X, __m256i __Y)
 {
-  return (__m128) __builtin_ia32_shufps128_mask ((__v4sf) __A,
-                                                (__v4sf) __B, __imm,
-                                                (__v4sf)
-                                                _mm_setzero_ps (),
-                                                (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+                                                 (__v4di) __Y, 4,
+                                                 (__mmask8) -1);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_inserti32x4 (__m256i __A, __m128i __B, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmplt_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si) __A,
-                                                       (__v4si) __B,
-                                                       __imm,
-                                                       (__v8si)
-                                                       _mm256_setzero_si256 (),
-                                                       (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+                                                 (__v4di) __Y, 1,
+                                                 (__mmask8) __M);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_inserti32x4 (__m256i __W, __mmask8 __U, __m256i __A,
-                        __m128i __B, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmplt_epu64_mask (__m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si) __A,
-                                                       (__v4si) __B,
-                                                       __imm,
-                                                       (__v8si) __W,
-                                                       (__mmask8)
-                                                       __U);
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+                                                 (__v4di) __Y, 1,
+                                                 (__mmask8) -1);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_inserti32x4 (__mmask8 __U, __m256i __A, __m128i __B,
-                         const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpge_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si) __A,
-                                                       (__v4si) __B,
-                                                       __imm,
-                                                       (__v8si)
-                                                       _mm256_setzero_si256 (),
-                                                       (__mmask8)
-                                                       __U);
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+                                                 (__v4di) __Y, 5,
+                                                 (__mmask8) __M);
 }
 
-extern __inline __m256
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_insertf32x4 (__m256 __A, __m128 __B, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpge_epu64_mask (__m256i __X, __m256i __Y)
 {
-  return (__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf) __A,
-                                                      (__v4sf) __B,
-                                                      __imm,
-                                                      (__v8sf)
-                                                      _mm256_setzero_ps (),
-                                                      (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+                                                 (__v4di) __Y, 5,
+                                                 (__mmask8) -1);
 }
 
-extern __inline __m256
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_insertf32x4 (__m256 __W, __mmask8 __U, __m256 __A,
-                        __m128 __B, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmple_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf) __A,
-                                                      (__v4sf) __B,
-                                                      __imm,
-                                                      (__v8sf) __W,
-                                                      (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+                                                 (__v4di) __Y, 2,
+                                                 (__mmask8) __M);
 }
 
-extern __inline __m256
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_insertf32x4 (__mmask8 __U, __m256 __A, __m128 __B,
-                         const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmple_epu64_mask (__m256i __X, __m256i __Y)
 {
-  return (__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf) __A,
-                                                      (__v4sf) __B,
-                                                      __imm,
-                                                      (__v8sf)
-                                                      _mm256_setzero_ps (),
-                                                      (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+                                                 (__v4di) __Y, 2,
+                                                 (__mmask8) -1);
 }
 
-extern __inline __m128i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_extracti32x4_epi32 (__m256i __A, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpneq_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si) __A,
-                                                        __imm,
-                                                        (__v4si)
-                                                        _mm_setzero_si128 (),
-                                                        (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+                                                (__v8si) __Y, 4,
+                                                (__mmask8) __M);
 }
 
-extern __inline __m128i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_extracti32x4_epi32 (__m128i __W, __mmask8 __U, __m256i __A,
-                               const int __imm)
-{
-  return (__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si) __A,
-                                                        __imm,
-                                                        (__v4si) __W,
-                                                        (__mmask8)
-                                                        __U);
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpneq_epi32_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+                                                (__v8si) __Y, 4,
+                                                (__mmask8) -1);
 }
 
-extern __inline __m128i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_extracti32x4_epi32 (__mmask8 __U, __m256i __A,
-                                const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmplt_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si) __A,
-                                                        __imm,
-                                                        (__v4si)
-                                                        _mm_setzero_si128 (),
-                                                        (__mmask8)
-                                                        __U);
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+                                                (__v8si) __Y, 1,
+                                                (__mmask8) __M);
 }
 
-extern __inline __m128
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_extractf32x4_ps (__m256 __A, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmplt_epi32_mask (__m256i __X, __m256i __Y)
 {
-  return (__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf) __A,
-                                                       __imm,
-                                                       (__v4sf)
-                                                       _mm_setzero_ps (),
-                                                       (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+                                                (__v8si) __Y, 1,
+                                                (__mmask8) -1);
 }
 
-extern __inline __m128
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_extractf32x4_ps (__m128 __W, __mmask8 __U, __m256 __A,
-                            const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpge_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf) __A,
-                                                       __imm,
-                                                       (__v4sf) __W,
-                                                       (__mmask8)
-                                                       __U);
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+                                                (__v8si) __Y, 5,
+                                                (__mmask8) __M);
 }
 
-extern __inline __m128
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_extractf32x4_ps (__mmask8 __U, __m256 __A,
-                             const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpge_epi32_mask (__m256i __X, __m256i __Y)
 {
-  return (__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf) __A,
-                                                       __imm,
-                                                       (__v4sf)
-                                                       _mm_setzero_ps (),
-                                                       (__mmask8)
-                                                       __U);
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+                                                (__v8si) __Y, 5,
+                                                (__mmask8) -1);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_shuffle_i64x2 (__m256i __A, __m256i __B, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmple_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di) __A,
-                                                      (__v4di) __B,
-                                                      __imm,
-                                                      (__v4di)
-                                                      _mm256_setzero_si256 (),
-                                                      (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+                                                (__v8si) __Y, 2,
+                                                (__mmask8) __M);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_shuffle_i64x2 (__m256i __W, __mmask8 __U, __m256i __A,
-                          __m256i __B, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmple_epi32_mask (__m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di) __A,
-                                                      (__v4di) __B,
-                                                      __imm,
-                                                      (__v4di) __W,
-                                                      (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+                                                (__v8si) __Y, 2,
+                                                (__mmask8) -1);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_shuffle_i64x2 (__mmask8 __U, __m256i __A, __m256i __B,
-                           const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpneq_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di) __A,
-                                                      (__v4di) __B,
-                                                      __imm,
-                                                      (__v4di)
-                                                      _mm256_setzero_si256 (),
-                                                      (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+                                                (__v4di) __Y, 4,
+                                                (__mmask8) __M);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_shuffle_i32x4 (__m256i __A, __m256i __B, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpneq_epi64_mask (__m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si) __A,
-                                                      (__v8si) __B,
-                                                      __imm,
-                                                      (__v8si)
-                                                      _mm256_setzero_si256 (),
-                                                      (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+                                                (__v4di) __Y, 4,
+                                                (__mmask8) -1);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_shuffle_i32x4 (__m256i __W, __mmask8 __U, __m256i __A,
-                          __m256i __B, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmplt_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si) __A,
-                                                      (__v8si) __B,
-                                                      __imm,
-                                                      (__v8si) __W,
-                                                      (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+                                                (__v4di) __Y, 1,
+                                                (__mmask8) __M);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_shuffle_i32x4 (__mmask8 __U, __m256i __A, __m256i __B,
-                           const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmplt_epi64_mask (__m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si) __A,
-                                                      (__v8si) __B,
-                                                      __imm,
-                                                      (__v8si)
-                                                      _mm256_setzero_si256 (),
-                                                      (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+                                                (__v4di) __Y, 1,
+                                                (__mmask8) -1);
 }
 
-extern __inline __m256d
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_shuffle_f64x2 (__m256d __A, __m256d __B, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpge_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df) __A,
-                                                      (__v4df) __B,
-                                                      __imm,
-                                                      (__v4df)
-                                                      _mm256_setzero_pd (),
-                                                      (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+                                                (__v4di) __Y, 5,
+                                                (__mmask8) __M);
 }
 
-extern __inline __m256d
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_shuffle_f64x2 (__m256d __W, __mmask8 __U, __m256d __A,
-                          __m256d __B, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpge_epi64_mask (__m256i __X, __m256i __Y)
 {
-  return (__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df) __A,
-                                                      (__v4df) __B,
-                                                      __imm,
-                                                      (__v4df) __W,
-                                                      (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+                                                (__v4di) __Y, 5,
+                                                (__mmask8) -1);
 }
 
-extern __inline __m256d
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_shuffle_f64x2 (__mmask8 __U, __m256d __A, __m256d __B,
-                           const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmple_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df) __A,
-                                                      (__v4df) __B,
-                                                      __imm,
-                                                      (__v4df)
-                                                      _mm256_setzero_pd (),
-                                                      (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+                                                (__v4di) __Y, 2,
+                                                (__mmask8) __M);
 }
 
-extern __inline __m256
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_shuffle_f32x4 (__m256 __A, __m256 __B, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmple_epi64_mask (__m256i __X, __m256i __Y)
 {
-  return (__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf) __A,
-                                                     (__v8sf) __B,
-                                                     __imm,
-                                                     (__v8sf)
-                                                     _mm256_setzero_ps (),
-                                                     (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+                                                (__v4di) __Y, 2,
+                                                (__mmask8) -1);
 }
 
-extern __inline __m256
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_shuffle_f32x4 (__m256 __W, __mmask8 __U, __m256 __A,
-                          __m256 __B, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpneq_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf) __A,
-                                                     (__v8sf) __B,
-                                                     __imm,
-                                                     (__v8sf) __W,
-                                                     (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+                                                 (__v4si) __Y, 4,
+                                                 (__mmask8) __M);
 }
 
-extern __inline __m256
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_shuffle_f32x4 (__mmask8 __U, __m256 __A, __m256 __B,
-                           const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_epu32_mask (__m128i __X, __m128i __Y)
 {
-  return (__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf) __A,
-                                                     (__v8sf) __B,
-                                                     __imm,
-                                                     (__v8sf)
-                                                     _mm256_setzero_ps (),
-                                                     (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+                                                 (__v4si) __Y, 4,
+                                                 (__mmask8) -1);
 }
 
-extern __inline __m256d
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_fixupimm_pd (__m256d __A, __m256d __B, __m256i __C,
-                   const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmplt_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m256d) __builtin_ia32_fixupimmpd256_mask ((__v4df) __A,
-                                                     (__v4df) __B,
-                                                     (__v4di) __C,
-                                                     __imm,
-                                                     (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+                                                 (__v4si) __Y, 1,
+                                                 (__mmask8) __M);
 }
 
-extern __inline __m256d
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_fixupimm_pd (__m256d __A, __mmask8 __U, __m256d __B,
-                        __m256i __C, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epu32_mask (__m128i __X, __m128i __Y)
 {
-  return (__m256d) __builtin_ia32_fixupimmpd256_mask ((__v4df) __A,
-                                                     (__v4df) __B,
-                                                     (__v4di) __C,
-                                                     __imm,
-                                                     (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+                                                 (__v4si) __Y, 1,
+                                                 (__mmask8) -1);
 }
 
-extern __inline __m256d
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_fixupimm_pd (__mmask8 __U, __m256d __A, __m256d __B,
-                         __m256i __C, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpge_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m256d) __builtin_ia32_fixupimmpd256_maskz ((__v4df) __A,
-                                                      (__v4df) __B,
-                                                      (__v4di) __C,
-                                                      __imm,
-                                                      (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+                                                 (__v4si) __Y, 5,
+                                                 (__mmask8) __M);
 }
 
-extern __inline __m256
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_fixupimm_ps (__m256 __A, __m256 __B, __m256i __C,
-                   const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_epu32_mask (__m128i __X, __m128i __Y)
 {
-  return (__m256) __builtin_ia32_fixupimmps256_mask ((__v8sf) __A,
-                                                    (__v8sf) __B,
-                                                    (__v8si) __C,
-                                                    __imm,
-                                                    (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+                                                 (__v4si) __Y, 5,
+                                                 (__mmask8) -1);
 }
 
-extern __inline __m256
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_fixupimm_ps (__m256 __A, __mmask8 __U, __m256 __B,
-                        __m256i __C, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmple_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m256) __builtin_ia32_fixupimmps256_mask ((__v8sf) __A,
-                                                    (__v8sf) __B,
-                                                    (__v8si) __C,
-                                                    __imm,
-                                                    (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+                                                 (__v4si) __Y, 2,
+                                                 (__mmask8) __M);
 }
 
-extern __inline __m256
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_fixupimm_ps (__mmask8 __U, __m256 __A, __m256 __B,
-                         __m256i __C, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_epu32_mask (__m128i __X, __m128i __Y)
 {
-  return (__m256) __builtin_ia32_fixupimmps256_maskz ((__v8sf) __A,
-                                                     (__v8sf) __B,
-                                                     (__v8si) __C,
-                                                     __imm,
-                                                     (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+                                                 (__v4si) __Y, 2,
+                                                 (__mmask8) -1);
 }
 
-extern __inline __m128d
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_fixupimm_pd (__m128d __A, __m128d __B, __m128i __C,
-                const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpneq_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m128d) __builtin_ia32_fixupimmpd128_mask ((__v2df) __A,
-                                                     (__v2df) __B,
-                                                     (__v2di) __C,
-                                                     __imm,
-                                                     (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+                                                 (__v2di) __Y, 4,
+                                                 (__mmask8) __M);
 }
 
-extern __inline __m128d
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_fixupimm_pd (__m128d __A, __mmask8 __U, __m128d __B,
-                     __m128i __C, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_epu64_mask (__m128i __X, __m128i __Y)
 {
-  return (__m128d) __builtin_ia32_fixupimmpd128_mask ((__v2df) __A,
-                                                     (__v2df) __B,
-                                                     (__v2di) __C,
-                                                     __imm,
-                                                     (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+                                                 (__v2di) __Y, 4,
+                                                 (__mmask8) -1);
 }
 
-extern __inline __m128d
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_fixupimm_pd (__mmask8 __U, __m128d __A, __m128d __B,
-                      __m128i __C, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmplt_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m128d) __builtin_ia32_fixupimmpd128_maskz ((__v2df) __A,
-                                                      (__v2df) __B,
-                                                      (__v2di) __C,
-                                                      __imm,
-                                                      (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+                                                 (__v2di) __Y, 1,
+                                                 (__mmask8) __M);
 }
 
-extern __inline __m128
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_fixupimm_ps (__m128 __A, __m128 __B, __m128i __C, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epu64_mask (__m128i __X, __m128i __Y)
 {
-  return (__m128) __builtin_ia32_fixupimmps128_mask ((__v4sf) __A,
-                                                    (__v4sf) __B,
-                                                    (__v4si) __C,
-                                                    __imm,
-                                                    (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+                                                 (__v2di) __Y, 1,
+                                                 (__mmask8) -1);
 }
 
-extern __inline __m128
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_fixupimm_ps (__m128 __A, __mmask8 __U, __m128 __B,
-                     __m128i __C, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpge_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m128) __builtin_ia32_fixupimmps128_mask ((__v4sf) __A,
-                                                    (__v4sf) __B,
-                                                    (__v4si) __C,
-                                                    __imm,
-                                                    (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+                                                 (__v2di) __Y, 5,
+                                                 (__mmask8) __M);
 }
 
-extern __inline __m128
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_fixupimm_ps (__mmask8 __U, __m128 __A, __m128 __B,
-                      __m128i __C, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_epu64_mask (__m128i __X, __m128i __Y)
 {
-  return (__m128) __builtin_ia32_fixupimmps128_maskz ((__v4sf) __A,
-                                                     (__v4sf) __B,
-                                                     (__v4si) __C,
-                                                     __imm,
-                                                     (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+                                                 (__v2di) __Y, 5,
+                                                 (__mmask8) -1);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_srli_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-                       const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmple_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m256i) __builtin_ia32_psrldi256_mask ((__v8si) __A, __imm,
-                                                 (__v8si) __W,
-                                                 (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+                                                 (__v2di) __Y, 2,
+                                                 (__mmask8) __M);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_srli_epi32 (__mmask8 __U, __m256i __A, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_epu64_mask (__m128i __X, __m128i __Y)
 {
-  return (__m256i) __builtin_ia32_psrldi256_mask ((__v8si) __A, __imm,
-                                                 (__v8si)
-                                                 _mm256_setzero_si256 (),
-                                                 (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+                                                 (__v2di) __Y, 2,
+                                                 (__mmask8) -1);
 }
 
-extern __inline __m128i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_srli_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
-                    const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpneq_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psrldi128_mask ((__v4si) __A, __imm,
-                                                 (__v4si) __W,
-                                                 (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+                                                (__v4si) __Y, 4,
+                                                (__mmask8) __M);
 }
 
-extern __inline __m128i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_srli_epi32 (__mmask8 __U, __m128i __A, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_epi32_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+                                                (__v4si) __Y, 4,
+                                                (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmplt_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+                                                (__v4si) __Y, 1,
+                                                (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epi32_mask (__m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psrldi128_mask ((__v4si) __A, __imm,
-                                                 (__v4si)
-                                                 _mm_setzero_si128 (),
-                                                 (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+                                                (__v4si) __Y, 1,
+                                                (__mmask8) -1);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_srli_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-                       const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpge_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m256i) __builtin_ia32_psrlqi256_mask ((__v4di) __A, __imm,
-                                                 (__v4di) __W,
-                                                 (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+                                                (__v4si) __Y, 5,
+                                                (__mmask8) __M);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_srli_epi64 (__mmask8 __U, __m256i __A, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_epi32_mask (__m128i __X, __m128i __Y)
 {
-  return (__m256i) __builtin_ia32_psrlqi256_mask ((__v4di) __A, __imm,
-                                                 (__v4di)
-                                                 _mm256_setzero_si256 (),
-                                                 (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+                                                (__v4si) __Y, 5,
+                                                (__mmask8) -1);
 }
 
-extern __inline __m128i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_srli_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
-                    const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmple_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psrlqi128_mask ((__v2di) __A, __imm,
-                                                 (__v2di) __W,
-                                                 (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+                                                (__v4si) __Y, 2,
+                                                (__mmask8) __M);
 }
 
-extern __inline __m128i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_srli_epi64 (__mmask8 __U, __m128i __A, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_epi32_mask (__m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_psrlqi128_mask ((__v2di) __A, __imm,
-                                                 (__v2di)
-                                                 _mm_setzero_si128 (),
-                                                 (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+                                                (__v4si) __Y, 2,
+                                                (__mmask8) -1);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_ternarylogic_epi64 (__m256i __A, __m256i __B, __m256i __C,
-                          const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpneq_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m256i) __builtin_ia32_pternlogq256_mask ((__v4di) __A,
-                                                    (__v4di) __B,
-                                                    (__v4di) __C, __imm,
-                                                    (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+                                                (__v2di) __Y, 4,
+                                                (__mmask8) __M);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_ternarylogic_epi64 (__m256i __A, __mmask8 __U,
-                               __m256i __B, __m256i __C,
-                               const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_epi64_mask (__m128i __X, __m128i __Y)
 {
-  return (__m256i) __builtin_ia32_pternlogq256_mask ((__v4di) __A,
-                                                    (__v4di) __B,
-                                                    (__v4di) __C, __imm,
-                                                    (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+                                                (__v2di) __Y, 4,
+                                                (__mmask8) -1);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_ternarylogic_epi64 (__mmask8 __U, __m256i __A,
-                                __m256i __B, __m256i __C,
-                                const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmplt_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m256i) __builtin_ia32_pternlogq256_maskz ((__v4di) __A,
-                                                     (__v4di) __B,
-                                                     (__v4di) __C,
-                                                     __imm,
-                                                     (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+                                                (__v2di) __Y, 1,
+                                                (__mmask8) __M);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_ternarylogic_epi32 (__m256i __A, __m256i __B, __m256i __C,
-                          const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epi64_mask (__m128i __X, __m128i __Y)
 {
-  return (__m256i) __builtin_ia32_pternlogd256_mask ((__v8si) __A,
-                                                    (__v8si) __B,
-                                                    (__v8si) __C, __imm,
-                                                    (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+                                                (__v2di) __Y, 1,
+                                                (__mmask8) -1);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_ternarylogic_epi32 (__m256i __A, __mmask8 __U,
-                               __m256i __B, __m256i __C,
-                               const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpge_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m256i) __builtin_ia32_pternlogd256_mask ((__v8si) __A,
-                                                    (__v8si) __B,
-                                                    (__v8si) __C, __imm,
-                                                    (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+                                                (__v2di) __Y, 5,
+                                                (__mmask8) __M);
 }
 
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_ternarylogic_epi32 (__mmask8 __U, __m256i __A,
-                                __m256i __B, __m256i __C,
-                                const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_epi64_mask (__m128i __X, __m128i __Y)
 {
-  return (__m256i) __builtin_ia32_pternlogd256_maskz ((__v8si) __A,
-                                                     (__v8si) __B,
-                                                     (__v8si) __C,
-                                                     __imm,
-                                                     (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+                                                (__v2di) __Y, 5,
+                                                (__mmask8) -1);
 }
 
-extern __inline __m128i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_ternarylogic_epi64 (__m128i __A, __m128i __B, __m128i __C,
-                       const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmple_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_pternlogq128_mask ((__v2di) __A,
-                                                    (__v2di) __B,
-                                                    (__v2di) __C, __imm,
-                                                    (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+                                                (__v2di) __Y, 2,
+                                                (__mmask8) __M);
 }
 
-extern __inline __m128i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_ternarylogic_epi64 (__m128i __A, __mmask8 __U,
-                            __m128i __B, __m128i __C, const int __imm)
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_epi64_mask (__m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_pternlogq128_mask ((__v2di) __A,
-                                                    (__v2di) __B,
-                                                    (__v2di) __C, __imm,
-                                                    (__mmask8) __U);
+  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+                                                (__v2di) __Y, 2,
+                                                (__mmask8) -1);
 }
 
-extern __inline __m128i
+#ifdef __OPTIMIZE__
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_ternarylogic_epi64 (__mmask8 __U, __m128i __A,
-                             __m128i __B, __m128i __C, const int __imm)
+_mm256_mask_permutex_epi64 (__m256i __W, __mmask8 __M,
+                           __m256i __X, const int __I)
 {
-  return (__m128i) __builtin_ia32_pternlogq128_maskz ((__v2di) __A,
-                                                     (__v2di) __B,
-                                                     (__v2di) __C,
-                                                     __imm,
-                                                     (__mmask8) __U);
+  return (__m256i) __builtin_ia32_permdi256_mask ((__v4di) __X,
+                                                 __I,
+                                                 (__v4di) __W,
+                                                 (__mmask8) __M);
 }
 
-extern __inline __m128i
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_ternarylogic_epi32 (__m128i __A, __m128i __B, __m128i __C,
-                       const int __imm)
+_mm256_maskz_permutex_epi64 (__mmask8 __M, __m256i __X, const int __I)
 {
-  return (__m128i) __builtin_ia32_pternlogd128_mask ((__v4si) __A,
-                                                    (__v4si) __B,
-                                                    (__v4si) __C, __imm,
-                                                    (__mmask8) -1);
+  return (__m256i) __builtin_ia32_permdi256_mask ((__v4di) __X,
+                                                 __I,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) __M);
 }
 
-extern __inline __m128i
+extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_ternarylogic_epi32 (__m128i __A, __mmask8 __U,
-                            __m128i __B, __m128i __C, const int __imm)
+_mm256_mask_shuffle_pd (__m256d __W, __mmask8 __U, __m256d __A,
+                       __m256d __B, const int __imm)
 {
-  return (__m128i) __builtin_ia32_pternlogd128_mask ((__v4si) __A,
-                                                    (__v4si) __B,
-                                                    (__v4si) __C, __imm,
-                                                    (__mmask8) __U);
+  return (__m256d) __builtin_ia32_shufpd256_mask ((__v4df) __A,
+                                                 (__v4df) __B, __imm,
+                                                 (__v4df) __W,
+                                                 (__mmask8) __U);
 }
 
-extern __inline __m128i
+extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_ternarylogic_epi32 (__mmask8 __U, __m128i __A,
-                             __m128i __B, __m128i __C, const int __imm)
+_mm256_maskz_shuffle_pd (__mmask8 __U, __m256d __A, __m256d __B,
+                        const int __imm)
 {
-  return (__m128i) __builtin_ia32_pternlogd128_maskz ((__v4si) __A,
-                                                     (__v4si) __B,
-                                                     (__v4si) __C,
-                                                     __imm,
-                                                     (__mmask8) __U);
+  return (__m256d) __builtin_ia32_shufpd256_mask ((__v4df) __A,
+                                                 (__v4df) __B, __imm,
+                                                 (__v4df)
+                                                 _mm256_setzero_pd (),
+                                                 (__mmask8) __U);
 }
 
-extern __inline __m256
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_roundscale_ps (__m256 __A, const int __imm)
+_mm_mask_shuffle_pd (__m128d __W, __mmask8 __U, __m128d __A,
+                    __m128d __B, const int __imm)
 {
-  return (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A,
-                                                     __imm,
-                                                     (__v8sf)
-                                                     _mm256_setzero_ps (),
-                                                     (__mmask8) -1);
+  return (__m128d) __builtin_ia32_shufpd128_mask ((__v2df) __A,
+                                                 (__v2df) __B, __imm,
+                                                 (__v2df) __W,
+                                                 (__mmask8) __U);
 }
 
-extern __inline __m256
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_roundscale_ps (__m256 __W, __mmask8 __U, __m256 __A,
-                          const int __imm)
+_mm_maskz_shuffle_pd (__mmask8 __U, __m128d __A, __m128d __B,
+                     const int __imm)
 {
-  return (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A,
-                                                     __imm,
-                                                     (__v8sf) __W,
-                                                     (__mmask8) __U);
+  return (__m128d) __builtin_ia32_shufpd128_mask ((__v2df) __A,
+                                                 (__v2df) __B, __imm,
+                                                 (__v2df)
+                                                 _mm_setzero_pd (),
+                                                 (__mmask8) __U);
 }
 
 extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_roundscale_ps (__mmask8 __U, __m256 __A, const int __imm)
+_mm256_mask_shuffle_ps (__m256 __W, __mmask8 __U, __m256 __A,
+                       __m256 __B, const int __imm)
 {
-  return (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A,
-                                                     __imm,
-                                                     (__v8sf)
-                                                     _mm256_setzero_ps (),
-                                                     (__mmask8) __U);
+  return (__m256) __builtin_ia32_shufps256_mask ((__v8sf) __A,
+                                                (__v8sf) __B, __imm,
+                                                (__v8sf) __W,
+                                                (__mmask8) __U);
 }
 
-extern __inline __m256d
+extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_roundscale_pd (__m256d __A, const int __imm)
+_mm256_maskz_shuffle_ps (__mmask8 __U, __m256 __A, __m256 __B,
+                        const int __imm)
 {
-  return (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A,
-                                                      __imm,
-                                                      (__v4df)
-                                                      _mm256_setzero_pd (),
-                                                      (__mmask8) -1);
+  return (__m256) __builtin_ia32_shufps256_mask ((__v8sf) __A,
+                                                (__v8sf) __B, __imm,
+                                                (__v8sf)
+                                                _mm256_setzero_ps (),
+                                                (__mmask8) __U);
 }
 
-extern __inline __m256d
+extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_roundscale_pd (__m256d __W, __mmask8 __U, __m256d __A,
-                          const int __imm)
+_mm_mask_shuffle_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B,
+                    const int __imm)
 {
-  return (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A,
-                                                      __imm,
-                                                      (__v4df) __W,
-                                                      (__mmask8) __U);
+  return (__m128) __builtin_ia32_shufps128_mask ((__v4sf) __A,
+                                                (__v4sf) __B, __imm,
+                                                (__v4sf) __W,
+                                                (__mmask8) __U);
 }
 
-extern __inline __m256d
+extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_roundscale_pd (__mmask8 __U, __m256d __A, const int __imm)
+_mm_maskz_shuffle_ps (__mmask8 __U, __m128 __A, __m128 __B,
+                     const int __imm)
 {
-  return (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A,
-                                                      __imm,
-                                                      (__v4df)
-                                                      _mm256_setzero_pd (),
-                                                      (__mmask8) __U);
+  return (__m128) __builtin_ia32_shufps128_mask ((__v4sf) __A,
+                                                (__v4sf) __B, __imm,
+                                                (__v4sf)
+                                                _mm_setzero_ps (),
+                                                (__mmask8) __U);
 }
 
-extern __inline __m128
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_roundscale_ps (__m128 __A, const int __imm)
+_mm256_inserti32x4 (__m256i __A, __m128i __B, const int __imm)
 {
-  return (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A,
-                                                     __imm,
-                                                     (__v4sf)
-                                                     _mm_setzero_ps (),
-                                                     (__mmask8) -1);
+  return (__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si) __A,
+                                                       (__v4si) __B,
+                                                       __imm,
+                                                       (__v8si)
+                                                       _mm256_setzero_si256 (),
+                                                       (__mmask8) -1);
 }
 
-extern __inline __m128
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_roundscale_ps (__m128 __W, __mmask8 __U, __m128 __A,
-                       const int __imm)
+_mm256_mask_inserti32x4 (__m256i __W, __mmask8 __U, __m256i __A,
+                        __m128i __B, const int __imm)
 {
-  return (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A,
-                                                     __imm,
-                                                     (__v4sf) __W,
-                                                     (__mmask8) __U);
+  return (__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si) __A,
+                                                       (__v4si) __B,
+                                                       __imm,
+                                                       (__v8si) __W,
+                                                       (__mmask8)
+                                                       __U);
 }
 
-extern __inline __m128
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_roundscale_ps (__mmask8 __U, __m128 __A, const int __imm)
+_mm256_maskz_inserti32x4 (__mmask8 __U, __m256i __A, __m128i __B,
+                         const int __imm)
 {
-  return (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A,
-                                                     __imm,
-                                                     (__v4sf)
-                                                     _mm_setzero_ps (),
-                                                     (__mmask8) __U);
+  return (__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si) __A,
+                                                       (__v4si) __B,
+                                                       __imm,
+                                                       (__v8si)
+                                                       _mm256_setzero_si256 (),
+                                                       (__mmask8)
+                                                       __U);
 }
 
-extern __inline __m128d
+extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_roundscale_pd (__m128d __A, const int __imm)
+_mm256_insertf32x4 (__m256 __A, __m128 __B, const int __imm)
 {
-  return (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A,
+  return (__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf) __A,
+                                                      (__v4sf) __B,
                                                       __imm,
-                                                      (__v2df)
-                                                      _mm_setzero_pd (),
+                                                      (__v8sf)
+                                                      _mm256_setzero_ps (),
                                                       (__mmask8) -1);
 }
 
-extern __inline __m128d
+extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_roundscale_pd (__m128d __W, __mmask8 __U, __m128d __A,
-                       const int __imm)
+_mm256_mask_insertf32x4 (__m256 __W, __mmask8 __U, __m256 __A,
+                        __m128 __B, const int __imm)
 {
-  return (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A,
+  return (__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf) __A,
+                                                      (__v4sf) __B,
                                                       __imm,
-                                                      (__v2df) __W,
+                                                      (__v8sf) __W,
                                                       (__mmask8) __U);
 }
 
-extern __inline __m128d
+extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_roundscale_pd (__mmask8 __U, __m128d __A, const int __imm)
+_mm256_maskz_insertf32x4 (__mmask8 __U, __m256 __A, __m128 __B,
+                         const int __imm)
 {
-  return (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A,
+  return (__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf) __A,
+                                                      (__v4sf) __B,
                                                       __imm,
-                                                      (__v2df)
-                                                      _mm_setzero_pd (),
+                                                      (__v8sf)
+                                                      _mm256_setzero_ps (),
                                                       (__mmask8) __U);
 }
 
-extern __inline __m256
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_getmant_ps (__m256 __A, _MM_MANTISSA_NORM_ENUM __B,
-                  _MM_MANTISSA_SIGN_ENUM __C)
+_mm256_extracti32x4_epi32 (__m256i __A, const int __imm)
 {
-  return (__m256) __builtin_ia32_getmantps256_mask ((__v8sf) __A,
-                                                   (__C << 2) | __B,
-                                                   (__v8sf)
-                                                   _mm256_setzero_ps (),
-                                                   (__mmask8) -1);
+  return (__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si) __A,
+                                                        __imm,
+                                                        (__v4si)
+                                                        _mm_setzero_si128 (),
+                                                        (__mmask8) -1);
 }
 
-extern __inline __m256
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_getmant_ps (__m256 __W, __mmask8 __U, __m256 __A,
-                       _MM_MANTISSA_NORM_ENUM __B,
-                       _MM_MANTISSA_SIGN_ENUM __C)
+_mm256_mask_extracti32x4_epi32 (__m128i __W, __mmask8 __U, __m256i __A,
+                               const int __imm)
 {
-  return (__m256) __builtin_ia32_getmantps256_mask ((__v8sf) __A,
-                                                   (__C << 2) | __B,
-                                                   (__v8sf) __W,
-                                                   (__mmask8) __U);
+  return (__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si) __A,
+                                                        __imm,
+                                                        (__v4si) __W,
+                                                        (__mmask8)
+                                                        __U);
 }
 
-extern __inline __m256
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_getmant_ps (__mmask8 __U, __m256 __A,
-                        _MM_MANTISSA_NORM_ENUM __B,
-                        _MM_MANTISSA_SIGN_ENUM __C)
+_mm256_maskz_extracti32x4_epi32 (__mmask8 __U, __m256i __A,
+                                const int __imm)
 {
-  return (__m256) __builtin_ia32_getmantps256_mask ((__v8sf) __A,
-                                                   (__C << 2) | __B,
-                                                   (__v8sf)
-                                                   _mm256_setzero_ps (),
-                                                   (__mmask8) __U);
+  return (__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si) __A,
+                                                        __imm,
+                                                        (__v4si)
+                                                        _mm_setzero_si128 (),
+                                                        (__mmask8)
+                                                        __U);
 }
 
 extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_getmant_ps (__m128 __A, _MM_MANTISSA_NORM_ENUM __B,
-               _MM_MANTISSA_SIGN_ENUM __C)
+_mm256_extractf32x4_ps (__m256 __A, const int __imm)
 {
-  return (__m128) __builtin_ia32_getmantps128_mask ((__v4sf) __A,
-                                                   (__C << 2) | __B,
-                                                   (__v4sf)
-                                                   _mm_setzero_ps (),
-                                                   (__mmask8) -1);
+  return (__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf) __A,
+                                                       __imm,
+                                                       (__v4sf)
+                                                       _mm_setzero_ps (),
+                                                       (__mmask8) -1);
 }
 
 extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_getmant_ps (__m128 __W, __mmask8 __U, __m128 __A,
-                    _MM_MANTISSA_NORM_ENUM __B,
-                    _MM_MANTISSA_SIGN_ENUM __C)
+_mm256_mask_extractf32x4_ps (__m128 __W, __mmask8 __U, __m256 __A,
+                            const int __imm)
 {
-  return (__m128) __builtin_ia32_getmantps128_mask ((__v4sf) __A,
-                                                   (__C << 2) | __B,
-                                                   (__v4sf) __W,
-                                                   (__mmask8) __U);
+  return (__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf) __A,
+                                                       __imm,
+                                                       (__v4sf) __W,
+                                                       (__mmask8)
+                                                       __U);
 }
 
 extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_getmant_ps (__mmask8 __U, __m128 __A,
-                     _MM_MANTISSA_NORM_ENUM __B,
-                     _MM_MANTISSA_SIGN_ENUM __C)
+_mm256_maskz_extractf32x4_ps (__mmask8 __U, __m256 __A,
+                             const int __imm)
+{
+  return (__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf) __A,
+                                                       __imm,
+                                                       (__v4sf)
+                                                       _mm_setzero_ps (),
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shuffle_i64x2 (__m256i __A, __m256i __B, const int __imm)
+{
+  return (__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di) __A,
+                                                      (__v4di) __B,
+                                                      __imm,
+                                                      (__v4di)
+                                                      _mm256_setzero_si256 (),
+                                                      (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shuffle_i64x2 (__m256i __W, __mmask8 __U, __m256i __A,
+                          __m256i __B, const int __imm)
 {
-  return (__m128) __builtin_ia32_getmantps128_mask ((__v4sf) __A,
-                                                   (__C << 2) | __B,
-                                                   (__v4sf)
-                                                   _mm_setzero_ps (),
-                                                   (__mmask8) __U);
+  return (__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di) __A,
+                                                      (__v4di) __B,
+                                                      __imm,
+                                                      (__v4di) __W,
+                                                      (__mmask8) __U);
 }
 
-extern __inline __m256d
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_getmant_pd (__m256d __A, _MM_MANTISSA_NORM_ENUM __B,
-                  _MM_MANTISSA_SIGN_ENUM __C)
+_mm256_maskz_shuffle_i64x2 (__mmask8 __U, __m256i __A, __m256i __B,
+                           const int __imm)
 {
-  return (__m256d) __builtin_ia32_getmantpd256_mask ((__v4df) __A,
-                                                    (__C << 2) | __B,
-                                                    (__v4df)
-                                                    _mm256_setzero_pd (),
-                                                    (__mmask8) -1);
+  return (__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di) __A,
+                                                      (__v4di) __B,
+                                                      __imm,
+                                                      (__v4di)
+                                                      _mm256_setzero_si256 (),
+                                                      (__mmask8) __U);
 }
 
-extern __inline __m256d
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_getmant_pd (__m256d __W, __mmask8 __U, __m256d __A,
-                       _MM_MANTISSA_NORM_ENUM __B,
-                       _MM_MANTISSA_SIGN_ENUM __C)
+_mm256_shuffle_i32x4 (__m256i __A, __m256i __B, const int __imm)
 {
-  return (__m256d) __builtin_ia32_getmantpd256_mask ((__v4df) __A,
-                                                    (__C << 2) | __B,
-                                                    (__v4df) __W,
-                                                    (__mmask8) __U);
+  return (__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si) __A,
+                                                      (__v8si) __B,
+                                                      __imm,
+                                                      (__v8si)
+                                                      _mm256_setzero_si256 (),
+                                                      (__mmask8) -1);
 }
 
-extern __inline __m256d
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_getmant_pd (__mmask8 __U, __m256d __A,
-                        _MM_MANTISSA_NORM_ENUM __B,
-                        _MM_MANTISSA_SIGN_ENUM __C)
+_mm256_mask_shuffle_i32x4 (__m256i __W, __mmask8 __U, __m256i __A,
+                          __m256i __B, const int __imm)
 {
-  return (__m256d) __builtin_ia32_getmantpd256_mask ((__v4df) __A,
-                                                    (__C << 2) | __B,
-                                                    (__v4df)
-                                                    _mm256_setzero_pd (),
-                                                    (__mmask8) __U);
+  return (__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si) __A,
+                                                      (__v8si) __B,
+                                                      __imm,
+                                                      (__v8si) __W,
+                                                      (__mmask8) __U);
 }
 
-extern __inline __m128d
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_getmant_pd (__m128d __A, _MM_MANTISSA_NORM_ENUM __B,
-               _MM_MANTISSA_SIGN_ENUM __C)
+_mm256_maskz_shuffle_i32x4 (__mmask8 __U, __m256i __A, __m256i __B,
+                           const int __imm)
 {
-  return (__m128d) __builtin_ia32_getmantpd128_mask ((__v2df) __A,
-                                                    (__C << 2) | __B,
-                                                    (__v2df)
-                                                    _mm_setzero_pd (),
-                                                    (__mmask8) -1);
+  return (__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si) __A,
+                                                      (__v8si) __B,
+                                                      __imm,
+                                                      (__v8si)
+                                                      _mm256_setzero_si256 (),
+                                                      (__mmask8) __U);
 }
 
-extern __inline __m128d
+extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_getmant_pd (__m128d __W, __mmask8 __U, __m128d __A,
-                    _MM_MANTISSA_NORM_ENUM __B,
-                    _MM_MANTISSA_SIGN_ENUM __C)
+_mm256_shuffle_f64x2 (__m256d __A, __m256d __B, const int __imm)
 {
-  return (__m128d) __builtin_ia32_getmantpd128_mask ((__v2df) __A,
-                                                    (__C << 2) | __B,
-                                                    (__v2df) __W,
-                                                    (__mmask8) __U);
+  return (__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df) __A,
+                                                      (__v4df) __B,
+                                                      __imm,
+                                                      (__v4df)
+                                                      _mm256_setzero_pd (),
+                                                      (__mmask8) -1);
 }
 
-extern __inline __m128d
+extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_getmant_pd (__mmask8 __U, __m128d __A,
-                     _MM_MANTISSA_NORM_ENUM __B,
-                     _MM_MANTISSA_SIGN_ENUM __C)
+_mm256_mask_shuffle_f64x2 (__m256d __W, __mmask8 __U, __m256d __A,
+                          __m256d __B, const int __imm)
 {
-  return (__m128d) __builtin_ia32_getmantpd128_mask ((__v2df) __A,
-                                                    (__C << 2) | __B,
-                                                    (__v2df)
-                                                    _mm_setzero_pd (),
-                                                    (__mmask8) __U);
+  return (__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df) __A,
+                                                      (__v4df) __B,
+                                                      __imm,
+                                                      (__v4df) __W,
+                                                      (__mmask8) __U);
 }
 
-extern __inline __m256
+extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mmask_i32gather_ps (__m256 __v1_old, __mmask8 __mask,
-                          __m256i __index, void const *__addr,
-                          int __scale)
+_mm256_maskz_shuffle_f64x2 (__mmask8 __U, __m256d __A, __m256d __B,
+                           const int __imm)
 {
-  return (__m256) __builtin_ia32_gather3siv8sf ((__v8sf) __v1_old,
-                                               __addr,
-                                               (__v8si) __index,
-                                               __mask, __scale);
+  return (__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df) __A,
+                                                      (__v4df) __B,
+                                                      __imm,
+                                                      (__v4df)
+                                                      _mm256_setzero_pd (),
+                                                      (__mmask8) __U);
 }
 
-extern __inline __m128
+extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mmask_i32gather_ps (__m128 __v1_old, __mmask8 __mask,
-                       __m128i __index, void const *__addr,
-                       int __scale)
+_mm256_shuffle_f32x4 (__m256 __A, __m256 __B, const int __imm)
 {
-  return (__m128) __builtin_ia32_gather3siv4sf ((__v4sf) __v1_old,
-                                               __addr,
-                                               (__v4si) __index,
-                                               __mask, __scale);
+  return (__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf) __A,
+                                                     (__v8sf) __B,
+                                                     __imm,
+                                                     (__v8sf)
+                                                     _mm256_setzero_ps (),
+                                                     (__mmask8) -1);
 }
 
-extern __inline __m256d
+extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mmask_i32gather_pd (__m256d __v1_old, __mmask8 __mask,
-                          __m128i __index, void const *__addr,
-                          int __scale)
+_mm256_mask_shuffle_f32x4 (__m256 __W, __mmask8 __U, __m256 __A,
+                          __m256 __B, const int __imm)
 {
-  return (__m256d) __builtin_ia32_gather3siv4df ((__v4df) __v1_old,
-                                                __addr,
-                                                (__v4si) __index,
-                                                __mask, __scale);
+  return (__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf) __A,
+                                                     (__v8sf) __B,
+                                                     __imm,
+                                                     (__v8sf) __W,
+                                                     (__mmask8) __U);
 }
 
-extern __inline __m128d
+extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mmask_i32gather_pd (__m128d __v1_old, __mmask8 __mask,
-                       __m128i __index, void const *__addr,
-                       int __scale)
+_mm256_maskz_shuffle_f32x4 (__mmask8 __U, __m256 __A, __m256 __B,
+                           const int __imm)
 {
-  return (__m128d) __builtin_ia32_gather3siv2df ((__v2df) __v1_old,
-                                                __addr,
-                                                (__v4si) __index,
-                                                __mask, __scale);
+  return (__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf) __A,
+                                                     (__v8sf) __B,
+                                                     __imm,
+                                                     (__v8sf)
+                                                     _mm256_setzero_ps (),
+                                                     (__mmask8) __U);
 }
 
-extern __inline __m128
+extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mmask_i64gather_ps (__m128 __v1_old, __mmask8 __mask,
-                          __m256i __index, void const *__addr,
-                          int __scale)
+_mm256_fixupimm_pd (__m256d __A, __m256d __B, __m256i __C,
+                   const int __imm)
 {
-  return (__m128) __builtin_ia32_gather3div8sf ((__v4sf) __v1_old,
-                                               __addr,
-                                               (__v4di) __index,
-                                               __mask, __scale);
+  return (__m256d) __builtin_ia32_fixupimmpd256_mask ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4di) __C,
+                                                     __imm,
+                                                     (__mmask8) -1);
 }
 
-extern __inline __m128
+extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mmask_i64gather_ps (__m128 __v1_old, __mmask8 __mask,
-                       __m128i __index, void const *__addr,
-                       int __scale)
+_mm256_mask_fixupimm_pd (__m256d __A, __mmask8 __U, __m256d __B,
+                        __m256i __C, const int __imm)
 {
-  return (__m128) __builtin_ia32_gather3div4sf ((__v4sf) __v1_old,
-                                               __addr,
-                                               (__v2di) __index,
-                                               __mask, __scale);
+  return (__m256d) __builtin_ia32_fixupimmpd256_mask ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4di) __C,
+                                                     __imm,
+                                                     (__mmask8) __U);
 }
 
 extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mmask_i64gather_pd (__m256d __v1_old, __mmask8 __mask,
-                          __m256i __index, void const *__addr,
-                          int __scale)
+_mm256_maskz_fixupimm_pd (__mmask8 __U, __m256d __A, __m256d __B,
+                         __m256i __C, const int __imm)
 {
-  return (__m256d) __builtin_ia32_gather3div4df ((__v4df) __v1_old,
-                                                __addr,
-                                                (__v4di) __index,
-                                                __mask, __scale);
+  return (__m256d) __builtin_ia32_fixupimmpd256_maskz ((__v4df) __A,
+                                                      (__v4df) __B,
+                                                      (__v4di) __C,
+                                                      __imm,
+                                                      (__mmask8) __U);
 }
 
-extern __inline __m128d
+extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mmask_i64gather_pd (__m128d __v1_old, __mmask8 __mask,
-                       __m128i __index, void const *__addr,
-                       int __scale)
+_mm256_fixupimm_ps (__m256 __A, __m256 __B, __m256i __C,
+                   const int __imm)
 {
-  return (__m128d) __builtin_ia32_gather3div2df ((__v2df) __v1_old,
-                                                __addr,
-                                                (__v2di) __index,
-                                                __mask, __scale);
+  return (__m256) __builtin_ia32_fixupimmps256_mask ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8si) __C,
+                                                    __imm,
+                                                    (__mmask8) -1);
 }
 
-extern __inline __m256i
+extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mmask_i32gather_epi32 (__m256i __v1_old, __mmask8 __mask,
-                             __m256i __index, void const *__addr,
-                             int __scale)
+_mm256_mask_fixupimm_ps (__m256 __A, __mmask8 __U, __m256 __B,
+                        __m256i __C, const int __imm)
 {
-  return (__m256i) __builtin_ia32_gather3siv8si ((__v8si) __v1_old,
-                                                __addr,
-                                                (__v8si) __index,
-                                                __mask, __scale);
+  return (__m256) __builtin_ia32_fixupimmps256_mask ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8si) __C,
+                                                    __imm,
+                                                    (__mmask8) __U);
 }
 
-extern __inline __m128i
+extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mmask_i32gather_epi32 (__m128i __v1_old, __mmask8 __mask,
-                          __m128i __index, void const *__addr,
-                          int __scale)
+_mm256_maskz_fixupimm_ps (__mmask8 __U, __m256 __A, __m256 __B,
+                         __m256i __C, const int __imm)
 {
-  return (__m128i) __builtin_ia32_gather3siv4si ((__v4si) __v1_old,
-                                                __addr,
-                                                (__v4si) __index,
-                                                __mask, __scale);
+  return (__m256) __builtin_ia32_fixupimmps256_maskz ((__v8sf) __A,
+                                                     (__v8sf) __B,
+                                                     (__v8si) __C,
+                                                     __imm,
+                                                     (__mmask8) __U);
 }
 
-extern __inline __m256i
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mmask_i32gather_epi64 (__m256i __v1_old, __mmask8 __mask,
-                             __m128i __index, void const *__addr,
-                             int __scale)
+_mm_fixupimm_pd (__m128d __A, __m128d __B, __m128i __C,
+                const int __imm)
 {
-  return (__m256i) __builtin_ia32_gather3siv4di ((__v4di) __v1_old,
-                                                __addr,
-                                                (__v4si) __index,
-                                                __mask, __scale);
+  return (__m128d) __builtin_ia32_fixupimmpd128_mask ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2di) __C,
+                                                     __imm,
+                                                     (__mmask8) -1);
 }
 
-extern __inline __m128i
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mmask_i32gather_epi64 (__m128i __v1_old, __mmask8 __mask,
-                          __m128i __index, void const *__addr,
-                          int __scale)
+_mm_mask_fixupimm_pd (__m128d __A, __mmask8 __U, __m128d __B,
+                     __m128i __C, const int __imm)
 {
-  return (__m128i) __builtin_ia32_gather3siv2di ((__v2di) __v1_old,
-                                                __addr,
-                                                (__v4si) __index,
-                                                __mask, __scale);
+  return (__m128d) __builtin_ia32_fixupimmpd128_mask ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2di) __C,
+                                                     __imm,
+                                                     (__mmask8) __U);
 }
 
-extern __inline __m128i
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mmask_i64gather_epi32 (__m128i __v1_old, __mmask8 __mask,
-                             __m256i __index, void const *__addr,
-                             int __scale)
+_mm_maskz_fixupimm_pd (__mmask8 __U, __m128d __A, __m128d __B,
+                      __m128i __C, const int __imm)
 {
-  return (__m128i) __builtin_ia32_gather3div8si ((__v4si) __v1_old,
-                                                __addr,
-                                                (__v4di) __index,
-                                                __mask, __scale);
+  return (__m128d) __builtin_ia32_fixupimmpd128_maskz ((__v2df) __A,
+                                                      (__v2df) __B,
+                                                      (__v2di) __C,
+                                                      __imm,
+                                                      (__mmask8) __U);
 }
 
-extern __inline __m128i
+extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mmask_i64gather_epi32 (__m128i __v1_old, __mmask8 __mask,
-                          __m128i __index, void const *__addr,
-                          int __scale)
+_mm_fixupimm_ps (__m128 __A, __m128 __B, __m128i __C, const int __imm)
 {
-  return (__m128i) __builtin_ia32_gather3div4si ((__v4si) __v1_old,
-                                                __addr,
-                                                (__v2di) __index,
-                                                __mask, __scale);
+  return (__m128) __builtin_ia32_fixupimmps128_mask ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4si) __C,
+                                                    __imm,
+                                                    (__mmask8) -1);
 }
 
-extern __inline __m256i
+extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mmask_i64gather_epi64 (__m256i __v1_old, __mmask8 __mask,
-                             __m256i __index, void const *__addr,
-                             int __scale)
+_mm_mask_fixupimm_ps (__m128 __A, __mmask8 __U, __m128 __B,
+                     __m128i __C, const int __imm)
 {
-  return (__m256i) __builtin_ia32_gather3div4di ((__v4di) __v1_old,
-                                                __addr,
-                                                (__v4di) __index,
-                                                __mask, __scale);
+  return (__m128) __builtin_ia32_fixupimmps128_mask ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4si) __C,
+                                                    __imm,
+                                                    (__mmask8) __U);
 }
 
-extern __inline __m128i
+extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mmask_i64gather_epi64 (__m128i __v1_old, __mmask8 __mask,
-                          __m128i __index, void const *__addr,
-                          int __scale)
+_mm_maskz_fixupimm_ps (__mmask8 __U, __m128 __A, __m128 __B,
+                      __m128i __C, const int __imm)
 {
-  return (__m128i) __builtin_ia32_gather3div2di ((__v2di) __v1_old,
-                                                __addr,
-                                                (__v2di) __index,
-                                                __mask, __scale);
+  return (__m128) __builtin_ia32_fixupimmps128_maskz ((__v4sf) __A,
+                                                     (__v4sf) __B,
+                                                     (__v4si) __C,
+                                                     __imm,
+                                                     (__mmask8) __U);
 }
 
-extern __inline void
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_i32scatter_ps (void *__addr, __m256i __index,
-                     __m256 __v1, const int __scale)
+_mm256_mask_srli_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                       const int __imm)
 {
-  __builtin_ia32_scattersiv8sf (__addr, (__mmask8) 0xFF,
-                               (__v8si) __index, (__v8sf) __v1,
-                               __scale);
+  return (__m256i) __builtin_ia32_psrldi256_mask ((__v8si) __A, __imm,
+                                                 (__v8si) __W,
+                                                 (__mmask8) __U);
 }
 
-extern __inline void
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_i32scatter_ps (void *__addr, __mmask8 __mask,
-                          __m256i __index, __m256 __v1,
-                          const int __scale)
+_mm256_maskz_srli_epi32 (__mmask8 __U, __m256i __A, const int __imm)
 {
-  __builtin_ia32_scattersiv8sf (__addr, __mask, (__v8si) __index,
-                               (__v8sf) __v1, __scale);
+  return (__m256i) __builtin_ia32_psrldi256_mask ((__v8si) __A, __imm,
+                                                 (__v8si)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) __U);
 }
 
-extern __inline void
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_i32scatter_ps (void *__addr, __m128i __index, __m128 __v1,
-                  const int __scale)
+_mm_mask_srli_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+                    const int __imm)
 {
-  __builtin_ia32_scattersiv4sf (__addr, (__mmask8) 0xFF,
-                               (__v4si) __index, (__v4sf) __v1,
-                               __scale);
+  return (__m128i) __builtin_ia32_psrldi128_mask ((__v4si) __A, __imm,
+                                                 (__v4si) __W,
+                                                 (__mmask8) __U);
 }
 
-extern __inline void
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_i32scatter_ps (void *__addr, __mmask8 __mask,
-                       __m128i __index, __m128 __v1,
-                       const int __scale)
+_mm_maskz_srli_epi32 (__mmask8 __U, __m128i __A, const int __imm)
 {
-  __builtin_ia32_scattersiv4sf (__addr, __mask, (__v4si) __index,
-                               (__v4sf) __v1, __scale);
+  return (__m128i) __builtin_ia32_psrldi128_mask ((__v4si) __A, __imm,
+                                                 (__v4si)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
 }
 
-extern __inline void
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_i32scatter_pd (void *__addr, __m128i __index,
-                     __m256d __v1, const int __scale)
+_mm256_mask_srli_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                       const int __imm)
 {
-  __builtin_ia32_scattersiv4df (__addr, (__mmask8) 0xFF,
-                               (__v4si) __index, (__v4df) __v1,
-                               __scale);
+  return (__m256i) __builtin_ia32_psrlqi256_mask ((__v4di) __A, __imm,
+                                                 (__v4di) __W,
+                                                 (__mmask8) __U);
 }
 
-extern __inline void
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_i32scatter_pd (void *__addr, __mmask8 __mask,
-                          __m128i __index, __m256d __v1,
-                          const int __scale)
+_mm256_maskz_srli_epi64 (__mmask8 __U, __m256i __A, const int __imm)
 {
-  __builtin_ia32_scattersiv4df (__addr, __mask, (__v4si) __index,
-                               (__v4df) __v1, __scale);
+  return (__m256i) __builtin_ia32_psrlqi256_mask ((__v4di) __A, __imm,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) __U);
 }
 
-extern __inline void
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_i32scatter_pd (void *__addr, __m128i __index,
-                  __m128d __v1, const int __scale)
+_mm_mask_srli_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+                    const int __imm)
 {
-  __builtin_ia32_scattersiv2df (__addr, (__mmask8) 0xFF,
-                               (__v4si) __index, (__v2df) __v1,
-                               __scale);
+  return (__m128i) __builtin_ia32_psrlqi128_mask ((__v2di) __A, __imm,
+                                                 (__v2di) __W,
+                                                 (__mmask8) __U);
 }
 
-extern __inline void
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_i32scatter_pd (void *__addr, __mmask8 __mask,
-                       __m128i __index, __m128d __v1,
-                       const int __scale)
+_mm_maskz_srli_epi64 (__mmask8 __U, __m128i __A, const int __imm)
 {
-  __builtin_ia32_scattersiv2df (__addr, __mask, (__v4si) __index,
-                               (__v2df) __v1, __scale);
+  return (__m128i) __builtin_ia32_psrlqi128_mask ((__v2di) __A, __imm,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
 }
 
-extern __inline void
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_i64scatter_ps (void *__addr, __m256i __index,
-                     __m128 __v1, const int __scale)
+_mm256_ternarylogic_epi64 (__m256i __A, __m256i __B, __m256i __C,
+                          const int __imm)
 {
-  __builtin_ia32_scatterdiv8sf (__addr, (__mmask8) 0xFF,
-                               (__v4di) __index, (__v4sf) __v1,
-                               __scale);
+  return (__m256i) __builtin_ia32_pternlogq256_mask ((__v4di) __A,
+                                                    (__v4di) __B,
+                                                    (__v4di) __C, __imm,
+                                                    (__mmask8) -1);
 }
 
-extern __inline void
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_i64scatter_ps (void *__addr, __mmask8 __mask,
-                          __m256i __index, __m128 __v1,
-                          const int __scale)
+_mm256_mask_ternarylogic_epi64 (__m256i __A, __mmask8 __U,
+                               __m256i __B, __m256i __C,
+                               const int __imm)
 {
-  __builtin_ia32_scatterdiv8sf (__addr, __mask, (__v4di) __index,
-                               (__v4sf) __v1, __scale);
+  return (__m256i) __builtin_ia32_pternlogq256_mask ((__v4di) __A,
+                                                    (__v4di) __B,
+                                                    (__v4di) __C, __imm,
+                                                    (__mmask8) __U);
 }
 
-extern __inline void
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_i64scatter_ps (void *__addr, __m128i __index, __m128 __v1,
-                  const int __scale)
+_mm256_maskz_ternarylogic_epi64 (__mmask8 __U, __m256i __A,
+                                __m256i __B, __m256i __C,
+                                const int __imm)
 {
-  __builtin_ia32_scatterdiv4sf (__addr, (__mmask8) 0xFF,
-                               (__v2di) __index, (__v4sf) __v1,
-                               __scale);
+  return (__m256i) __builtin_ia32_pternlogq256_maskz ((__v4di) __A,
+                                                     (__v4di) __B,
+                                                     (__v4di) __C,
+                                                     __imm,
+                                                     (__mmask8) __U);
 }
 
-extern __inline void
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_i64scatter_ps (void *__addr, __mmask8 __mask,
-                       __m128i __index, __m128 __v1,
-                       const int __scale)
+_mm256_ternarylogic_epi32 (__m256i __A, __m256i __B, __m256i __C,
+                          const int __imm)
 {
-  __builtin_ia32_scatterdiv4sf (__addr, __mask, (__v2di) __index,
-                               (__v4sf) __v1, __scale);
+  return (__m256i) __builtin_ia32_pternlogd256_mask ((__v8si) __A,
+                                                    (__v8si) __B,
+                                                    (__v8si) __C, __imm,
+                                                    (__mmask8) -1);
 }
 
-extern __inline void
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_i64scatter_pd (void *__addr, __m256i __index,
-                     __m256d __v1, const int __scale)
+_mm256_mask_ternarylogic_epi32 (__m256i __A, __mmask8 __U,
+                               __m256i __B, __m256i __C,
+                               const int __imm)
 {
-  __builtin_ia32_scatterdiv4df (__addr, (__mmask8) 0xFF,
-                               (__v4di) __index, (__v4df) __v1,
-                               __scale);
+  return (__m256i) __builtin_ia32_pternlogd256_mask ((__v8si) __A,
+                                                    (__v8si) __B,
+                                                    (__v8si) __C, __imm,
+                                                    (__mmask8) __U);
 }
 
-extern __inline void
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_i64scatter_pd (void *__addr, __mmask8 __mask,
-                          __m256i __index, __m256d __v1,
-                          const int __scale)
+_mm256_maskz_ternarylogic_epi32 (__mmask8 __U, __m256i __A,
+                                __m256i __B, __m256i __C,
+                                const int __imm)
 {
-  __builtin_ia32_scatterdiv4df (__addr, __mask, (__v4di) __index,
-                               (__v4df) __v1, __scale);
+  return (__m256i) __builtin_ia32_pternlogd256_maskz ((__v8si) __A,
+                                                     (__v8si) __B,
+                                                     (__v8si) __C,
+                                                     __imm,
+                                                     (__mmask8) __U);
 }
 
-extern __inline void
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_i64scatter_pd (void *__addr, __m128i __index,
-                  __m128d __v1, const int __scale)
+_mm_ternarylogic_epi64 (__m128i __A, __m128i __B, __m128i __C,
+                       const int __imm)
 {
-  __builtin_ia32_scatterdiv2df (__addr, (__mmask8) 0xFF,
-                               (__v2di) __index, (__v2df) __v1,
-                               __scale);
+  return (__m128i) __builtin_ia32_pternlogq128_mask ((__v2di) __A,
+                                                    (__v2di) __B,
+                                                    (__v2di) __C, __imm,
+                                                    (__mmask8) -1);
 }
 
-extern __inline void
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_i64scatter_pd (void *__addr, __mmask8 __mask,
-                       __m128i __index, __m128d __v1,
-                       const int __scale)
+_mm_mask_ternarylogic_epi64 (__m128i __A, __mmask8 __U,
+                            __m128i __B, __m128i __C, const int __imm)
 {
-  __builtin_ia32_scatterdiv2df (__addr, __mask, (__v2di) __index,
-                               (__v2df) __v1, __scale);
+  return (__m128i) __builtin_ia32_pternlogq128_mask ((__v2di) __A,
+                                                    (__v2di) __B,
+                                                    (__v2di) __C, __imm,
+                                                    (__mmask8) __U);
 }
 
-extern __inline void
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_i32scatter_epi32 (void *__addr, __m256i __index,
-                        __m256i __v1, const int __scale)
+_mm_maskz_ternarylogic_epi64 (__mmask8 __U, __m128i __A,
+                             __m128i __B, __m128i __C, const int __imm)
 {
-  __builtin_ia32_scattersiv8si (__addr, (__mmask8) 0xFF,
-                               (__v8si) __index, (__v8si) __v1,
-                               __scale);
+  return (__m128i) __builtin_ia32_pternlogq128_maskz ((__v2di) __A,
+                                                     (__v2di) __B,
+                                                     (__v2di) __C,
+                                                     __imm,
+                                                     (__mmask8) __U);
 }
 
-extern __inline void
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_i32scatter_epi32 (void *__addr, __mmask8 __mask,
-                             __m256i __index, __m256i __v1,
-                             const int __scale)
+_mm_ternarylogic_epi32 (__m128i __A, __m128i __B, __m128i __C,
+                       const int __imm)
 {
-  __builtin_ia32_scattersiv8si (__addr, __mask, (__v8si) __index,
-                               (__v8si) __v1, __scale);
+  return (__m128i) __builtin_ia32_pternlogd128_mask ((__v4si) __A,
+                                                    (__v4si) __B,
+                                                    (__v4si) __C, __imm,
+                                                    (__mmask8) -1);
 }
 
-extern __inline void
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_i32scatter_epi32 (void *__addr, __m128i __index,
-                     __m128i __v1, const int __scale)
+_mm_mask_ternarylogic_epi32 (__m128i __A, __mmask8 __U,
+                            __m128i __B, __m128i __C, const int __imm)
 {
-  __builtin_ia32_scattersiv4si (__addr, (__mmask8) 0xFF,
-                               (__v4si) __index, (__v4si) __v1,
-                               __scale);
+  return (__m128i) __builtin_ia32_pternlogd128_mask ((__v4si) __A,
+                                                    (__v4si) __B,
+                                                    (__v4si) __C, __imm,
+                                                    (__mmask8) __U);
 }
 
-extern __inline void
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_i32scatter_epi32 (void *__addr, __mmask8 __mask,
-                          __m128i __index, __m128i __v1,
-                          const int __scale)
+_mm_maskz_ternarylogic_epi32 (__mmask8 __U, __m128i __A,
+                             __m128i __B, __m128i __C, const int __imm)
 {
-  __builtin_ia32_scattersiv4si (__addr, __mask, (__v4si) __index,
-                               (__v4si) __v1, __scale);
+  return (__m128i) __builtin_ia32_pternlogd128_maskz ((__v4si) __A,
+                                                     (__v4si) __B,
+                                                     (__v4si) __C,
+                                                     __imm,
+                                                     (__mmask8) __U);
 }
 
-extern __inline void
+extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_i32scatter_epi64 (void *__addr, __m128i __index,
-                        __m256i __v1, const int __scale)
+_mm256_roundscale_ps (__m256 __A, const int __imm)
 {
-  __builtin_ia32_scattersiv4di (__addr, (__mmask8) 0xFF,
-                               (__v4si) __index, (__v4di) __v1,
-                               __scale);
+  return (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A,
+                                                     __imm,
+                                                     (__v8sf)
+                                                     _mm256_setzero_ps (),
+                                                     (__mmask8) -1);
 }
 
-extern __inline void
+extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_i32scatter_epi64 (void *__addr, __mmask8 __mask,
-                             __m128i __index, __m256i __v1,
-                             const int __scale)
+_mm256_mask_roundscale_ps (__m256 __W, __mmask8 __U, __m256 __A,
+                          const int __imm)
 {
-  __builtin_ia32_scattersiv4di (__addr, __mask, (__v4si) __index,
-                               (__v4di) __v1, __scale);
+  return (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A,
+                                                     __imm,
+                                                     (__v8sf) __W,
+                                                     (__mmask8) __U);
 }
 
-extern __inline void
+extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_i32scatter_epi64 (void *__addr, __m128i __index,
-                     __m128i __v1, const int __scale)
+_mm256_maskz_roundscale_ps (__mmask8 __U, __m256 __A, const int __imm)
 {
-  __builtin_ia32_scattersiv2di (__addr, (__mmask8) 0xFF,
-                               (__v4si) __index, (__v2di) __v1,
-                               __scale);
+  return (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A,
+                                                     __imm,
+                                                     (__v8sf)
+                                                     _mm256_setzero_ps (),
+                                                     (__mmask8) __U);
 }
 
-extern __inline void
+extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_i32scatter_epi64 (void *__addr, __mmask8 __mask,
-                          __m128i __index, __m128i __v1,
-                          const int __scale)
+_mm256_roundscale_pd (__m256d __A, const int __imm)
 {
-  __builtin_ia32_scattersiv2di (__addr, __mask, (__v4si) __index,
-                               (__v2di) __v1, __scale);
+  return (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A,
+                                                      __imm,
+                                                      (__v4df)
+                                                      _mm256_setzero_pd (),
+                                                      (__mmask8) -1);
 }
 
-extern __inline void
+extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_i64scatter_epi32 (void *__addr, __m256i __index,
-                        __m128i __v1, const int __scale)
+_mm256_mask_roundscale_pd (__m256d __W, __mmask8 __U, __m256d __A,
+                          const int __imm)
 {
-  __builtin_ia32_scatterdiv8si (__addr, (__mmask8) 0xFF,
-                               (__v4di) __index, (__v4si) __v1,
-                               __scale);
+  return (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A,
+                                                      __imm,
+                                                      (__v4df) __W,
+                                                      (__mmask8) __U);
 }
 
-extern __inline void
+extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_i64scatter_epi32 (void *__addr, __mmask8 __mask,
-                             __m256i __index, __m128i __v1,
-                             const int __scale)
+_mm256_maskz_roundscale_pd (__mmask8 __U, __m256d __A, const int __imm)
 {
-  __builtin_ia32_scatterdiv8si (__addr, __mask, (__v4di) __index,
-                               (__v4si) __v1, __scale);
+  return (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A,
+                                                      __imm,
+                                                      (__v4df)
+                                                      _mm256_setzero_pd (),
+                                                      (__mmask8) __U);
 }
 
-extern __inline void
+extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_i64scatter_epi32 (void *__addr, __m128i __index,
-                     __m128i __v1, const int __scale)
+_mm_roundscale_ps (__m128 __A, const int __imm)
 {
-  __builtin_ia32_scatterdiv4si (__addr, (__mmask8) 0xFF,
-                               (__v2di) __index, (__v4si) __v1,
-                               __scale);
+  return (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A,
+                                                     __imm,
+                                                     (__v4sf)
+                                                     _mm_setzero_ps (),
+                                                     (__mmask8) -1);
 }
 
-extern __inline void
+extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_i64scatter_epi32 (void *__addr, __mmask8 __mask,
-                          __m128i __index, __m128i __v1,
-                          const int __scale)
+_mm_mask_roundscale_ps (__m128 __W, __mmask8 __U, __m128 __A,
+                       const int __imm)
 {
-  __builtin_ia32_scatterdiv4si (__addr, __mask, (__v2di) __index,
-                               (__v4si) __v1, __scale);
+  return (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A,
+                                                     __imm,
+                                                     (__v4sf) __W,
+                                                     (__mmask8) __U);
 }
 
-extern __inline void
+extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_i64scatter_epi64 (void *__addr, __m256i __index,
-                        __m256i __v1, const int __scale)
+_mm_maskz_roundscale_ps (__mmask8 __U, __m128 __A, const int __imm)
 {
-  __builtin_ia32_scatterdiv4di (__addr, (__mmask8) 0xFF,
-                               (__v4di) __index, (__v4di) __v1,
-                               __scale);
+  return (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A,
+                                                     __imm,
+                                                     (__v4sf)
+                                                     _mm_setzero_ps (),
+                                                     (__mmask8) __U);
 }
 
-extern __inline void
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_i64scatter_epi64 (void *__addr, __mmask8 __mask,
-                             __m256i __index, __m256i __v1,
-                             const int __scale)
+_mm_roundscale_pd (__m128d __A, const int __imm)
 {
-  __builtin_ia32_scatterdiv4di (__addr, __mask, (__v4di) __index,
-                               (__v4di) __v1, __scale);
+  return (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A,
+                                                      __imm,
+                                                      (__v2df)
+                                                      _mm_setzero_pd (),
+                                                      (__mmask8) -1);
 }
 
-extern __inline void
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_i64scatter_epi64 (void *__addr, __m128i __index,
-                     __m128i __v1, const int __scale)
+_mm_mask_roundscale_pd (__m128d __W, __mmask8 __U, __m128d __A,
+                       const int __imm)
 {
-  __builtin_ia32_scatterdiv2di (__addr, (__mmask8) 0xFF,
-                               (__v2di) __index, (__v2di) __v1,
-                               __scale);
+  return (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A,
+                                                      __imm,
+                                                      (__v2df) __W,
+                                                      (__mmask8) __U);
 }
 
-extern __inline void
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_i64scatter_epi64 (void *__addr, __mmask8 __mask,
-                          __m128i __index, __m128i __v1,
-                          const int __scale)
+_mm_maskz_roundscale_pd (__mmask8 __U, __m128d __A, const int __imm)
 {
-  __builtin_ia32_scatterdiv2di (__addr, __mask, (__v2di) __index,
-                               (__v2di) __v1, __scale);
+  return (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A,
+                                                      __imm,
+                                                      (__v2df)
+                                                      _mm_setzero_pd (),
+                                                      (__mmask8) __U);
 }
 
-extern __inline __m256i
+extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_shuffle_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-                          _MM_PERM_ENUM __mask)
+_mm256_getmant_ps (__m256 __A, _MM_MANTISSA_NORM_ENUM __B,
+                  _MM_MANTISSA_SIGN_ENUM __C)
 {
-  return (__m256i) __builtin_ia32_pshufd256_mask ((__v8si) __A, __mask,
-                                                 (__v8si) __W,
-                                                 (__mmask8) __U);
+  return (__m256) __builtin_ia32_getmantps256_mask ((__v8sf) __A,
+                                                   (__C << 2) | __B,
+                                                   (__v8sf)
+                                                   _mm256_setzero_ps (),
+                                                   (__mmask8) -1);
 }
 
-extern __inline __m256i
+extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_shuffle_epi32 (__mmask8 __U, __m256i __A,
-                           _MM_PERM_ENUM __mask)
+_mm256_mask_getmant_ps (__m256 __W, __mmask8 __U, __m256 __A,
+                       _MM_MANTISSA_NORM_ENUM __B,
+                       _MM_MANTISSA_SIGN_ENUM __C)
 {
-  return (__m256i) __builtin_ia32_pshufd256_mask ((__v8si) __A, __mask,
-                                                 (__v8si)
-                                                 _mm256_setzero_si256 (),
-                                                 (__mmask8) __U);
+  return (__m256) __builtin_ia32_getmantps256_mask ((__v8sf) __A,
+                                                   (__C << 2) | __B,
+                                                   (__v8sf) __W,
+                                                   (__mmask8) __U);
 }
 
-extern __inline __m128i
+extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_shuffle_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
-                       _MM_PERM_ENUM __mask)
+_mm256_maskz_getmant_ps (__mmask8 __U, __m256 __A,
+                        _MM_MANTISSA_NORM_ENUM __B,
+                        _MM_MANTISSA_SIGN_ENUM __C)
 {
-  return (__m128i) __builtin_ia32_pshufd128_mask ((__v4si) __A, __mask,
-                                                 (__v4si) __W,
-                                                 (__mmask8) __U);
+  return (__m256) __builtin_ia32_getmantps256_mask ((__v8sf) __A,
+                                                   (__C << 2) | __B,
+                                                   (__v8sf)
+                                                   _mm256_setzero_ps (),
+                                                   (__mmask8) __U);
 }
 
-extern __inline __m128i
+extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_shuffle_epi32 (__mmask8 __U, __m128i __A,
-                        _MM_PERM_ENUM __mask)
+_mm_getmant_ps (__m128 __A, _MM_MANTISSA_NORM_ENUM __B,
+               _MM_MANTISSA_SIGN_ENUM __C)
 {
-  return (__m128i) __builtin_ia32_pshufd128_mask ((__v4si) __A, __mask,
-                                                 (__v4si)
-                                                 _mm_setzero_si128 (),
-                                                 (__mmask8) __U);
+  return (__m128) __builtin_ia32_getmantps128_mask ((__v4sf) __A,
+                                                   (__C << 2) | __B,
+                                                   (__v4sf)
+                                                   _mm_setzero_ps (),
+                                                   (__mmask8) -1);
 }
 
-extern __inline __m256i
+extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_rol_epi32 (__m256i __A, const int __B)
+_mm_mask_getmant_ps (__m128 __W, __mmask8 __U, __m128 __A,
+                    _MM_MANTISSA_NORM_ENUM __B,
+                    _MM_MANTISSA_SIGN_ENUM __C)
 {
-  return (__m256i) __builtin_ia32_prold256_mask ((__v8si) __A, __B,
-                                                (__v8si)
-                                                _mm256_setzero_si256 (),
-                                                (__mmask8) -1);
+  return (__m128) __builtin_ia32_getmantps128_mask ((__v4sf) __A,
+                                                   (__C << 2) | __B,
+                                                   (__v4sf) __W,
+                                                   (__mmask8) __U);
 }
 
-extern __inline __m256i
+extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_rol_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-                      const int __B)
+_mm_maskz_getmant_ps (__mmask8 __U, __m128 __A,
+                     _MM_MANTISSA_NORM_ENUM __B,
+                     _MM_MANTISSA_SIGN_ENUM __C)
 {
-  return (__m256i) __builtin_ia32_prold256_mask ((__v8si) __A, __B,
-                                                (__v8si) __W,
-                                                (__mmask8) __U);
+  return (__m128) __builtin_ia32_getmantps128_mask ((__v4sf) __A,
+                                                   (__C << 2) | __B,
+                                                   (__v4sf)
+                                                   _mm_setzero_ps (),
+                                                   (__mmask8) __U);
 }
 
-extern __inline __m256i
+extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_rol_epi32 (__mmask8 __U, __m256i __A, const int __B)
+_mm256_getmant_pd (__m256d __A, _MM_MANTISSA_NORM_ENUM __B,
+                  _MM_MANTISSA_SIGN_ENUM __C)
 {
-  return (__m256i) __builtin_ia32_prold256_mask ((__v8si) __A, __B,
-                                                (__v8si)
-                                                _mm256_setzero_si256 (),
-                                                (__mmask8) __U);
+  return (__m256d) __builtin_ia32_getmantpd256_mask ((__v4df) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v4df)
+                                                    _mm256_setzero_pd (),
+                                                    (__mmask8) -1);
 }
 
-extern __inline __m128i
+extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_rol_epi32 (__m128i __A, const int __B)
+_mm256_mask_getmant_pd (__m256d __W, __mmask8 __U, __m256d __A,
+                       _MM_MANTISSA_NORM_ENUM __B,
+                       _MM_MANTISSA_SIGN_ENUM __C)
 {
-  return (__m128i) __builtin_ia32_prold128_mask ((__v4si) __A, __B,
-                                                (__v4si)
-                                                _mm_setzero_si128 (),
-                                                (__mmask8) -1);
+  return (__m256d) __builtin_ia32_getmantpd256_mask ((__v4df) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v4df) __W,
+                                                    (__mmask8) __U);
 }
 
-extern __inline __m128i
+extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_rol_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
-                   const int __B)
+_mm256_maskz_getmant_pd (__mmask8 __U, __m256d __A,
+                        _MM_MANTISSA_NORM_ENUM __B,
+                        _MM_MANTISSA_SIGN_ENUM __C)
 {
-  return (__m128i) __builtin_ia32_prold128_mask ((__v4si) __A, __B,
-                                                (__v4si) __W,
-                                                (__mmask8) __U);
+  return (__m256d) __builtin_ia32_getmantpd256_mask ((__v4df) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v4df)
+                                                    _mm256_setzero_pd (),
+                                                    (__mmask8) __U);
 }
 
-extern __inline __m128i
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_rol_epi32 (__mmask8 __U, __m128i __A, const int __B)
+_mm_getmant_pd (__m128d __A, _MM_MANTISSA_NORM_ENUM __B,
+               _MM_MANTISSA_SIGN_ENUM __C)
 {
-  return (__m128i) __builtin_ia32_prold128_mask ((__v4si) __A, __B,
-                                                (__v4si)
-                                                _mm_setzero_si128 (),
-                                                (__mmask8) __U);
+  return (__m128d) __builtin_ia32_getmantpd128_mask ((__v2df) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v2df)
+                                                    _mm_setzero_pd (),
+                                                    (__mmask8) -1);
 }
 
-extern __inline __m256i
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_ror_epi32 (__m256i __A, const int __B)
+_mm_mask_getmant_pd (__m128d __W, __mmask8 __U, __m128d __A,
+                    _MM_MANTISSA_NORM_ENUM __B,
+                    _MM_MANTISSA_SIGN_ENUM __C)
 {
-  return (__m256i) __builtin_ia32_prord256_mask ((__v8si) __A, __B,
-                                                (__v8si)
-                                                _mm256_setzero_si256 (),
-                                                (__mmask8) -1);
+  return (__m128d) __builtin_ia32_getmantpd128_mask ((__v2df) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v2df) __W,
+                                                    (__mmask8) __U);
 }
 
-extern __inline __m256i
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_ror_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-                      const int __B)
+_mm_maskz_getmant_pd (__mmask8 __U, __m128d __A,
+                     _MM_MANTISSA_NORM_ENUM __B,
+                     _MM_MANTISSA_SIGN_ENUM __C)
 {
-  return (__m256i) __builtin_ia32_prord256_mask ((__v8si) __A, __B,
-                                                (__v8si) __W,
-                                                (__mmask8) __U);
+  return (__m128d) __builtin_ia32_getmantpd128_mask ((__v2df) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v2df)
+                                                    _mm_setzero_pd (),
+                                                    (__mmask8) __U);
 }
 
-extern __inline __m256i
+extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_ror_epi32 (__mmask8 __U, __m256i __A, const int __B)
+_mm256_mmask_i32gather_ps (__m256 __v1_old, __mmask8 __mask,
+                          __m256i __index, void const *__addr,
+                          int __scale)
 {
-  return (__m256i) __builtin_ia32_prord256_mask ((__v8si) __A, __B,
-                                                (__v8si)
-                                                _mm256_setzero_si256 (),
-                                                (__mmask8) __U);
+  return (__m256) __builtin_ia32_gather3siv8sf ((__v8sf) __v1_old,
+                                               __addr,
+                                               (__v8si) __index,
+                                               __mask, __scale);
 }
 
-extern __inline __m128i
+extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_ror_epi32 (__m128i __A, const int __B)
+_mm_mmask_i32gather_ps (__m128 __v1_old, __mmask8 __mask,
+                       __m128i __index, void const *__addr,
+                       int __scale)
 {
-  return (__m128i) __builtin_ia32_prord128_mask ((__v4si) __A, __B,
-                                                (__v4si)
-                                                _mm_setzero_si128 (),
-                                                (__mmask8) -1);
+  return (__m128) __builtin_ia32_gather3siv4sf ((__v4sf) __v1_old,
+                                               __addr,
+                                               (__v4si) __index,
+                                               __mask, __scale);
 }
 
-extern __inline __m128i
+extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_ror_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
-                   const int __B)
+_mm256_mmask_i32gather_pd (__m256d __v1_old, __mmask8 __mask,
+                          __m128i __index, void const *__addr,
+                          int __scale)
 {
-  return (__m128i) __builtin_ia32_prord128_mask ((__v4si) __A, __B,
-                                                (__v4si) __W,
-                                                (__mmask8) __U);
+  return (__m256d) __builtin_ia32_gather3siv4df ((__v4df) __v1_old,
+                                                __addr,
+                                                (__v4si) __index,
+                                                __mask, __scale);
 }
 
-extern __inline __m128i
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_ror_epi32 (__mmask8 __U, __m128i __A, const int __B)
+_mm_mmask_i32gather_pd (__m128d __v1_old, __mmask8 __mask,
+                       __m128i __index, void const *__addr,
+                       int __scale)
 {
-  return (__m128i) __builtin_ia32_prord128_mask ((__v4si) __A, __B,
-                                                (__v4si)
-                                                _mm_setzero_si128 (),
-                                                (__mmask8) __U);
+  return (__m128d) __builtin_ia32_gather3siv2df ((__v2df) __v1_old,
+                                                __addr,
+                                                (__v4si) __index,
+                                                __mask, __scale);
 }
 
-extern __inline __m256i
+extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_rol_epi64 (__m256i __A, const int __B)
-{
-  return (__m256i) __builtin_ia32_prolq256_mask ((__v4di) __A, __B,
-                                                (__v4di)
-                                                _mm256_setzero_si256 (),
-                                                (__mmask8) -1);
+_mm256_mmask_i64gather_ps (__m128 __v1_old, __mmask8 __mask,
+                          __m256i __index, void const *__addr,
+                          int __scale)
+{
+  return (__m128) __builtin_ia32_gather3div8sf ((__v4sf) __v1_old,
+                                               __addr,
+                                               (__v4di) __index,
+                                               __mask, __scale);
 }
 
-extern __inline __m256i
+extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_rol_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-                      const int __B)
+_mm_mmask_i64gather_ps (__m128 __v1_old, __mmask8 __mask,
+                       __m128i __index, void const *__addr,
+                       int __scale)
 {
-  return (__m256i) __builtin_ia32_prolq256_mask ((__v4di) __A, __B,
-                                                (__v4di) __W,
-                                                (__mmask8) __U);
+  return (__m128) __builtin_ia32_gather3div4sf ((__v4sf) __v1_old,
+                                               __addr,
+                                               (__v2di) __index,
+                                               __mask, __scale);
 }
 
-extern __inline __m256i
+extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_rol_epi64 (__mmask8 __U, __m256i __A, const int __B)
+_mm256_mmask_i64gather_pd (__m256d __v1_old, __mmask8 __mask,
+                          __m256i __index, void const *__addr,
+                          int __scale)
 {
-  return (__m256i) __builtin_ia32_prolq256_mask ((__v4di) __A, __B,
-                                                (__v4di)
-                                                _mm256_setzero_si256 (),
-                                                (__mmask8) __U);
+  return (__m256d) __builtin_ia32_gather3div4df ((__v4df) __v1_old,
+                                                __addr,
+                                                (__v4di) __index,
+                                                __mask, __scale);
 }
 
-extern __inline __m128i
+extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_rol_epi64 (__m128i __A, const int __B)
+_mm_mmask_i64gather_pd (__m128d __v1_old, __mmask8 __mask,
+                       __m128i __index, void const *__addr,
+                       int __scale)
 {
-  return (__m128i) __builtin_ia32_prolq128_mask ((__v2di) __A, __B,
-                                                (__v2di)
-                                                _mm_setzero_si128 (),
-                                                (__mmask8) -1);
+  return (__m128d) __builtin_ia32_gather3div2df ((__v2df) __v1_old,
+                                                __addr,
+                                                (__v2di) __index,
+                                                __mask, __scale);
 }
 
-extern __inline __m128i
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_rol_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
-                   const int __B)
+_mm256_mmask_i32gather_epi32 (__m256i __v1_old, __mmask8 __mask,
+                             __m256i __index, void const *__addr,
+                             int __scale)
 {
-  return (__m128i) __builtin_ia32_prolq128_mask ((__v2di) __A, __B,
-                                                (__v2di) __W,
-                                                (__mmask8) __U);
+  return (__m256i) __builtin_ia32_gather3siv8si ((__v8si) __v1_old,
+                                                __addr,
+                                                (__v8si) __index,
+                                                __mask, __scale);
 }
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_rol_epi64 (__mmask8 __U, __m128i __A, const int __B)
+_mm_mmask_i32gather_epi32 (__m128i __v1_old, __mmask8 __mask,
+                          __m128i __index, void const *__addr,
+                          int __scale)
 {
-  return (__m128i) __builtin_ia32_prolq128_mask ((__v2di) __A, __B,
-                                                (__v2di)
-                                                _mm_setzero_si128 (),
-                                                (__mmask8) __U);
+  return (__m128i) __builtin_ia32_gather3siv4si ((__v4si) __v1_old,
+                                                __addr,
+                                                (__v4si) __index,
+                                                __mask, __scale);
 }
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_ror_epi64 (__m256i __A, const int __B)
+_mm256_mmask_i32gather_epi64 (__m256i __v1_old, __mmask8 __mask,
+                             __m128i __index, void const *__addr,
+                             int __scale)
 {
-  return (__m256i) __builtin_ia32_prorq256_mask ((__v4di) __A, __B,
-                                                (__v4di)
-                                                _mm256_setzero_si256 (),
-                                                (__mmask8) -1);
+  return (__m256i) __builtin_ia32_gather3siv4di ((__v4di) __v1_old,
+                                                __addr,
+                                                (__v4si) __index,
+                                                __mask, __scale);
 }
 
-extern __inline __m256i
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_ror_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-                      const int __B)
+_mm_mmask_i32gather_epi64 (__m128i __v1_old, __mmask8 __mask,
+                          __m128i __index, void const *__addr,
+                          int __scale)
 {
-  return (__m256i) __builtin_ia32_prorq256_mask ((__v4di) __A, __B,
-                                                (__v4di) __W,
-                                                (__mmask8) __U);
+  return (__m128i) __builtin_ia32_gather3siv2di ((__v2di) __v1_old,
+                                                __addr,
+                                                (__v4si) __index,
+                                                __mask, __scale);
 }
 
-extern __inline __m256i
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_ror_epi64 (__mmask8 __U, __m256i __A, const int __B)
+_mm256_mmask_i64gather_epi32 (__m128i __v1_old, __mmask8 __mask,
+                             __m256i __index, void const *__addr,
+                             int __scale)
 {
-  return (__m256i) __builtin_ia32_prorq256_mask ((__v4di) __A, __B,
-                                                (__v4di)
-                                                _mm256_setzero_si256 (),
-                                                (__mmask8) __U);
+  return (__m128i) __builtin_ia32_gather3div8si ((__v4si) __v1_old,
+                                                __addr,
+                                                (__v4di) __index,
+                                                __mask, __scale);
 }
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_ror_epi64 (__m128i __A, const int __B)
+_mm_mmask_i64gather_epi32 (__m128i __v1_old, __mmask8 __mask,
+                          __m128i __index, void const *__addr,
+                          int __scale)
 {
-  return (__m128i) __builtin_ia32_prorq128_mask ((__v2di) __A, __B,
-                                                (__v2di)
-                                                _mm_setzero_si128 (),
-                                                (__mmask8) -1);
+  return (__m128i) __builtin_ia32_gather3div4si ((__v4si) __v1_old,
+                                                __addr,
+                                                (__v2di) __index,
+                                                __mask, __scale);
 }
 
-extern __inline __m128i
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_ror_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
-                   const int __B)
+_mm256_mmask_i64gather_epi64 (__m256i __v1_old, __mmask8 __mask,
+                             __m256i __index, void const *__addr,
+                             int __scale)
 {
-  return (__m128i) __builtin_ia32_prorq128_mask ((__v2di) __A, __B,
-                                                (__v2di) __W,
-                                                (__mmask8) __U);
+  return (__m256i) __builtin_ia32_gather3div4di ((__v4di) __v1_old,
+                                                __addr,
+                                                (__v4di) __index,
+                                                __mask, __scale);
 }
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_ror_epi64 (__mmask8 __U, __m128i __A, const int __B)
+_mm_mmask_i64gather_epi64 (__m128i __v1_old, __mmask8 __mask,
+                          __m128i __index, void const *__addr,
+                          int __scale)
 {
-  return (__m128i) __builtin_ia32_prorq128_mask ((__v2di) __A, __B,
-                                                (__v2di)
-                                                _mm_setzero_si128 (),
-                                                (__mmask8) __U);
+  return (__m128i) __builtin_ia32_gather3div2di ((__v2di) __v1_old,
+                                                __addr,
+                                                (__v2di) __index,
+                                                __mask, __scale);
 }
 
-extern __inline __m128i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_alignr_epi32 (__m128i __A, __m128i __B, const int __imm)
+_mm256_i32scatter_ps (void *__addr, __m256i __index,
+                     __m256 __v1, const int __scale)
 {
-  return (__m128i) __builtin_ia32_alignd128_mask ((__v4si) __A,
-                                                 (__v4si) __B, __imm,
-                                                 (__v4si)
-                                                 _mm_setzero_si128 (),
-                                                 (__mmask8) -1);
+  __builtin_ia32_scattersiv8sf (__addr, (__mmask8) 0xFF,
+                               (__v8si) __index, (__v8sf) __v1,
+                               __scale);
 }
 
-extern __inline __m128i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_alignr_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
-                      __m128i __B, const int __imm)
+_mm256_mask_i32scatter_ps (void *__addr, __mmask8 __mask,
+                          __m256i __index, __m256 __v1,
+                          const int __scale)
 {
-  return (__m128i) __builtin_ia32_alignd128_mask ((__v4si) __A,
-                                                 (__v4si) __B, __imm,
-                                                 (__v4si) __W,
-                                                 (__mmask8) __U);
+  __builtin_ia32_scattersiv8sf (__addr, __mask, (__v8si) __index,
+                               (__v8sf) __v1, __scale);
 }
 
-extern __inline __m128i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_alignr_epi32 (__mmask8 __U, __m128i __A, __m128i __B,
-                       const int __imm)
+_mm_i32scatter_ps (void *__addr, __m128i __index, __m128 __v1,
+                  const int __scale)
 {
-  return (__m128i) __builtin_ia32_alignd128_mask ((__v4si) __A,
-                                                 (__v4si) __B, __imm,
-                                                 (__v4si)
-                                                 _mm_setzero_si128 (),
-                                                 (__mmask8) __U);
+  __builtin_ia32_scattersiv4sf (__addr, (__mmask8) 0xFF,
+                               (__v4si) __index, (__v4sf) __v1,
+                               __scale);
 }
 
-extern __inline __m128i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_alignr_epi64 (__m128i __A, __m128i __B, const int __imm)
+_mm_mask_i32scatter_ps (void *__addr, __mmask8 __mask,
+                       __m128i __index, __m128 __v1,
+                       const int __scale)
 {
-  return (__m128i) __builtin_ia32_alignq128_mask ((__v2di) __A,
-                                                 (__v2di) __B, __imm,
-                                                 (__v2di)
-                                                 _mm_setzero_si128 (),
-                                                 (__mmask8) -1);
+  __builtin_ia32_scattersiv4sf (__addr, __mask, (__v4si) __index,
+                               (__v4sf) __v1, __scale);
 }
 
-extern __inline __m128i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_alignr_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
-                      __m128i __B, const int __imm)
+_mm256_i32scatter_pd (void *__addr, __m128i __index,
+                     __m256d __v1, const int __scale)
 {
-  return (__m128i) __builtin_ia32_alignq128_mask ((__v2di) __A,
-                                                 (__v2di) __B, __imm,
-                                                 (__v2di) __W,
-                                                 (__mmask8) __U);
+  __builtin_ia32_scattersiv4df (__addr, (__mmask8) 0xFF,
+                               (__v4si) __index, (__v4df) __v1,
+                               __scale);
 }
 
-extern __inline __m128i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_alignr_epi64 (__mmask8 __U, __m128i __A, __m128i __B,
-                       const int __imm)
+_mm256_mask_i32scatter_pd (void *__addr, __mmask8 __mask,
+                          __m128i __index, __m256d __v1,
+                          const int __scale)
 {
-  return (__m128i) __builtin_ia32_alignq128_mask ((__v2di) __A,
-                                                 (__v2di) __B, __imm,
-                                                 (__v2di)
-                                                 _mm_setzero_si128 (),
-                                                 (__mmask8) __U);
+  __builtin_ia32_scattersiv4df (__addr, __mask, (__v4si) __index,
+                               (__v4df) __v1, __scale);
 }
 
-extern __inline __m256i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_alignr_epi32 (__m256i __A, __m256i __B, const int __imm)
+_mm_i32scatter_pd (void *__addr, __m128i __index,
+                  __m128d __v1, const int __scale)
 {
-  return (__m256i) __builtin_ia32_alignd256_mask ((__v8si) __A,
-                                                 (__v8si) __B, __imm,
-                                                 (__v8si)
-                                                 _mm256_setzero_si256 (),
-                                                 (__mmask8) -1);
+  __builtin_ia32_scattersiv2df (__addr, (__mmask8) 0xFF,
+                               (__v4si) __index, (__v2df) __v1,
+                               __scale);
 }
 
-extern __inline __m256i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_alignr_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-                         __m256i __B, const int __imm)
+_mm_mask_i32scatter_pd (void *__addr, __mmask8 __mask,
+                       __m128i __index, __m128d __v1,
+                       const int __scale)
 {
-  return (__m256i) __builtin_ia32_alignd256_mask ((__v8si) __A,
-                                                 (__v8si) __B, __imm,
-                                                 (__v8si) __W,
-                                                 (__mmask8) __U);
+  __builtin_ia32_scattersiv2df (__addr, __mask, (__v4si) __index,
+                               (__v2df) __v1, __scale);
 }
 
-extern __inline __m256i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_alignr_epi32 (__mmask8 __U, __m256i __A, __m256i __B,
-                          const int __imm)
+_mm256_i64scatter_ps (void *__addr, __m256i __index,
+                     __m128 __v1, const int __scale)
 {
-  return (__m256i) __builtin_ia32_alignd256_mask ((__v8si) __A,
-                                                 (__v8si) __B, __imm,
-                                                 (__v8si)
-                                                 _mm256_setzero_si256 (),
-                                                 (__mmask8) __U);
+  __builtin_ia32_scatterdiv8sf (__addr, (__mmask8) 0xFF,
+                               (__v4di) __index, (__v4sf) __v1,
+                               __scale);
 }
 
-extern __inline __m256i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_alignr_epi64 (__m256i __A, __m256i __B, const int __imm)
+_mm256_mask_i64scatter_ps (void *__addr, __mmask8 __mask,
+                          __m256i __index, __m128 __v1,
+                          const int __scale)
 {
-  return (__m256i) __builtin_ia32_alignq256_mask ((__v4di) __A,
-                                                 (__v4di) __B, __imm,
-                                                 (__v4di)
-                                                 _mm256_setzero_si256 (),
-                                                 (__mmask8) -1);
+  __builtin_ia32_scatterdiv8sf (__addr, __mask, (__v4di) __index,
+                               (__v4sf) __v1, __scale);
 }
 
-extern __inline __m256i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_alignr_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-                         __m256i __B, const int __imm)
+_mm_i64scatter_ps (void *__addr, __m128i __index, __m128 __v1,
+                  const int __scale)
 {
-  return (__m256i) __builtin_ia32_alignq256_mask ((__v4di) __A,
-                                                 (__v4di) __B, __imm,
-                                                 (__v4di) __W,
-                                                 (__mmask8) __U);
+  __builtin_ia32_scatterdiv4sf (__addr, (__mmask8) 0xFF,
+                               (__v2di) __index, (__v4sf) __v1,
+                               __scale);
 }
 
-extern __inline __m256i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_alignr_epi64 (__mmask8 __U, __m256i __A, __m256i __B,
-                          const int __imm)
+_mm_mask_i64scatter_ps (void *__addr, __mmask8 __mask,
+                       __m128i __index, __m128 __v1,
+                       const int __scale)
 {
-  return (__m256i) __builtin_ia32_alignq256_mask ((__v4di) __A,
-                                                 (__v4di) __B, __imm,
-                                                 (__v4di)
-                                                 _mm256_setzero_si256 (),
-                                                 (__mmask8) __U);
+  __builtin_ia32_scatterdiv4sf (__addr, __mask, (__v2di) __index,
+                               (__v4sf) __v1, __scale);
 }
 
-extern __inline __m128i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m128 __A,
-                  const int __I)
+_mm256_i64scatter_pd (void *__addr, __m256i __index,
+                     __m256d __v1, const int __scale)
 {
-  return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, __I,
-                                                 (__v8hi) __W,
-                                                 (__mmask8) __U);
+  __builtin_ia32_scatterdiv4df (__addr, (__mmask8) 0xFF,
+                               (__v4di) __index, (__v4df) __v1,
+                               __scale);
 }
 
-extern __inline __m128i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_cvtps_ph (__mmask8 __U, __m128 __A, const int __I)
+_mm256_mask_i64scatter_pd (void *__addr, __mmask8 __mask,
+                          __m256i __index, __m256d __v1,
+                          const int __scale)
 {
-  return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, __I,
-                                                 (__v8hi)
-                                                 _mm_setzero_si128 (),
-                                                 (__mmask8) __U);
+  __builtin_ia32_scatterdiv4df (__addr, __mask, (__v4di) __index,
+                               (__v4df) __v1, __scale);
 }
 
-extern __inline __m128i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m256 __A,
-                     const int __I)
+_mm_i64scatter_pd (void *__addr, __m128i __index,
+                  __m128d __v1, const int __scale)
 {
-  return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, __I,
-                                                    (__v8hi) __W,
-                                                    (__mmask8) __U);
+  __builtin_ia32_scatterdiv2df (__addr, (__mmask8) 0xFF,
+                               (__v2di) __index, (__v2df) __v1,
+                               __scale);
 }
 
-extern __inline __m128i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_cvtps_ph (__mmask8 __U, __m256 __A, const int __I)
+_mm_mask_i64scatter_pd (void *__addr, __mmask8 __mask,
+                       __m128i __index, __m128d __v1,
+                       const int __scale)
 {
-  return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, __I,
-                                                    (__v8hi)
-                                                    _mm_setzero_si128 (),
-                                                    (__mmask8) __U);
+  __builtin_ia32_scatterdiv2df (__addr, __mask, (__v2di) __index,
+                               (__v2df) __v1, __scale);
 }
 
-extern __inline __m256i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_srai_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-                       const int __imm)
+_mm256_i32scatter_epi32 (void *__addr, __m256i __index,
+                        __m256i __v1, const int __scale)
 {
-  return (__m256i) __builtin_ia32_psradi256_mask ((__v8si) __A, __imm,
-                                                 (__v8si) __W,
-                                                 (__mmask8) __U);
+  __builtin_ia32_scattersiv8si (__addr, (__mmask8) 0xFF,
+                               (__v8si) __index, (__v8si) __v1,
+                               __scale);
 }
 
-extern __inline __m256i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_srai_epi32 (__mmask8 __U, __m256i __A, const int __imm)
+_mm256_mask_i32scatter_epi32 (void *__addr, __mmask8 __mask,
+                             __m256i __index, __m256i __v1,
+                             const int __scale)
 {
-  return (__m256i) __builtin_ia32_psradi256_mask ((__v8si) __A, __imm,
-                                                 (__v8si)
-                                                 _mm256_setzero_si256 (),
-                                                 (__mmask8) __U);
+  __builtin_ia32_scattersiv8si (__addr, __mask, (__v8si) __index,
+                               (__v8si) __v1, __scale);
 }
 
-extern __inline __m128i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_srai_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
-                    const int __imm)
+_mm_i32scatter_epi32 (void *__addr, __m128i __index,
+                     __m128i __v1, const int __scale)
 {
-  return (__m128i) __builtin_ia32_psradi128_mask ((__v4si) __A, __imm,
-                                                 (__v4si) __W,
-                                                 (__mmask8) __U);
+  __builtin_ia32_scattersiv4si (__addr, (__mmask8) 0xFF,
+                               (__v4si) __index, (__v4si) __v1,
+                               __scale);
 }
 
-extern __inline __m128i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_srai_epi32 (__mmask8 __U, __m128i __A, const int __imm)
+_mm_mask_i32scatter_epi32 (void *__addr, __mmask8 __mask,
+                          __m128i __index, __m128i __v1,
+                          const int __scale)
 {
-  return (__m128i) __builtin_ia32_psradi128_mask ((__v4si) __A, __imm,
-                                                 (__v4si)
-                                                 _mm_setzero_si128 (),
-                                                 (__mmask8) __U);
+  __builtin_ia32_scattersiv4si (__addr, __mask, (__v4si) __index,
+                               (__v4si) __v1, __scale);
 }
 
-extern __inline __m256i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_srai_epi64 (__m256i __A, const int __imm)
+_mm256_i32scatter_epi64 (void *__addr, __m128i __index,
+                        __m256i __v1, const int __scale)
 {
-  return (__m256i) __builtin_ia32_psraqi256_mask ((__v4di) __A, __imm,
-                                                 (__v4di)
-                                                 _mm256_setzero_si256 (),
-                                                 (__mmask8) -1);
+  __builtin_ia32_scattersiv4di (__addr, (__mmask8) 0xFF,
+                               (__v4si) __index, (__v4di) __v1,
+                               __scale);
 }
 
-extern __inline __m256i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_srai_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-                       const int __imm)
+_mm256_mask_i32scatter_epi64 (void *__addr, __mmask8 __mask,
+                             __m128i __index, __m256i __v1,
+                             const int __scale)
 {
-  return (__m256i) __builtin_ia32_psraqi256_mask ((__v4di) __A, __imm,
-                                                 (__v4di) __W,
-                                                 (__mmask8) __U);
+  __builtin_ia32_scattersiv4di (__addr, __mask, (__v4si) __index,
+                               (__v4di) __v1, __scale);
 }
 
-extern __inline __m256i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_srai_epi64 (__mmask8 __U, __m256i __A, const int __imm)
+_mm_i32scatter_epi64 (void *__addr, __m128i __index,
+                     __m128i __v1, const int __scale)
 {
-  return (__m256i) __builtin_ia32_psraqi256_mask ((__v4di) __A, __imm,
-                                                 (__v4di)
-                                                 _mm256_setzero_si256 (),
-                                                 (__mmask8) __U);
+  __builtin_ia32_scattersiv2di (__addr, (__mmask8) 0xFF,
+                               (__v4si) __index, (__v2di) __v1,
+                               __scale);
 }
 
-extern __inline __m128i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_srai_epi64 (__m128i __A, const int __imm)
+_mm_mask_i32scatter_epi64 (void *__addr, __mmask8 __mask,
+                          __m128i __index, __m128i __v1,
+                          const int __scale)
 {
-  return (__m128i) __builtin_ia32_psraqi128_mask ((__v2di) __A, __imm,
-                                                 (__v2di)
-                                                 _mm_setzero_si128 (),
-                                                 (__mmask8) -1);
+  __builtin_ia32_scattersiv2di (__addr, __mask, (__v4si) __index,
+                               (__v2di) __v1, __scale);
 }
 
-extern __inline __m128i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_srai_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
-                    const int __imm)
+_mm256_i64scatter_epi32 (void *__addr, __m256i __index,
+                        __m128i __v1, const int __scale)
 {
-  return (__m128i) __builtin_ia32_psraqi128_mask ((__v2di) __A, __imm,
-                                                 (__v2di) __W,
-                                                 (__mmask8) __U);
+  __builtin_ia32_scatterdiv8si (__addr, (__mmask8) 0xFF,
+                               (__v4di) __index, (__v4si) __v1,
+                               __scale);
 }
 
-extern __inline __m128i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_srai_epi64 (__mmask8 __U, __m128i __A, const int __imm)
+_mm256_mask_i64scatter_epi32 (void *__addr, __mmask8 __mask,
+                             __m256i __index, __m128i __v1,
+                             const int __scale)
 {
-  return (__m128i) __builtin_ia32_psraqi128_mask ((__v2di) __A, __imm,
-                                                 (__v2di)
-                                                 _mm_setzero_si128 (),
-                                                 (__mmask8) __U);
+  __builtin_ia32_scatterdiv8si (__addr, __mask, (__v4di) __index,
+                               (__v4si) __v1, __scale);
 }
 
-extern __inline __m128i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_slli_epi32 (__m128i __W, __mmask8 __U, __m128i __A, int __B)
+_mm_i64scatter_epi32 (void *__addr, __m128i __index,
+                     __m128i __v1, const int __scale)
 {
-  return (__m128i) __builtin_ia32_pslldi128_mask ((__v4si) __A, __B,
-                                                 (__v4si) __W,
-                                                 (__mmask8) __U);
+  __builtin_ia32_scatterdiv4si (__addr, (__mmask8) 0xFF,
+                               (__v2di) __index, (__v4si) __v1,
+                               __scale);
 }
 
-extern __inline __m128i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_slli_epi32 (__mmask8 __U, __m128i __A, int __B)
+_mm_mask_i64scatter_epi32 (void *__addr, __mmask8 __mask,
+                          __m128i __index, __m128i __v1,
+                          const int __scale)
 {
-  return (__m128i) __builtin_ia32_pslldi128_mask ((__v4si) __A, __B,
-                                                 (__v4si)
-                                                 _mm_setzero_si128 (),
-                                                 (__mmask8) __U);
+  __builtin_ia32_scatterdiv4si (__addr, __mask, (__v2di) __index,
+                               (__v4si) __v1, __scale);
 }
 
-extern __inline __m128i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_slli_epi64 (__m128i __W, __mmask8 __U, __m128i __A, int __B)
+_mm256_i64scatter_epi64 (void *__addr, __m256i __index,
+                        __m256i __v1, const int __scale)
 {
-  return (__m128i) __builtin_ia32_psllqi128_mask ((__v2di) __A, __B,
-                                                 (__v2di) __W,
-                                                 (__mmask8) __U);
+  __builtin_ia32_scatterdiv4di (__addr, (__mmask8) 0xFF,
+                               (__v4di) __index, (__v4di) __v1,
+                               __scale);
 }
 
-extern __inline __m128i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_slli_epi64 (__mmask8 __U, __m128i __A, int __B)
+_mm256_mask_i64scatter_epi64 (void *__addr, __mmask8 __mask,
+                             __m256i __index, __m256i __v1,
+                             const int __scale)
 {
-  return (__m128i) __builtin_ia32_psllqi128_mask ((__v2di) __A, __B,
-                                                 (__v2di)
-                                                 _mm_setzero_si128 (),
-                                                 (__mmask8) __U);
+  __builtin_ia32_scatterdiv4di (__addr, __mask, (__v4di) __index,
+                               (__v4di) __v1, __scale);
 }
 
-extern __inline __m256i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_slli_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
-                       int __B)
+_mm_i64scatter_epi64 (void *__addr, __m128i __index,
+                     __m128i __v1, const int __scale)
 {
-  return (__m256i) __builtin_ia32_pslldi256_mask ((__v8si) __A, __B,
-                                                 (__v8si) __W,
-                                                 (__mmask8) __U);
+  __builtin_ia32_scatterdiv2di (__addr, (__mmask8) 0xFF,
+                               (__v2di) __index, (__v2di) __v1,
+                               __scale);
 }
 
-extern __inline __m256i
+extern __inline void
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_slli_epi32 (__mmask8 __U, __m256i __A, int __B)
+_mm_mask_i64scatter_epi64 (void *__addr, __mmask8 __mask,
+                          __m128i __index, __m128i __v1,
+                          const int __scale)
 {
-  return (__m256i) __builtin_ia32_pslldi256_mask ((__v8si) __A, __B,
-                                                 (__v8si)
-                                                 _mm256_setzero_si256 (),
-                                                 (__mmask8) __U);
+  __builtin_ia32_scatterdiv2di (__addr, __mask, (__v2di) __index,
+                               (__v2di) __v1, __scale);
 }
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_slli_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
-                       int __B)
+_mm256_mask_shuffle_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                          _MM_PERM_ENUM __mask)
 {
-  return (__m256i) __builtin_ia32_psllqi256_mask ((__v4di) __A, __B,
-                                                 (__v4di) __W,
+  return (__m256i) __builtin_ia32_pshufd256_mask ((__v8si) __A, __mask,
+                                                 (__v8si) __W,
                                                  (__mmask8) __U);
 }
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_slli_epi64 (__mmask8 __U, __m256i __A, int __B)
+_mm256_maskz_shuffle_epi32 (__mmask8 __U, __m256i __A,
+                           _MM_PERM_ENUM __mask)
 {
-  return (__m256i) __builtin_ia32_psllqi256_mask ((__v4di) __A, __B,
-                                                 (__v4di)
+  return (__m256i) __builtin_ia32_pshufd256_mask ((__v8si) __A, __mask,
+                                                 (__v8si)
                                                  _mm256_setzero_si256 (),
                                                  (__mmask8) __U);
 }
 
-extern __inline __m256d
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_permutex_pd (__m256d __W, __mmask8 __U, __m256d __X,
-                        const int __imm)
+_mm_mask_shuffle_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+                       _MM_PERM_ENUM __mask)
 {
-  return (__m256d) __builtin_ia32_permdf256_mask ((__v4df) __X, __imm,
-                                                 (__v4df) __W,
+  return (__m128i) __builtin_ia32_pshufd128_mask ((__v4si) __A, __mask,
+                                                 (__v4si) __W,
                                                  (__mmask8) __U);
 }
 
-extern __inline __m256d
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_permutex_pd (__mmask8 __U, __m256d __X, const int __imm)
+_mm_maskz_shuffle_epi32 (__mmask8 __U, __m128i __A,
+                        _MM_PERM_ENUM __mask)
 {
-  return (__m256d) __builtin_ia32_permdf256_mask ((__v4df) __X, __imm,
-                                                 (__v4df)
-                                                 _mm256_setzero_pd (),
+  return (__m128i) __builtin_ia32_pshufd128_mask ((__v4si) __A, __mask,
+                                                 (__v4si)
+                                                 _mm_setzero_si128 (),
                                                  (__mmask8) __U);
 }
 
-extern __inline __m256d
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_permute_pd (__m256d __W, __mmask8 __U, __m256d __X,
-                       const int __C)
-{
-  return (__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df) __X, __C,
-                                                    (__v4df) __W,
-                                                    (__mmask8) __U);
-}
-
-extern __inline __m256d
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_permute_pd (__mmask8 __U, __m256d __X, const int __C)
-{
-  return (__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df) __X, __C,
-                                                    (__v4df)
-                                                    _mm256_setzero_pd (),
-                                                    (__mmask8) __U);
-}
-
-extern __inline __m128d
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_permute_pd (__m128d __W, __mmask8 __U, __m128d __X,
-                    const int __C)
+_mm256_rol_epi32 (__m256i __A, const int __B)
 {
-  return (__m128d) __builtin_ia32_vpermilpd_mask ((__v2df) __X, __C,
-                                                 (__v2df) __W,
-                                                 (__mmask8) __U);
+  return (__m256i) __builtin_ia32_prold256_mask ((__v8si) __A, __B,
+                                                (__v8si)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) -1);
 }
 
-extern __inline __m128d
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_permute_pd (__mmask8 __U, __m128d __X, const int __C)
+_mm256_mask_rol_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                      const int __B)
 {
-  return (__m128d) __builtin_ia32_vpermilpd_mask ((__v2df) __X, __C,
-                                                 (__v2df)
-                                                 _mm_setzero_pd (),
-                                                 (__mmask8) __U);
+  return (__m256i) __builtin_ia32_prold256_mask ((__v8si) __A, __B,
+                                                (__v8si) __W,
+                                                (__mmask8) __U);
 }
 
-extern __inline __m256
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_permute_ps (__m256 __W, __mmask8 __U, __m256 __X,
-                       const int __C)
+_mm256_maskz_rol_epi32 (__mmask8 __U, __m256i __A, const int __B)
 {
-  return (__m256) __builtin_ia32_vpermilps256_mask ((__v8sf) __X, __C,
-                                                   (__v8sf) __W,
-                                                   (__mmask8) __U);
+  return (__m256i) __builtin_ia32_prold256_mask ((__v8si) __A, __B,
+                                                (__v8si)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
 }
 
-extern __inline __m256
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_maskz_permute_ps (__mmask8 __U, __m256 __X, const int __C)
+_mm_rol_epi32 (__m128i __A, const int __B)
 {
-  return (__m256) __builtin_ia32_vpermilps256_mask ((__v8sf) __X, __C,
-                                                   (__v8sf)
-                                                   _mm256_setzero_ps (),
-                                                   (__mmask8) __U);
+  return (__m128i) __builtin_ia32_prold128_mask ((__v4si) __A, __B,
+                                                (__v4si)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) -1);
 }
 
-extern __inline __m128
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_permute_ps (__m128 __W, __mmask8 __U, __m128 __X,
-                    const int __C)
+_mm_mask_rol_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+                   const int __B)
 {
-  return (__m128) __builtin_ia32_vpermilps_mask ((__v4sf) __X, __C,
-                                                (__v4sf) __W,
+  return (__m128i) __builtin_ia32_prold128_mask ((__v4si) __A, __B,
+                                                (__v4si) __W,
                                                 (__mmask8) __U);
 }
 
-extern __inline __m128
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_maskz_permute_ps (__mmask8 __U, __m128 __X, const int __C)
+_mm_maskz_rol_epi32 (__mmask8 __U, __m128i __A, const int __B)
 {
-  return (__m128) __builtin_ia32_vpermilps_mask ((__v4sf) __X, __C,
-                                                (__v4sf)
-                                                _mm_setzero_ps (),
+  return (__m128i) __builtin_ia32_prold128_mask ((__v4si) __A, __B,
+                                                (__v4si)
+                                                _mm_setzero_si128 (),
                                                 (__mmask8) __U);
 }
 
-extern __inline __m256d
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_blend_pd (__mmask8 __U, __m256d __A, __m256d __W)
+_mm256_ror_epi32 (__m256i __A, const int __B)
 {
-  return (__m256d) __builtin_ia32_blendmpd_256_mask ((__v4df) __A,
-                                                    (__v4df) __W,
-                                                    (__mmask8) __U);
+  return (__m256i) __builtin_ia32_prord256_mask ((__v8si) __A, __B,
+                                                (__v8si)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) -1);
 }
 
-extern __inline __m256
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_blend_ps (__mmask8 __U, __m256 __A, __m256 __W)
+_mm256_mask_ror_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                      const int __B)
 {
-  return (__m256) __builtin_ia32_blendmps_256_mask ((__v8sf) __A,
-                                                   (__v8sf) __W,
-                                                   (__mmask8) __U);
+  return (__m256i) __builtin_ia32_prord256_mask ((__v8si) __A, __B,
+                                                (__v8si) __W,
+                                                (__mmask8) __U);
 }
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_blend_epi64 (__mmask8 __U, __m256i __A, __m256i __W)
+_mm256_maskz_ror_epi32 (__mmask8 __U, __m256i __A, const int __B)
 {
-  return (__m256i) __builtin_ia32_blendmq_256_mask ((__v4di) __A,
-                                                   (__v4di) __W,
-                                                   (__mmask8) __U);
+  return (__m256i) __builtin_ia32_prord256_mask ((__v8si) __A, __B,
+                                                (__v8si)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
 }
 
-extern __inline __m256i
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_blend_epi32 (__mmask8 __U, __m256i __A, __m256i __W)
+_mm_ror_epi32 (__m128i __A, const int __B)
 {
-  return (__m256i) __builtin_ia32_blendmd_256_mask ((__v8si) __A,
-                                                   (__v8si) __W,
-                                                   (__mmask8) __U);
+  return (__m128i) __builtin_ia32_prord128_mask ((__v4si) __A, __B,
+                                                (__v4si)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) -1);
 }
 
-extern __inline __m128d
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_blend_pd (__mmask8 __U, __m128d __A, __m128d __W)
+_mm_mask_ror_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+                   const int __B)
 {
-  return (__m128d) __builtin_ia32_blendmpd_128_mask ((__v2df) __A,
-                                                    (__v2df) __W,
-                                                    (__mmask8) __U);
+  return (__m128i) __builtin_ia32_prord128_mask ((__v4si) __A, __B,
+                                                (__v4si) __W,
+                                                (__mmask8) __U);
 }
 
-extern __inline __m128
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_blend_ps (__mmask8 __U, __m128 __A, __m128 __W)
+_mm_maskz_ror_epi32 (__mmask8 __U, __m128i __A, const int __B)
 {
-  return (__m128) __builtin_ia32_blendmps_128_mask ((__v4sf) __A,
-                                                   (__v4sf) __W,
-                                                   (__mmask8) __U);
+  return (__m128i) __builtin_ia32_prord128_mask ((__v4si) __A, __B,
+                                                (__v4si)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
 }
 
-extern __inline __m128i
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_blend_epi64 (__mmask8 __U, __m128i __A, __m128i __W)
+_mm256_rol_epi64 (__m256i __A, const int __B)
 {
-  return (__m128i) __builtin_ia32_blendmq_128_mask ((__v2di) __A,
-                                                   (__v2di) __W,
-                                                   (__mmask8) __U);
+  return (__m256i) __builtin_ia32_prolq256_mask ((__v4di) __A, __B,
+                                                (__v4di)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) -1);
 }
 
-extern __inline __m128i
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_blend_epi32 (__mmask8 __U, __m128i __A, __m128i __W)
+_mm256_mask_rol_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                      const int __B)
 {
-  return (__m128i) __builtin_ia32_blendmd_128_mask ((__v4si) __A,
-                                                   (__v4si) __W,
-                                                   (__mmask8) __U);
+  return (__m256i) __builtin_ia32_prolq256_mask ((__v4di) __A, __B,
+                                                (__v4di) __W,
+                                                (__mmask8) __U);
 }
 
-extern __inline __mmask8
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmp_epi64_mask (__m256i __X, __m256i __Y, const int __P)
-{
-  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
-                                                (__v4di) __Y, __P,
-                                                (__mmask8) -1);
+_mm256_maskz_rol_epi64 (__mmask8 __U, __m256i __A, const int __B)
+{
+  return (__m256i) __builtin_ia32_prolq256_mask ((__v4di) __A, __B,
+                                                (__v4di)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
 }
 
-extern __inline __mmask8
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmp_epi32_mask (__m256i __X, __m256i __Y, const int __P)
+_mm_rol_epi64 (__m128i __A, const int __B)
 {
-  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
-                                                (__v8si) __Y, __P,
+  return (__m128i) __builtin_ia32_prolq128_mask ((__v2di) __A, __B,
+                                                (__v2di)
+                                                _mm_setzero_si128 (),
                                                 (__mmask8) -1);
 }
 
-extern __inline __mmask8
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmp_epu64_mask (__m256i __X, __m256i __Y, const int __P)
+_mm_mask_rol_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+                   const int __B)
 {
-  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
-                                                 (__v4di) __Y, __P,
-                                                 (__mmask8) -1);
+  return (__m128i) __builtin_ia32_prolq128_mask ((__v2di) __A, __B,
+                                                (__v2di) __W,
+                                                (__mmask8) __U);
 }
 
-extern __inline __mmask8
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmp_epu32_mask (__m256i __X, __m256i __Y, const int __P)
+_mm_maskz_rol_epi64 (__mmask8 __U, __m128i __A, const int __B)
 {
-  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
-                                                 (__v8si) __Y, __P,
-                                                 (__mmask8) -1);
+  return (__m128i) __builtin_ia32_prolq128_mask ((__v2di) __A, __B,
+                                                (__v2di)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
 }
 
-extern __inline __mmask8
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmp_pd_mask (__m256d __X, __m256d __Y, const int __P)
+_mm256_ror_epi64 (__m256i __A, const int __B)
 {
-  return (__mmask8) __builtin_ia32_cmppd256_mask ((__v4df) __X,
-                                                 (__v4df) __Y, __P,
-                                                 (__mmask8) -1);
+  return (__m256i) __builtin_ia32_prorq256_mask ((__v4di) __A, __B,
+                                                (__v4di)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) -1);
 }
 
-extern __inline __mmask8
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmp_ps_mask (__m256 __X, __m256 __Y, const int __P)
+_mm256_mask_ror_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                      const int __B)
 {
-  return (__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf) __X,
-                                                 (__v8sf) __Y, __P,
-                                                 (__mmask8) -1);
+  return (__m256i) __builtin_ia32_prorq256_mask ((__v4di) __A, __B,
+                                                (__v4di) __W,
+                                                (__mmask8) __U);
 }
 
-extern __inline __mmask8
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmp_epi64_mask (__mmask8 __U, __m256i __X, __m256i __Y,
-                           const int __P)
+_mm256_maskz_ror_epi64 (__mmask8 __U, __m256i __A, const int __B)
 {
-  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
-                                                (__v4di) __Y, __P,
+  return (__m256i) __builtin_ia32_prorq256_mask ((__v4di) __A, __B,
+                                                (__v4di)
+                                                _mm256_setzero_si256 (),
                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmp_epi32_mask (__mmask8 __U, __m256i __X, __m256i __Y,
-                           const int __P)
+_mm_ror_epi64 (__m128i __A, const int __B)
 {
-  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
-                                                (__v8si) __Y, __P,
-                                                (__mmask8) __U);
+  return (__m128i) __builtin_ia32_prorq128_mask ((__v2di) __A, __B,
+                                                (__v2di)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) -1);
 }
 
-extern __inline __mmask8
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmp_epu64_mask (__mmask8 __U, __m256i __X, __m256i __Y,
-                           const int __P)
+_mm_mask_ror_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+                   const int __B)
 {
-  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
-                                                 (__v4di) __Y, __P,
-                                                 (__mmask8) __U);
+  return (__m128i) __builtin_ia32_prorq128_mask ((__v2di) __A, __B,
+                                                (__v2di) __W,
+                                                (__mmask8) __U);
 }
 
-extern __inline __mmask8
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmp_epu32_mask (__mmask8 __U, __m256i __X, __m256i __Y,
-                           const int __P)
+_mm_maskz_ror_epi64 (__mmask8 __U, __m128i __A, const int __B)
 {
-  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
-                                                 (__v8si) __Y, __P,
-                                                 (__mmask8) __U);
+  return (__m128i) __builtin_ia32_prorq128_mask ((__v2di) __A, __B,
+                                                (__v2di)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
 }
 
-extern __inline __mmask8
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmp_pd_mask (__mmask8 __U, __m256d __X, __m256d __Y,
-                        const int __P)
+_mm_alignr_epi32 (__m128i __A, __m128i __B, const int __imm)
 {
-  return (__mmask8) __builtin_ia32_cmppd256_mask ((__v4df) __X,
-                                                 (__v4df) __Y, __P,
-                                                 (__mmask8) __U);
+  return (__m128i) __builtin_ia32_alignd128_mask ((__v4si) __A,
+                                                 (__v4si) __B, __imm,
+                                                 (__v4si)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) -1);
 }
 
-extern __inline __mmask8
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmp_ps_mask (__mmask8 __U, __m256 __X, __m256 __Y,
-                        const int __P)
+_mm_mask_alignr_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+                      __m128i __B, const int __imm)
 {
-  return (__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf) __X,
-                                                 (__v8sf) __Y, __P,
+  return (__m128i) __builtin_ia32_alignd128_mask ((__v4si) __A,
+                                                 (__v4si) __B, __imm,
+                                                 (__v4si) __W,
                                                  (__mmask8) __U);
 }
 
-extern __inline __mmask8
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cmp_epi64_mask (__m128i __X, __m128i __Y, const int __P)
+_mm_maskz_alignr_epi32 (__mmask8 __U, __m128i __A, __m128i __B,
+                       const int __imm)
 {
-  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
-                                                (__v2di) __Y, __P,
-                                                (__mmask8) -1);
+  return (__m128i) __builtin_ia32_alignd128_mask ((__v4si) __A,
+                                                 (__v4si) __B, __imm,
+                                                 (__v4si)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cmp_epi32_mask (__m128i __X, __m128i __Y, const int __P)
+_mm_alignr_epi64 (__m128i __A, __m128i __B, const int __imm)
 {
-  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
-                                                (__v4si) __Y, __P,
-                                                (__mmask8) -1);
+  return (__m128i) __builtin_ia32_alignq128_mask ((__v2di) __A,
+                                                 (__v2di) __B, __imm,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) -1);
 }
 
-extern __inline __mmask8
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cmp_epu64_mask (__m128i __X, __m128i __Y, const int __P)
+_mm_mask_alignr_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+                      __m128i __B, const int __imm)
 {
-  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
-                                                 (__v2di) __Y, __P,
-                                                 (__mmask8) -1);
+  return (__m128i) __builtin_ia32_alignq128_mask ((__v2di) __A,
+                                                 (__v2di) __B, __imm,
+                                                 (__v2di) __W,
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cmp_epu32_mask (__m128i __X, __m128i __Y, const int __P)
+_mm_maskz_alignr_epi64 (__mmask8 __U, __m128i __A, __m128i __B,
+                       const int __imm)
 {
-  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
-                                                 (__v4si) __Y, __P,
-                                                 (__mmask8) -1);
+  return (__m128i) __builtin_ia32_alignq128_mask ((__v2di) __A,
+                                                 (__v2di) __B, __imm,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cmp_pd_mask (__m128d __X, __m128d __Y, const int __P)
+_mm256_alignr_epi32 (__m256i __A, __m256i __B, const int __imm)
 {
-  return (__mmask8) __builtin_ia32_cmppd128_mask ((__v2df) __X,
-                                                 (__v2df) __Y, __P,
+  return (__m256i) __builtin_ia32_alignd256_mask ((__v8si) __A,
+                                                 (__v8si) __B, __imm,
+                                                 (__v8si)
+                                                 _mm256_setzero_si256 (),
                                                  (__mmask8) -1);
 }
 
-extern __inline __mmask8
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cmp_ps_mask (__m128 __X, __m128 __Y, const int __P)
+_mm256_mask_alignr_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                         __m256i __B, const int __imm)
 {
-  return (__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf) __X,
-                                                 (__v4sf) __Y, __P,
-                                                 (__mmask8) -1);
+  return (__m256i) __builtin_ia32_alignd256_mask ((__v8si) __A,
+                                                 (__v8si) __B, __imm,
+                                                 (__v8si) __W,
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cmp_epi64_mask (__mmask8 __U, __m128i __X, __m128i __Y,
-                        const int __P)
+_mm256_maskz_alignr_epi32 (__mmask8 __U, __m256i __A, __m256i __B,
+                          const int __imm)
 {
-  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
-                                                (__v2di) __Y, __P,
-                                                (__mmask8) __U);
+  return (__m256i) __builtin_ia32_alignd256_mask ((__v8si) __A,
+                                                 (__v8si) __B, __imm,
+                                                 (__v8si)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cmp_epi32_mask (__mmask8 __U, __m128i __X, __m128i __Y,
-                        const int __P)
+_mm256_alignr_epi64 (__m256i __A, __m256i __B, const int __imm)
 {
-  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
-                                                (__v4si) __Y, __P,
-                                                (__mmask8) __U);
+  return (__m256i) __builtin_ia32_alignq256_mask ((__v4di) __A,
+                                                 (__v4di) __B, __imm,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) -1);
 }
 
-extern __inline __mmask8
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cmp_epu64_mask (__mmask8 __U, __m128i __X, __m128i __Y,
-                        const int __P)
+_mm256_mask_alignr_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                         __m256i __B, const int __imm)
 {
-  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
-                                                 (__v2di) __Y, __P,
+  return (__m256i) __builtin_ia32_alignq256_mask ((__v4di) __A,
+                                                 (__v4di) __B, __imm,
+                                                 (__v4di) __W,
                                                  (__mmask8) __U);
 }
 
-extern __inline __mmask8
+extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cmp_epu32_mask (__mmask8 __U, __m128i __X, __m128i __Y,
-                        const int __P)
+_mm256_maskz_alignr_epi64 (__mmask8 __U, __m256i __A, __m256i __B,
+                          const int __imm)
 {
-  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
-                                                 (__v4si) __Y, __P,
+  return (__m256i) __builtin_ia32_alignq256_mask ((__v4di) __A,
+                                                 (__v4di) __B, __imm,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
                                                  (__mmask8) __U);
 }
 
-extern __inline __mmask8
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cmp_pd_mask (__mmask8 __U, __m128d __X, __m128d __Y,
-                     const int __P)
+_mm_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m128 __A,
+                  const int __I)
 {
-  return (__mmask8) __builtin_ia32_cmppd128_mask ((__v2df) __X,
-                                                 (__v2df) __Y, __P,
+  return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, __I,
+                                                 (__v8hi) __W,
                                                  (__mmask8) __U);
 }
 
-extern __inline __mmask8
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cmp_ps_mask (__mmask8 __U, __m128 __X, __m128 __Y,
-                     const int __P)
+_mm_maskz_cvtps_ph (__mmask8 __U, __m128 __A, const int __I)
 {
-  return (__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf) __X,
-                                                 (__v4sf) __Y, __P,
+  return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, __I,
+                                                 (__v8hi)
+                                                 _mm_setzero_si128 (),
                                                  (__mmask8) __U);
 }
 
-extern __inline __m256d
+extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_permutex_pd (__m256d __X, const int __M)
+_mm256_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m256 __A,
+                     const int __I)
 {
-  return (__m256d) __builtin_ia32_permdf256_mask ((__v4df) __X, __M,
-                                                 (__v4df)
-                                                 _mm256_undefined_pd (),
-                                                 (__mmask8) -1);
+  return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, __I,
+                                                    (__v8hi) __W,
+                                                    (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmpneq_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtps_ph (__mmask8 __U, __m256 __A, const int __I)
 {
-  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
-                                                 (__v8si) __Y, 4,
-                                                 (__mmask8) __M);
+  return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, __I,
+                                                    (__v8hi)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmpneq_epu32_mask (__m256i __X, __m256i __Y)
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srai_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                       const int __imm)
 {
-  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
-                                                 (__v8si) __Y, 4,
-                                                 (__mmask8) -1);
+  return (__m256i) __builtin_ia32_psradi256_mask ((__v8si) __A, __imm,
+                                                 (__v8si) __W,
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmplt_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srai_epi32 (__mmask8 __U, __m256i __A, const int __imm)
 {
-  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
-                                                 (__v8si) __Y, 1,
-                                                 (__mmask8) __M);
+  return (__m256i) __builtin_ia32_psradi256_mask ((__v8si) __A, __imm,
+                                                 (__v8si)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmplt_epu32_mask (__m256i __X, __m256i __Y)
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srai_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+                    const int __imm)
 {
-  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
-                                                 (__v8si) __Y, 1,
-                                                 (__mmask8) -1);
+  return (__m128i) __builtin_ia32_psradi128_mask ((__v4si) __A, __imm,
+                                                 (__v4si) __W,
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmpge_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srai_epi32 (__mmask8 __U, __m128i __A, const int __imm)
 {
-  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
-                                                 (__v8si) __Y, 5,
-                                                 (__mmask8) __M);
+  return (__m128i) __builtin_ia32_psradi128_mask ((__v4si) __A, __imm,
+                                                 (__v4si)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmpge_epu32_mask (__m256i __X, __m256i __Y)
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srai_epi64 (__m256i __A, const int __imm)
 {
-  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
-                                                 (__v8si) __Y, 5,
+  return (__m256i) __builtin_ia32_psraqi256_mask ((__v4di) __A, __imm,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
                                                  (__mmask8) -1);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmple_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
-                                                 (__v8si) __Y, 2,
-                                                 (__mmask8) __M);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmple_epu32_mask (__m256i __X, __m256i __Y)
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srai_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                       const int __imm)
 {
-  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
-                                                 (__v8si) __Y, 2,
-                                                 (__mmask8) -1);
+  return (__m256i) __builtin_ia32_psraqi256_mask ((__v4di) __A, __imm,
+                                                 (__v4di) __W,
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmpneq_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srai_epi64 (__mmask8 __U, __m256i __A, const int __imm)
 {
-  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
-                                                 (__v4di) __Y, 4,
-                                                 (__mmask8) __M);
+  return (__m256i) __builtin_ia32_psraqi256_mask ((__v4di) __A, __imm,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmpneq_epu64_mask (__m256i __X, __m256i __Y)
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srai_epi64 (__m128i __A, const int __imm)
 {
-  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
-                                                 (__v4di) __Y, 4,
+  return (__m128i) __builtin_ia32_psraqi128_mask ((__v2di) __A, __imm,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
                                                  (__mmask8) -1);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmplt_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
-                                                 (__v4di) __Y, 1,
-                                                 (__mmask8) __M);
-}
-
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmplt_epu64_mask (__m256i __X, __m256i __Y)
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srai_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+                    const int __imm)
 {
-  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
-                                                 (__v4di) __Y, 1,
-                                                 (__mmask8) -1);
+  return (__m128i) __builtin_ia32_psraqi128_mask ((__v2di) __A, __imm,
+                                                 (__v2di) __W,
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmpge_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srai_epi64 (__mmask8 __U, __m128i __A, const int __imm)
 {
-  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
-                                                 (__v4di) __Y, 5,
-                                                 (__mmask8) __M);
+  return (__m128i) __builtin_ia32_psraqi128_mask ((__v2di) __A, __imm,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmpge_epu64_mask (__m256i __X, __m256i __Y)
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_slli_epi32 (__m128i __W, __mmask8 __U, __m128i __A, int __B)
 {
-  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
-                                                 (__v4di) __Y, 5,
-                                                 (__mmask8) -1);
+  return (__m128i) __builtin_ia32_pslldi128_mask ((__v4si) __A, __B,
+                                                 (__v4si) __W,
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmple_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_slli_epi32 (__mmask8 __U, __m128i __A, int __B)
 {
-  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
-                                                 (__v4di) __Y, 2,
-                                                 (__mmask8) __M);
+  return (__m128i) __builtin_ia32_pslldi128_mask ((__v4si) __A, __B,
+                                                 (__v4si)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmple_epu64_mask (__m256i __X, __m256i __Y)
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_slli_epi64 (__m128i __W, __mmask8 __U, __m128i __A, int __B)
 {
-  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
-                                                 (__v4di) __Y, 2,
-                                                 (__mmask8) -1);
+  return (__m128i) __builtin_ia32_psllqi128_mask ((__v2di) __A, __B,
+                                                 (__v2di) __W,
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmpneq_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_slli_epi64 (__mmask8 __U, __m128i __A, int __B)
 {
-  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
-                                                (__v8si) __Y, 4,
-                                                (__mmask8) __M);
+  return (__m128i) __builtin_ia32_psllqi128_mask ((__v2di) __A, __B,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmpneq_epi32_mask (__m256i __X, __m256i __Y)
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_slli_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                       int __B)
 {
-  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
-                                                (__v8si) __Y, 4,
-                                                (__mmask8) -1);
+  return (__m256i) __builtin_ia32_pslldi256_mask ((__v8si) __A, __B,
+                                                 (__v8si) __W,
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmplt_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
-{
-  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
-                                                (__v8si) __Y, 1,
-                                                (__mmask8) __M);
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_slli_epi32 (__mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i) __builtin_ia32_pslldi256_mask ((__v8si) __A, __B,
+                                                 (__v8si)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmplt_epi32_mask (__m256i __X, __m256i __Y)
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_slli_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                       int __B)
 {
-  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
-                                                (__v8si) __Y, 1,
-                                                (__mmask8) -1);
+  return (__m256i) __builtin_ia32_psllqi256_mask ((__v4di) __A, __B,
+                                                 (__v4di) __W,
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmpge_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_slli_epi64 (__mmask8 __U, __m256i __A, int __B)
 {
-  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
-                                                (__v8si) __Y, 5,
-                                                (__mmask8) __M);
+  return (__m256i) __builtin_ia32_psllqi256_mask ((__v4di) __A, __B,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmpge_epi32_mask (__m256i __X, __m256i __Y)
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutex_pd (__m256d __W, __mmask8 __U, __m256d __X,
+                        const int __imm)
 {
-  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
-                                                (__v8si) __Y, 5,
-                                                (__mmask8) -1);
+  return (__m256d) __builtin_ia32_permdf256_mask ((__v4df) __X, __imm,
+                                                 (__v4df) __W,
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmple_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutex_pd (__mmask8 __U, __m256d __X, const int __imm)
 {
-  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
-                                                (__v8si) __Y, 2,
-                                                (__mmask8) __M);
+  return (__m256d) __builtin_ia32_permdf256_mask ((__v4df) __X, __imm,
+                                                 (__v4df)
+                                                 _mm256_setzero_pd (),
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmple_epi32_mask (__m256i __X, __m256i __Y)
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permute_pd (__m256d __W, __mmask8 __U, __m256d __X,
+                       const int __C)
 {
-  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
-                                                (__v8si) __Y, 2,
-                                                (__mmask8) -1);
+  return (__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df) __X, __C,
+                                                    (__v4df) __W,
+                                                    (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmpneq_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permute_pd (__mmask8 __U, __m256d __X, const int __C)
 {
-  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
-                                                (__v4di) __Y, 4,
-                                                (__mmask8) __M);
+  return (__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df) __X, __C,
+                                                    (__v4df)
+                                                    _mm256_setzero_pd (),
+                                                    (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmpneq_epi64_mask (__m256i __X, __m256i __Y)
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permute_pd (__m128d __W, __mmask8 __U, __m128d __X,
+                    const int __C)
 {
-  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
-                                                (__v4di) __Y, 4,
-                                                (__mmask8) -1);
+  return (__m128d) __builtin_ia32_vpermilpd_mask ((__v2df) __X, __C,
+                                                 (__v2df) __W,
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmplt_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permute_pd (__mmask8 __U, __m128d __X, const int __C)
 {
-  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
-                                                (__v4di) __Y, 1,
-                                                (__mmask8) __M);
+  return (__m128d) __builtin_ia32_vpermilpd_mask ((__v2df) __X, __C,
+                                                 (__v2df)
+                                                 _mm_setzero_pd (),
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmplt_epi64_mask (__m256i __X, __m256i __Y)
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permute_ps (__m256 __W, __mmask8 __U, __m256 __X,
+                       const int __C)
 {
-  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
-                                                (__v4di) __Y, 1,
-                                                (__mmask8) -1);
+  return (__m256) __builtin_ia32_vpermilps256_mask ((__v8sf) __X, __C,
+                                                   (__v8sf) __W,
+                                                   (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmpge_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permute_ps (__mmask8 __U, __m256 __X, const int __C)
 {
-  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
-                                                (__v4di) __Y, 5,
-                                                (__mmask8) __M);
+  return (__m256) __builtin_ia32_vpermilps256_mask ((__v8sf) __X, __C,
+                                                   (__v8sf)
+                                                   _mm256_setzero_ps (),
+                                                   (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmpge_epi64_mask (__m256i __X, __m256i __Y)
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permute_ps (__m128 __W, __mmask8 __U, __m128 __X,
+                    const int __C)
 {
-  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
-                                                (__v4di) __Y, 5,
-                                                (__mmask8) -1);
+  return (__m128) __builtin_ia32_vpermilps_mask ((__v4sf) __X, __C,
+                                                (__v4sf) __W,
+                                                (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_mask_cmple_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permute_ps (__mmask8 __U, __m128 __X, const int __C)
 {
-  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
-                                                (__v4di) __Y, 2,
-                                                (__mmask8) __M);
+  return (__m128) __builtin_ia32_vpermilps_mask ((__v4sf) __X, __C,
+                                                (__v4sf)
+                                                _mm_setzero_ps (),
+                                                (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_cmple_epi64_mask (__m256i __X, __m256i __Y)
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_pd (__mmask8 __U, __m256d __A, __m256d __W)
 {
-  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
-                                                (__v4di) __Y, 2,
-                                                (__mmask8) -1);
+  return (__m256d) __builtin_ia32_blendmpd_256_mask ((__v4df) __A,
+                                                    (__v4df) __W,
+                                                    (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cmpneq_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_ps (__mmask8 __U, __m256 __A, __m256 __W)
 {
-  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
-                                                 (__v4si) __Y, 4,
-                                                 (__mmask8) __M);
+  return (__m256) __builtin_ia32_blendmps_256_mask ((__v8sf) __A,
+                                                   (__v8sf) __W,
+                                                   (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cmpneq_epu32_mask (__m128i __X, __m128i __Y)
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_epi64 (__mmask8 __U, __m256i __A, __m256i __W)
 {
-  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
-                                                 (__v4si) __Y, 4,
-                                                 (__mmask8) -1);
+  return (__m256i) __builtin_ia32_blendmq_256_mask ((__v4di) __A,
+                                                   (__v4di) __W,
+                                                   (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cmplt_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_epi32 (__mmask8 __U, __m256i __A, __m256i __W)
 {
-  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
-                                                 (__v4si) __Y, 1,
-                                                 (__mmask8) __M);
+  return (__m256i) __builtin_ia32_blendmd_256_mask ((__v8si) __A,
+                                                   (__v8si) __W,
+                                                   (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cmplt_epu32_mask (__m128i __X, __m128i __Y)
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_pd (__mmask8 __U, __m128d __A, __m128d __W)
 {
-  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
-                                                 (__v4si) __Y, 1,
-                                                 (__mmask8) -1);
+  return (__m128d) __builtin_ia32_blendmpd_128_mask ((__v2df) __A,
+                                                    (__v2df) __W,
+                                                    (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cmpge_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_ps (__mmask8 __U, __m128 __A, __m128 __W)
 {
-  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
-                                                 (__v4si) __Y, 5,
-                                                 (__mmask8) __M);
+  return (__m128) __builtin_ia32_blendmps_128_mask ((__v4sf) __A,
+                                                   (__v4sf) __W,
+                                                   (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cmpge_epu32_mask (__m128i __X, __m128i __Y)
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_epi64 (__mmask8 __U, __m128i __A, __m128i __W)
 {
-  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
-                                                 (__v4si) __Y, 5,
-                                                 (__mmask8) -1);
+  return (__m128i) __builtin_ia32_blendmq_128_mask ((__v2di) __A,
+                                                   (__v2di) __W,
+                                                   (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cmple_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_epi32 (__mmask8 __U, __m128i __A, __m128i __W)
 {
-  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
-                                                 (__v4si) __Y, 2,
-                                                 (__mmask8) __M);
+  return (__m128i) __builtin_ia32_blendmd_128_mask ((__v4si) __A,
+                                                   (__v4si) __W,
+                                                   (__mmask8) __U);
 }
 
 extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cmple_epu32_mask (__m128i __X, __m128i __Y)
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_epi64_mask (__m256i __X, __m256i __Y, const int __P)
 {
-  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
-                                                 (__v4si) __Y, 2,
-                                                 (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+                                                (__v4di) __Y, __P,
+                                                (__mmask8) -1);
 }
 
 extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cmpneq_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_epi32_mask (__m256i __X, __m256i __Y, const int __P)
 {
-  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
-                                                 (__v2di) __Y, 4,
-                                                 (__mmask8) __M);
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+                                                (__v8si) __Y, __P,
+                                                (__mmask8) -1);
 }
 
 extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cmpneq_epu64_mask (__m128i __X, __m128i __Y)
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_epu64_mask (__m256i __X, __m256i __Y, const int __P)
 {
-  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
-                                                 (__v2di) __Y, 4,
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+                                                 (__v4di) __Y, __P,
                                                  (__mmask8) -1);
 }
 
 extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cmplt_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_epu32_mask (__m256i __X, __m256i __Y, const int __P)
 {
-  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
-                                                 (__v2di) __Y, 1,
-                                                 (__mmask8) __M);
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+                                                 (__v8si) __Y, __P,
+                                                 (__mmask8) -1);
 }
 
 extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cmplt_epu64_mask (__m128i __X, __m128i __Y)
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_pd_mask (__m256d __X, __m256d __Y, const int __P)
 {
-  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
-                                                 (__v2di) __Y, 1,
+  return (__mmask8) __builtin_ia32_cmppd256_mask ((__v4df) __X,
+                                                 (__v4df) __Y, __P,
                                                  (__mmask8) -1);
 }
 
 extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cmpge_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_ps_mask (__m256 __X, __m256 __Y, const int __P)
 {
-  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
-                                                 (__v2di) __Y, 5,
-                                                 (__mmask8) __M);
+  return (__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf) __X,
+                                                 (__v8sf) __Y, __P,
+                                                 (__mmask8) -1);
 }
 
 extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cmpge_epu64_mask (__m128i __X, __m128i __Y)
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_epi64_mask (__mmask8 __U, __m256i __X, __m256i __Y,
+                           const int __P)
 {
-  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
-                                                 (__v2di) __Y, 5,
-                                                 (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+                                                (__v4di) __Y, __P,
+                                                (__mmask8) __U);
 }
 
 extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cmple_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_epi32_mask (__mmask8 __U, __m256i __X, __m256i __Y,
+                           const int __P)
 {
-  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
-                                                 (__v2di) __Y, 2,
-                                                 (__mmask8) __M);
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+                                                (__v8si) __Y, __P,
+                                                (__mmask8) __U);
 }
 
 extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cmple_epu64_mask (__m128i __X, __m128i __Y)
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_epu64_mask (__mmask8 __U, __m256i __X, __m256i __Y,
+                           const int __P)
 {
-  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
-                                                 (__v2di) __Y, 2,
-                                                 (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+                                                 (__v4di) __Y, __P,
+                                                 (__mmask8) __U);
 }
 
 extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cmpneq_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_epu32_mask (__mmask8 __U, __m256i __X, __m256i __Y,
+                           const int __P)
 {
-  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
-                                                (__v4si) __Y, 4,
-                                                (__mmask8) __M);
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+                                                 (__v8si) __Y, __P,
+                                                 (__mmask8) __U);
 }
 
 extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cmpneq_epi32_mask (__m128i __X, __m128i __Y)
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_pd_mask (__mmask8 __U, __m256d __X, __m256d __Y,
+                        const int __P)
 {
-  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
-                                                (__v4si) __Y, 4,
-                                                (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_cmppd256_mask ((__v4df) __X,
+                                                 (__v4df) __Y, __P,
+                                                 (__mmask8) __U);
 }
 
 extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cmplt_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_ps_mask (__mmask8 __U, __m256 __X, __m256 __Y,
+                        const int __P)
 {
-  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
-                                                (__v4si) __Y, 1,
-                                                (__mmask8) __M);
+  return (__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf) __X,
+                                                 (__v8sf) __Y, __P,
+                                                 (__mmask8) __U);
 }
 
 extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cmplt_epi32_mask (__m128i __X, __m128i __Y)
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_epi64_mask (__m128i __X, __m128i __Y, const int __P)
 {
-  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
-                                                (__v4si) __Y, 1,
+  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+                                                (__v2di) __Y, __P,
                                                 (__mmask8) -1);
 }
 
 extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cmpge_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_epi32_mask (__m128i __X, __m128i __Y, const int __P)
 {
   return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
-                                                (__v4si) __Y, 5,
-                                                (__mmask8) __M);
+                                                (__v4si) __Y, __P,
+                                                (__mmask8) -1);
 }
 
 extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cmpge_epi32_mask (__m128i __X, __m128i __Y)
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_epu64_mask (__m128i __X, __m128i __Y, const int __P)
 {
-  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
-                                                (__v4si) __Y, 5,
-                                                (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+                                                 (__v2di) __Y, __P,
+                                                 (__mmask8) -1);
 }
 
 extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cmple_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_epu32_mask (__m128i __X, __m128i __Y, const int __P)
 {
-  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
-                                                (__v4si) __Y, 2,
-                                                (__mmask8) __M);
+  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+                                                 (__v4si) __Y, __P,
+                                                 (__mmask8) -1);
 }
 
 extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cmple_epi32_mask (__m128i __X, __m128i __Y)
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_pd_mask (__m128d __X, __m128d __Y, const int __P)
 {
-  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
-                                                (__v4si) __Y, 2,
-                                                (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_cmppd128_mask ((__v2df) __X,
+                                                 (__v2df) __Y, __P,
+                                                 (__mmask8) -1);
 }
 
 extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cmpneq_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_ps_mask (__m128 __X, __m128 __Y, const int __P)
 {
-  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
-                                                (__v2di) __Y, 4,
-                                                (__mmask8) __M);
+  return (__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf) __X,
+                                                 (__v4sf) __Y, __P,
+                                                 (__mmask8) -1);
 }
 
 extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cmpneq_epi64_mask (__m128i __X, __m128i __Y)
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_epi64_mask (__mmask8 __U, __m128i __X, __m128i __Y,
+                        const int __P)
 {
   return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
-                                                (__v2di) __Y, 4,
-                                                (__mmask8) -1);
+                                                (__v2di) __Y, __P,
+                                                (__mmask8) __U);
 }
 
 extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cmplt_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_epi32_mask (__mmask8 __U, __m128i __X, __m128i __Y,
+                        const int __P)
 {
-  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
-                                                (__v2di) __Y, 1,
-                                                (__mmask8) __M);
+  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+                                                (__v4si) __Y, __P,
+                                                (__mmask8) __U);
 }
 
 extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cmplt_epi64_mask (__m128i __X, __m128i __Y)
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_epu64_mask (__mmask8 __U, __m128i __X, __m128i __Y,
+                        const int __P)
 {
-  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
-                                                (__v2di) __Y, 1,
-                                                (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+                                                 (__v2di) __Y, __P,
+                                                 (__mmask8) __U);
 }
 
 extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cmpge_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_epu32_mask (__mmask8 __U, __m128i __X, __m128i __Y,
+                        const int __P)
 {
-  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
-                                                (__v2di) __Y, 5,
-                                                (__mmask8) __M);
+  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+                                                 (__v4si) __Y, __P,
+                                                 (__mmask8) __U);
 }
 
 extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cmpge_epi64_mask (__m128i __X, __m128i __Y)
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_pd_mask (__mmask8 __U, __m128d __X, __m128d __Y,
+                     const int __P)
 {
-  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
-                                                (__v2di) __Y, 5,
-                                                (__mmask8) -1);
+  return (__mmask8) __builtin_ia32_cmppd128_mask ((__v2df) __X,
+                                                 (__v2df) __Y, __P,
+                                                 (__mmask8) __U);
 }
 
 extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mask_cmple_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_ps_mask (__mmask8 __U, __m128 __X, __m128 __Y,
+                     const int __P)
 {
-  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
-                                                (__v2di) __Y, 2,
-                                                (__mmask8) __M);
+  return (__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf) __X,
+                                                 (__v4sf) __Y, __P,
+                                                 (__mmask8) __U);
 }
 
-extern __inline __mmask8
-  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cmple_epi64_mask (__m128i __X, __m128i __Y)
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutex_pd (__m256d __X, const int __M)
 {
-  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
-                                                (__v2di) __Y, 2,
-                                                (__mmask8) -1);
+  return (__m256d) __builtin_ia32_permdf256_mask ((__v4df) __X, __M,
+                                                 (__v4df)
+                                                 _mm256_undefined_pd (),
+                                                 (__mmask8) -1);
 }
 
 #else
index ebe5a4a342c84de938d200242f967dc100d1d5e6..45839d25ddcde626e0a5f45399d84c5bad7607f0 100644 (file)
@@ -1,5 +1,8 @@
 2017-03-09  Jakub Jelinek  <jakub@redhat.com>
 
+       PR target/79932
+       * gcc.target/i386/pr79932-2.c: New test.
+
        PR target/79932
        * gcc.target/i386/pr79932-1.c: New test.
 
diff --git a/gcc/testsuite/gcc.target/i386/pr79932-2.c b/gcc/testsuite/gcc.target/i386/pr79932-2.c
new file mode 100644 (file)
index 0000000..dc8178b
--- /dev/null
@@ -0,0 +1,78 @@
+/* PR target/79932 */
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx512vl" } */
+
+#include <x86intrin.h>
+
+__m256i a, b;
+__m128i c, d;
+__mmask32 e, f[64];
+
+void
+foo (void)
+{
+  f[0] = _mm256_cmpge_epi32_mask (a, b);
+  f[1] = _mm256_cmpge_epi64_mask (a, b);
+  f[2] = _mm256_cmpge_epu32_mask (a, b);
+  f[3] = _mm256_cmpge_epu64_mask (a, b);
+  f[4] = _mm256_cmple_epi32_mask (a, b);
+  f[5] = _mm256_cmple_epi64_mask (a, b);
+  f[6] = _mm256_cmple_epu32_mask (a, b);
+  f[7] = _mm256_cmple_epu64_mask (a, b);
+  f[8] = _mm256_cmplt_epi32_mask (a, b);
+  f[9] = _mm256_cmplt_epi64_mask (a, b);
+  f[10] = _mm256_cmplt_epu32_mask (a, b);
+  f[11] = _mm256_cmplt_epu64_mask (a, b);
+  f[12] = _mm256_cmpneq_epi32_mask (a, b);
+  f[13] = _mm256_cmpneq_epi64_mask (a, b);
+  f[14] = _mm256_cmpneq_epu32_mask (a, b);
+  f[15] = _mm256_cmpneq_epu64_mask (a, b);
+  f[16] = _mm256_mask_cmpge_epi32_mask (e, a, b);
+  f[17] = _mm256_mask_cmpge_epi64_mask (e, a, b);
+  f[18] = _mm256_mask_cmpge_epu32_mask (e, a, b);
+  f[19] = _mm256_mask_cmpge_epu64_mask (e, a, b);
+  f[20] = _mm256_mask_cmple_epi32_mask (e, a, b);
+  f[21] = _mm256_mask_cmple_epi64_mask (e, a, b);
+  f[22] = _mm256_mask_cmple_epu32_mask (e, a, b);
+  f[23] = _mm256_mask_cmple_epu64_mask (e, a, b);
+  f[24] = _mm256_mask_cmplt_epi32_mask (e, a, b);
+  f[25] = _mm256_mask_cmplt_epi64_mask (e, a, b);
+  f[26] = _mm256_mask_cmplt_epu32_mask (e, a, b);
+  f[27] = _mm256_mask_cmplt_epu64_mask (e, a, b);
+  f[28] = _mm256_mask_cmpneq_epi32_mask (e, a, b);
+  f[29] = _mm256_mask_cmpneq_epi64_mask (e, a, b);
+  f[30] = _mm256_mask_cmpneq_epu32_mask (e, a, b);
+  f[31] = _mm256_mask_cmpneq_epu64_mask (e, a, b);
+  f[32] = _mm_cmpge_epi32_mask (c, d);
+  f[33] = _mm_cmpge_epi64_mask (c, d);
+  f[34] = _mm_cmpge_epu32_mask (c, d);
+  f[35] = _mm_cmpge_epu64_mask (c, d);
+  f[36] = _mm_cmple_epi32_mask (c, d);
+  f[37] = _mm_cmple_epi64_mask (c, d);
+  f[38] = _mm_cmple_epu32_mask (c, d);
+  f[39] = _mm_cmple_epu64_mask (c, d);
+  f[40] = _mm_cmplt_epi32_mask (c, d);
+  f[41] = _mm_cmplt_epi64_mask (c, d);
+  f[42] = _mm_cmplt_epu32_mask (c, d);
+  f[43] = _mm_cmplt_epu64_mask (c, d);
+  f[44] = _mm_cmpneq_epi32_mask (c, d);
+  f[45] = _mm_cmpneq_epi64_mask (c, d);
+  f[46] = _mm_cmpneq_epu32_mask (c, d);
+  f[47] = _mm_cmpneq_epu64_mask (c, d);
+  f[48] = _mm_mask_cmpge_epi32_mask (e, c, d);
+  f[49] = _mm_mask_cmpge_epi64_mask (e, c, d);
+  f[50] = _mm_mask_cmpge_epu32_mask (e, c, d);
+  f[51] = _mm_mask_cmpge_epu64_mask (e, c, d);
+  f[52] = _mm_mask_cmple_epi32_mask (e, c, d);
+  f[53] = _mm_mask_cmple_epi64_mask (e, c, d);
+  f[54] = _mm_mask_cmple_epu32_mask (e, c, d);
+  f[55] = _mm_mask_cmple_epu64_mask (e, c, d);
+  f[56] = _mm_mask_cmplt_epi32_mask (e, c, d);
+  f[57] = _mm_mask_cmplt_epi64_mask (e, c, d);
+  f[58] = _mm_mask_cmplt_epu32_mask (e, c, d);
+  f[59] = _mm_mask_cmplt_epu64_mask (e, c, d);
+  f[60] = _mm_mask_cmpneq_epi32_mask (e, c, d);
+  f[61] = _mm_mask_cmpneq_epi64_mask (e, c, d);
+  f[62] = _mm_mask_cmpneq_epu32_mask (e, c, d);
+  f[63] = _mm_mask_cmpneq_epu64_mask (e, c, d);
+}