3b43d510e68fa89e7b2625686d12bcf9b4e53b81
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 ****************************************************************************/
24 #ifndef __SWR_SIMD16INTRIN_H__
25 #define __SWR_SIMD16INTRIN_H__
27 #if ENABLE_AVX512_SIMD16
29 #if KNOB_SIMD16_WIDTH == 16
31 #if ENABLE_AVX512_EMULATION
47 typedef uint16_t simd16mask
;
49 #define _simd16_masklo(mask) ((mask) & 0xFF)
50 #define _simd16_maskhi(mask) (((mask) >> 8))
51 #define _simd16_setmask(hi, lo) (((hi) << 8) | (lo))
54 typedef __m512 simd16scalar
;
55 typedef __m512d simd16scalard
;
56 typedef __m512i simd16scalari
;
57 typedef __mmask16 simd16mask
;
58 #endif//ENABLE_AVX512_EMULATION
60 #error Unsupported vector width
61 #endif//KNOB_SIMD16_WIDTH == 16
63 OSALIGN(union, KNOB_SIMD16_BYTES
) simd16vector
68 simd16scalar x
, y
, z
, w
;
71 simd16scalar
& operator[] (const int i
) { return v
[i
]; }
72 const simd16scalar
& operator[] (const int i
) const { return v
[i
]; }
75 #if ENABLE_AVX512_EMULATION
77 #define SIMD16_EMU_AVX512_0(type, func, intrin) \
82 result.lo = intrin();\
83 result.hi = intrin();\
88 #define SIMD16_EMU_AVX512_1(type, func, intrin) \
89 INLINE type func(type a)\
93 result.lo = intrin(a.lo);\
94 result.hi = intrin(a.hi);\
99 #define SIMD16_EMU_AVX512_2(type, func, intrin) \
100 INLINE type func(type a, type b)\
104 result.lo = intrin(a.lo, b.lo);\
105 result.hi = intrin(a.hi, b.hi);\
110 #define SIMD16_EMU_AVX512_3(type, func, intrin) \
111 INLINE type func(type a, type b, type c)\
115 result.lo = intrin(a.lo, b.lo, c.lo);\
116 result.hi = intrin(a.hi, b.hi, c.hi);\
121 SIMD16_EMU_AVX512_0(simd16scalar
, _simd16_setzero_ps
, _mm256_setzero_ps
)
122 SIMD16_EMU_AVX512_0(simd16scalari
, _simd16_setzero_si
, _mm256_setzero_si256
)
124 INLINE simd16scalar
_simd16_set1_ps(float a
)
128 result
.lo
= _mm256_set1_ps(a
);
129 result
.hi
= _mm256_set1_ps(a
);
134 INLINE simd16scalari
_simd16_set1_epi8(char a
)
136 simd16scalari result
;
138 result
.lo
= _mm256_set1_epi8(a
);
139 result
.hi
= _mm256_set1_epi8(a
);
144 INLINE simd16scalari
_simd16_set1_epi32(int a
)
146 simd16scalari result
;
148 result
.lo
= _mm256_set1_epi32(a
);
149 result
.hi
= _mm256_set1_epi32(a
);
154 INLINE simd16scalar
_simd16_set_ps(float e15
, float e14
, float e13
, float e12
, float e11
, float e10
, float e9
, float e8
, float e7
, float e6
, float e5
, float e4
, float e3
, float e2
, float e1
, float e0
)
158 result
.lo
= _mm256_set_ps(e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
159 result
.hi
= _mm256_set_ps(e15
, e14
, e13
, e12
, e11
, e10
, e9
, e8
);
164 INLINE simd16scalari
_simd16_set_epi32(int e15
, int e14
, int e13
, int e12
, int e11
, int e10
, int e9
, int e8
, int e7
, int e6
, int e5
, int e4
, int e3
, int e2
, int e1
, int e0
)
166 simd16scalari result
;
168 result
.lo
= _mm256_set_epi32(e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
169 result
.hi
= _mm256_set_epi32(e15
, e14
, e13
, e12
, e11
, e10
, e9
, e8
);
174 INLINE simd16scalar
_simd16_set_ps(float e7
, float e6
, float e5
, float e4
, float e3
, float e2
, float e1
, float e0
)
178 result
.lo
= _mm256_set_ps(e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
179 result
.hi
= _mm256_set_ps(e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
184 INLINE simd16scalari
_simd16_set_epi32(int e7
, int e6
, int e5
, int e4
, int e3
, int e2
, int e1
, int e0
)
186 simd16scalari result
;
188 result
.lo
= _mm256_set_epi32(e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
189 result
.hi
= _mm256_set_epi32(e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
194 INLINE simd16scalar
_simd16_load_ps(float const *m
)
198 float const *n
= reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m
) + sizeof(result
.lo
));
200 result
.lo
= _mm256_load_ps(m
);
201 result
.hi
= _mm256_load_ps(n
);
206 INLINE simd16scalar
_simd16_loadu_ps(float const *m
)
210 float const *n
= reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m
) + sizeof(result
.lo
));
212 result
.lo
= _mm256_loadu_ps(m
);
213 result
.hi
= _mm256_loadu_ps(n
);
218 INLINE simd16scalar
_simd16_load1_ps(float const *m
)
222 result
.lo
= _mm256_broadcast_ss(m
);
223 result
.hi
= _mm256_broadcast_ss(m
);
228 INLINE simd16scalari
_simd16_load_si(simd16scalari
const *m
)
230 simd16scalari result
;
232 result
.lo
= _mm256_load_si256(&m
[0].lo
);
233 result
.hi
= _mm256_load_si256(&m
[0].hi
);
238 INLINE simd16scalari
_simd16_loadu_si(simd16scalari
const *m
)
240 simd16scalari result
;
242 result
.lo
= _mm256_loadu_si256(&m
[0].lo
);
243 result
.hi
= _mm256_loadu_si256(&m
[0].hi
);
248 INLINE simd16scalar
_simd16_broadcast_ss(float const *m
)
252 result
.lo
= _mm256_broadcast_ss(m
);
253 result
.hi
= _mm256_broadcast_ss(m
);
258 INLINE simd16scalar
_simd16_broadcast_ps(__m128
const *m
)
262 result
.lo
= _mm256_broadcast_ps(m
);
263 result
.hi
= _mm256_broadcast_ps(m
);
268 INLINE
void _simd16_store_ps(float *m
, simd16scalar a
)
270 float *n
= reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m
) + sizeof(a
.lo
));
272 _mm256_store_ps(m
, a
.lo
);
273 _mm256_store_ps(n
, a
.hi
);
276 INLINE
void _simd16_maskstore_ps(float *m
, simd16scalari mask
, simd16scalar a
)
278 float *n
= reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m
) + sizeof(a
.lo
));
280 _mm256_maskstore_ps(m
, mask
.lo
, a
.lo
);
281 _mm256_maskstore_ps(n
, mask
.hi
, a
.hi
);
284 INLINE
void _simd16_store_si(simd16scalari
*m
, simd16scalari a
)
286 _mm256_store_si256(&m
[0].lo
, a
.lo
);
287 _mm256_store_si256(&m
[0].hi
, a
.hi
);
290 INLINE simdscalar
_simd16_extract_ps(simd16scalar a
, int imm8
)
299 return _simd_set1_ps(0.0f
);
302 INLINE simdscalari
_simd16_extract_si(simd16scalari a
, int imm8
)
311 return _simd_set1_epi32(0);
314 INLINE simd16scalar
_simd16_insert_ps(simd16scalar a
, simdscalar b
, int imm8
)
328 INLINE simd16scalari
_simd16_insert_si(simd16scalari a
, simdscalari b
, int imm8
)
342 template <simd16mask mask
>
343 INLINE simd16scalar
_simd16_blend_ps_temp(simd16scalar a
, simd16scalar b
)
347 result
.lo
= _mm256_blend_ps(a
.lo
, b
.lo
, _simd16_masklo(mask
));
348 result
.hi
= _mm256_blend_ps(a
.hi
, b
.hi
, _simd16_maskhi(mask
));
353 #define _simd16_blend_ps(a, b, mask) _simd16_blend_ps_temp<mask>(a, b)
355 SIMD16_EMU_AVX512_3(simd16scalar
, _simd16_blendv_ps
, _mm256_blendv_ps
)
357 INLINE simd16scalari
_simd16_blendv_epi32(simd16scalari a
, simd16scalari b
, const simd16scalar mask
)
359 simd16scalari result
;
361 result
.lo
= _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a
.lo
), _mm256_castsi256_ps(b
.lo
), mask
.lo
));
362 result
.hi
= _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a
.hi
), _mm256_castsi256_ps(b
.hi
), mask
.hi
));
367 INLINE simd16scalari
_simd16_blendv_epi32(simd16scalari a
, simd16scalari b
, const simd16scalari mask
)
369 simd16scalari result
;
371 result
.lo
= _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a
.lo
), _mm256_castsi256_ps(b
.lo
), _mm256_castsi256_ps(mask
.lo
)));
372 result
.hi
= _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a
.hi
), _mm256_castsi256_ps(b
.hi
), _mm256_castsi256_ps(mask
.hi
)));
377 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_mul_ps
, _mm256_mul_ps
)
378 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_add_ps
, _mm256_add_ps
)
379 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_sub_ps
, _mm256_sub_ps
)
380 SIMD16_EMU_AVX512_1(simd16scalar
, _simd16_rsqrt_ps
, _mm256_rsqrt_ps
)
381 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_min_ps
, _mm256_min_ps
)
382 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_max_ps
, _mm256_max_ps
)
384 INLINE simd16mask
_simd16_movemask_ps(simd16scalar a
)
388 reinterpret_cast<uint8_t *>(&mask
)[0] = _mm256_movemask_ps(a
.lo
);
389 reinterpret_cast<uint8_t *>(&mask
)[1] = _mm256_movemask_ps(a
.hi
);
394 INLINE simd16mask
_simd16_movemask_pd(simd16scalard a
)
398 reinterpret_cast<uint8_t *>(&mask
)[0] = _mm256_movemask_pd(a
.lo
);
399 reinterpret_cast<uint8_t *>(&mask
)[1] = _mm256_movemask_pd(a
.hi
);
404 INLINE simd16mask
_simd16_movemask_epi8(simd16scalari a
)
408 reinterpret_cast<uint8_t *>(&mask
)[0] = _mm256_movemask_epi8(a
.lo
);
409 reinterpret_cast<uint8_t *>(&mask
)[1] = _mm256_movemask_epi8(a
.hi
);
414 INLINE simd16scalari
_simd16_cvtps_epi32(simd16scalar a
)
416 simd16scalari result
;
418 result
.lo
= _mm256_cvtps_epi32(a
.lo
);
419 result
.hi
= _mm256_cvtps_epi32(a
.hi
);
424 INLINE simd16scalari
_simd16_cvttps_epi32(simd16scalar a
)
426 simd16scalari result
;
428 result
.lo
= _mm256_cvttps_epi32(a
.lo
);
429 result
.hi
= _mm256_cvttps_epi32(a
.hi
);
434 INLINE simd16scalar
_simd16_cvtepi32_ps(simd16scalari a
)
438 result
.lo
= _mm256_cvtepi32_ps(a
.lo
);
439 result
.hi
= _mm256_cvtepi32_ps(a
.hi
);
445 INLINE simd16scalar
_simd16_cmp_ps(simd16scalar a
, simd16scalar b
)
449 result
.lo
= _mm256_cmp_ps(a
.lo
, b
.lo
, comp
);
450 result
.hi
= _mm256_cmp_ps(a
.hi
, b
.hi
, comp
);
455 #define _simd16_cmplt_ps(a, b) _simd16_cmp_ps<_CMP_LT_OQ>(a, b)
456 #define _simd16_cmpgt_ps(a, b) _simd16_cmp_ps<_CMP_GT_OQ>(a, b)
457 #define _simd16_cmpneq_ps(a, b) _simd16_cmp_ps<_CMP_NEQ_OQ>(a, b)
458 #define _simd16_cmpeq_ps(a, b) _simd16_cmp_ps<_CMP_EQ_OQ>(a, b)
459 #define _simd16_cmpge_ps(a, b) _simd16_cmp_ps<_CMP_GE_OQ>(a, b)
460 #define _simd16_cmple_ps(a, b) _simd16_cmp_ps<_CMP_LE_OQ>(a, b)
462 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_and_ps
, _simd_and_ps
)
463 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_andnot_ps
, _simd_andnot_ps
)
464 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_or_ps
, _simd_or_ps
)
465 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_xor_ps
, _simd_xor_ps
)
467 SIMD16_EMU_AVX512_1(simd16scalar
, _simd16_rcp_ps
, _simd_rcp_ps
)
468 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_div_ps
, _simd_div_ps
)
470 INLINE simd16scalar
_simd16_castsi_ps(simd16scalari a
)
472 return *reinterpret_cast<simd16scalar
*>(&a
);
475 INLINE simd16scalari
_simd16_castps_si(simd16scalar a
)
477 return *reinterpret_cast<simd16scalari
*>(&a
);
480 INLINE simd16scalard
_simd16_castsi_pd(simd16scalari a
)
482 return *reinterpret_cast<simd16scalard
*>(&a
);
485 INLINE simd16scalari
_simd16_castpd_si(simd16scalard a
)
487 return *reinterpret_cast<simd16scalari
*>(&a
);
490 INLINE simd16scalar
_simd16_castpd_ps(simd16scalard a
)
492 return *reinterpret_cast<simd16scalar
*>(&a
);
495 INLINE simd16scalard
_simd16_castps_pd(simd16scalar a
)
497 return *reinterpret_cast<simd16scalard
*>(&a
);
501 INLINE simd16scalar
_simd16_round_ps_temp(simd16scalar a
)
505 result
.lo
= _mm256_round_ps(a
.lo
, mode
);
506 result
.hi
= _mm256_round_ps(a
.hi
, mode
);
511 #define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
513 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_mul_epi32
, _simd_mul_epi32
)
514 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_mullo_epi32
, _simd_mullo_epi32
)
515 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_sub_epi32
, _simd_sub_epi32
)
516 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_sub_epi64
, _simd_sub_epi64
)
517 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_min_epi32
, _simd_min_epi32
)
518 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_max_epi32
, _simd_max_epi32
)
519 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_min_epu32
, _simd_min_epu32
)
520 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_max_epu32
, _simd_max_epu32
)
521 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_add_epi32
, _simd_add_epi32
)
523 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_and_si
, _simd_and_si
)
524 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_andnot_si
, _simd_andnot_si
)
525 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_or_si
, _simd_or_si
)
526 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_xor_si
, _simd_xor_si
)
528 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmpeq_epi32
, _simd_cmpeq_epi32
)
529 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmpgt_epi32
, _simd_cmpgt_epi32
)
530 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmplt_epi32
, _simd_cmplt_epi32
)
532 INLINE
int _simd16_testz_ps(simd16scalar a
, simd16scalar b
)
534 int lo
= _mm256_testz_ps(a
.lo
, b
.lo
);
535 int hi
= _mm256_testz_ps(a
.hi
, b
.hi
);
540 #define _simd16_cmplt_epi32(a, b) _simd16_cmpgt_epi32(b, a)
542 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_unpacklo_ps
, _simd_unpacklo_ps
)
543 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_unpackhi_ps
, _simd_unpackhi_ps
)
544 SIMD16_EMU_AVX512_2(simd16scalard
, _simd16_unpacklo_pd
, _simd_unpacklo_pd
)
545 SIMD16_EMU_AVX512_2(simd16scalard
, _simd16_unpackhi_pd
, _simd_unpackhi_pd
)
547 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_unpacklo_epi8
, _simd_unpacklo_epi8
)
548 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_unpackhi_epi8
, _simd_unpackhi_epi8
)
549 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_unpacklo_epi16
, _simd_unpacklo_epi16
)
550 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_unpackhi_epi16
, _simd_unpackhi_epi16
)
551 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_unpacklo_epi32
, _simd_unpacklo_epi32
)
552 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_unpackhi_epi32
, _simd_unpackhi_epi32
)
553 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_unpacklo_epi64
, _simd_unpacklo_epi64
)
554 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_unpackhi_epi64
, _simd_unpackhi_epi64
)
557 INLINE simd16scalari
_simd16_slli_epi32_temp(simd16scalari a
)
559 simd16scalari result
;
561 result
.lo
= _simd_slli_epi32(a
.lo
, imm8
);
562 result
.hi
= _simd_slli_epi32(a
.hi
, imm8
);
567 #define _simd16_slli_epi32(a, imm8) _simd16_slli_epi32_temp<imm8>(a)
570 INLINE simd16scalari
_simd16_srai_epi32_temp(simd16scalari a
)
572 simd16scalari result
;
574 result
.lo
= _simd_srai_epi32(a
.lo
, imm8
);
575 result
.hi
= _simd_srai_epi32(a
.hi
, imm8
);
580 #define _simd16_srai_epi32(a, imm8) _simd16_srai_epi32_temp<imm8>(a)
583 INLINE simd16scalari
_simd16_srli_epi32_temp(simd16scalari a
)
585 simd16scalari result
;
587 result
.lo
= _simd_srli_epi32(a
.lo
, imm8
);
588 result
.hi
= _simd_srli_epi32(a
.hi
, imm8
);
593 #define _simd16_srli_epi32(a, imm8) _simd16_srli_epi32_temp<imm8>(a)
595 SIMD16_EMU_AVX512_3(simd16scalar
, _simd16_fmadd_ps
, _simd_fmadd_ps
)
596 SIMD16_EMU_AVX512_3(simd16scalar
, _simd16_fmsub_ps
, _simd_fmsub_ps
)
599 INLINE simd16scalar
_simd16_i32gather_ps_temp(const float *m
, simd16scalari index
)
603 result
.lo
= _simd_i32gather_ps(m
, index
.lo
, scale
);
604 result
.hi
= _simd_i32gather_ps(m
, index
.hi
, scale
);
609 #define _simd16_i32gather_ps(m, index, scale) _simd16_i32gather_ps_temp<scale>(m, index)
612 INLINE simd16scalar
_simd16_mask_i32gather_ps_temp(simd16scalar a
, const float *m
, simd16scalari index
, simd16scalari mask
)
616 result
.lo
= _simd_mask_i32gather_ps(a
.lo
, m
, index
.lo
, _simd_castsi_ps(mask
.lo
), scale
);
617 result
.hi
= _simd_mask_i32gather_ps(a
.hi
, m
, index
.hi
, _simd_castsi_ps(mask
.hi
), scale
);
622 #define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, index, mask)
624 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_shuffle_epi8
, _simd_shuffle_epi8
)
625 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_adds_epu8
, _simd_adds_epu8
)
626 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_subs_epu8
, _simd_subs_epu8
)
627 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_add_epi8
, _simd_add_epi8
)
628 SIMD16_EMU_AVX512_1(simd16scalari
, _simd16_abs_epi32
, _simd_abs_epi32
)
629 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmpeq_epi64
, _simd_cmpeq_epi64
)
630 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmpgt_epi64
, _simd_cmpgt_epi64
)
631 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmpeq_epi16
, _simd_cmpeq_epi16
)
632 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmpgt_epi16
, _simd_cmpgt_epi16
)
633 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmpeq_epi8
, _simd_cmpeq_epi8
)
634 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmpgt_epi8
, _simd_cmpgt_epi8
)
636 INLINE simd16scalar
_simd16_permute_ps(simd16scalar a
, simd16scalari i
)
640 const simdscalari mask
= _simd_set1_epi32(7);
642 simdscalar lolo
= _simd_permute_ps(a
.lo
, _simd_and_si(i
.lo
, mask
));
643 simdscalar lohi
= _simd_permute_ps(a
.hi
, _simd_and_si(i
.lo
, mask
));
645 simdscalar hilo
= _simd_permute_ps(a
.lo
, _simd_and_si(i
.hi
, mask
));
646 simdscalar hihi
= _simd_permute_ps(a
.hi
, _simd_and_si(i
.hi
, mask
));
648 result
.lo
= _simd_blendv_ps(lolo
, lohi
, _simd_castsi_ps(_simd_cmpgt_epi32(i
.lo
, mask
)));
649 result
.hi
= _simd_blendv_ps(hilo
, hihi
, _simd_castsi_ps(_simd_cmpgt_epi32(i
.hi
, mask
)));
654 INLINE simd16scalari
_simd16_permute_epi32(simd16scalari a
, simd16scalari i
)
656 return _simd16_castps_si(_simd16_permute_ps(_simd16_castsi_ps(a
), i
));
659 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_srlv_epi32
, _simd_srlv_epi32
)
660 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_sllv_epi32
, _simd_sllv_epi32
)
663 INLINE simd16scalar
_simd16_permute2f128_ps_temp(simd16scalar a
, simd16scalar b
)
667 result
.lo
= _simd_permute2f128_ps(a
.lo
, a
.hi
, ((imm8
& 0x03) << 0) | ((imm8
& 0x0C) << 2));
668 result
.hi
= _simd_permute2f128_ps(b
.lo
, b
.hi
, ((imm8
& 0x30) >> 4) | ((imm8
& 0xC0) >> 2));
673 #define _simd16_permute2f128_ps(a, b, imm8) _simd16_permute2f128_ps_temp<imm8>(a, b)
676 INLINE simd16scalard
_simd16_permute2f128_pd_temp(simd16scalard a
, simd16scalard b
)
678 simd16scalard result
;
680 result
.lo
= _simd_permute2f128_pd(a
.lo
, a
.hi
, ((imm8
& 0x03) << 0) | ((imm8
& 0x0C) << 2));
681 result
.hi
= _simd_permute2f128_pd(b
.lo
, b
.hi
, ((imm8
& 0x30) >> 4) | ((imm8
& 0xC0) >> 2));
686 #define _simd16_permute2f128_pd(a, b, imm8) _simd16_permute2f128_pd_temp<imm8>(a, b)
689 INLINE simd16scalari
_simd16_permute2f128_si_temp(simd16scalari a
, simd16scalari b
)
691 simd16scalari result
;
693 result
.lo
= _simd_permute2f128_si(a
.lo
, a
.hi
, ((imm8
& 0x03) << 0) | ((imm8
& 0x0C) << 2));
694 result
.hi
= _simd_permute2f128_si(b
.lo
, b
.hi
, ((imm8
& 0x30) >> 4) | ((imm8
& 0xC0) >> 2));
699 #define _simd16_permute2f128_si(a, b, imm8) _simd16_permute2f128_si_temp<imm8>(a, b)
702 INLINE simd16scalar
_simd16_shuffle_ps_temp(simd16scalar a
, simd16scalar b
)
706 result
.lo
= _simd_shuffle_ps(a
.lo
, b
.lo
, imm8
);
707 result
.hi
= _simd_shuffle_ps(a
.hi
, b
.hi
, imm8
);
712 #define _simd16_shuffle_ps(a, b, imm8) _simd16_shuffle_ps_temp<imm8>(a, b)
715 INLINE simd16scalard
_simd16_shuffle_pd_temp(simd16scalard a
, simd16scalard b
)
717 simd16scalard result
;
719 result
.lo
= _simd_shuffle_pd(a
.lo
, b
.lo
, (imm8
& 15));
720 result
.hi
= _simd_shuffle_pd(a
.hi
, b
.hi
, (imm8
>> 4));
725 #define _simd16_shuffle_pd(a, b, imm8) _simd16_shuffle_pd_temp<imm8>(a, b)
728 INLINE simd16scalari
_simd16_shuffle_epi32_temp(simd16scalari a
, simd16scalari b
)
730 return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a
), _simd16_castsi_ps(b
), imm8
));
733 #define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
736 INLINE simd16scalari
_simd16_shuffle_epi64_temp(simd16scalari a
, simd16scalari b
)
738 return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a
), _simd16_castsi_pd(b
), imm8
));
741 #define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
743 INLINE simd16scalari
_simd16_cvtepu8_epi16(simdscalari a
)
745 simd16scalari result
;
747 result
.lo
= _simd_cvtepu8_epi16(_mm256_extractf128_si256(a
, 0));
748 result
.hi
= _simd_cvtepu8_epi16(_mm256_extractf128_si256(a
, 1));
753 INLINE simd16scalari
_simd16_cvtepu8_epi32(__m128i a
)
755 simd16scalari result
;
757 result
.lo
= _simd_cvtepu8_epi32(a
);
758 result
.hi
= _simd_cvtepu8_epi32(_mm_srli_si128(a
, 8));
763 INLINE simd16scalari
_simd16_cvtepu16_epi32(simdscalari a
)
765 simd16scalari result
;
767 result
.lo
= _simd_cvtepu16_epi32(_mm256_extractf128_si256(a
, 0));
768 result
.hi
= _simd_cvtepu16_epi32(_mm256_extractf128_si256(a
, 1));
773 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_packus_epi16
, _simd_packus_epi16
)
774 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_packs_epi16
, _simd_packs_epi16
)
775 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_packus_epi32
, _simd_packus_epi32
)
776 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_packs_epi32
, _simd_packs_epi32
)
778 INLINE simd16mask
_simd16_int2mask(int mask
)
783 INLINE
int _simd16_mask2int(simd16mask mask
)
788 INLINE simd16mask
_simd16_cmplt_ps_mask(simd16scalar a
, simd16scalar b
)
790 return _simd16_movemask_ps(_simd16_cmplt_ps(a
, b
));
793 // convert bitmask to vector mask
794 INLINE simd16scalar
vMask16(int32_t mask
)
796 simd16scalari temp
= _simd16_set1_epi32(mask
);
798 simd16scalari bits
= _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
800 simd16scalari result
= _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp
, bits
));
802 return _simd16_castsi_ps(result
);
807 INLINE simd16mask
_simd16_scalari2mask(simd16scalari mask
)
809 return _mm512_cmpneq_epu32_mask(mask
, _mm512_setzero_epi32());
813 INLINE simd16mask
_simd16_scalard2mask(simd16scalard mask
)
815 return _mm512_cmpneq_epu64_mask(mask
, _mm512_setzero_epi64());
819 #define _simd16_setzero_ps _mm512_setzero_ps
820 #define _simd16_setzero_si _mm512_setzero_si512
821 #define _simd16_set1_ps _mm512_set1_ps
822 #define _simd16_set1_epi8 _mm512_set1_epi8
823 #define _simd16_set1_epi32 _mm512_set1_epi32
825 INLINE simd16scalar
_simd16_set_ps(float e15
, float e14
, float e13
, float e12
, float e11
, float e10
, float e9
, float e8
, float e7
, float e6
, float e5
, float e4
, float e3
, float e2
, float e1
, float e0
)
827 return _mm512_set_ps(e15
, e14
, e13
, e12
, e11
, e10
, e9
, e8
, e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
830 INLINE simd16scalari
_simd16_set_epi32(int e15
, int e14
, int e13
, int e12
, int e11
, int e10
, int e9
, int e8
, int e7
, int e6
, int e5
, int e4
, int e3
, int e2
, int e1
, int e0
)
832 return _mm512_set_epi32(e15
, e14
, e13
, e12
, e11
, e10
, e9
, e8
, e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
835 INLINE simd16scalar
_simd16_set_ps(float e7
, float e6
, float e5
, float e4
, float e3
, float e2
, float e1
, float e0
)
837 return _mm512_set_ps(e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
, e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
840 INLINE simd16scalari
_simd16_set_epi32(int e7
, int e6
, int e5
, int e4
, int e3
, int e2
, int e1
, int e0
)
842 return _mm512_set_epi32(e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
, e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
845 #define _simd16_load_ps _mm512_load_ps
846 #define _simd16_loadu_ps _mm512_loadu_ps
848 #define _simd16_load1_ps _simd16_broadcast_ss
850 #define _simd16_load_si _mm512_load_si512
851 #define _simd16_loadu_si _mm512_loadu_si512
852 #define _simd16_broadcast_ss(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, 0)
853 #define _simd16_broadcast_ps(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, 0)
854 #define _simd16_store_ps _mm512_store_ps
855 #define _simd16_store_si _mm512_store_si512
856 #define _simd16_extract_ps(a, imm8) _mm256_castsi256_ps(_mm512_extracti64x4_epi64(_mm512_castps_si512(a), imm8))
857 #define _simd16_extract_si _mm512_extracti64x4_epi64
858 #define _simd16_insert_ps(a, b, imm8) _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castps_si512(a), _mm256_castps_si256(b), imm8))
859 #define _simd16_insert_si _mm512_inserti64x4
861 INLINE
void _simd16_maskstore_ps(float *m
, simd16scalari mask
, simd16scalar a
)
863 simd16mask k
= _simd16_scalari2mask(mask
);
865 _mm512_mask_store_ps(m
, k
, a
);
868 #define _simd16_blend_ps(a, b, mask) _mm512_mask_blend_ps(mask, a, b)
870 INLINE simd16scalar
_simd16_blendv_ps(simd16scalar a
, simd16scalar b
, const simd16scalar mask
)
872 simd16mask k
= _simd16_scalari2mask(_mm512_castps_si512(mask
));
874 return _mm512_mask_blend_ps(k
, a
, b
);
877 INLINE simd16scalari
_simd16_blendv_epi32(simd16scalari a
, simd16scalari b
, const simd16scalar mask
)
879 simd16mask k
= _simd16_scalari2mask(_mm512_castps_si512(mask
));
881 return _mm512_mask_blend_epi32(k
, a
, b
);
884 INLINE simd16scalari
_simd16_blendv_epi32(simd16scalari a
, simd16scalari b
, const simd16scalari mask
)
886 simd16mask k
= _simd16_scalari2mask(mask
);
888 return _mm512_mask_blend_epi32(k
, a
, b
);
891 #define _simd16_mul_ps _mm512_mul_ps
892 #define _simd16_add_ps _mm512_add_ps
893 #define _simd16_sub_ps _mm512_sub_ps
894 #define _simd16_rsqrt_ps _mm512_rsqrt14_ps
895 #define _simd16_min_ps _mm512_min_ps
896 #define _simd16_max_ps _mm512_max_ps
898 INLINE simd16mask
_simd16_movemask_ps(simd16scalar a
)
900 return _simd16_scalari2mask(_mm512_castps_si512(a
));
904 INLINE simd16mask
_simd16_movemask_pd(simd16scalard a
)
906 return _simd16_scalard2mask(_mm512i_castpd_si512(a
));
911 INLINE
int _simd16_movemask_epi8(simd16scalari a
)
913 return _simd16_scalar2mask(a
);
917 #define _simd16_cvtps_epi32 _mm512_cvtps_epi32
918 #define _simd16_cvttps_epi32 _mm512_cvttps_epi32
919 #define _simd16_cvtepi32_ps _mm512_cvtepi32_ps
922 INLINE simd16scalar
_simd16_cmp_ps_temp(simd16scalar a
, simd16scalar b
)
924 simd16mask k
= _mm512_cmpeq_ps_mask(a
, b
);
926 return _mm512_castsi512_ps(_mm512_mask_blend_epi32(k
, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF)));
929 #define _simd16_cmp_ps(a, b, comp) _simd16_cmp_ps_temp<comp>(a, b)
931 #define _simd16_cmplt_ps(a, b) _simd16_cmp_ps<_CMP_LT_OQ>(a, b)
932 #define _simd16_cmpgt_ps(a, b) _simd16_cmp_ps<_CMP_GT_OQ>(a, b)
933 #define _simd16_cmpneq_ps(a, b) _simd16_cmp_ps<_CMP_NEQ_OQ>(a, b)
934 #define _simd16_cmpeq_ps(a, b) _simd16_cmp_ps<_CMP_EQ_OQ>(a, b)
935 #define _simd16_cmpge_ps(a, b) _simd16_cmp_ps<_CMP_GE_OQ>(a, b)
936 #define _simd16_cmple_ps(a, b) _simd16_cmp_ps<_CMP_LE_OQ>(a, b)
938 #define _simd16_castsi_ps _mm512_castsi512_ps
939 #define _simd16_castps_si _mm512_castps_si512
940 #define _simd16_castsi_pd _mm512_castsi512_pd
941 #define _simd16_castpd_si _mm512_castpd_si512
942 #define _simd16_castpd_ps _mm512_castpd_ps
943 #define _simd16_castps_pd _mm512_castps_pd
945 #define _simd16_and_ps _mm512_and_ps
946 #define _simd16_andnot_ps _mm512_andnot_ps
947 #define _simd16_or_ps _mm512_or_ps
948 #define _simd16_xor_ps _mm512_xor_ps
951 INLINE simd16scalar
_simd16_round_ps_temp(simd16scalar a
)
953 return _mm512_roundscale_ps(a
, mode
);
956 #define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
958 #define _simd16_mul_epi32 _mm512_mul_epi32
959 #define _simd16_mullo_epi32 _mm512_mullo_epi32
960 #define _simd16_sub_epi32 _mm512_sub_epi32
961 #define _simd16_sub_epi64 _mm512_sub_epi64
962 #define _simd16_min_epi32 _mm512_min_epi32
963 #define _simd16_max_epi32 _mm512_max_epi32
964 #define _simd16_min_epu32 _mm512_min_epu32
965 #define _simd16_max_epu32 _mm512_max_epu32
966 #define _simd16_add_epi32 _mm512_add_epi32
968 #define _simd16_and_si _mm512_and_si512
969 #define _simd16_andnot_si _mm512_andnot_si512
970 #define _simd16_or_si _mm512_or_si512
971 #define _simd16_xor_si _mm512_xor_si512
973 INLINE simd16scalari
_simd16_cmpeq_epi32(simd16scalari a
, simd16scalari b
)
975 simd16mask k
= _mm512_cmpeq_epi32_mask(a
, b
);
977 return _mm512_mask_blend_epi32(k
, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
980 INLINE simd16scalari
_simd16_cmpgt_epi32(simd16scalari a
, simd16scalari b
)
982 simd16mask k
= _mm512_cmpgt_epi32_mask(a
, b
);
984 return _mm512_mask_blend_epi32(k
, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
987 INLINE simd16scalari
_simd16_cmplt_epi32(simd16scalari a
, simd16scalari b
)
989 simd16mask k
= _mm512_cmplt_epi32_mask(a
, b
);
991 return _mm512_mask_blend_epi32(k
, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
995 INLINE
int _simd16_testz_ps(simd16scalar a
, simd16scalar b
)
997 int lo
= _mm256_testz_ps(a
.lo
, b
.lo
);
998 int hi
= _mm256_testz_ps(a
.hi
, b
.hi
);
1005 #define _simd16_unpacklo_ps _mm512_unpacklo_ps
1006 #define _simd16_unpackhi_ps _mm512_unpackhi_ps
1007 #define _simd16_unpacklo_pd _mm512_unpacklo_pd
1008 #define _simd16_unpackhi_pd _mm512_unpackhi_pd
1009 #define _simd16_unpacklo_epi8 _mm512_unpacklo_epi8
1010 #define _simd16_unpackhi_epi8 _mm512_unpackhi_epi8
1011 #define _simd16_unpacklo_epi16 _mm512_unpacklo_epi16
1012 #define _simd16_unpackhi_epi16 _mm512_unpackhi_epi16
1013 #define _simd16_unpacklo_epi32 _mm512_unpacklo_epi32
1014 #define _simd16_unpackhi_epi32 _mm512_unpackhi_epi32
1015 #define _simd16_unpacklo_epi64 _mm512_unpacklo_epi64
1016 #define _simd16_unpackhi_epi64 _mm512_unpackhi_epi64
1017 #define _simd16_slli_epi32 _mm512_slli_epi32
1018 #define _simd16_srli_epi32 _mm512_srli_epi32
1019 #define _simd16_srai_epi32 _mm512_srai_epi32
1020 #define _simd16_fmadd_ps _mm512_fmadd_ps
1021 #define _simd16_fmsub_ps _mm512_fmsub_ps
1022 #define _simd16_adds_epu8 _mm512_adds_epu8
1023 #define _simd16_subs_epu8 _mm512_subs_epu8
1024 #define _simd16_add_epi8 _mm512_add_epi8
1025 #define _simd16_shuffle_epi8 _mm512_shuffle_epi8
1027 #define _simd16_fmadd_ps _mm512_fmadd_ps
1028 #define _simd16_fmsub_ps _mm512_fmsub_ps
1030 #define _simd16_i32gather_ps(m, index, scale) _mm512_i32gather_ps(index, m, scale)
1032 template <int scale
>
1033 INLINE simd16scalar
_simd16_mask_i32gather_ps_temp(simd16scalar a
, const float *m
, simd16scalari index
, simd16scalari mask
)
1035 __mmask16 k
= _mm512_cmpneq_epi32_mask(mask
, _mm512_setzero_si512());
1037 return _mm512_mask_i32gather_ps(a
, k
, index
, m
, scale
);
1040 #define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, index, mask)
1042 #define _simd16_abs_epi32 _mm512_abs_epi32
1043 #define _simd16_cmpeq_epi64 _mm512_abs_epi32
1045 INLINE simd16scalari
_simd16_cmpeq_epi64(simd16scalari a
, simd16scalari b
)
1047 __mmask8 k
= _mm512_cmpeq_epi64_mask(a
, b
);
1049 return _mm512_mask_blend_epi64(k
, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1052 INLINE simd16scalari
_simd16_cmpgt_epi64(simd16scalari a
, simd16scalari b
)
1054 __mmask8 k
= _mm512_cmpgt_epi64_mask(a
, b
);
1056 return _mm512_mask_blend_epi64(k
, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1059 INLINE simd16scalari
_simd16_cmpeq_epi16(simd16scalari a
, simd16scalari b
)
1061 __mmask32 k
= _mm512_cmpeq_epi16_mask(a
, b
);
1063 return _mm512_mask_blend_epi16(k
, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1066 INLINE simd16scalari
_simd16_cmpgt_epi16(simd16scalari a
, simd16scalari b
)
1068 __mmask32 k
= _mm512_cmpgt_epi16_mask(a
, b
);
1070 return _mm512_mask_blend_epi16(k
, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1073 INLINE simd16scalari
_simd16_cmpeq_epi8(simd16scalari a
, simd16scalari b
)
1075 __mmask64 k
= _mm512_cmpeq_epi8_mask(a
, b
);
1077 return _mm512_mask_blend_epi8(k
, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1080 INLINE simd16scalari
_simd16_cmpgt_epi8(simd16scalari a
, simd16scalari b
)
1082 __mmask64 k
= _mm512_cmpgt_epi8_mask(a
, b
);
1084 return _mm512_mask_blend_epi8(k
, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1087 #define _simd16_permute_ps(a, i) _mm512_permutexvar_ps(i, a)
1088 #define _simd16_permute_epi32(a, i) _mm512_permutexvar_epi32(i, a)
1089 #define _simd16_sllv_epi32 _mm512_srlv_epi32
1090 #define _simd16_srlv_epi32 _mm512_sllv_epi32
1091 #define _simd16_permute2f128_ps _mm512_shuffle_f32x4
1092 #define _simd16_permute2f128_pd _mm512_shuffle_f64x2
1093 #define _simd16_permute2f128_si _mm512_shuffle_i32x4
1094 #define _simd16_shuffle_ps _mm512_shuffle_ps
1095 #define _simd16_shuffle_pd _mm512_shuffle_pd
1096 #define _simd16_cvtepu8_epi16 _mm512_cvtepu8_epi16
1097 #define _simd16_cvtepu8_epi32 _mm512_cvtepu8_epi32
1098 #define _simd16_cvtepu16_epi32 _mm512_cvtepu16_epi32
1099 #define _simd16_packus_epi16 _mm512_packus_epi16
1100 #define _simd16_packs_epi16 _mm512_packs_epi16
1101 #define _simd16_packus_epi32 _mm512_packus_epi32
1102 #define _simd16_packs_epi32 _mm512_packs_epi32
1105 INLINE simd16scalari
_simd16_shuffle_epi32_temp(simd16scalari a
, simd16scalari b
)
1107 return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a
), _simd16_castsi_ps(b
), imm8
));
1110 #define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
1113 INLINE simd16scalari
_simd16_shuffle_epi64_temp(simd16scalari a
, simd16scalari b
)
1115 return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a
), _simd16_castsi_pd(b
), imm8
));
1118 #define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
1120 INLINE simd16mask
_simd16_int2mask(int mask
)
1122 return _mm512_int2mask(mask
);
1125 INLINE
int _simd16_mask2int(simd16mask mask
)
1127 return _mm512_mask2int(mask
);
1130 INLINE simd16mask
_simd16_cmplt_ps_mask(simd16scalar a
, simd16scalar b
)
1132 return _mm512_cmplt_ps_mask(a
, b
);
1135 // convert bitmask to vector mask
1136 INLINE simd16scalar
vMask16(int32_t mask
)
1138 simd16scalari temp
= _simd16_set1_epi32(mask
);
1140 simd16scalari bits
= _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
1142 simd16scalari result
= _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp
, bits
));
1144 return _simd16_castsi_ps(result
);
1147 #endif//ENABLE_AVX512_EMULATION
1149 #endif//ENABLE_AVX512_SIMD16
1151 #endif//__SWR_SIMD16INTRIN_H_