1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 ****************************************************************************/
24 #ifndef __SWR_SIMD16INTRIN_H__
25 #define __SWR_SIMD16INTRIN_H__
27 #if ENABLE_AVX512_SIMD16
29 #if KNOB_SIMD16_WIDTH == 16
31 #if ENABLE_AVX512_EMULATION
47 typedef uint16_t simd16mask
;
50 typedef __m512 simd16scalar
;
51 typedef __m512d simd16scalard
;
52 typedef __m512i simd16scalari
;
53 typedef __mmask16 simd16mask
;
54 #endif//ENABLE_AVX512_EMULATION
56 #error Unsupported vector width
57 #endif//KNOB_SIMD16_WIDTH == 16
59 #define _simd16_masklo(mask) ((mask) & 0xFF)
60 #define _simd16_maskhi(mask) (((mask) >> 8) & 0xFF)
61 #define _simd16_setmask(hi, lo) (((hi) << 8) | (lo))
64 #define SIMDAPI __vectorcall
69 OSALIGN(union, KNOB_SIMD16_BYTES
) simd16vector
74 simd16scalar x
, y
, z
, w
;
77 simd16scalar
& operator[] (const int i
) { return v
[i
]; }
78 const simd16scalar
& operator[] (const int i
) const { return v
[i
]; }
81 #if ENABLE_AVX512_EMULATION
83 #define SIMD16_EMU_AVX512_0(type, func, intrin) \
84 INLINE type SIMDAPI func()\
88 result.lo = intrin();\
89 result.hi = intrin();\
94 #define SIMD16_EMU_AVX512_1(type, func, intrin) \
95 INLINE type SIMDAPI func(type a)\
99 result.lo = intrin(a.lo);\
100 result.hi = intrin(a.hi);\
105 #define SIMD16_EMU_AVX512_2(type, func, intrin) \
106 INLINE type SIMDAPI func(type a, type b)\
110 result.lo = intrin(a.lo, b.lo);\
111 result.hi = intrin(a.hi, b.hi);\
116 #define SIMD16_EMU_AVX512_3(type, func, intrin) \
117 INLINE type SIMDAPI func(type a, type b, type c)\
121 result.lo = intrin(a.lo, b.lo, c.lo);\
122 result.hi = intrin(a.hi, b.hi, c.hi);\
127 SIMD16_EMU_AVX512_0(simd16scalar
, _simd16_setzero_ps
, _mm256_setzero_ps
)
128 SIMD16_EMU_AVX512_0(simd16scalari
, _simd16_setzero_si
, _mm256_setzero_si256
)
130 INLINE simd16scalar SIMDAPI
_simd16_set1_ps(float a
)
134 result
.lo
= _mm256_set1_ps(a
);
135 result
.hi
= _mm256_set1_ps(a
);
140 INLINE simd16scalari SIMDAPI
_simd16_set1_epi8(char a
)
142 simd16scalari result
;
144 result
.lo
= _mm256_set1_epi8(a
);
145 result
.hi
= _mm256_set1_epi8(a
);
150 INLINE simd16scalari SIMDAPI
_simd16_set1_epi32(int a
)
152 simd16scalari result
;
154 result
.lo
= _mm256_set1_epi32(a
);
155 result
.hi
= _mm256_set1_epi32(a
);
160 INLINE simd16scalar SIMDAPI
_simd16_set_ps(float e15
, float e14
, float e13
, float e12
, float e11
, float e10
, float e9
, float e8
, float e7
, float e6
, float e5
, float e4
, float e3
, float e2
, float e1
, float e0
)
164 result
.lo
= _mm256_set_ps(e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
165 result
.hi
= _mm256_set_ps(e15
, e14
, e13
, e12
, e11
, e10
, e9
, e8
);
170 INLINE simd16scalari SIMDAPI
_simd16_set_epi32(int e15
, int e14
, int e13
, int e12
, int e11
, int e10
, int e9
, int e8
, int e7
, int e6
, int e5
, int e4
, int e3
, int e2
, int e1
, int e0
)
172 simd16scalari result
;
174 result
.lo
= _mm256_set_epi32(e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
175 result
.hi
= _mm256_set_epi32(e15
, e14
, e13
, e12
, e11
, e10
, e9
, e8
);
180 INLINE simd16scalar SIMDAPI
_simd16_set_ps(float e7
, float e6
, float e5
, float e4
, float e3
, float e2
, float e1
, float e0
)
184 result
.lo
= _mm256_set_ps(e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
185 result
.hi
= _mm256_set_ps(e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
190 INLINE simd16scalari SIMDAPI
_simd16_set_epi32(int e7
, int e6
, int e5
, int e4
, int e3
, int e2
, int e1
, int e0
)
192 simd16scalari result
;
194 result
.lo
= _mm256_set_epi32(e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
195 result
.hi
= _mm256_set_epi32(e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
200 INLINE simd16scalar SIMDAPI
_simd16_load_ps(float const *m
)
204 float const *n
= reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m
) + sizeof(result
.lo
));
206 result
.lo
= _mm256_load_ps(m
);
207 result
.hi
= _mm256_load_ps(n
);
212 INLINE simd16scalar SIMDAPI
_simd16_loadu_ps(float const *m
)
216 float const *n
= reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m
) + sizeof(result
.lo
));
218 result
.lo
= _mm256_loadu_ps(m
);
219 result
.hi
= _mm256_loadu_ps(n
);
224 INLINE simd16scalar SIMDAPI
_simd16_load1_ps(float const *m
)
228 result
.lo
= _mm256_broadcast_ss(m
);
229 result
.hi
= _mm256_broadcast_ss(m
);
234 INLINE simd16scalari SIMDAPI
_simd16_load_si(simd16scalari
const *m
)
236 simd16scalari result
;
238 result
.lo
= _mm256_load_si256(&m
[0].lo
);
239 result
.hi
= _mm256_load_si256(&m
[0].hi
);
244 INLINE simd16scalari SIMDAPI
_simd16_loadu_si(simd16scalari
const *m
)
246 simd16scalari result
;
248 result
.lo
= _mm256_loadu_si256(&m
[0].lo
);
249 result
.hi
= _mm256_loadu_si256(&m
[0].hi
);
254 INLINE simd16scalar SIMDAPI
_simd16_broadcast_ss(float const *m
)
258 result
.lo
= _mm256_broadcast_ss(m
);
259 result
.hi
= _mm256_broadcast_ss(m
);
264 INLINE simd16scalar SIMDAPI
_simd16_broadcast_ps(__m128
const *m
)
268 result
.lo
= _mm256_broadcast_ps(m
);
269 result
.hi
= _mm256_broadcast_ps(m
);
274 INLINE
void SIMDAPI
_simd16_store_ps(float *m
, simd16scalar a
)
276 float *n
= reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m
) + sizeof(a
.lo
));
278 _mm256_store_ps(m
, a
.lo
);
279 _mm256_store_ps(n
, a
.hi
);
282 INLINE
void SIMDAPI
_simd16_maskstore_ps(float *m
, simd16scalari mask
, simd16scalar a
)
284 float *n
= reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m
) + sizeof(a
.lo
));
286 _mm256_maskstore_ps(m
, mask
.lo
, a
.lo
);
287 _mm256_maskstore_ps(n
, mask
.hi
, a
.hi
);
290 INLINE
void SIMDAPI
_simd16_store_si(simd16scalari
*m
, simd16scalari a
)
292 _mm256_store_si256(&m
[0].lo
, a
.lo
);
293 _mm256_store_si256(&m
[0].hi
, a
.hi
);
296 INLINE simdscalar SIMDAPI
_simd16_extract_ps(simd16scalar a
, int imm8
)
305 return _simd_set1_ps(0.0f
);
308 INLINE simdscalari SIMDAPI
_simd16_extract_si(simd16scalari a
, int imm8
)
317 return _simd_set1_epi32(0);
320 INLINE simd16scalar SIMDAPI
_simd16_insert_ps(simd16scalar a
, simdscalar b
, int imm8
)
334 INLINE simd16scalari SIMDAPI
_simd16_insert_si(simd16scalari a
, simdscalari b
, int imm8
)
348 template <simd16mask mask
>
349 INLINE simd16scalar SIMDAPI
_simd16_blend_ps_temp(simd16scalar a
, simd16scalar b
)
353 result
.lo
= _mm256_blend_ps(a
.lo
, b
.lo
, _simd16_masklo(mask
));
354 result
.hi
= _mm256_blend_ps(a
.hi
, b
.hi
, _simd16_maskhi(mask
));
359 #define _simd16_blend_ps(a, b, mask) _simd16_blend_ps_temp<mask>(a, b)
361 SIMD16_EMU_AVX512_3(simd16scalar
, _simd16_blendv_ps
, _mm256_blendv_ps
)
363 INLINE simd16scalari SIMDAPI
_simd16_blendv_epi32(simd16scalari a
, simd16scalari b
, const simd16scalar mask
)
365 simd16scalari result
;
367 result
.lo
= _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a
.lo
), _mm256_castsi256_ps(b
.lo
), mask
.lo
));
368 result
.hi
= _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a
.hi
), _mm256_castsi256_ps(b
.hi
), mask
.hi
));
373 INLINE simd16scalari SIMDAPI
_simd16_blendv_epi32(simd16scalari a
, simd16scalari b
, const simd16scalari mask
)
375 simd16scalari result
;
377 result
.lo
= _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a
.lo
), _mm256_castsi256_ps(b
.lo
), _mm256_castsi256_ps(mask
.lo
)));
378 result
.hi
= _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a
.hi
), _mm256_castsi256_ps(b
.hi
), _mm256_castsi256_ps(mask
.hi
)));
383 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_mul_ps
, _mm256_mul_ps
)
384 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_add_ps
, _mm256_add_ps
)
385 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_sub_ps
, _mm256_sub_ps
)
386 SIMD16_EMU_AVX512_1(simd16scalar
, _simd16_rsqrt_ps
, _mm256_rsqrt_ps
)
387 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_min_ps
, _mm256_min_ps
)
388 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_max_ps
, _mm256_max_ps
)
390 INLINE simd16mask SIMDAPI
_simd16_movemask_ps(simd16scalar a
)
392 simdmask mask_lo
= _mm256_movemask_ps(a
.lo
);
393 simdmask mask_hi
= _mm256_movemask_ps(a
.hi
);
395 return static_cast<simd16mask
>(mask_lo
) | (static_cast<simd16mask
>(mask_hi
) << 8);
398 INLINE simd16mask SIMDAPI
_simd16_movemask_pd(simd16scalard a
)
400 simdmask mask_lo
= _mm256_movemask_pd(a
.lo
);
401 simdmask mask_hi
= _mm256_movemask_pd(a
.hi
);
403 return static_cast<simd16mask
>(mask_lo
) | (static_cast<simd16mask
>(mask_hi
) << 4);
406 INLINE
uint64_t SIMDAPI
_simd16_movemask_epi8(simd16scalari a
)
408 uint32_t mask_lo
= _mm256_movemask_epi8(a
.lo
);
409 uint32_t mask_hi
= _mm256_movemask_epi8(a
.hi
);
411 return static_cast<uint64_t>(mask_lo
) | (static_cast<uint64_t>(mask_hi
) << 32);
414 INLINE simd16scalari SIMDAPI
_simd16_cvtps_epi32(simd16scalar a
)
416 simd16scalari result
;
418 result
.lo
= _mm256_cvtps_epi32(a
.lo
);
419 result
.hi
= _mm256_cvtps_epi32(a
.hi
);
424 INLINE simd16scalari SIMDAPI
_simd16_cvttps_epi32(simd16scalar a
)
426 simd16scalari result
;
428 result
.lo
= _mm256_cvttps_epi32(a
.lo
);
429 result
.hi
= _mm256_cvttps_epi32(a
.hi
);
434 INLINE simd16scalar SIMDAPI
_simd16_cvtepi32_ps(simd16scalari a
)
438 result
.lo
= _mm256_cvtepi32_ps(a
.lo
);
439 result
.hi
= _mm256_cvtepi32_ps(a
.hi
);
445 INLINE simd16scalar SIMDAPI
_simd16_cmp_ps_temp(simd16scalar a
, simd16scalar b
)
449 result
.lo
= _mm256_cmp_ps(a
.lo
, b
.lo
, comp
);
450 result
.hi
= _mm256_cmp_ps(a
.hi
, b
.hi
, comp
);
455 #define _simd16_cmp_ps(a, b, comp) _simd16_cmp_ps_temp<comp>(a, b)
457 #define _simd16_cmplt_ps(a, b) _simd16_cmp_ps(a, b, _CMP_LT_OQ)
458 #define _simd16_cmpgt_ps(a, b) _simd16_cmp_ps(a, b, _CMP_GT_OQ)
459 #define _simd16_cmpneq_ps(a, b) _simd16_cmp_ps(a, b, _CMP_NEQ_OQ)
460 #define _simd16_cmpeq_ps(a, b) _simd16_cmp_ps(a, b, _CMP_EQ_OQ)
461 #define _simd16_cmpge_ps(a, b) _simd16_cmp_ps(a, b, _CMP_GE_OQ)
462 #define _simd16_cmple_ps(a, b) _simd16_cmp_ps(a, b, _CMP_LE_OQ)
464 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_and_ps
, _simd_and_ps
)
465 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_andnot_ps
, _simd_andnot_ps
)
466 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_or_ps
, _simd_or_ps
)
467 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_xor_ps
, _simd_xor_ps
)
469 SIMD16_EMU_AVX512_1(simd16scalar
, _simd16_rcp_ps
, _simd_rcp_ps
)
470 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_div_ps
, _simd_div_ps
)
472 INLINE simd16scalar SIMDAPI
_simd16_castsi_ps(simd16scalari a
)
474 return *reinterpret_cast<simd16scalar
*>(&a
);
477 INLINE simd16scalari SIMDAPI
_simd16_castps_si(simd16scalar a
)
479 return *reinterpret_cast<simd16scalari
*>(&a
);
482 INLINE simd16scalard SIMDAPI
_simd16_castsi_pd(simd16scalari a
)
484 return *reinterpret_cast<simd16scalard
*>(&a
);
487 INLINE simd16scalari SIMDAPI
_simd16_castpd_si(simd16scalard a
)
489 return *reinterpret_cast<simd16scalari
*>(&a
);
492 INLINE simd16scalar SIMDAPI
_simd16_castpd_ps(simd16scalard a
)
494 return *reinterpret_cast<simd16scalar
*>(&a
);
497 INLINE simd16scalard SIMDAPI
_simd16_castps_pd(simd16scalar a
)
499 return *reinterpret_cast<simd16scalard
*>(&a
);
503 INLINE simd16scalar SIMDAPI
_simd16_round_ps_temp(simd16scalar a
)
507 result
.lo
= _mm256_round_ps(a
.lo
, mode
);
508 result
.hi
= _mm256_round_ps(a
.hi
, mode
);
513 #define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
515 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_mul_epi32
, _simd_mul_epi32
)
516 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_mullo_epi32
, _simd_mullo_epi32
)
517 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_sub_epi32
, _simd_sub_epi32
)
518 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_sub_epi64
, _simd_sub_epi64
)
519 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_min_epi32
, _simd_min_epi32
)
520 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_max_epi32
, _simd_max_epi32
)
521 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_min_epu32
, _simd_min_epu32
)
522 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_max_epu32
, _simd_max_epu32
)
523 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_add_epi32
, _simd_add_epi32
)
525 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_and_si
, _simd_and_si
)
526 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_andnot_si
, _simd_andnot_si
)
527 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_or_si
, _simd_or_si
)
528 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_xor_si
, _simd_xor_si
)
530 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmpeq_epi32
, _simd_cmpeq_epi32
)
531 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmpgt_epi32
, _simd_cmpgt_epi32
)
532 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmplt_epi32
, _simd_cmplt_epi32
)
534 INLINE
int SIMDAPI
_simd16_testz_ps(simd16scalar a
, simd16scalar b
)
536 int lo
= _simd_testz_ps(a
.lo
, b
.lo
);
537 int hi
= _simd_testz_ps(a
.hi
, b
.hi
);
542 #define _simd16_cmplt_epi32(a, b) _simd16_cmpgt_epi32(b, a)
544 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_unpacklo_ps
, _simd_unpacklo_ps
)
545 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_unpackhi_ps
, _simd_unpackhi_ps
)
546 SIMD16_EMU_AVX512_2(simd16scalard
, _simd16_unpacklo_pd
, _simd_unpacklo_pd
)
547 SIMD16_EMU_AVX512_2(simd16scalard
, _simd16_unpackhi_pd
, _simd_unpackhi_pd
)
549 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_unpacklo_epi8
, _simd_unpacklo_epi8
)
550 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_unpackhi_epi8
, _simd_unpackhi_epi8
)
551 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_unpacklo_epi16
, _simd_unpacklo_epi16
)
552 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_unpackhi_epi16
, _simd_unpackhi_epi16
)
553 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_unpacklo_epi32
, _simd_unpacklo_epi32
)
554 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_unpackhi_epi32
, _simd_unpackhi_epi32
)
555 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_unpacklo_epi64
, _simd_unpacklo_epi64
)
556 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_unpackhi_epi64
, _simd_unpackhi_epi64
)
559 INLINE simd16scalari SIMDAPI
_simd16_slli_epi32_temp(simd16scalari a
)
561 simd16scalari result
;
563 result
.lo
= _simd_slli_epi32(a
.lo
, imm8
);
564 result
.hi
= _simd_slli_epi32(a
.hi
, imm8
);
569 #define _simd16_slli_epi32(a, imm8) _simd16_slli_epi32_temp<imm8>(a)
572 INLINE simd16scalari SIMDAPI
_simd16_srai_epi32_temp(simd16scalari a
)
574 simd16scalari result
;
576 result
.lo
= _simd_srai_epi32(a
.lo
, imm8
);
577 result
.hi
= _simd_srai_epi32(a
.hi
, imm8
);
582 #define _simd16_srai_epi32(a, imm8) _simd16_srai_epi32_temp<imm8>(a)
585 INLINE simd16scalari SIMDAPI
_simd16_srli_epi32_temp(simd16scalari a
)
587 simd16scalari result
;
589 result
.lo
= _simd_srli_epi32(a
.lo
, imm8
);
590 result
.hi
= _simd_srli_epi32(a
.hi
, imm8
);
595 #define _simd16_srli_epi32(a, imm8) _simd16_srli_epi32_temp<imm8>(a)
597 SIMD16_EMU_AVX512_3(simd16scalar
, _simd16_fmadd_ps
, _simd_fmadd_ps
)
598 SIMD16_EMU_AVX512_3(simd16scalar
, _simd16_fmsub_ps
, _simd_fmsub_ps
)
601 INLINE simd16scalar SIMDAPI
_simd16_i32gather_ps_temp(const float *m
, simd16scalari index
)
605 result
.lo
= _simd_i32gather_ps(m
, index
.lo
, scale
);
606 result
.hi
= _simd_i32gather_ps(m
, index
.hi
, scale
);
611 #define _simd16_i32gather_ps(m, index, scale) _simd16_i32gather_ps_temp<scale>(m, index)
614 INLINE simd16scalar SIMDAPI
_simd16_mask_i32gather_ps_temp(simd16scalar a
, const float *m
, simd16scalari index
, simd16scalari mask
)
618 result
.lo
= _simd_mask_i32gather_ps(a
.lo
, m
, index
.lo
, _simd_castsi_ps(mask
.lo
), scale
);
619 result
.hi
= _simd_mask_i32gather_ps(a
.hi
, m
, index
.hi
, _simd_castsi_ps(mask
.hi
), scale
);
624 #define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, index, mask)
626 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_shuffle_epi8
, _simd_shuffle_epi8
)
627 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_adds_epu8
, _simd_adds_epu8
)
628 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_subs_epu8
, _simd_subs_epu8
)
629 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_add_epi8
, _simd_add_epi8
)
630 SIMD16_EMU_AVX512_1(simd16scalari
, _simd16_abs_epi32
, _simd_abs_epi32
)
631 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmpeq_epi64
, _simd_cmpeq_epi64
)
632 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmpgt_epi64
, _simd_cmpgt_epi64
)
633 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmpeq_epi16
, _simd_cmpeq_epi16
)
634 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmpgt_epi16
, _simd_cmpgt_epi16
)
635 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmpeq_epi8
, _simd_cmpeq_epi8
)
636 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmpgt_epi8
, _simd_cmpgt_epi8
)
638 INLINE simd16scalar SIMDAPI
_simd16_permute_ps(simd16scalar a
, simd16scalari i
)
642 const simdscalari mask
= _simd_set1_epi32(7);
644 simdscalar lolo
= _simd_permute_ps(a
.lo
, _simd_and_si(i
.lo
, mask
));
645 simdscalar lohi
= _simd_permute_ps(a
.hi
, _simd_and_si(i
.lo
, mask
));
647 simdscalar hilo
= _simd_permute_ps(a
.lo
, _simd_and_si(i
.hi
, mask
));
648 simdscalar hihi
= _simd_permute_ps(a
.hi
, _simd_and_si(i
.hi
, mask
));
650 result
.lo
= _simd_blendv_ps(lolo
, lohi
, _simd_castsi_ps(_simd_cmpgt_epi32(i
.lo
, mask
)));
651 result
.hi
= _simd_blendv_ps(hilo
, hihi
, _simd_castsi_ps(_simd_cmpgt_epi32(i
.hi
, mask
)));
656 INLINE simd16scalari SIMDAPI
_simd16_permute_epi32(simd16scalari a
, simd16scalari i
)
658 return _simd16_castps_si(_simd16_permute_ps(_simd16_castsi_ps(a
), i
));
661 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_srlv_epi32
, _simd_srlv_epi32
)
662 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_sllv_epi32
, _simd_sllv_epi32
)
665 INLINE simd16scalar SIMDAPI
_simd16_permute2f128_ps_temp(simd16scalar a
, simd16scalar b
)
669 result
.lo
= _simd_permute2f128_ps(a
.lo
, a
.hi
, ((imm8
& 0x03) << 0) | ((imm8
& 0x0C) << 2));
670 result
.hi
= _simd_permute2f128_ps(b
.lo
, b
.hi
, ((imm8
& 0x30) >> 4) | ((imm8
& 0xC0) >> 2));
675 #define _simd16_permute2f128_ps(a, b, imm8) _simd16_permute2f128_ps_temp<imm8>(a, b)
678 INLINE simd16scalard SIMDAPI
_simd16_permute2f128_pd_temp(simd16scalard a
, simd16scalard b
)
680 simd16scalard result
;
682 result
.lo
= _simd_permute2f128_pd(a
.lo
, a
.hi
, ((imm8
& 0x03) << 0) | ((imm8
& 0x0C) << 2));
683 result
.hi
= _simd_permute2f128_pd(b
.lo
, b
.hi
, ((imm8
& 0x30) >> 4) | ((imm8
& 0xC0) >> 2));
688 #define _simd16_permute2f128_pd(a, b, imm8) _simd16_permute2f128_pd_temp<imm8>(a, b)
691 INLINE simd16scalari SIMDAPI
_simd16_permute2f128_si_temp(simd16scalari a
, simd16scalari b
)
693 simd16scalari result
;
695 result
.lo
= _simd_permute2f128_si(a
.lo
, a
.hi
, ((imm8
& 0x03) << 0) | ((imm8
& 0x0C) << 2));
696 result
.hi
= _simd_permute2f128_si(b
.lo
, b
.hi
, ((imm8
& 0x30) >> 4) | ((imm8
& 0xC0) >> 2));
701 #define _simd16_permute2f128_si(a, b, imm8) _simd16_permute2f128_si_temp<imm8>(a, b)
704 INLINE simd16scalar SIMDAPI
_simd16_shuffle_ps_temp(simd16scalar a
, simd16scalar b
)
708 result
.lo
= _simd_shuffle_ps(a
.lo
, b
.lo
, imm8
);
709 result
.hi
= _simd_shuffle_ps(a
.hi
, b
.hi
, imm8
);
714 #define _simd16_shuffle_ps(a, b, imm8) _simd16_shuffle_ps_temp<imm8>(a, b)
717 INLINE simd16scalard SIMDAPI
_simd16_shuffle_pd_temp(simd16scalard a
, simd16scalard b
)
719 simd16scalard result
;
721 result
.lo
= _simd_shuffle_pd(a
.lo
, b
.lo
, (imm8
& 15));
722 result
.hi
= _simd_shuffle_pd(a
.hi
, b
.hi
, (imm8
>> 4));
727 #define _simd16_shuffle_pd(a, b, imm8) _simd16_shuffle_pd_temp<imm8>(a, b)
730 INLINE simd16scalari SIMDAPI
_simd16_shuffle_epi32_temp(simd16scalari a
, simd16scalari b
)
732 return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a
), _simd16_castsi_ps(b
), imm8
));
735 #define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
738 INLINE simd16scalari SIMDAPI
_simd16_shuffle_epi64_temp(simd16scalari a
, simd16scalari b
)
740 return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a
), _simd16_castsi_pd(b
), imm8
));
743 #define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
745 INLINE simd16scalari SIMDAPI
_simd16_cvtepu8_epi16(simdscalari a
)
747 simd16scalari result
;
749 result
.lo
= _simd_cvtepu8_epi16(_mm256_extractf128_si256(a
, 0));
750 result
.hi
= _simd_cvtepu8_epi16(_mm256_extractf128_si256(a
, 1));
755 INLINE simd16scalari SIMDAPI
_simd16_cvtepu8_epi32(__m128i a
)
757 simd16scalari result
;
759 result
.lo
= _simd_cvtepu8_epi32(a
);
760 result
.hi
= _simd_cvtepu8_epi32(_mm_srli_si128(a
, 8));
765 INLINE simd16scalari SIMDAPI
_simd16_cvtepu16_epi32(simdscalari a
)
767 simd16scalari result
;
769 result
.lo
= _simd_cvtepu16_epi32(_mm256_extractf128_si256(a
, 0));
770 result
.hi
= _simd_cvtepu16_epi32(_mm256_extractf128_si256(a
, 1));
775 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_packus_epi16
, _simd_packus_epi16
)
776 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_packs_epi16
, _simd_packs_epi16
)
777 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_packus_epi32
, _simd_packus_epi32
)
778 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_packs_epi32
, _simd_packs_epi32
)
780 INLINE simd16mask SIMDAPI
_simd16_int2mask(int mask
)
785 INLINE
int SIMDAPI SIMDAPI
_simd16_mask2int(simd16mask mask
)
790 INLINE simd16mask SIMDAPI
_simd16_cmplt_ps_mask(simd16scalar a
, simd16scalar b
)
792 return _simd16_movemask_ps(_simd16_cmplt_ps(a
, b
));
795 // convert bitmask to vector mask
796 INLINE simd16scalar SIMDAPI
vMask16(int32_t mask
)
798 simd16scalari temp
= _simd16_set1_epi32(mask
);
800 simd16scalari bits
= _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
802 simd16scalari result
= _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp
, bits
));
804 return _simd16_castsi_ps(result
);
809 INLINE simd16mask SIMDAPI
_simd16_scalari2mask(simd16scalari mask
)
811 return _mm512_cmpneq_epu32_mask(mask
, _mm512_setzero_epi32());
814 INLINE simd16mask SIMDAPI
_simd16_scalard2mask(simd16scalard mask
)
816 return _mm512_cmpneq_epu64_mask(_mm512_castpd_si512(mask
), _mm512_setzero_si512());
819 #define _simd16_setzero_ps _mm512_setzero_ps
820 #define _simd16_setzero_si _mm512_setzero_si512
821 #define _simd16_set1_ps _mm512_set1_ps
822 #define _simd16_set1_epi8 _mm512_set1_epi8
823 #define _simd16_set1_epi32 _mm512_set1_epi32
825 INLINE simd16scalar SIMDAPI
_simd16_set_ps(float e15
, float e14
, float e13
, float e12
, float e11
, float e10
, float e9
, float e8
, float e7
, float e6
, float e5
, float e4
, float e3
, float e2
, float e1
, float e0
)
827 return _mm512_set_ps(e15
, e14
, e13
, e12
, e11
, e10
, e9
, e8
, e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
830 INLINE simd16scalari SIMDAPI
_simd16_set_epi32(int e15
, int e14
, int e13
, int e12
, int e11
, int e10
, int e9
, int e8
, int e7
, int e6
, int e5
, int e4
, int e3
, int e2
, int e1
, int e0
)
832 return _mm512_set_epi32(e15
, e14
, e13
, e12
, e11
, e10
, e9
, e8
, e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
835 INLINE simd16scalar SIMDAPI
_simd16_set_ps(float e7
, float e6
, float e5
, float e4
, float e3
, float e2
, float e1
, float e0
)
837 return _mm512_set_ps(e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
, e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
840 INLINE simd16scalari SIMDAPI
_simd16_set_epi32(int e7
, int e6
, int e5
, int e4
, int e3
, int e2
, int e1
, int e0
)
842 return _mm512_set_epi32(e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
, e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
845 #define _simd16_load_ps _mm512_load_ps
846 #define _simd16_loadu_ps _mm512_loadu_ps
848 #define _simd16_load1_ps _simd16_broadcast_ss
850 #define _simd16_load_si _mm512_load_si512
851 #define _simd16_loadu_si _mm512_loadu_si512
852 #define _simd16_broadcast_ss(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, 0)
853 #define _simd16_broadcast_ps(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, 0)
854 #define _simd16_store_ps _mm512_store_ps
855 #define _simd16_store_si _mm512_store_si512
856 #define _simd16_extract_ps(a, imm8) _mm256_castsi256_ps(_mm512_extracti64x4_epi64(_mm512_castps_si512(a), imm8))
857 #define _simd16_extract_si _mm512_extracti64x4_epi64
858 #define _simd16_insert_ps(a, b, imm8) _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castps_si512(a), _mm256_castps_si256(b), imm8))
859 #define _simd16_insert_si _mm512_inserti64x4
861 INLINE
void SIMDAPI
_simd16_maskstore_ps(float *m
, simd16scalari mask
, simd16scalar a
)
863 simd16mask k
= _simd16_scalari2mask(mask
);
865 _mm512_mask_store_ps(m
, k
, a
);
868 #define _simd16_blend_ps(a, b, mask) _mm512_mask_blend_ps(mask, a, b)
870 INLINE simd16scalar SIMDAPI
_simd16_blendv_ps(simd16scalar a
, simd16scalar b
, const simd16scalar mask
)
872 simd16mask k
= _simd16_scalari2mask(_mm512_castps_si512(mask
));
874 return _mm512_mask_blend_ps(k
, a
, b
);
877 INLINE simd16scalari SIMDAPI
_simd16_blendv_epi32(simd16scalari a
, simd16scalari b
, const simd16scalar mask
)
879 simd16mask k
= _simd16_scalari2mask(_mm512_castps_si512(mask
));
881 return _mm512_mask_blend_epi32(k
, a
, b
);
884 INLINE simd16scalari SIMDAPI
_simd16_blendv_epi32(simd16scalari a
, simd16scalari b
, const simd16scalari mask
)
886 simd16mask k
= _simd16_scalari2mask(mask
);
888 return _mm512_mask_blend_epi32(k
, a
, b
);
891 #define _simd16_mul_ps _mm512_mul_ps
892 #define _simd16_div_ps _mm512_div_ps
893 #define _simd16_add_ps _mm512_add_ps
894 #define _simd16_sub_ps _mm512_sub_ps
895 #define _simd16_rsqrt_ps _mm512_rsqrt14_ps
896 #define _simd16_min_ps _mm512_min_ps
897 #define _simd16_max_ps _mm512_max_ps
899 INLINE simd16mask SIMDAPI
_simd16_movemask_ps(simd16scalar a
)
901 return _simd16_scalari2mask(_mm512_castps_si512(a
));
904 INLINE simd16mask SIMDAPI
_simd16_movemask_pd(simd16scalard a
)
906 return _simd16_scalard2mask(a
);
910 INLINE
int SIMDAPI
_simd16_movemask_epi8(simd16scalari a
)
912 return _simd16_scalar2mask(a
);
916 #define _simd16_cvtps_epi32 _mm512_cvtps_epi32
917 #define _simd16_cvttps_epi32 _mm512_cvttps_epi32
918 #define _simd16_cvtepi32_ps _mm512_cvtepi32_ps
921 INLINE simd16scalar SIMDAPI
_simd16_cmp_ps_temp(simd16scalar a
, simd16scalar b
)
923 simd16mask k
= _mm512_cmp_ps_mask(a
, b
, comp
);
925 return _mm512_castsi512_ps(_mm512_mask_blend_epi32(k
, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF)));
928 #define _simd16_cmp_ps(a, b, comp) _simd16_cmp_ps_temp<comp>(a, b)
930 #define _simd16_cmplt_ps(a, b) _simd16_cmp_ps(a, b, _CMP_LT_OQ)
931 #define _simd16_cmpgt_ps(a, b) _simd16_cmp_ps(a, b, _CMP_GT_OQ)
932 #define _simd16_cmpneq_ps(a, b) _simd16_cmp_ps(a, b, _CMP_NEQ_OQ)
933 #define _simd16_cmpeq_ps(a, b) _simd16_cmp_ps(a, b, _CMP_EQ_OQ)
934 #define _simd16_cmpge_ps(a, b) _simd16_cmp_ps(a, b, _CMP_GE_OQ)
935 #define _simd16_cmple_ps(a, b) _simd16_cmp_ps(a, b, _CMP_LE_OQ)
937 #define _simd16_castsi_ps _mm512_castsi512_ps
938 #define _simd16_castps_si _mm512_castps_si512
939 #define _simd16_castsi_pd _mm512_castsi512_pd
940 #define _simd16_castpd_si _mm512_castpd_si512
941 #define _simd16_castpd_ps _mm512_castpd_ps
942 #define _simd16_castps_pd _mm512_castps_pd
944 #define _simd16_and_ps _mm512_and_ps
945 #define _simd16_andnot_ps _mm512_andnot_ps
946 #define _simd16_or_ps _mm512_or_ps
947 #define _simd16_xor_ps _mm512_xor_ps
950 INLINE simd16scalar SIMDAPI
_simd16_round_ps_temp(simd16scalar a
)
952 return _mm512_roundscale_ps(a
, mode
);
955 #define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
957 #define _simd16_mul_epi32 _mm512_mul_epi32
958 #define _simd16_mullo_epi32 _mm512_mullo_epi32
959 #define _simd16_sub_epi32 _mm512_sub_epi32
960 #define _simd16_sub_epi64 _mm512_sub_epi64
961 #define _simd16_min_epi32 _mm512_min_epi32
962 #define _simd16_max_epi32 _mm512_max_epi32
963 #define _simd16_min_epu32 _mm512_min_epu32
964 #define _simd16_max_epu32 _mm512_max_epu32
965 #define _simd16_add_epi32 _mm512_add_epi32
967 #define _simd16_and_si _mm512_and_si512
968 #define _simd16_andnot_si _mm512_andnot_si512
969 #define _simd16_or_si _mm512_or_si512
970 #define _simd16_xor_si _mm512_xor_si512
972 INLINE simd16scalari SIMDAPI
_simd16_cmpeq_epi32(simd16scalari a
, simd16scalari b
)
974 simd16mask k
= _mm512_cmpeq_epi32_mask(a
, b
);
976 return _mm512_mask_blend_epi32(k
, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
979 INLINE simd16scalari SIMDAPI
_simd16_cmpgt_epi32(simd16scalari a
, simd16scalari b
)
981 simd16mask k
= _mm512_cmpgt_epi32_mask(a
, b
);
983 return _mm512_mask_blend_epi32(k
, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
986 INLINE simd16scalari SIMDAPI
_simd16_cmplt_epi32(simd16scalari a
, simd16scalari b
)
988 simd16mask k
= _mm512_cmplt_epi32_mask(a
, b
);
990 return _mm512_mask_blend_epi32(k
, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
993 INLINE
int SIMDAPI
_simd16_testz_ps(simd16scalar a
, simd16scalar b
)
995 int lo
= _simd_testz_ps(_simd16_extract_ps(a
, 0), _simd16_extract_ps(b
, 0));
996 int hi
= _simd_testz_ps(_simd16_extract_ps(a
, 1), _simd16_extract_ps(b
, 1));
1001 #define _simd16_unpacklo_ps _mm512_unpacklo_ps
1002 #define _simd16_unpackhi_ps _mm512_unpackhi_ps
1003 #define _simd16_unpacklo_pd _mm512_unpacklo_pd
1004 #define _simd16_unpackhi_pd _mm512_unpackhi_pd
1005 #define _simd16_unpacklo_epi8 _mm512_unpacklo_epi8
1006 #define _simd16_unpackhi_epi8 _mm512_unpackhi_epi8
1007 #define _simd16_unpacklo_epi16 _mm512_unpacklo_epi16
1008 #define _simd16_unpackhi_epi16 _mm512_unpackhi_epi16
1009 #define _simd16_unpacklo_epi32 _mm512_unpacklo_epi32
1010 #define _simd16_unpackhi_epi32 _mm512_unpackhi_epi32
1011 #define _simd16_unpacklo_epi64 _mm512_unpacklo_epi64
1012 #define _simd16_unpackhi_epi64 _mm512_unpackhi_epi64
1013 #define _simd16_slli_epi32 _mm512_slli_epi32
1014 #define _simd16_srli_epi32 _mm512_srli_epi32
1015 #define _simd16_srai_epi32 _mm512_srai_epi32
1016 #define _simd16_fmadd_ps _mm512_fmadd_ps
1017 #define _simd16_fmsub_ps _mm512_fmsub_ps
1018 #define _simd16_adds_epu8 _mm512_adds_epu8
1019 #define _simd16_subs_epu8 _mm512_subs_epu8
1020 #define _simd16_add_epi8 _mm512_add_epi8
1021 #define _simd16_shuffle_epi8 _mm512_shuffle_epi8
1023 #define _simd16_fmadd_ps _mm512_fmadd_ps
1024 #define _simd16_fmsub_ps _mm512_fmsub_ps
1026 #define _simd16_i32gather_ps(m, index, scale) _mm512_i32gather_ps(index, m, scale)
1028 template <int scale
>
1029 INLINE simd16scalar SIMDAPI
_simd16_mask_i32gather_ps_temp(simd16scalar a
, const float *m
, simd16scalari index
, simd16scalari mask
)
1031 __mmask16 k
= _mm512_cmpneq_epi32_mask(mask
, _mm512_setzero_si512());
1033 return _mm512_mask_i32gather_ps(a
, k
, index
, m
, scale
);
1036 #define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, index, mask)
1038 #define _simd16_abs_epi32 _mm512_abs_epi32
1040 INLINE simd16scalari SIMDAPI
_simd16_cmpeq_epi64(simd16scalari a
, simd16scalari b
)
1042 __mmask8 k
= _mm512_cmpeq_epi64_mask(a
, b
);
1044 return _mm512_mask_blend_epi64(k
, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1047 INLINE simd16scalari SIMDAPI
_simd16_cmpgt_epi64(simd16scalari a
, simd16scalari b
)
1049 __mmask8 k
= _mm512_cmpgt_epi64_mask(a
, b
);
1051 return _mm512_mask_blend_epi64(k
, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1054 INLINE simd16scalari SIMDAPI
_simd16_cmpeq_epi16(simd16scalari a
, simd16scalari b
)
1056 __mmask32 k
= _mm512_cmpeq_epi16_mask(a
, b
);
1058 return _mm512_mask_blend_epi16(k
, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1061 INLINE simd16scalari SIMDAPI
_simd16_cmpgt_epi16(simd16scalari a
, simd16scalari b
)
1063 __mmask32 k
= _mm512_cmpgt_epi16_mask(a
, b
);
1065 return _mm512_mask_blend_epi16(k
, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1068 INLINE simd16scalari SIMDAPI
_simd16_cmpeq_epi8(simd16scalari a
, simd16scalari b
)
1070 __mmask64 k
= _mm512_cmpeq_epi8_mask(a
, b
);
1072 return _mm512_mask_blend_epi8(k
, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1075 INLINE simd16scalari SIMDAPI
_simd16_cmpgt_epi8(simd16scalari a
, simd16scalari b
)
1077 __mmask64 k
= _mm512_cmpgt_epi8_mask(a
, b
);
1079 return _mm512_mask_blend_epi8(k
, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1082 #define _simd16_permute_ps(a, i) _mm512_permutexvar_ps(i, a)
1083 #define _simd16_permute_epi32(a, i) _mm512_permutexvar_epi32(i, a)
1084 #define _simd16_sllv_epi32 _mm512_srlv_epi32
1085 #define _simd16_srlv_epi32 _mm512_sllv_epi32
1086 #define _simd16_permute2f128_ps _mm512_shuffle_f32x4
1087 #define _simd16_permute2f128_pd _mm512_shuffle_f64x2
1088 #define _simd16_permute2f128_si _mm512_shuffle_i32x4
1089 #define _simd16_shuffle_ps _mm512_shuffle_ps
1090 #define _simd16_shuffle_pd _mm512_shuffle_pd
1091 #define _simd16_cvtepu8_epi16 _mm512_cvtepu8_epi16
1092 #define _simd16_cvtepu8_epi32 _mm512_cvtepu8_epi32
1093 #define _simd16_cvtepu16_epi32 _mm512_cvtepu16_epi32
1094 #define _simd16_packus_epi16 _mm512_packus_epi16
1095 #define _simd16_packs_epi16 _mm512_packs_epi16
1096 #define _simd16_packus_epi32 _mm512_packus_epi32
1097 #define _simd16_packs_epi32 _mm512_packs_epi32
1100 INLINE simd16scalari SIMDAPI
_simd16_shuffle_epi32_temp(simd16scalari a
, simd16scalari b
)
1102 return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a
), _simd16_castsi_ps(b
), imm8
));
1105 #define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
1108 INLINE simd16scalari SIMDAPI
_simd16_shuffle_epi64_temp(simd16scalari a
, simd16scalari b
)
1110 return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a
), _simd16_castsi_pd(b
), imm8
));
1113 #define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
1115 INLINE simd16mask SIMDAPI
_simd16_int2mask(int mask
)
1117 return _mm512_int2mask(mask
);
1120 INLINE
int SIMDAPI
_simd16_mask2int(simd16mask mask
)
1122 return _mm512_mask2int(mask
);
1125 INLINE simd16mask SIMDAPI
_simd16_cmplt_ps_mask(simd16scalar a
, simd16scalar b
)
1127 return _mm512_cmplt_ps_mask(a
, b
);
1130 // convert bitmask to vector mask
1131 INLINE simd16scalar SIMDAPI
vMask16(int32_t mask
)
1133 simd16scalari temp
= _simd16_set1_epi32(mask
);
1135 simd16scalari bits
= _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
1137 simd16scalari result
= _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp
, bits
));
1139 return _simd16_castsi_ps(result
);
1142 #endif//ENABLE_AVX512_EMULATION
1144 #endif//ENABLE_AVX512_SIMD16
1146 #endif//__SWR_SIMD16INTRIN_H_