1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 ****************************************************************************/
24 #ifndef __SWR_SIMD16INTRIN_H__
25 #define __SWR_SIMD16INTRIN_H__
27 #if ENABLE_AVX512_SIMD16
29 #if KNOB_SIMD16_WIDTH == 16
31 #if ENABLE_AVX512_EMULATION
47 typedef uint16_t simd16mask
;
49 #define _simd16_masklo(mask) ((mask) & 0xFF)
50 #define _simd16_maskhi(mask) (((mask) >> 8))
51 #define _simd16_setmask(hi, lo) (((hi) << 8) | (lo))
54 typedef __m512 simd16scalar
;
55 typedef __m512d simd16scalard
;
56 typedef __m512i simd16scalari
;
57 typedef __mmask16 simd16mask
;
58 #endif//ENABLE_AVX512_EMULATION
60 #error Unsupported vector width
61 #endif//KNOB_SIMD16_WIDTH == 16
63 OSALIGN(union, KNOB_SIMD16_BYTES
) simd16vector
68 simd16scalar x
, y
, z
, w
;
71 simd16scalar
& operator[] (const int i
) { return v
[i
]; }
72 const simd16scalar
& operator[] (const int i
) const { return v
[i
]; }
75 #if ENABLE_AVX512_EMULATION
77 #define SIMD16_EMU_AVX512_0(type, func, intrin) \
82 result.lo = intrin();\
83 result.hi = intrin();\
88 #define SIMD16_EMU_AVX512_1(type, func, intrin) \
89 INLINE type func(type a)\
93 result.lo = intrin(a.lo);\
94 result.hi = intrin(a.hi);\
99 #define SIMD16_EMU_AVX512_2(type, func, intrin) \
100 INLINE type func(type a, type b)\
104 result.lo = intrin(a.lo, b.lo);\
105 result.hi = intrin(a.hi, b.hi);\
110 #define SIMD16_EMU_AVX512_3(type, func, intrin) \
111 INLINE type func(type a, type b, type c)\
115 result.lo = intrin(a.lo, b.lo, c.lo);\
116 result.hi = intrin(a.hi, b.hi, c.hi);\
121 SIMD16_EMU_AVX512_0(simd16scalar
, _simd16_setzero_ps
, _mm256_setzero_ps
)
122 SIMD16_EMU_AVX512_0(simd16scalari
, _simd16_setzero_si
, _mm256_setzero_si256
)
124 INLINE simd16scalar
_simd16_set1_ps(float a
)
128 result
.lo
= _mm256_set1_ps(a
);
129 result
.hi
= _mm256_set1_ps(a
);
134 INLINE simd16scalari
_simd16_set1_epi8(char a
)
136 simd16scalari result
;
138 result
.lo
= _mm256_set1_epi8(a
);
139 result
.hi
= _mm256_set1_epi8(a
);
144 INLINE simd16scalari
_simd16_set1_epi32(int a
)
146 simd16scalari result
;
148 result
.lo
= _mm256_set1_epi32(a
);
149 result
.hi
= _mm256_set1_epi32(a
);
154 INLINE simd16scalar
_simd16_set_ps(float e15
, float e14
, float e13
, float e12
, float e11
, float e10
, float e9
, float e8
, float e7
, float e6
, float e5
, float e4
, float e3
, float e2
, float e1
, float e0
)
158 result
.lo
= _mm256_set_ps(e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
159 result
.hi
= _mm256_set_ps(e15
, e14
, e13
, e12
, e11
, e10
, e9
, e8
);
164 INLINE simd16scalari
_simd16_set_epi32(int e15
, int e14
, int e13
, int e12
, int e11
, int e10
, int e9
, int e8
, int e7
, int e6
, int e5
, int e4
, int e3
, int e2
, int e1
, int e0
)
166 simd16scalari result
;
168 result
.lo
= _mm256_set_epi32(e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
169 result
.hi
= _mm256_set_epi32(e15
, e14
, e13
, e12
, e11
, e10
, e9
, e8
);
174 INLINE simd16scalar
_simd16_set_ps(float e7
, float e6
, float e5
, float e4
, float e3
, float e2
, float e1
, float e0
)
178 result
.lo
= _mm256_set_ps(e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
179 result
.hi
= _mm256_set_ps(e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
184 INLINE simd16scalari
_simd16_set_epi32(int e7
, int e6
, int e5
, int e4
, int e3
, int e2
, int e1
, int e0
)
186 simd16scalari result
;
188 result
.lo
= _mm256_set_epi32(e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
189 result
.hi
= _mm256_set_epi32(e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
194 INLINE simd16scalar
_simd16_load_ps(float const *m
)
198 float const *n
= reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m
) + sizeof(result
.lo
));
200 result
.lo
= _mm256_load_ps(m
);
201 result
.hi
= _mm256_load_ps(n
);
206 INLINE simd16scalar
_simd16_loadu_ps(float const *m
)
210 float const *n
= reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m
) + sizeof(result
.lo
));
212 result
.lo
= _mm256_loadu_ps(m
);
213 result
.hi
= _mm256_loadu_ps(n
);
218 INLINE simd16scalar
_simd16_load1_ps(float const *m
)
222 result
.lo
= _mm256_broadcast_ss(m
);
223 result
.hi
= _mm256_broadcast_ss(m
);
228 INLINE simd16scalari
_simd16_load_si(simd16scalari
const *m
)
230 simd16scalari result
;
232 result
.lo
= _mm256_load_si256(&m
[0].lo
);
233 result
.hi
= _mm256_load_si256(&m
[0].hi
);
238 INLINE simd16scalari
_simd16_loadu_si(simd16scalari
const *m
)
240 simd16scalari result
;
242 result
.lo
= _mm256_loadu_si256(&m
[0].lo
);
243 result
.hi
= _mm256_loadu_si256(&m
[0].hi
);
248 INLINE simd16scalar
_simd16_broadcast_ss(float const *m
)
252 result
.lo
= _mm256_broadcast_ss(m
);
253 result
.hi
= _mm256_broadcast_ss(m
);
258 INLINE simd16scalar
_simd16_broadcast_ps(__m128
const *m
)
262 result
.lo
= _mm256_broadcast_ps(m
);
263 result
.hi
= _mm256_broadcast_ps(m
);
268 INLINE
void _simd16_store_ps(float *m
, simd16scalar a
)
270 float *n
= reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m
) + sizeof(a
.lo
));
272 _mm256_store_ps(m
, a
.lo
);
273 _mm256_store_ps(n
, a
.hi
);
276 INLINE
void _simd16_maskstore_ps(float *m
, simd16scalari mask
, simd16scalar a
)
278 float *n
= reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m
) + sizeof(a
.lo
));
280 _mm256_maskstore_ps(m
, mask
.lo
, a
.lo
);
281 _mm256_maskstore_ps(n
, mask
.hi
, a
.hi
);
284 INLINE
void _simd16_store_si(simd16scalari
*m
, simd16scalari a
)
286 _mm256_store_si256(&m
[0].lo
, a
.lo
);
287 _mm256_store_si256(&m
[0].hi
, a
.hi
);
290 INLINE simdscalar
_simd16_extract_ps(simd16scalar a
, int imm8
)
299 return _simd_set1_ps(0.0f
);
302 INLINE simdscalari
_simd16_extract_si(simd16scalari a
, int imm8
)
311 return _simd_set1_epi32(0);
314 INLINE simd16scalar
_simd16_insert_ps(simd16scalar a
, simdscalar b
, int imm8
)
328 INLINE simd16scalari
_simd16_insert_si(simd16scalari a
, simdscalari b
, int imm8
)
342 template <simd16mask mask
>
343 INLINE simd16scalar
_simd16_blend_ps_temp(simd16scalar a
, simd16scalar b
)
347 result
.lo
= _mm256_blend_ps(a
.lo
, b
.lo
, _simd16_masklo(mask
));
348 result
.hi
= _mm256_blend_ps(a
.hi
, b
.hi
, _simd16_maskhi(mask
));
353 #define _simd16_blend_ps(a, b, mask) _simd16_blend_ps_temp<mask>(a, b)
355 SIMD16_EMU_AVX512_3(simd16scalar
, _simd16_blendv_ps
, _mm256_blendv_ps
)
357 INLINE simd16scalari
_simd16_blendv_epi32(simd16scalari a
, simd16scalari b
, const simd16scalar mask
)
359 simd16scalari result
;
361 result
.lo
= _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a
.lo
), _mm256_castsi256_ps(b
.lo
), mask
.lo
));
362 result
.hi
= _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a
.hi
), _mm256_castsi256_ps(b
.hi
), mask
.hi
));
367 INLINE simd16scalari
_simd16_blendv_epi32(simd16scalari a
, simd16scalari b
, const simd16scalari mask
)
369 simd16scalari result
;
371 result
.lo
= _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a
.lo
), _mm256_castsi256_ps(b
.lo
), _mm256_castsi256_ps(mask
.lo
)));
372 result
.hi
= _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a
.hi
), _mm256_castsi256_ps(b
.hi
), _mm256_castsi256_ps(mask
.hi
)));
377 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_mul_ps
, _mm256_mul_ps
)
378 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_add_ps
, _mm256_add_ps
)
379 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_sub_ps
, _mm256_sub_ps
)
380 SIMD16_EMU_AVX512_1(simd16scalar
, _simd16_rsqrt_ps
, _mm256_rsqrt_ps
)
381 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_min_ps
, _mm256_min_ps
)
382 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_max_ps
, _mm256_max_ps
)
384 INLINE simd16mask
_simd16_movemask_ps(simd16scalar a
)
388 reinterpret_cast<uint8_t *>(&mask
)[0] = _mm256_movemask_ps(a
.lo
);
389 reinterpret_cast<uint8_t *>(&mask
)[1] = _mm256_movemask_ps(a
.hi
);
394 INLINE simd16mask
_simd16_movemask_pd(simd16scalard a
)
398 reinterpret_cast<uint8_t *>(&mask
)[0] = _mm256_movemask_pd(a
.lo
);
399 reinterpret_cast<uint8_t *>(&mask
)[1] = _mm256_movemask_pd(a
.hi
);
404 INLINE simd16mask
_simd16_movemask_epi8(simd16scalari a
)
408 reinterpret_cast<uint8_t *>(&mask
)[0] = _mm256_movemask_epi8(a
.lo
);
409 reinterpret_cast<uint8_t *>(&mask
)[1] = _mm256_movemask_epi8(a
.hi
);
414 INLINE simd16scalari
_simd16_cvtps_epi32(simd16scalar a
)
416 simd16scalari result
;
418 result
.lo
= _mm256_cvtps_epi32(a
.lo
);
419 result
.hi
= _mm256_cvtps_epi32(a
.hi
);
424 INLINE simd16scalari
_simd16_cvttps_epi32(simd16scalar a
)
426 simd16scalari result
;
428 result
.lo
= _mm256_cvttps_epi32(a
.lo
);
429 result
.hi
= _mm256_cvttps_epi32(a
.hi
);
434 INLINE simd16scalar
_simd16_cvtepi32_ps(simd16scalari a
)
438 result
.lo
= _mm256_cvtepi32_ps(a
.lo
);
439 result
.hi
= _mm256_cvtepi32_ps(a
.hi
);
445 INLINE simd16scalar
_simd16_cmp_ps(simd16scalar a
, simd16scalar b
)
449 result
.lo
= _mm256_cmp_ps(a
.lo
, b
.lo
, comp
);
450 result
.hi
= _mm256_cmp_ps(a
.hi
, b
.hi
, comp
);
455 #define _simd16_cmplt_ps(a, b) _simd16_cmp_ps<_CMP_LT_OQ>(a, b)
456 #define _simd16_cmpgt_ps(a, b) _simd16_cmp_ps<_CMP_GT_OQ>(a, b)
457 #define _simd16_cmpneq_ps(a, b) _simd16_cmp_ps<_CMP_NEQ_OQ>(a, b)
458 #define _simd16_cmpeq_ps(a, b) _simd16_cmp_ps<_CMP_EQ_OQ>(a, b)
459 #define _simd16_cmpge_ps(a, b) _simd16_cmp_ps<_CMP_GE_OQ>(a, b)
460 #define _simd16_cmple_ps(a, b) _simd16_cmp_ps<_CMP_LE_OQ>(a, b)
462 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_and_ps
, _mm256_and_ps
)
463 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_or_ps
, _mm256_or_ps
)
464 SIMD16_EMU_AVX512_1(simd16scalar
, _simd16_rcp_ps
, _mm256_rcp_ps
)
465 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_div_ps
, _mm256_div_ps
)
467 INLINE simd16scalar
_simd16_castsi_ps(simd16scalari a
)
469 return *reinterpret_cast<simd16scalar
*>(&a
);
472 INLINE simd16scalari
_simd16_castps_si(simd16scalar a
)
474 return *reinterpret_cast<simd16scalari
*>(&a
);
477 INLINE simd16scalard
_simd16_castsi_pd(simd16scalari a
)
479 return *reinterpret_cast<simd16scalard
*>(&a
);
482 INLINE simd16scalari
_simd16_castpd_si(simd16scalard a
)
484 return *reinterpret_cast<simd16scalari
*>(&a
);
487 INLINE simd16scalar
_simd16_castpd_ps(simd16scalard a
)
489 return *reinterpret_cast<simd16scalar
*>(&a
);
492 INLINE simd16scalard
_simd16_castps_pd(simd16scalar a
)
494 return *reinterpret_cast<simd16scalard
*>(&a
);
497 SIMD16_EMU_AVX512_2(simd16scalar
, _simd16_andnot_ps
, _mm256_andnot_ps
)
500 INLINE simd16scalar
_simd16_round_ps_temp(simd16scalar a
)
504 result
.lo
= _mm256_round_ps(a
.lo
, mode
);
505 result
.hi
= _mm256_round_ps(a
.hi
, mode
);
510 #define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
512 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_mul_epi32
, _mm256_mul_epi32
)
513 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_mullo_epi32
, _mm256_mullo_epi32
)
514 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_sub_epi32
, _mm256_sub_epi32
)
515 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_sub_epi64
, _mm256_sub_epi64
)
516 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_min_epi32
, _mm256_min_epi32
)
517 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_max_epi32
, _mm256_max_epi32
)
518 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_min_epu32
, _mm256_min_epu32
)
519 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_max_epu32
, _mm256_max_epu32
)
520 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_add_epi32
, _mm256_add_epi32
)
521 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_and_si
, _simd_and_si
)
522 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_andnot_si
, _simd_andnot_si
)
523 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_or_si
, _simd_or_si
)
524 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_xor_si
, _simd_xor_si
)
525 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmpeq_epi32
, _mm256_cmpeq_epi32
)
526 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmpgt_epi32
, _mm256_cmpgt_epi32
)
528 INLINE
int _simd16_testz_ps(simd16scalar a
, simd16scalar b
)
530 int lo
= _mm256_testz_ps(a
.lo
, b
.lo
);
531 int hi
= _mm256_testz_ps(a
.hi
, b
.hi
);
536 #define _simd16_cmplt_epi32(a, b) _simd16_cmpgt_epi32(b, a)
538 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_unpacklo_epi32
, _simd_unpacklo_epi32
)
539 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_unpackhi_epi32
, _simd_unpackhi_epi32
)
540 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_unpacklo_epi64
, _simd_unpacklo_epi64
)
541 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_unpackhi_epi64
, _simd_unpackhi_epi64
)
544 INLINE simd16scalari
_simd16_slli_epi32_temp(simd16scalari a
)
546 simd16scalari result
;
548 result
.lo
= _simd_slli_epi32(a
.lo
, imm8
);
549 result
.hi
= _simd_slli_epi32(a
.hi
, imm8
);
554 #define _simd16_slli_epi32(a, imm8) _simd16_slli_epi32_temp<imm8>(a)
557 INLINE simd16scalari
_simd16_srai_epi32_temp(simd16scalari a
)
559 simd16scalari result
;
561 result
.lo
= _simd_srai_epi32(a
.lo
, imm8
);
562 result
.hi
= _simd_srai_epi32(a
.hi
, imm8
);
567 #define _simd16_srai_epi32(a, imm8) _simd16_srai_epi32_temp<imm8>(a)
570 INLINE simd16scalari
_simd16_srli_epi32_temp(simd16scalari a
)
572 simd16scalari result
;
574 result
.lo
= _simd_srli_epi32(a
.lo
, imm8
);
575 result
.hi
= _simd_srli_epi32(a
.hi
, imm8
);
580 #define _simd16_srli_epi32(a, imm8) _simd16_srli_epi32_temp<imm8>(a)
582 SIMD16_EMU_AVX512_3(simd16scalar
, _simd16_fmadd_ps
, _mm256_fmadd_ps
)
583 SIMD16_EMU_AVX512_3(simd16scalar
, _simd16_fmsub_ps
, _mm256_fmsub_ps
)
585 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_shuffle_epi8
, _mm256_shuffle_epi8
)
586 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_adds_epu8
, _mm256_adds_epu8
)
587 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_subs_epu8
, _mm256_subs_epu8
)
588 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_add_epi8
, _mm256_add_epi8
)
591 INLINE simd16scalar
_simd16_i32gather_ps_temp(float const *m
, simd16scalari a
)
595 result
.lo
= _mm256_i32gather_ps(m
, a
.lo
, imm8
);
596 result
.hi
= _mm256_i32gather_ps(m
, a
.hi
, imm8
);
601 #define _simd16_i32gather_ps(m, a, imm8) _simd16_i32gather_ps_temp<imm8>(m, a)
603 SIMD16_EMU_AVX512_1(simd16scalari
, _simd16_abs_epi32
, _mm256_abs_epi32
)
604 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmpeq_epi64
, _mm256_cmpeq_epi64
)
605 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmpgt_epi64
, _mm256_cmpgt_epi64
)
606 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmpeq_epi16
, _mm256_cmpeq_epi16
)
607 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmpgt_epi16
, _mm256_cmpgt_epi16
)
608 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmpeq_epi8
, _mm256_cmpeq_epi8
)
609 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_cmpgt_epi8
, _mm256_cmpgt_epi8
)
611 INLINE simd16scalar
_simd16_permute_ps(simd16scalar a
, simd16scalari i
)
615 const simdscalari mask
= _simd_set1_epi32(7);
617 simdscalar lolo
= _simd_permute_ps(a
.lo
, _simd_and_si(i
.lo
, mask
));
618 simdscalar lohi
= _simd_permute_ps(a
.hi
, _simd_and_si(i
.lo
, mask
));
620 simdscalar hilo
= _simd_permute_ps(a
.lo
, _simd_and_si(i
.hi
, mask
));
621 simdscalar hihi
= _simd_permute_ps(a
.hi
, _simd_and_si(i
.hi
, mask
));
623 result
.lo
= _simd_blendv_ps(lolo
, lohi
, _simd_castsi_ps(_simd_cmpgt_epi32(i
.lo
, mask
)));
624 result
.hi
= _simd_blendv_ps(hilo
, hihi
, _simd_castsi_ps(_simd_cmpgt_epi32(i
.hi
, mask
)));
629 INLINE simd16scalari
_simd16_permute_epi32(simd16scalari a
, simd16scalari i
)
631 return _simd16_castps_si(_simd16_permute_ps(_simd16_castsi_ps(a
), i
));
634 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_srlv_epi32
, _mm256_srlv_epi32
)
635 SIMD16_EMU_AVX512_2(simd16scalari
, _simd16_sllv_epi32
, _mm256_sllv_epi32
)
638 INLINE simd16scalar
_simd16_permute2f128_ps_temp(simd16scalar a
, simd16scalar b
)
642 result
.lo
= _simd_permute2f128_ps(a
.lo
, a
.hi
, ((imm8
& 0x03) << 0) | ((imm8
& 0x0C) << 2));
643 result
.hi
= _simd_permute2f128_ps(b
.lo
, b
.hi
, ((imm8
& 0x30) >> 4) | ((imm8
& 0xC0) >> 2));
648 #define _simd16_permute2f128_ps(a, b, imm8) _simd16_permute2f128_ps_temp<imm8>(a, b)
651 INLINE simd16scalard
_simd16_permute2f128_pd_temp(simd16scalard a
, simd16scalard b
)
653 simd16scalard result
;
655 result
.lo
= _simd_permute2f128_pd(a
.lo
, a
.hi
, ((imm8
& 0x03) << 0) | ((imm8
& 0x0C) << 2));
656 result
.hi
= _simd_permute2f128_pd(b
.lo
, b
.hi
, ((imm8
& 0x30) >> 4) | ((imm8
& 0xC0) >> 2));
661 #define _simd16_permute2f128_pd(a, b, imm8) _simd16_permute2f128_pd_temp<imm8>(a, b)
664 INLINE simd16scalari
_simd16_permute2f128_si_temp(simd16scalari a
, simd16scalari b
)
666 simd16scalari result
;
668 result
.lo
= _simd_permute2f128_si(a
.lo
, a
.hi
, ((imm8
& 0x03) << 0) | ((imm8
& 0x0C) << 2));
669 result
.hi
= _simd_permute2f128_si(b
.lo
, b
.hi
, ((imm8
& 0x30) >> 4) | ((imm8
& 0xC0) >> 2));
674 #define _simd16_permute2f128_si(a, b, imm8) _simd16_permute2f128_si_temp<imm8>(a, b)
677 INLINE simd16scalar
_simd16_shuffle_ps_temp(simd16scalar a
, simd16scalar b
)
681 result
.lo
= _simd_shuffle_ps(a
.lo
, b
.lo
, imm8
);
682 result
.hi
= _simd_shuffle_ps(a
.hi
, b
.hi
, imm8
);
687 #define _simd16_shuffle_ps(a, b, imm8) _simd16_shuffle_ps_temp<imm8>(a, b)
690 INLINE simd16scalard
_simd16_shuffle_pd_temp(simd16scalard a
, simd16scalard b
)
692 simd16scalard result
;
694 result
.lo
= _simd_shuffle_pd(a
.lo
, b
.lo
, (imm8
& 15));
695 result
.hi
= _simd_shuffle_pd(a
.hi
, b
.hi
, (imm8
>> 4));
700 #define _simd16_shuffle_pd(a, b, imm8) _simd16_shuffle_pd_temp<imm8>(a, b)
703 INLINE simd16scalari
_simd16_shuffle_epi32_temp(simd16scalari a
, simd16scalari b
)
705 return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a
), _simd16_castsi_ps(b
), imm8
));
708 #define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
711 INLINE simd16scalari
_simd16_shuffle_epi64_temp(simd16scalari a
, simd16scalari b
)
713 return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a
), _simd16_castsi_pd(b
), imm8
));
716 #define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
718 INLINE simd16mask
_simd16_int2mask(int mask
)
723 INLINE
int _simd16_mask2int(simd16mask mask
)
728 INLINE simd16mask
_simd16_cmplt_ps_mask(simd16scalar a
, simd16scalar b
)
730 return _simd16_movemask_ps(_simd16_cmplt_ps(a
, b
));
733 // convert bitmask to vector mask
734 INLINE simd16scalar
vMask16(int32_t mask
)
736 simd16scalari temp
= _simd16_set1_epi32(mask
);
738 simd16scalari bits
= _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
740 simd16scalari result
= _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp
, bits
));
742 return _simd16_castsi_ps(result
);
747 INLINE simd16mask
_simd16_scalari2mask(simd16scalari mask
)
749 return _mm512_cmpneq_epu32_mask(mask
, _mm512_setzero_epi32());
753 INLINE simd16mask
_simd16_scalard2mask(simd16scalard mask
)
755 return _mm512_cmpneq_epu64_mask(mask
, _mm512_setzero_epi64());
759 #define _simd16_setzero_ps _mm512_setzero_ps
760 #define _simd16_setzero_si _mm512_setzero_si512
761 #define _simd16_set1_ps _mm512_set1_ps
762 #define _simd16_set1_epi8 _mm512_set1_epi8
763 #define _simd16_set1_epi32 _mm512_set1_epi32
765 INLINE simd16scalar
_simd16_set_ps(float e15
, float e14
, float e13
, float e12
, float e11
, float e10
, float e9
, float e8
, float e7
, float e6
, float e5
, float e4
, float e3
, float e2
, float e1
, float e0
)
767 return _mm512_set_ps(e15
, e14
, e13
, e12
, e11
, e10
, e9
, e8
, e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
770 INLINE simd16scalari
_simd16_set_epi32(int e15
, int e14
, int e13
, int e12
, int e11
, int e10
, int e9
, int e8
, int e7
, int e6
, int e5
, int e4
, int e3
, int e2
, int e1
, int e0
)
772 return _mm512_set_epi32(e15
, e14
, e13
, e12
, e11
, e10
, e9
, e8
, e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
775 INLINE simd16scalar
_simd16_set_ps(float e7
, float e6
, float e5
, float e4
, float e3
, float e2
, float e1
, float e0
)
777 return _mm512_set_ps(e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
, e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
780 INLINE simd16scalari
_simd16_set_epi32(int e7
, int e6
, int e5
, int e4
, int e3
, int e2
, int e1
, int e0
)
782 return _mm512_set_epi32(e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
, e7
, e6
, e5
, e4
, e3
, e2
, e1
, e0
);
785 #define _simd16_load_ps _mm512_load_ps
786 #define _simd16_loadu_ps _mm512_loadu_ps
788 #define _simd16_load1_ps _simd16_broadcast_ss
790 #define _simd16_load_si _mm512_load_si512
791 #define _simd16_loadu_si _mm512_loadu_si512
792 #define _simd16_broadcast_ss(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, 0)
793 #define _simd16_broadcast_ps(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, 0)
794 #define _simd16_store_ps _mm512_store_ps
795 #define _simd16_store_si _mm512_store_si512
796 #define _simd16_extract_ps _mm512_extractf32x8_ps
797 #define _simd16_extract_si _mm512_extracti32x8_epi32
798 #define _simd16_insert_ps _mm512_insertf32x8
799 #define _simd16_insert_si _mm512_inserti32x8
801 INLINE
void _simd16_maskstore_ps(float *m
, simd16scalari mask
, simd16scalar a
)
803 simd16mask k
= _simd16_scalari2mask(mask
);
805 _mm512_mask_store_ps(m
, k
, a
);
808 #define _simd16_blend_ps(a, b, mask) _mm512_mask_blend_ps(mask, a, b)
810 INLINE simd16scalar
_simd16_blendv_ps(simd16scalar a
, simd16scalar b
, const simd16scalar mask
)
812 simd16mask k
= _simd16_scalari2mask(_mm512_castps_si512(mask
));
814 _mm512_mask_blend_ps(k
, a
, b
);
817 INLINE simd16scalari
_simd16_blendv_epi32(simd16scalari a
, simd16scalari b
, const simd16scalar mask
)
819 simd16mask k
= _simd16_scalari2mask(_mm512_castps_si512(mask
));
821 _mm512_mask_blend_epi32(k
, a
, b
);
824 INLINE simd16scalari
_simd16_blendv_epi32(simd16scalari a
, simd16scalari b
, const simd16scalari mask
)
826 simd16mask k
= _simd16_scalari2mask(mask
);
828 _mm512_mask_blend_epi32(k
, a
, b
);
831 #define _simd16_mul_ps _mm512_mul_ps
832 #define _simd16_add_ps _mm512_add_ps
833 #define _simd16_sub_ps _mm512_sub_ps
834 #define _simd16_rsqrt_ps _mm512_rsqrt14_ps
835 #define _simd16_min_ps _mm512_min_ps
836 #define _simd16_max_ps _mm512_max_ps
838 INLINE simd16mask
_simd16_movemask_ps(simd16scalar a
)
840 return _simd16_scalari2mask(_mm512_castps_si512(a
));
844 INLINE simd16mask
_simd16_movemask_pd(simd16scalard a
)
846 return _simd16_scalard2mask(_mm512i_castpd_si512(a
));
851 INLINE
int _simd16_movemask_epi8(simd16scalari a
)
853 return _simd16_scalar2mask(a
);
857 #define _simd16_cvtps_epi32 _mm512_cvtps_epi32
858 #define _simd16_cvttps_epi32 _mm512_cvttps_epi32
859 #define _simd16_cvtepi32_ps _mm512_cvtepi32_ps
862 INLINE simd16scalar
_simd16_cmp_ps_temp(simd16scalar a
, simd16scalar b
)
864 simd16mask k
= _mm512_cmpeq_ps_mask(a
, b
);
866 return _mm512_castsi512_ps(_mm512_mask_blend_epi32(k
, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF)));
869 #define _simd16_cmp_ps(a, b, comp) _simd16_cmp_ps_temp<comp>(a, b)
871 #define _simd16_cmplt_ps(a, b) _simd16_cmp_ps<_CMP_LT_OQ>(a, b)
872 #define _simd16_cmpgt_ps(a, b) _simd16_cmp_ps<_CMP_GT_OQ>(a, b)
873 #define _simd16_cmpneq_ps(a, b) _simd16_cmp_ps<_CMP_NEQ_OQ>(a, b)
874 #define _simd16_cmpeq_ps(a, b) _simd16_cmp_ps<_CMP_EQ_OQ>(a, b)
875 #define _simd16_cmpge_ps(a, b) _simd16_cmp_ps<_CMP_GE_OQ>(a, b)
876 #define _simd16_cmple_ps(a, b) _simd16_cmp_ps<_CMP_LE_OQ>(a, b)
878 #define _simd16_castsi_ps _mm512_castsi512_ps
879 #define _simd16_castps_si _mm512_castps_si512
880 #define _simd16_castsi_pd _mm512_castsi512_pd
881 #define _simd16_castpd_si _mm512_castpd_si512
882 #define _simd16_castpd_ps _mm512_castpd_ps
883 #define _simd16_castps_pd _mm512_castps_pd
885 #define _simd16_andnot_ps _mm512_andnot_ps
888 INLINE simd16scalar
_simd16_round_ps_temp(simd16scalar a
)
890 return _mm512_roundscale_ps(a
, mode
);
893 #define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
895 #define _simd16_mul_epi32 _mm512_mul_epi32
896 #define _simd16_mullo_epi32 _mm512_mullo_epi32
897 #define _simd16_sub_epi32 _mm512_sub_epi32
898 #define _simd16_sub_epi64 _mm512_sub_epi64
899 #define _simd16_min_epi32 _mm512_min_epi32
900 #define _simd16_max_epi32 _mm512_max_epi32
901 #define _simd16_min_epu32 _mm512_min_epu32
902 #define _simd16_max_epu32 _mm512_max_epu32
903 #define _simd16_add_epi32 _mm512_add_epi32
904 #define _simd16_and_si _mm512_and_si512
905 #define _simd16_andnot_si _mm512_andnot_si512
906 #define _simd16_or_si _mm512_or_si512
907 #define _simd16_xor_si _mm512_xor_si512
909 INLINE simd16scalari
_simd16_cmpeq_epi32(simd16scalari a
, simd16scalari b
)
911 simd16mask k
= _mm512_cmpeq_epi32_mask(a
, b
);
913 return _mm512_mask_blend_epi32(k
, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
916 INLINE simd16scalari
_simd16_cmpgt_epi32(simd16scalari a
, simd16scalari b
)
918 simd16mask k
= _mm512_cmpgt_epi32_mask(a
, b
);
920 return _mm512_mask_blend_epi32(k
, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
923 INLINE simd16scalari
_simd16_cmplt_epi32(simd16scalari a
, simd16scalari b
)
925 simd16mask k
= _mm512_cmplt_epi32_mask(a
, b
);
927 return _mm512_mask_blend_epi32(k
, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
931 INLINE
int _simd16_testz_ps(simd16scalar a
, simd16scalar b
)
933 int lo
= _mm256_testz_ps(a
.lo
, b
.lo
);
934 int hi
= _mm256_testz_ps(a
.hi
, b
.hi
);
941 #define _simd16_unpacklo_epi32 _mm512_unpacklo_epi32
942 #define _simd16_unpackhi_epi32 _mm512_unpackhi_epi32
943 #define _simd16_unpacklo_epi64 _mm512_unpacklo_epi64
944 #define _simd16_unpackhi_epi64 _mm512_unpackhi_epi64
945 #define _simd16_slli_epi32 _mm512_slli_epi32
946 #define _simd16_srli_epi32 _mm512_srli_epi32
947 #define _simd16_srai_epi32 _mm512_srai_epi32
948 #define _simd16_fmadd_ps _mm512_fmadd_ps
949 #define _simd16_fmsub_ps _mm512_fmsub_ps
950 #define _simd16_adds_epu8 _mm512_adds_epu8
951 #define _simd16_subs_epu8 _mm512_subs_epu8
952 #define _simd16_add_epi8 _mm512_add_epi8
953 #define _simd16_shuffle_epi8 _mm512_shuffle_epi8
955 #define _simd16_i32gather_ps(m, index, scale) _mm512_i32gather_ps(index, m, scale)
957 #define _simd16_abs_epi32 _mm512_abs_epi32
958 #define _simd16_cmpeq_epi64 _mm512_abs_epi32
960 INLINE simd16scalari
_simd16_cmpeq_epi64(simd16scalari a
, simd16scalari b
)
962 __mmask8 k
= _mm512_cmpeq_epi64_mask(a
, b
);
964 return _mm512_mask_blend_epi64(k
, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
967 INLINE simd16scalari
_simd16_cmpgt_epi64(simd16scalari a
, simd16scalari b
)
969 __mmask8 k
= _mm512_cmpgt_epi64_mask(a
, b
);
971 return _mm512_mask_blend_epi64(k
, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
974 INLINE simd16scalari
_simd16_cmpeq_epi16(simd16scalari a
, simd16scalari b
)
976 __mmask32 k
= _mm512_cmpeq_epi16_mask(a
, b
);
978 return _mm512_mask_blend_epi16(k
, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
981 INLINE simd16scalari
_simd16_cmpgt_epi16(simd16scalari a
, simd16scalari b
)
983 __mmask32 k
= _mm512_cmpgt_epi16_mask(a
, b
);
985 return _mm512_mask_blend_epi16(k
, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
988 INLINE simd16scalari
_simd16_cmpeq_epi8(simd16scalari a
, simd16scalari b
)
990 __mmask64 k
= _mm512_cmpeq_epi8_mask(a
, b
);
992 return _mm512_mask_blend_epi8(k
, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
995 INLINE simd16scalari
_simd16_cmpgt_epi8(simd16scalari a
, simd16scalari b
)
997 __mmask64 k
= _mm512_cmpgt_epi8_mask(a
, b
);
999 return _mm512_mask_blend_epi8(k
, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
1002 #define _simd16_permute_ps(a, i) _mm512_permutexvar_ps(i, a)
1003 #define _simd16_permute_epi32(a, i) _mm512_permutexvar_epi32(i, a)
1004 #define _simd16_sllv_epi32 _mm512_srlv_epi32
1005 #define _simd16_srlv_epi32 _mm512_sllv_epi32
1006 #define _simd16_permute2f128_ps _mm512_shuffle_f32x4
1007 #define _simd16_permute2f128_pd _mm512_shuffle_f64x2
1008 #define _simd16_permute2f128_si _mm512_shuffle_i32x4
1009 #define _simd16_shuffle_ps _mm512_shuffle_ps
1010 #define _simd16_shuffle_pd _mm512_shuffle_pd
1013 INLINE simd16scalari
_simd16_shuffle_epi32_temp(simd16scalari a
, simd16scalari b
)
1015 return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a
), _simd16_castsi_ps(b
), imm8
));
1018 #define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
1021 INLINE simd16scalari
_simd16_shuffle_epi64_temp(simd16scalari a
, simd16scalari b
)
1023 return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a
), _simd16_castsi_pd(b
), imm8
));
1026 #define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
1028 INLINE simd16mask
_simd16_int2mask(int mask
)
1030 return _mm512_int2mask(mask
);
1033 INLINE
int _simd16_mask2int(simd16mask mask
)
1035 return _mm512_mask2int(mask
);
1038 INLINE simd16mask
_simd16_cmplt_ps_mask(simd16scalar a
, simd16scalar b
)
1040 return _mm512_cmplt_ps_mask(a
, b
);
1043 // convert bitmask to vector mask
1044 INLINE simd16scalar
vMask16(int32_t mask
)
1046 simd16scalari temp
= _simd16_set1_epi32(mask
);
1048 simd16scalari bits
= _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
1050 simd16scalari result
= _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp
, bits
));
1052 return _simd16_castsi_ps(result
);
1055 #endif//ENABLE_AVX512_EMULATION
1057 #endif//ENABLE_AVX512_SIMD16
1059 #endif//__SWR_SIMD16INTRIN_H_