rasterizer/common/rdtsc_buckets_shared.h \
rasterizer/common/simd16intrin.h \
rasterizer/common/simdintrin.h \
+ rasterizer/common/simdlib.hpp \
+ rasterizer/common/simdlib_128_avx.inl \
+ rasterizer/common/simdlib_128_avx2.inl \
+ rasterizer/common/simdlib_128_avx512.inl \
+ rasterizer/common/simdlib_256_avx.inl \
+ rasterizer/common/simdlib_256_avx2.inl \
+ rasterizer/common/simdlib_256_avx512.inl \
+ rasterizer/common/simdlib_512_avx512.inl \
+ rasterizer/common/simdlib_512_avx512_masks.inl \
+ rasterizer/common/simdlib_512_emu.inl \
+ rasterizer/common/simdlib_512_emu_masks.inl \
+ rasterizer/common/simdlib_interface.hpp \
+ rasterizer/common/simdlib_types.hpp \
rasterizer/common/swr_assert.cpp \
rasterizer/common/swr_assert.h
#include "os.h"
-#include <cassert>
-
-#include <emmintrin.h>
-#include <immintrin.h>
-#include <xmmintrin.h>
+#define SIMD_ARCH KNOB_ARCH
+#include "simdlib_types.hpp"
+
+typedef SIMDImpl::SIMD128Impl::Float simd4scalar;
+typedef SIMDImpl::SIMD128Impl::Double simd4scalard;
+typedef SIMDImpl::SIMD128Impl::Integer simd4scalari;
+typedef SIMDImpl::SIMD128Impl::Vec4 simd4vector;
+typedef SIMDImpl::SIMD128Impl::Mask simd4mask;
+
+typedef SIMDImpl::SIMD256Impl::Float simd8scalar;
+typedef SIMDImpl::SIMD256Impl::Double simd8scalard;
+typedef SIMDImpl::SIMD256Impl::Integer simd8scalari;
+typedef SIMDImpl::SIMD256Impl::Vec4 simd8vector;
+typedef SIMDImpl::SIMD256Impl::Mask simd8mask;
+
+typedef SIMDImpl::SIMD512Impl::Float simd16scalar;
+typedef SIMDImpl::SIMD512Impl::Double simd16scalard;
+typedef SIMDImpl::SIMD512Impl::Integer simd16scalari;
+typedef SIMDImpl::SIMD512Impl::Vec4 simd16vector;
+typedef SIMDImpl::SIMD512Impl::Mask simd16mask;
#if KNOB_SIMD_WIDTH == 8
-typedef __m256 simdscalar;
-typedef __m256i simdscalari;
-typedef uint8_t simdmask;
-#else
-#error Unsupported vector width
-#endif
-
-// simd vector
-OSALIGNSIMD(union) simdvector
-{
- simdscalar v[4];
- struct
- {
- simdscalar x, y, z, w;
- };
-
- simdscalar& operator[] (const int i) { return v[i]; }
- const simdscalar& operator[] (const int i) const { return v[i]; }
-};
-
-#if ENABLE_AVX512_SIMD16
-
-#if KNOB_SIMD16_WIDTH == 16
-
-#if ENABLE_AVX512_EMULATION
-struct simd16scalar
-{
- __m256 lo;
- __m256 hi;
-};
-struct simd16scalard
-{
- __m256d lo;
- __m256d hi;
-};
-struct simd16scalari
-{
- __m256i lo;
- __m256i hi;
-};
-typedef uint16_t simd16mask;
-
-#else
-typedef __m512 simd16scalar;
-typedef __m512d simd16scalard;
-typedef __m512i simd16scalari;
-typedef __mmask16 simd16mask;
-#endif//ENABLE_AVX512_EMULATION
+typedef simd8scalar simdscalar;
+typedef simd8scalard simdscalard;
+typedef simd8scalari simdscalari;
+typedef simd8vector simdvector;
+typedef simd8mask simdmask;
#else
#error Unsupported vector width
-#endif//KNOB_SIMD16_WIDTH == 16
-
-#define _simd16_masklo(mask) ((mask) & 0xFF)
-#define _simd16_maskhi(mask) (((mask) >> 8) & 0xFF)
-#define _simd16_setmask(hi, lo) (((hi) << 8) | (lo))
-
-#if defined(_WIN32)
-#define SIMDAPI __vectorcall
-#else
-#define SIMDAPI
#endif
-OSALIGN(union, KNOB_SIMD16_BYTES) simd16vector
-{
- simd16scalar v[4];
- struct
- {
- simd16scalar x, y, z, w;
- };
-
- simd16scalar& operator[] (const int i) { return v[i]; }
- const simd16scalar& operator[] (const int i) const { return v[i]; }
-};
-
-#endif // ENABLE_AVX512_SIMD16
-
INLINE
UINT pdep_u32(UINT a, UINT mask)
{
#if ENABLE_AVX512_SIMD16
-#if ENABLE_AVX512_EMULATION
-
-#define SIMD16_EMU_AVX512_0(type, func, intrin) \
-INLINE type SIMDAPI func()\
-{\
- type result;\
-\
- result.lo = intrin();\
- result.hi = intrin();\
-\
- return result;\
-}
-
-#define SIMD16_EMU_AVX512_1(type, func, intrin) \
-INLINE type SIMDAPI func(type a)\
-{\
- type result;\
-\
- result.lo = intrin(a.lo);\
- result.hi = intrin(a.hi);\
-\
- return result;\
-}
-
-#define SIMD16_EMU_AVX512_2(type, func, intrin) \
-INLINE type SIMDAPI func(type a, type b)\
-{\
- type result;\
-\
- result.lo = intrin(a.lo, b.lo);\
- result.hi = intrin(a.hi, b.hi);\
-\
- return result;\
-}
-
-#define SIMD16_EMU_AVX512_3(type, func, intrin) \
-INLINE type SIMDAPI func(type a, type b, type c)\
-{\
- type result;\
-\
- result.lo = intrin(a.lo, b.lo, c.lo);\
- result.hi = intrin(a.hi, b.hi, c.hi);\
-\
- return result;\
-}
-
-SIMD16_EMU_AVX512_0(simd16scalar, _simd16_setzero_ps, _mm256_setzero_ps)
-SIMD16_EMU_AVX512_0(simd16scalari, _simd16_setzero_si, _mm256_setzero_si256)
-
-INLINE simd16scalar SIMDAPI _simd16_set1_ps(float a)
-{
- simd16scalar result;
-
- result.lo = _mm256_set1_ps(a);
- result.hi = _mm256_set1_ps(a);
-
- return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_set1_epi8(char a)
-{
- simd16scalari result;
-
- result.lo = _mm256_set1_epi8(a);
- result.hi = _mm256_set1_epi8(a);
-
- return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_set1_epi32(int a)
-{
- simd16scalari result;
-
- result.lo = _mm256_set1_epi32(a);
- result.hi = _mm256_set1_epi32(a);
-
- return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_set_ps(float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
-{
- simd16scalar result;
-
- result.lo = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
- result.hi = _mm256_set_ps(e15, e14, e13, e12, e11, e10, e9, e8);
-
- return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_set_epi32(int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
-{
- simd16scalari result;
-
- result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
- result.hi = _mm256_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8);
-
- return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_set_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
-{
- simd16scalar result;
-
- result.lo = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
- result.hi = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
-
- return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_set_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
-{
- simd16scalari result;
-
- result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
- result.hi = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
-
- return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_load_ps(float const *m)
-{
- simd16scalar result;
-
- float const *n = reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m) + sizeof(result.lo));
-
- result.lo = _mm256_load_ps(m);
- result.hi = _mm256_load_ps(n);
-
- return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_loadu_ps(float const *m)
-{
- simd16scalar result;
-
- float const *n = reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m) + sizeof(result.lo));
-
- result.lo = _mm256_loadu_ps(m);
- result.hi = _mm256_loadu_ps(n);
-
- return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_load1_ps(float const *m)
-{
- simd16scalar result;
-
- result.lo = _mm256_broadcast_ss(m);
- result.hi = _mm256_broadcast_ss(m);
-
- return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_load_si(simd16scalari const *m)
-{
- simd16scalari result;
-
- result.lo = _mm256_load_si256(&m[0].lo);
- result.hi = _mm256_load_si256(&m[0].hi);
-
- return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_loadu_si(simd16scalari const *m)
-{
- simd16scalari result;
-
- result.lo = _mm256_loadu_si256(&m[0].lo);
- result.hi = _mm256_loadu_si256(&m[0].hi);
-
- return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_broadcast_ss(float const *m)
-{
- simd16scalar result;
-
- result.lo = _mm256_broadcast_ss(m);
- result.hi = _mm256_broadcast_ss(m);
-
- return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_broadcast_ps(__m128 const *m)
-{
- simd16scalar result;
-
- result.lo = _mm256_broadcast_ps(m);
- result.hi = _mm256_broadcast_ps(m);
-
- return result;
-}
-
-INLINE void SIMDAPI _simd16_store_ps(float *m, simd16scalar a)
-{
- float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + sizeof(a.lo));
-
- _mm256_store_ps(m, a.lo);
- _mm256_store_ps(n, a.hi);
-}
-
-INLINE void SIMDAPI _simd16_maskstore_ps(float *m, simd16scalari mask, simd16scalar a)
-{
- float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + sizeof(a.lo));
-
- _mm256_maskstore_ps(m, mask.lo, a.lo);
- _mm256_maskstore_ps(n, mask.hi, a.hi);
-}
-
-INLINE void SIMDAPI _simd16_store_si(simd16scalari *m, simd16scalari a)
-{
- _mm256_store_si256(&m[0].lo, a.lo);
- _mm256_store_si256(&m[0].hi, a.hi);
-}
-
-INLINE simdscalar SIMDAPI _simd16_extract_ps(simd16scalar a, int imm8)
-{
- switch (imm8)
- {
- case 0:
- return a.lo;
- case 1:
- return a.hi;
- }
- return _simd_set1_ps(0.0f);
-}
-
-INLINE simdscalari SIMDAPI _simd16_extract_si(simd16scalari a, int imm8)
-{
- switch (imm8)
- {
- case 0:
- return a.lo;
- case 1:
- return a.hi;
- }
- return _simd_set1_epi32(0);
-}
-
-INLINE simd16scalar SIMDAPI _simd16_insert_ps(simd16scalar a, simdscalar b, int imm8)
-{
- switch (imm8)
- {
- case 0:
- a.lo = b;
- break;
- case 1:
- a.hi = b;
- break;
- }
- return a;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_insert_si(simd16scalari a, simdscalari b, int imm8)
-{
- switch (imm8)
- {
- case 0:
- a.lo = b;
- break;
- case 1:
- a.hi = b;
- break;
- }
- return a;
-}
-
-template <simd16mask mask>
-INLINE simd16scalar SIMDAPI _simd16_blend_ps_temp(simd16scalar a, simd16scalar b)
-{
- simd16scalar result;
-
- result.lo = _mm256_blend_ps(a.lo, b.lo, _simd16_masklo(mask));
- result.hi = _mm256_blend_ps(a.hi, b.hi, _simd16_maskhi(mask));
-
- return result;
-}
-
-#define _simd16_blend_ps(a, b, mask) _simd16_blend_ps_temp<mask>(a, b)
-
-SIMD16_EMU_AVX512_3(simd16scalar, _simd16_blendv_ps, _mm256_blendv_ps)
-
-INLINE simd16scalari SIMDAPI _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalar mask)
-{
- simd16scalari result;
-
- result.lo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), _mm256_castsi256_ps(b.lo), mask.lo));
- result.hi = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), _mm256_castsi256_ps(b.hi), mask.hi));
-
- return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalari mask)
-{
- simd16scalari result;
-
- result.lo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), _mm256_castsi256_ps(b.lo), _mm256_castsi256_ps(mask.lo)));
- result.hi = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), _mm256_castsi256_ps(b.hi), _mm256_castsi256_ps(mask.hi)));
-
- return result;
-}
-
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_mul_ps, _mm256_mul_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_add_ps, _mm256_add_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_sub_ps, _mm256_sub_ps)
-SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rsqrt_ps, _mm256_rsqrt_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_min_ps, _mm256_min_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_max_ps, _mm256_max_ps)
-
-INLINE simd16mask SIMDAPI _simd16_movemask_ps(simd16scalar a)
-{
- simdmask mask_lo = _mm256_movemask_ps(a.lo);
- simdmask mask_hi = _mm256_movemask_ps(a.hi);
-
- return static_cast<simd16mask>(mask_lo) | (static_cast<simd16mask>(mask_hi) << 8);
-}
-
-INLINE simd16mask SIMDAPI _simd16_movemask_pd(simd16scalard a)
-{
- simdmask mask_lo = _mm256_movemask_pd(a.lo);
- simdmask mask_hi = _mm256_movemask_pd(a.hi);
-
- return static_cast<simd16mask>(mask_lo) | (static_cast<simd16mask>(mask_hi) << 4);
-}
-
-INLINE uint64_t SIMDAPI _simd16_movemask_epi8(simd16scalari a)
-{
- uint32_t mask_lo = _mm256_movemask_epi8(a.lo);
- uint32_t mask_hi = _mm256_movemask_epi8(a.hi);
-
- return static_cast<uint64_t>(mask_lo) | (static_cast<uint64_t>(mask_hi) << 32);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cvtps_epi32(simd16scalar a)
-{
- simd16scalari result;
-
- result.lo = _mm256_cvtps_epi32(a.lo);
- result.hi = _mm256_cvtps_epi32(a.hi);
-
- return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cvttps_epi32(simd16scalar a)
-{
- simd16scalari result;
-
- result.lo = _mm256_cvttps_epi32(a.lo);
- result.hi = _mm256_cvttps_epi32(a.hi);
-
- return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_cvtepi32_ps(simd16scalari a)
-{
- simd16scalar result;
-
- result.lo = _mm256_cvtepi32_ps(a.lo);
- result.hi = _mm256_cvtepi32_ps(a.hi);
-
- return result;
-}
-
-template <int comp>
-INLINE simd16scalar SIMDAPI _simd16_cmp_ps_temp(simd16scalar a, simd16scalar b)
-{
- simd16scalar result;
-
- result.lo = _mm256_cmp_ps(a.lo, b.lo, comp);
- result.hi = _mm256_cmp_ps(a.hi, b.hi, comp);
-
- return result;
-}
-
-#define _simd16_cmp_ps(a, b, comp) _simd16_cmp_ps_temp<comp>(a, b)
-
-#define _simd16_cmplt_ps(a, b) _simd16_cmp_ps(a, b, _CMP_LT_OQ)
-#define _simd16_cmpgt_ps(a, b) _simd16_cmp_ps(a, b, _CMP_GT_OQ)
-#define _simd16_cmpneq_ps(a, b) _simd16_cmp_ps(a, b, _CMP_NEQ_OQ)
-#define _simd16_cmpeq_ps(a, b) _simd16_cmp_ps(a, b, _CMP_EQ_OQ)
-#define _simd16_cmpge_ps(a, b) _simd16_cmp_ps(a, b, _CMP_GE_OQ)
-#define _simd16_cmple_ps(a, b) _simd16_cmp_ps(a, b, _CMP_LE_OQ)
-
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_and_ps, _simd_and_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_andnot_ps, _simd_andnot_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_or_ps, _simd_or_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_xor_ps, _simd_xor_ps)
-
-SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rcp_ps, _simd_rcp_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_div_ps, _simd_div_ps)
-
-INLINE simd16scalar SIMDAPI _simd16_castsi_ps(simd16scalari a)
-{
- return *reinterpret_cast<simd16scalar *>(&a);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_castps_si(simd16scalar a)
-{
- return *reinterpret_cast<simd16scalari *>(&a);
-}
-
-INLINE simd16scalard SIMDAPI _simd16_castsi_pd(simd16scalari a)
-{
- return *reinterpret_cast<simd16scalard *>(&a);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_castpd_si(simd16scalard a)
-{
- return *reinterpret_cast<simd16scalari *>(&a);
-}
-
-INLINE simd16scalar SIMDAPI _simd16_castpd_ps(simd16scalard a)
-{
- return *reinterpret_cast<simd16scalar *>(&a);
-}
-
-INLINE simd16scalard SIMDAPI _simd16_castps_pd(simd16scalar a)
-{
- return *reinterpret_cast<simd16scalard *>(&a);
-}
-
-template <int mode>
-INLINE simd16scalar SIMDAPI _simd16_round_ps_temp(simd16scalar a)
-{
- simd16scalar result;
-
- result.lo = _mm256_round_ps(a.lo, mode);
- result.hi = _mm256_round_ps(a.hi, mode);
-
- return result;
-}
-
-#define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mul_epi32, _simd_mul_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mullo_epi32, _simd_mullo_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi32, _simd_sub_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi64, _simd_sub_epi64)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epi32, _simd_min_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epi32, _simd_max_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epu32, _simd_min_epu32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epu32, _simd_max_epu32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi32, _simd_add_epi32)
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_and_si, _simd_and_si)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_andnot_si, _simd_andnot_si)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_or_si, _simd_or_si)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_xor_si, _simd_xor_si)
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi32, _simd_cmpeq_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi32, _simd_cmpgt_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmplt_epi32, _simd_cmplt_epi32)
-
-INLINE int SIMDAPI _simd16_testz_ps(simd16scalar a, simd16scalar b)
-{
- int lo = _simd_testz_ps(a.lo, b.lo);
- int hi = _simd_testz_ps(a.hi, b.hi);
-
- return lo & hi;
-}
-
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_unpacklo_ps, _simd_unpacklo_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_unpackhi_ps, _simd_unpackhi_ps)
-SIMD16_EMU_AVX512_2(simd16scalard, _simd16_unpacklo_pd, _simd_unpacklo_pd)
-SIMD16_EMU_AVX512_2(simd16scalard, _simd16_unpackhi_pd, _simd_unpackhi_pd)
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi8, _simd_unpacklo_epi8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi8, _simd_unpackhi_epi8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi16, _simd_unpacklo_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi16, _simd_unpackhi_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi32, _simd_unpacklo_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi32, _simd_unpackhi_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi64, _simd_unpacklo_epi64)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi64, _simd_unpackhi_epi64)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_slli_epi32_temp(simd16scalari a)
-{
- simd16scalari result;
-
- result.lo = _simd_slli_epi32(a.lo, imm8);
- result.hi = _simd_slli_epi32(a.hi, imm8);
-
- return result;
-}
-
-#define _simd16_slli_epi32(a, imm8) _simd16_slli_epi32_temp<imm8>(a)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_srai_epi32_temp(simd16scalari a)
-{
- simd16scalari result;
-
- result.lo = _simd_srai_epi32(a.lo, imm8);
- result.hi = _simd_srai_epi32(a.hi, imm8);
-
- return result;
-}
-
-#define _simd16_srai_epi32(a, imm8) _simd16_srai_epi32_temp<imm8>(a)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_srli_epi32_temp(simd16scalari a)
-{
- simd16scalari result;
-
- result.lo = _simd_srli_epi32(a.lo, imm8);
- result.hi = _simd_srli_epi32(a.hi, imm8);
-
- return result;
-}
-
-#define _simd16_srli_epi32(a, imm8) _simd16_srli_epi32_temp<imm8>(a)
-
-SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmadd_ps, _simd_fmadd_ps)
-SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmsub_ps, _simd_fmsub_ps)
-
-template <int scale>
-INLINE simd16scalar SIMDAPI _simd16_i32gather_ps_temp(const float *m, simd16scalari index)
-{
- simd16scalar result;
-
- result.lo = _simd_i32gather_ps(m, index.lo, scale);
- result.hi = _simd_i32gather_ps(m, index.hi, scale);
-
- return result;
-}
-
-#define _simd16_i32gather_ps(m, index, scale) _simd16_i32gather_ps_temp<scale>(m, index)
-
-template <int scale>
-INLINE simd16scalar SIMDAPI _simd16_mask_i32gather_ps_temp(simd16scalar a, const float *m, simd16scalari index, simd16scalari mask)
-{
- simd16scalar result;
-
- result.lo = _simd_mask_i32gather_ps(a.lo, m, index.lo, _simd_castsi_ps(mask.lo), scale);
- result.hi = _simd_mask_i32gather_ps(a.hi, m, index.hi, _simd_castsi_ps(mask.hi), scale);
-
- return result;
-}
-
-#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, index, mask)
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _simd_shuffle_epi8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _simd_adds_epu8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_subs_epu8, _simd_subs_epu8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi8, _simd_add_epi8)
-SIMD16_EMU_AVX512_1(simd16scalari, _simd16_abs_epi32, _simd_abs_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi64, _simd_cmpeq_epi64)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi64, _simd_cmpgt_epi64)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi16, _simd_cmpeq_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi16, _simd_cmpgt_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi8, _simd_cmpeq_epi8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi8, _simd_cmpgt_epi8)
-
-INLINE simd16scalar SIMDAPI _simd16_permute_ps(simd16scalar a, simd16scalari i)
-{
- simd16scalar result;
-
- const simdscalari mask = _simd_set1_epi32(7);
-
- simdscalar lolo = _simd_permute_ps(a.lo, _simd_and_si(i.lo, mask));
- simdscalar lohi = _simd_permute_ps(a.hi, _simd_and_si(i.lo, mask));
-
- simdscalar hilo = _simd_permute_ps(a.lo, _simd_and_si(i.hi, mask));
- simdscalar hihi = _simd_permute_ps(a.hi, _simd_and_si(i.hi, mask));
-
- result.lo = _simd_blendv_ps(lolo, lohi, _simd_castsi_ps(_simd_cmpgt_epi32(i.lo, mask)));
- result.hi = _simd_blendv_ps(hilo, hihi, _simd_castsi_ps(_simd_cmpgt_epi32(i.hi, mask)));
-
- return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_permute_epi32(simd16scalari a, simd16scalari i)
-{
- return _simd16_castps_si(_simd16_permute_ps(_simd16_castsi_ps(a), i));
-}
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_srlv_epi32, _simd_srlv_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sllv_epi32, _simd_sllv_epi32)
-
-template <int imm8>
-INLINE simd16scalar SIMDAPI _simd16_permute2f128_ps_temp(simd16scalar a, simd16scalar b)
-{
- simd16scalar result;
-
- result.lo = _simd_permute2f128_ps(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
- result.hi = _simd_permute2f128_ps(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
-
- return result;
-}
-
-#define _simd16_permute2f128_ps(a, b, imm8) _simd16_permute2f128_ps_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalard SIMDAPI _simd16_permute2f128_pd_temp(simd16scalard a, simd16scalard b)
-{
- simd16scalard result;
-
- result.lo = _simd_permute2f128_pd(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
- result.hi = _simd_permute2f128_pd(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
-
- return result;
-}
-
-#define _simd16_permute2f128_pd(a, b, imm8) _simd16_permute2f128_pd_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_permute2f128_si_temp(simd16scalari a, simd16scalari b)
-{
- simd16scalari result;
-
- result.lo = _simd_permute2f128_si(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
- result.hi = _simd_permute2f128_si(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
-
- return result;
-}
-
-#define _simd16_permute2f128_si(a, b, imm8) _simd16_permute2f128_si_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalar SIMDAPI _simd16_shuffle_ps_temp(simd16scalar a, simd16scalar b)
-{
- simd16scalar result;
-
- result.lo = _simd_shuffle_ps(a.lo, b.lo, imm8);
- result.hi = _simd_shuffle_ps(a.hi, b.hi, imm8);
-
- return result;
-}
-
-#define _simd16_shuffle_ps(a, b, imm8) _simd16_shuffle_ps_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalard SIMDAPI _simd16_shuffle_pd_temp(simd16scalard a, simd16scalard b)
-{
- simd16scalard result;
-
- result.lo = _simd_shuffle_pd(a.lo, b.lo, (imm8 & 15));
- result.hi = _simd_shuffle_pd(a.hi, b.hi, (imm8 >> 4));
-
- return result;
-}
-
-#define _simd16_shuffle_pd(a, b, imm8) _simd16_shuffle_pd_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_shuffle_epi32_temp(simd16scalari a, simd16scalari b)
-{
- return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a), _simd16_castsi_ps(b), imm8));
-}
-
-#define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_shuffle_epi64_temp(simd16scalari a, simd16scalari b)
-{
- return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a), _simd16_castsi_pd(b), imm8));
-}
-
-#define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
-
-INLINE simd16scalari SIMDAPI _simd16_cvtepu8_epi16(simdscalari a)
-{
- simd16scalari result;
-
- result.lo = _simd_cvtepu8_epi16(_mm256_extractf128_si256(a, 0));
- result.hi = _simd_cvtepu8_epi16(_mm256_extractf128_si256(a, 1));
-
- return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cvtepu8_epi32(__m128i a)
-{
- simd16scalari result;
-
- result.lo = _simd_cvtepu8_epi32(a);
- result.hi = _simd_cvtepu8_epi32(_mm_srli_si128(a, 8));
-
- return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cvtepu16_epi32(simdscalari a)
-{
- simd16scalari result;
-
- result.lo = _simd_cvtepu16_epi32(_mm256_extractf128_si256(a, 0));
- result.hi = _simd_cvtepu16_epi32(_mm256_extractf128_si256(a, 1));
-
- return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cvtepu16_epi64(simdscalari a)
-{
- simd16scalari result;
-
- result.lo = _simd_cvtepu16_epi64(_mm256_extractf128_si256(a, 0));
- result.hi = _simd_cvtepu16_epi64(_mm256_extractf128_si256(a, 1));
-
- return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cvtepu32_epi64(simdscalari a)
-{
- simd16scalari result;
-
- result.lo = _simd_cvtepu32_epi64(_mm256_extractf128_si256(a, 0));
- result.hi = _simd_cvtepu32_epi64(_mm256_extractf128_si256(a, 1));
-
- return result;
-}
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packus_epi16, _simd_packus_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packs_epi16, _simd_packs_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packus_epi32, _simd_packus_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packs_epi32, _simd_packs_epi32)
-
-INLINE simd16mask SIMDAPI _simd16_int2mask(int mask)
-{
- return mask;
-}
-
-INLINE int SIMDAPI SIMDAPI _simd16_mask2int(simd16mask mask)
-{
- return mask;
-}
-
-INLINE simd16mask SIMDAPI _simd16_cmplt_ps_mask(simd16scalar a, simd16scalar b)
-{
- return _simd16_movemask_ps(_simd16_cmplt_ps(a, b));
-}
-
-// convert bitmask to vector mask
-INLINE simd16scalar SIMDAPI vMask16(int32_t mask)
-{
- simd16scalari temp = _simd16_set1_epi32(mask);
-
- simd16scalari bits = _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
-
- simd16scalari result = _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp, bits));
-
- return _simd16_castsi_ps(result);
-}
-
+#if KNOB_SIMD16_WIDTH == 16
+typedef SIMD512 SIMD16;
#else
-
-INLINE simd16mask SIMDAPI _simd16_scalari2mask(simd16scalari mask)
-{
- return _mm512_cmpneq_epu32_mask(mask, _mm512_setzero_epi32());
-}
-
-INLINE simd16mask SIMDAPI _simd16_scalard2mask(simd16scalard mask)
-{
- return _mm512_cmpneq_epu64_mask(_mm512_castpd_si512(mask), _mm512_setzero_si512());
-}
-
-#define _simd16_setzero_ps _mm512_setzero_ps
-#define _simd16_setzero_si _mm512_setzero_si512
-#define _simd16_set1_ps _mm512_set1_ps
-#define _simd16_set1_epi8 _mm512_set1_epi8
-#define _simd16_set1_epi32 _mm512_set1_epi32
-
-INLINE simd16scalar SIMDAPI _simd16_set_ps(float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
-{
- return _mm512_set_ps(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_set_epi32(int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
-{
- return _mm512_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
-}
-
-INLINE simd16scalar SIMDAPI _simd16_set_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
-{
- return _mm512_set_ps(e7, e6, e5, e4, e3, e2, e1, e0, e7, e6, e5, e4, e3, e2, e1, e0);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_set_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
-{
- return _mm512_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0, e7, e6, e5, e4, e3, e2, e1, e0);
-}
-
-#define _simd16_load_ps _mm512_load_ps
-#define _simd16_loadu_ps _mm512_loadu_ps
-#if 1
-#define _simd16_load1_ps _simd16_broadcast_ss
-#endif
-#define _simd16_load_si _mm512_load_si512
-#define _simd16_loadu_si _mm512_loadu_si512
-#define _simd16_broadcast_ss(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, 0)
-#define _simd16_broadcast_ps(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, 0)
-#define _simd16_store_ps _mm512_store_ps
-#define _simd16_store_si _mm512_store_si512
-#define _simd16_extract_ps(a, imm8) _mm256_castsi256_ps(_mm512_extracti64x4_epi64(_mm512_castps_si512(a), imm8))
-#define _simd16_extract_si _mm512_extracti64x4_epi64
-#define _simd16_insert_ps(a, b, imm8) _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castps_si512(a), _mm256_castps_si256(b), imm8))
-#define _simd16_insert_si _mm512_inserti64x4
-
-INLINE void SIMDAPI _simd16_maskstore_ps(float *m, simd16scalari mask, simd16scalar a)
-{
- simd16mask k = _simd16_scalari2mask(mask);
-
- _mm512_mask_store_ps(m, k, a);
-}
-
-#define _simd16_blend_ps(a, b, mask) _mm512_mask_blend_ps(mask, a, b)
-
-INLINE simd16scalar SIMDAPI _simd16_blendv_ps(simd16scalar a, simd16scalar b, const simd16scalar mask)
-{
- simd16mask k = _simd16_scalari2mask(_mm512_castps_si512(mask));
-
- return _mm512_mask_blend_ps(k, a, b);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalar mask)
-{
- simd16mask k = _simd16_scalari2mask(_mm512_castps_si512(mask));
-
- return _mm512_mask_blend_epi32(k, a, b);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalari mask)
-{
- simd16mask k = _simd16_scalari2mask(mask);
-
- return _mm512_mask_blend_epi32(k, a, b);
-}
-
-#define _simd16_mul_ps _mm512_mul_ps
-#define _simd16_div_ps _mm512_div_ps
-#define _simd16_add_ps _mm512_add_ps
-#define _simd16_sub_ps _mm512_sub_ps
-#define _simd16_rsqrt_ps _mm512_rsqrt14_ps
-#define _simd16_min_ps _mm512_min_ps
-#define _simd16_max_ps _mm512_max_ps
-
-INLINE simd16mask SIMDAPI _simd16_movemask_ps(simd16scalar a)
-{
- // movemask_ps only checks the top bit of the float single elements
- return _simd16_scalari2mask(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x80000000)));
-}
-
-INLINE simd16mask SIMDAPI _simd16_movemask_pd(simd16scalard a)
-{
- // movemask_pd only checks the top bit of the float double elements
- return _simd16_scalard2mask(_mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a), _mm512_set1_epi64(0x8000000000000000))));
-}
-
-#if 0
-INLINE int SIMDAPI _simd16_movemask_epi8(simd16scalari a)
-{
- return _simd16_scalar2mask(a);
-}
-#endif
-
-#define _simd16_cvtps_epi32 _mm512_cvtps_epi32
-#define _simd16_cvttps_epi32 _mm512_cvttps_epi32
-#define _simd16_cvtepi32_ps _mm512_cvtepi32_ps
-
-template <int comp>
-INLINE simd16scalar SIMDAPI _simd16_cmp_ps_temp(simd16scalar a, simd16scalar b)
-{
- simd16mask k = _mm512_cmp_ps_mask(a, b, comp);
-
- return _mm512_castsi512_ps(_mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF)));
-}
-
-#define _simd16_cmp_ps(a, b, comp) _simd16_cmp_ps_temp<comp>(a, b)
-
-#define _simd16_cmplt_ps(a, b) _simd16_cmp_ps(a, b, _CMP_LT_OQ)
-#define _simd16_cmpgt_ps(a, b) _simd16_cmp_ps(a, b, _CMP_GT_OQ)
-#define _simd16_cmpneq_ps(a, b) _simd16_cmp_ps(a, b, _CMP_NEQ_OQ)
-#define _simd16_cmpeq_ps(a, b) _simd16_cmp_ps(a, b, _CMP_EQ_OQ)
-#define _simd16_cmpge_ps(a, b) _simd16_cmp_ps(a, b, _CMP_GE_OQ)
-#define _simd16_cmple_ps(a, b) _simd16_cmp_ps(a, b, _CMP_LE_OQ)
-
-#define _simd16_castsi_ps _mm512_castsi512_ps
-#define _simd16_castps_si _mm512_castps_si512
-#define _simd16_castsi_pd _mm512_castsi512_pd
-#define _simd16_castpd_si _mm512_castpd_si512
-#define _simd16_castpd_ps _mm512_castpd_ps
-#define _simd16_castps_pd _mm512_castps_pd
-
-// _mm512_and_ps (and other bitwise operations) exist in AVX512DQ,
-// while the functionally equivalent _mm512_and_epi32 is in AVX512F.
-// Define the _simd16_*_ps versions in terms of AVX512F for broader
-// support.
-#define _simd16_logicop_ps(a, b, op) _simd16_castsi_ps(op##_epi32(_simd16_castps_si(a), _simd16_castps_si(b)))
-
-#define _simd16_and_ps(a, b) _simd16_logicop_ps(a, b, _mm512_and)
-#define _simd16_andnot_ps(a, b) _simd16_logicop_ps(a, b, _mm512_andnot)
-#define _simd16_or_ps(a, b) _simd16_logicop_ps(a, b, _mm512_or)
-#define _simd16_xor_ps(a, b) _simd16_logicop_ps(a, b, _mm512_xor)
-
-template <int mode>
-INLINE simd16scalar SIMDAPI _simd16_round_ps_temp(simd16scalar a)
-{
- return _mm512_roundscale_ps(a, mode);
-}
-
-#define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
-
-#define _simd16_mul_epi32 _mm512_mul_epi32
-#define _simd16_mullo_epi32 _mm512_mullo_epi32
-#define _simd16_sub_epi32 _mm512_sub_epi32
-#define _simd16_sub_epi64 _mm512_sub_epi64
-#define _simd16_min_epi32 _mm512_min_epi32
-#define _simd16_max_epi32 _mm512_max_epi32
-#define _simd16_min_epu32 _mm512_min_epu32
-#define _simd16_max_epu32 _mm512_max_epu32
-#define _simd16_add_epi32 _mm512_add_epi32
-
-#define _simd16_and_si _mm512_and_si512
-#define _simd16_andnot_si _mm512_andnot_si512
-#define _simd16_or_si _mm512_or_si512
-#define _simd16_xor_si _mm512_xor_si512
-
-INLINE simd16scalari SIMDAPI _simd16_cmpeq_epi32(simd16scalari a, simd16scalari b)
-{
- simd16mask k = _mm512_cmpeq_epi32_mask(a, b);
-
- return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmpgt_epi32(simd16scalari a, simd16scalari b)
-{
- simd16mask k = _mm512_cmpgt_epi32_mask(a, b);
-
- return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmplt_epi32(simd16scalari a, simd16scalari b)
-{
- simd16mask k = _mm512_cmplt_epi32_mask(a, b);
-
- return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE int SIMDAPI _simd16_testz_ps(simd16scalar a, simd16scalar b)
-{
- int lo = _simd_testz_ps(_simd16_extract_ps(a, 0), _simd16_extract_ps(b, 0));
- int hi = _simd_testz_ps(_simd16_extract_ps(a, 1), _simd16_extract_ps(b, 1));
-
- return lo & hi;
-}
-
-#define _simd16_unpacklo_ps _mm512_unpacklo_ps
-#define _simd16_unpackhi_ps _mm512_unpackhi_ps
-#define _simd16_unpacklo_pd _mm512_unpacklo_pd
-#define _simd16_unpackhi_pd _mm512_unpackhi_pd
-#define _simd16_unpacklo_epi8 _mm512_unpacklo_epi8
-#define _simd16_unpackhi_epi8 _mm512_unpackhi_epi8
-#define _simd16_unpacklo_epi16 _mm512_unpacklo_epi16
-#define _simd16_unpackhi_epi16 _mm512_unpackhi_epi16
-#define _simd16_unpacklo_epi32 _mm512_unpacklo_epi32
-#define _simd16_unpackhi_epi32 _mm512_unpackhi_epi32
-#define _simd16_unpacklo_epi64 _mm512_unpacklo_epi64
-#define _simd16_unpackhi_epi64 _mm512_unpackhi_epi64
-#define _simd16_slli_epi32 _mm512_slli_epi32
-#define _simd16_srli_epi32 _mm512_srli_epi32
-#define _simd16_srai_epi32 _mm512_srai_epi32
-#define _simd16_fmadd_ps _mm512_fmadd_ps
-#define _simd16_fmsub_ps _mm512_fmsub_ps
-#define _simd16_adds_epu8 _mm512_adds_epu8
-#define _simd16_subs_epu8 _mm512_subs_epu8
-#define _simd16_add_epi8 _mm512_add_epi8
-#define _simd16_shuffle_epi8 _mm512_shuffle_epi8
-
-#define _simd16_fmadd_ps _mm512_fmadd_ps
-#define _simd16_fmsub_ps _mm512_fmsub_ps
-
-#define _simd16_i32gather_ps(m, index, scale) _mm512_i32gather_ps(index, m, scale)
-
-template <int scale>
-INLINE simd16scalar SIMDAPI _simd16_mask_i32gather_ps_temp(simd16scalar a, const float *m, simd16scalari index, simd16scalari mask)
-{
- __mmask16 k = _mm512_cmpneq_epi32_mask(mask, _mm512_setzero_si512());
-
- return _mm512_mask_i32gather_ps(a, k, index, m, scale);
-}
-
-#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, index, mask)
-
-#define _simd16_abs_epi32 _mm512_abs_epi32
-
-INLINE simd16scalari SIMDAPI _simd16_cmpeq_epi64(simd16scalari a, simd16scalari b)
-{
- __mmask8 k = _mm512_cmpeq_epi64_mask(a, b);
-
- return _mm512_mask_blend_epi64(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmpgt_epi64(simd16scalari a, simd16scalari b)
-{
- __mmask8 k = _mm512_cmpgt_epi64_mask(a, b);
-
- return _mm512_mask_blend_epi64(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmpeq_epi16(simd16scalari a, simd16scalari b)
-{
- __mmask32 k = _mm512_cmpeq_epi16_mask(a, b);
-
- return _mm512_mask_blend_epi16(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmpgt_epi16(simd16scalari a, simd16scalari b)
-{
- __mmask32 k = _mm512_cmpgt_epi16_mask(a, b);
-
- return _mm512_mask_blend_epi16(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmpeq_epi8(simd16scalari a, simd16scalari b)
-{
- __mmask64 k = _mm512_cmpeq_epi8_mask(a, b);
-
- return _mm512_mask_blend_epi8(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmpgt_epi8(simd16scalari a, simd16scalari b)
-{
- __mmask64 k = _mm512_cmpgt_epi8_mask(a, b);
-
- return _mm512_mask_blend_epi8(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-#define _simd16_permute_ps(a, i) _mm512_permutexvar_ps(i, a)
-#define _simd16_permute_epi32(a, i) _mm512_permutexvar_epi32(i, a)
-#define _simd16_sllv_epi32 _mm512_srlv_epi32
-#define _simd16_srlv_epi32 _mm512_sllv_epi32
-#define _simd16_permute2f128_ps _mm512_shuffle_f32x4
-#define _simd16_permute2f128_pd _mm512_shuffle_f64x2
-#define _simd16_permute2f128_si _mm512_shuffle_i32x4
-#define _simd16_shuffle_ps _mm512_shuffle_ps
-#define _simd16_shuffle_pd _mm512_shuffle_pd
-#define _simd16_cvtepu8_epi16 _mm512_cvtepu8_epi16
-#define _simd16_cvtepu8_epi32 _mm512_cvtepu8_epi32
-#define _simd16_cvtepu16_epi32 _mm512_cvtepu16_epi32
-#define _simd16_cvtepu16_epi64 _mm512_cvtepu16_epi64
-#define _simd16_cvtepu32_epi64 _mm512_cvtepu32_epi64
-#define _simd16_packus_epi16 _mm512_packus_epi16
-#define _simd16_packs_epi16 _mm512_packs_epi16
-#define _simd16_packus_epi32 _mm512_packus_epi32
-#define _simd16_packs_epi32 _mm512_packs_epi32
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_shuffle_epi32_temp(simd16scalari a, simd16scalari b)
-{
- return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a), _simd16_castsi_ps(b), imm8));
-}
-
-#define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_shuffle_epi64_temp(simd16scalari a, simd16scalari b)
-{
- return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a), _simd16_castsi_pd(b), imm8));
-}
-
-#define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
-
-INLINE simd16mask SIMDAPI _simd16_int2mask(int mask)
-{
- return _mm512_int2mask(mask);
-}
-
-INLINE int SIMDAPI _simd16_mask2int(simd16mask mask)
-{
- return _mm512_mask2int(mask);
-}
-
-INLINE simd16mask SIMDAPI _simd16_cmplt_ps_mask(simd16scalar a, simd16scalar b)
-{
- return _mm512_cmplt_ps_mask(a, b);
-}
-
-// convert bitmask to vector mask
-INLINE simd16scalar SIMDAPI vMask16(int32_t mask)
-{
- simd16scalari temp = _simd16_set1_epi32(mask);
-
- simd16scalari bits = _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
-
- simd16scalari result = _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp, bits));
-
- return _simd16_castsi_ps(result);
-}
-
-#endif//ENABLE_AVX512_EMULATION
+#error Unsupported vector width
+#endif//KNOB_SIMD16_WIDTH == 16
+
+#define _simd16_setzero_ps SIMD16::setzero_ps
+#define _simd16_setzero_si SIMD16::setzero_si
+#define _simd16_set1_ps SIMD16::set1_ps
+#define _simd16_set1_epi8 SIMD16::set1_epi8
+#define _simd16_set1_epi32 SIMD16::set1_epi32
+#define _simd16_set_ps SIMD16::set_ps
+#define _simd16_set_epi32 SIMD16::set_epi32
+#define _simd16_load_ps SIMD16::load_ps
+#define _simd16_loadu_ps SIMD16::loadu_ps
+#if 1
+#define _simd16_load1_ps SIMD16::broadcast_ss
+#endif
+#define _simd16_load_si SIMD16::load_si
+#define _simd16_loadu_si SIMD16::loadu_si
+#define _simd16_broadcast_ss(m) SIMD16::broadcast_ss((float const*)m)
+#define _simd16_store_ps SIMD16::store_ps
+#define _simd16_store_si SIMD16::store_si
+#define _simd16_extract_ps(a, imm8) SIMD16::extract_ps<imm8>(a)
+#define _simd16_extract_si(a, imm8) SIMD16::extract_si<imm8>(a)
+#define _simd16_insert_ps(a, b, imm8) SIMD16::insert_ps<imm8>(a, b)
+#define _simd16_insert_si(a, b, imm8) SIMD16::insert_si<imm8>(a, b)
+#define _simd16_maskstore_ps SIMD16::maskstore_ps
+#define _simd16_blend_ps(a, b, mask) SIMD16::blend_ps<mask>(a, b)
+#define _simd16_blendv_ps SIMD16::blendv_ps
+#define _simd16_blendv_epi32 SIMD16::blendv_epi32
+#define _simd16_mul_ps SIMD16::mul_ps
+#define _simd16_div_ps SIMD16::div_ps
+#define _simd16_add_ps SIMD16::add_ps
+#define _simd16_sub_ps SIMD16::sub_ps
+#define _simd16_rsqrt_ps SIMD16::rsqrt_ps
+#define _simd16_min_ps SIMD16::min_ps
+#define _simd16_max_ps SIMD16::max_ps
+#define _simd16_movemask_ps SIMD16::movemask_ps
+#define _simd16_movemask_pd SIMD16::movemask_pd
+#define _simd16_cvtps_epi32 SIMD16::cvtps_epi32
+#define _simd16_cvttps_epi32 SIMD16::cvttps_epi32
+#define _simd16_cvtepi32_ps SIMD16::cvtepi32_ps
+#define _simd16_cmp_ps(a, b, comp) SIMD16::cmp_ps<SIMD16::CompareType(comp)>(a, b)
+#define _simd16_cmplt_ps SIMD16::cmplt_ps
+#define _simd16_cmpgt_ps SIMD16::cmpgt_ps
+#define _simd16_cmpneq_ps SIMD16::cmpneq_ps
+#define _simd16_cmpeq_ps SIMD16::cmpeq_ps
+#define _simd16_cmpge_ps SIMD16::cmpge_ps
+#define _simd16_cmple_ps SIMD16::cmple_ps
+#define _simd16_castsi_ps SIMD16::castsi_ps
+#define _simd16_castps_si SIMD16::castps_si
+#define _simd16_castsi_pd SIMD16::castsi_pd
+#define _simd16_castpd_si SIMD16::castpd_si
+#define _simd16_castpd_ps SIMD16::castpd_ps
+#define _simd16_castps_pd SIMD16::castps_pd
+#define _simd16_and_ps SIMD16::and_ps
+#define _simd16_andnot_ps SIMD16::andnot_ps
+#define _simd16_or_ps SIMD16::or_ps
+#define _simd16_xor_ps SIMD16::xor_ps
+#define _simd16_round_ps(a, mode) SIMD16::round_ps<SIMD16::RoundMode(mode)>(a)
+#define _simd16_mul_epi32 SIMD16::mul_epi32
+#define _simd16_mullo_epi32 SIMD16::mullo_epi32
+#define _simd16_sub_epi32 SIMD16::sub_epi32
+#define _simd16_sub_epi64 SIMD16::sub_epi64
+#define _simd16_min_epi32 SIMD16::min_epi32
+#define _simd16_max_epi32 SIMD16::max_epi32
+#define _simd16_min_epu32 SIMD16::min_epu32
+#define _simd16_max_epu32 SIMD16::max_epu32
+#define _simd16_add_epi32 SIMD16::add_epi32
+#define _simd16_and_si SIMD16::and_si
+#define _simd16_andnot_si SIMD16::andnot_si
+#define _simd16_or_si SIMD16::or_si
+#define _simd16_xor_si SIMD16::xor_si
+#define _simd16_cmpeq_epi32 SIMD16::cmpeq_epi32
+#define _simd16_cmpgt_epi32 SIMD16::cmpgt_epi32
+#define _simd16_cmplt_epi32 SIMD16::cmplt_epi32
+#define _simd16_testz_ps SIMD16::testz_ps
+#define _simd16_unpacklo_ps SIMD16::unpacklo_ps
+#define _simd16_unpackhi_ps SIMD16::unpackhi_ps
+#define _simd16_unpacklo_pd SIMD16::unpacklo_pd
+#define _simd16_unpackhi_pd SIMD16::unpackhi_pd
+#define _simd16_unpacklo_epi8 SIMD16::unpacklo_epi8
+#define _simd16_unpackhi_epi8 SIMD16::unpackhi_epi8
+#define _simd16_unpacklo_epi16 SIMD16::unpacklo_epi16
+#define _simd16_unpackhi_epi16 SIMD16::unpackhi_epi16
+#define _simd16_unpacklo_epi32 SIMD16::unpacklo_epi32
+#define _simd16_unpackhi_epi32 SIMD16::unpackhi_epi32
+#define _simd16_unpacklo_epi64 SIMD16::unpacklo_epi64
+#define _simd16_unpackhi_epi64 SIMD16::unpackhi_epi64
+#define _simd16_slli_epi32(a, i) SIMD16::slli_epi32<i>(a)
+#define _simd16_srli_epi32(a, i) SIMD16::srli_epi32<i>(a)
+#define _simd16_srai_epi32(a, i) SIMD16::srai_epi32<i>(a)
+#define _simd16_fmadd_ps SIMD16::fmadd_ps
+#define _simd16_fmsub_ps SIMD16::fmsub_ps
+#define _simd16_adds_epu8 SIMD16::adds_epu8
+#define _simd16_subs_epu8 SIMD16::subs_epu8
+#define _simd16_add_epi8 SIMD16::add_epi8
+#define _simd16_shuffle_epi8 SIMD16::shuffle_epi8
+
+#define _simd16_i32gather_ps(m, index, scale) SIMD16::i32gather_ps<SIMD16::ScaleFactor(scale)>(index, m)
+#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) SIMD16::mask_i32gather_ps<SIMD16::ScaleFactor(scale)>(a, m, index, mask)
+
+#define _simd16_abs_epi32 SIMD16::abs_epi32
+
+#define _simd16_cmpeq_epi64 SIMD16::cmpeq_epi64
+#define _simd16_cmpgt_epi64 SIMD16::cmpgt_epi64
+#define _simd16_cmpeq_epi16 SIMD16::cmpeq_epi16
+#define _simd16_cmpgt_epi16 SIMD16::cmpgt_epi16
+#define _simd16_cmpeq_epi8 SIMD16::cmpeq_epi8
+#define _simd16_cmpgt_epi8 SIMD16::cmpgt_epi8
+
+#define _simd16_permute_ps SIMD16::permute_ps
+#define _simd16_permute_epi32 SIMD16::permute_epi32
+#define _simd16_sllv_epi32 SIMD16::sllv_epi32
+#define _simd16_srlv_epi32 SIMD16::sllv_epi32
+#define _simd16_permute2f128_ps(a, b, i) SIMD16::permute2f128_ps<i>(a, b)
+#define _simd16_permute2f128_pd(a, b, i) SIMD16::permute2f128_pd<i>(a, b)
+#define _simd16_permute2f128_si(a, b, i) SIMD16::permute2f128_si<i>(a, b)
+#define _simd16_shuffle_ps(a, b, i) SIMD16::shuffle_ps<i>(a, b)
+#define _simd16_shuffle_pd(a, b, i) SIMD16::shuffle_pd<i>(a, b)
+#define _simd16_shuffle_epi32(a, b, imm8) SIMD16::shuffle_epi32<imm8>(a, b)
+#define _simd16_shuffle_epi64(a, b, imm8) SIMD16::shuffle_epi64<imm8>(a, b)
+#define _simd16_cvtepu8_epi16 SIMD16::cvtepu8_epi16
+#define _simd16_cvtepu8_epi32 SIMD16::cvtepu8_epi32
+#define _simd16_cvtepu16_epi32 SIMD16::cvtepu16_epi32
+#define _simd16_cvtepu16_epi64 SIMD16::cvtepu16_epi64
+#define _simd16_cvtepu32_epi64 SIMD16::cvtepu32_epi64
+#define _simd16_packus_epi16 SIMD16::packus_epi16
+#define _simd16_packs_epi16 SIMD16::packs_epi16
+#define _simd16_packus_epi32 SIMD16::packus_epi32
+#define _simd16_packs_epi32 SIMD16::packs_epi32
+#define _simd16_cmplt_ps_mask SIMD16::cmp_ps_mask<SIMD16::CompareType::LT_OQ>
+#define _simd16_int2mask(mask) simd16mask(mask)
+#define _simd16_mask2int(mask) int(mask)
#endif//ENABLE_AVX512_SIMD16
#ifndef __SWR_SIMDINTRIN_H__
#define __SWR_SIMDINTRIN_H__
-#include "common/os.h"
#include "common/intrin.h"
+#include "common/simdlib.hpp"
#if KNOB_SIMD_WIDTH == 8
-#define _simd128_maskstore_ps _mm_maskstore_ps
-#define _simd_load_ps _mm256_load_ps
-#define _simd_load1_ps _mm256_broadcast_ss
-#define _simd_loadu_ps _mm256_loadu_ps
-#define _simd_setzero_ps _mm256_setzero_ps
-#define _simd_set1_ps _mm256_set1_ps
-#define _simd_blend_ps _mm256_blend_ps
-#define _simd_blendv_ps _mm256_blendv_ps
-#define _simd_store_ps _mm256_store_ps
-#define _simd_mul_ps _mm256_mul_ps
-#define _simd_add_ps _mm256_add_ps
-#define _simd_sub_ps _mm256_sub_ps
-#define _simd_rsqrt_ps _mm256_rsqrt_ps
-#define _simd_min_ps _mm256_min_ps
-#define _simd_max_ps _mm256_max_ps
-#define _simd_movemask_ps _mm256_movemask_ps
-#define _simd_cvtps_epi32 _mm256_cvtps_epi32
-#define _simd_cvttps_epi32 _mm256_cvttps_epi32
-#define _simd_cvtepi32_ps _mm256_cvtepi32_ps
-#define _simd_cmplt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ)
-#define _simd_cmpgt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ)
-#define _simd_cmpneq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_NEQ_OQ)
-#define _simd_cmpeq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
-#define _simd_cmpge_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
-#define _simd_cmple_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
-#define _simd_cmp_ps(a, b, imm) _mm256_cmp_ps(a, b, imm)
-#define _simd_and_ps _mm256_and_ps
-#define _simd_or_ps _mm256_or_ps
-
-#define _simd_rcp_ps _mm256_rcp_ps
-#define _simd_div_ps _mm256_div_ps
-#define _simd_castsi_ps _mm256_castsi256_ps
-#define _simd_andnot_ps _mm256_andnot_ps
-#define _simd_round_ps _mm256_round_ps
-#define _simd_castpd_ps _mm256_castpd_ps
-#define _simd_broadcast_ps(a) _mm256_broadcast_ps((const __m128*)(a))
-#define _simd_stream_ps _mm256_stream_ps
-
-#define _simd_load_sd _mm256_load_sd
-#define _simd_movemask_pd _mm256_movemask_pd
-#define _simd_castsi_pd _mm256_castsi256_pd
-
-// emulated integer simd
-#define SIMD_EMU_EPI(func, intrin) \
-INLINE \
-__m256i func(__m256i a, __m256i b)\
-{\
- __m128i aHi = _mm256_extractf128_si256(a, 1);\
- __m128i bHi = _mm256_extractf128_si256(b, 1);\
- __m128i aLo = _mm256_castsi256_si128(a);\
- __m128i bLo = _mm256_castsi256_si128(b);\
-\
- __m128i subLo = intrin(aLo, bLo);\
- __m128i subHi = intrin(aHi, bHi);\
-\
- __m256i result = _mm256_castsi128_si256(subLo);\
- result = _mm256_insertf128_si256(result, subHi, 1);\
-\
- return result;\
-}
-
-#if (KNOB_ARCH == KNOB_ARCH_AVX)
-INLINE
-__m256 _simdemu_permute_ps(__m256 a, __m256i b)
-{
- __m128 aHi = _mm256_extractf128_ps(a, 1);
- __m128i bHi = _mm256_extractf128_si256(b, 1);
- __m128 aLo = _mm256_castps256_ps128(a);
- __m128i bLo = _mm256_castsi256_si128(b);
-
- __m128i indexHi = _mm_cmpgt_epi32(bLo, _mm_set1_epi32(3));
- __m128 resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
- __m128 resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
- __m128 blendLowRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
-
- indexHi = _mm_cmpgt_epi32(bHi, _mm_set1_epi32(3));
- resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
- resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
- __m128 blendHiRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
-
- __m256 result = _mm256_castps128_ps256(blendLowRes);
- result = _mm256_insertf128_ps(result, blendHiRes, 1);
-
- return result;
-}
-
-INLINE
-__m256i _simdemu_permute_epi32(__m256i a, __m256i b)
-{
- return _mm256_castps_si256(_simdemu_permute_ps(_mm256_castsi256_ps(a), b));
-}
-
-INLINE
-__m256i _simdemu_srlv_epi32(__m256i vA, __m256i vCount)
-{
- int32_t aHi, aLow, countHi, countLow;
- __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
- __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
- __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
- __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
-
- aHi = _mm_extract_epi32(vAHi, 0);
- countHi = _mm_extract_epi32(vCountHi, 0);
- aHi >>= countHi;
- vAHi = _mm_insert_epi32(vAHi, aHi, 0);
-
- aLow = _mm_extract_epi32(vALow, 0);
- countLow = _mm_extract_epi32(vCountLow, 0);
- aLow >>= countLow;
- vALow = _mm_insert_epi32(vALow, aLow, 0);
-
- aHi = _mm_extract_epi32(vAHi, 1);
- countHi = _mm_extract_epi32(vCountHi, 1);
- aHi >>= countHi;
- vAHi = _mm_insert_epi32(vAHi, aHi, 1);
-
- aLow = _mm_extract_epi32(vALow, 1);
- countLow = _mm_extract_epi32(vCountLow, 1);
- aLow >>= countLow;
- vALow = _mm_insert_epi32(vALow, aLow, 1);
-
- aHi = _mm_extract_epi32(vAHi, 2);
- countHi = _mm_extract_epi32(vCountHi, 2);
- aHi >>= countHi;
- vAHi = _mm_insert_epi32(vAHi, aHi, 2);
-
- aLow = _mm_extract_epi32(vALow, 2);
- countLow = _mm_extract_epi32(vCountLow, 2);
- aLow >>= countLow;
- vALow = _mm_insert_epi32(vALow, aLow, 2);
-
- aHi = _mm_extract_epi32(vAHi, 3);
- countHi = _mm_extract_epi32(vCountHi, 3);
- aHi >>= countHi;
- vAHi = _mm_insert_epi32(vAHi, aHi, 3);
-
- aLow = _mm_extract_epi32(vALow, 3);
- countLow = _mm_extract_epi32(vCountLow, 3);
- aLow >>= countLow;
- vALow = _mm_insert_epi32(vALow, aLow, 3);
-
- __m256i ret = _mm256_set1_epi32(0);
- ret = _mm256_insertf128_si256(ret, vAHi, 1);
- ret = _mm256_insertf128_si256(ret, vALow, 0);
- return ret;
-}
-
-
-INLINE
-__m256i _simdemu_sllv_epi32(__m256i vA, __m256i vCount)
-{
- int32_t aHi, aLow, countHi, countLow;
- __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
- __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
- __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
- __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
-
- aHi = _mm_extract_epi32(vAHi, 0);
- countHi = _mm_extract_epi32(vCountHi, 0);
- aHi <<= countHi;
- vAHi = _mm_insert_epi32(vAHi, aHi, 0);
-
- aLow = _mm_extract_epi32(vALow, 0);
- countLow = _mm_extract_epi32(vCountLow, 0);
- aLow <<= countLow;
- vALow = _mm_insert_epi32(vALow, aLow, 0);
-
- aHi = _mm_extract_epi32(vAHi, 1);
- countHi = _mm_extract_epi32(vCountHi, 1);
- aHi <<= countHi;
- vAHi = _mm_insert_epi32(vAHi, aHi, 1);
-
- aLow = _mm_extract_epi32(vALow, 1);
- countLow = _mm_extract_epi32(vCountLow, 1);
- aLow <<= countLow;
- vALow = _mm_insert_epi32(vALow, aLow, 1);
-
- aHi = _mm_extract_epi32(vAHi, 2);
- countHi = _mm_extract_epi32(vCountHi, 2);
- aHi <<= countHi;
- vAHi = _mm_insert_epi32(vAHi, aHi, 2);
-
- aLow = _mm_extract_epi32(vALow, 2);
- countLow = _mm_extract_epi32(vCountLow, 2);
- aLow <<= countLow;
- vALow = _mm_insert_epi32(vALow, aLow, 2);
-
- aHi = _mm_extract_epi32(vAHi, 3);
- countHi = _mm_extract_epi32(vCountHi, 3);
- aHi <<= countHi;
- vAHi = _mm_insert_epi32(vAHi, aHi, 3);
-
- aLow = _mm_extract_epi32(vALow, 3);
- countLow = _mm_extract_epi32(vCountLow, 3);
- aLow <<= countLow;
- vALow = _mm_insert_epi32(vALow, aLow, 3);
-
- __m256i ret = _mm256_set1_epi32(0);
- ret = _mm256_insertf128_si256(ret, vAHi, 1);
- ret = _mm256_insertf128_si256(ret, vALow, 0);
- return ret;
-}
-
-#define _simd_mul_epi32 _simdemu_mul_epi32
-#define _simd_mullo_epi32 _simdemu_mullo_epi32
-#define _simd_sub_epi32 _simdemu_sub_epi32
-#define _simd_sub_epi64 _simdemu_sub_epi64
-#define _simd_min_epi32 _simdemu_min_epi32
-#define _simd_min_epu32 _simdemu_min_epu32
-#define _simd_max_epi32 _simdemu_max_epi32
-#define _simd_max_epu32 _simdemu_max_epu32
-#define _simd_add_epi32 _simdemu_add_epi32
-#define _simd_and_si _simdemu_and_si
-#define _simd_andnot_si _simdemu_andnot_si
-#define _simd_cmpeq_epi32 _simdemu_cmpeq_epi32
-#define _simd_cmplt_epi32 _simdemu_cmplt_epi32
-#define _simd_cmpgt_epi32 _simdemu_cmpgt_epi32
-#define _simd_or_si _simdemu_or_si
-#define _simd_xor_si _simdemu_xor_si
-#define _simd_castps_si _mm256_castps_si256
-#define _simd_adds_epu8 _simdemu_adds_epu8
-#define _simd_subs_epu8 _simdemu_subs_epu8
-#define _simd_add_epi8 _simdemu_add_epi8
-#define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64
-#define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64
-#define _simd_cmpgt_epi8 _simdemu_cmpgt_epi8
-#define _simd_cmpeq_epi8 _simdemu_cmpeq_epi8
-#define _simd_cmpgt_epi16 _simdemu_cmpgt_epi16
-#define _simd_cmpeq_epi16 _simdemu_cmpeq_epi16
-#define _simd_movemask_epi8 _simdemu_movemask_epi8
-#define _simd_permute_ps _simdemu_permute_ps
-#define _simd_permute_epi32 _simdemu_permute_epi32
-#define _simd_srlv_epi32 _simdemu_srlv_epi32
-#define _simd_sllv_epi32 _simdemu_sllv_epi32
-
-SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32)
-SIMD_EMU_EPI(_simdemu_mullo_epi32, _mm_mullo_epi32)
-SIMD_EMU_EPI(_simdemu_sub_epi32, _mm_sub_epi32)
-SIMD_EMU_EPI(_simdemu_sub_epi64, _mm_sub_epi64)
-SIMD_EMU_EPI(_simdemu_min_epi32, _mm_min_epi32)
-SIMD_EMU_EPI(_simdemu_min_epu32, _mm_min_epu32)
-SIMD_EMU_EPI(_simdemu_max_epi32, _mm_max_epi32)
-SIMD_EMU_EPI(_simdemu_max_epu32, _mm_max_epu32)
-SIMD_EMU_EPI(_simdemu_add_epi32, _mm_add_epi32)
-SIMD_EMU_EPI(_simdemu_and_si, _mm_and_si128)
-SIMD_EMU_EPI(_simdemu_andnot_si, _mm_andnot_si128)
-SIMD_EMU_EPI(_simdemu_cmpeq_epi32, _mm_cmpeq_epi32)
-SIMD_EMU_EPI(_simdemu_cmplt_epi32, _mm_cmplt_epi32)
-SIMD_EMU_EPI(_simdemu_cmpgt_epi32, _mm_cmpgt_epi32)
-SIMD_EMU_EPI(_simdemu_or_si, _mm_or_si128)
-SIMD_EMU_EPI(_simdemu_xor_si, _mm_xor_si128)
-SIMD_EMU_EPI(_simdemu_adds_epu8, _mm_adds_epu8)
-SIMD_EMU_EPI(_simdemu_subs_epu8, _mm_subs_epu8)
-SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8)
-SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64)
-SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64)
-SIMD_EMU_EPI(_simdemu_cmpgt_epi8, _mm_cmpgt_epi8)
-SIMD_EMU_EPI(_simdemu_cmpeq_epi8, _mm_cmpeq_epi8)
-SIMD_EMU_EPI(_simdemu_cmpgt_epi16, _mm_cmpgt_epi16)
-SIMD_EMU_EPI(_simdemu_cmpeq_epi16, _mm_cmpeq_epi16)
-SIMD_EMU_EPI(_simdemu_unpacklo_epi8, _mm_unpacklo_epi8)
-SIMD_EMU_EPI(_simdemu_unpackhi_epi8, _mm_unpackhi_epi8)
-SIMD_EMU_EPI(_simdemu_unpacklo_epi16, _mm_unpacklo_epi16)
-SIMD_EMU_EPI(_simdemu_unpackhi_epi16, _mm_unpackhi_epi16)
-
-#define _simd_unpacklo_epi8 _simdemu_unpacklo_epi8
-#define _simd_unpackhi_epi8 _simdemu_unpackhi_epi8
-#define _simd_unpacklo_epi16 _simdemu_unpacklo_epi16
-#define _simd_unpackhi_epi16 _simdemu_unpackhi_epi16
-#define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
-#define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
-#define _simd_unpacklo_epi64(a, b) _mm256_castpd_si256(_mm256_unpacklo_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)))
-#define _simd_unpackhi_epi64(a, b) _mm256_castpd_si256(_mm256_unpackhi_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)))
-
-#define _simd_slli_epi32(a,i) _simdemu_slli_epi32(a,i)
-#define _simd_srai_epi32(a,i) _simdemu_srai_epi32(a,i)
-#define _simd_srli_epi32(a,i) _simdemu_srli_epi32(a,i)
-#define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
-
-#define _simd128_fmadd_ps _mm_fmaddemu_ps
-#define _simd_fmadd_ps _mm_fmaddemu256_ps
-#define _simd_fmsub_ps _mm_fmsubemu256_ps
-#define _simd_shuffle_epi8 _simdemu_shuffle_epi8
-SIMD_EMU_EPI(_simdemu_shuffle_epi8, _mm_shuffle_epi8)
-
-INLINE
-__m128 _mm_fmaddemu_ps(__m128 a, __m128 b, __m128 c)
-{
- __m128 res = _mm_mul_ps(a, b);
- res = _mm_add_ps(res, c);
- return res;
-}
-
-INLINE
-__m256 _mm_fmaddemu256_ps(__m256 a, __m256 b, __m256 c)
-{
- __m256 res = _mm256_mul_ps(a, b);
- res = _mm256_add_ps(res, c);
- return res;
-}
-
-INLINE
-__m256 _mm_fmsubemu256_ps(__m256 a, __m256 b, __m256 c)
-{
- __m256 res = _mm256_mul_ps(a, b);
- res = _mm256_sub_ps(res, c);
- return res;
-}
-
-INLINE
-__m256 _simd_i32gather_ps(const float* pBase, __m256i vOffsets, const int scale)
-{
- uint32_t *pOffsets = (uint32_t*)&vOffsets;
- simdscalar vResult;
- float* pResult = (float*)&vResult;
- for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
- {
- uint32_t offset = pOffsets[i];
- offset = offset * scale;
- pResult[i] = *(float*)(((const uint8_t*)pBase + offset));
- }
-
- return vResult;
-}
-
-INLINE
-__m256 _simd_mask_i32gather_ps(__m256 vSrc, const float* pBase, __m256i vOffsets, __m256 vMask, const int scale)
-{
- uint32_t *pOffsets = (uint32_t*)&vOffsets;
- simdscalar vResult = vSrc;
- float* pResult = (float*)&vResult;
- DWORD index;
- uint32_t mask = _simd_movemask_ps(vMask);
- while (_BitScanForward(&index, mask))
- {
- mask &= ~(1 << index);
- uint32_t offset = pOffsets[index];
- offset = offset * scale;
- pResult[index] = *(float*)(((const uint8_t*)pBase + offset));
- }
-
- return vResult;
-}
-
-INLINE
-__m256i _simd_abs_epi32(__m256i a)
-{
- __m128i aHi = _mm256_extractf128_si256(a, 1);
- __m128i aLo = _mm256_castsi256_si128(a);
- __m128i absLo = _mm_abs_epi32(aLo);
- __m128i absHi = _mm_abs_epi32(aHi);
- __m256i result = _mm256_castsi128_si256(absLo);
- result = _mm256_insertf128_si256(result, absHi, 1);
- return result;
-}
-
-INLINE
-int _simdemu_movemask_epi8(__m256i a)
-{
- __m128i aHi = _mm256_extractf128_si256(a, 1);
- __m128i aLo = _mm256_castsi256_si128(a);
-
- int resHi = _mm_movemask_epi8(aHi);
- int resLo = _mm_movemask_epi8(aLo);
-
- return (resHi << 16) | resLo;
-}
-
-INLINE
-__m256i _simd_cvtepu8_epi16(__m128i a)
-{
- __m128i resultlo = _mm_cvtepu8_epi16(a);
- __m128i resulthi = _mm_cvtepu8_epi16(_mm_srli_si128(a, 8));
-
- __m256i result = _mm256_castsi128_si256(resultlo);
-
- return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_cvtepu8_epi32(__m128i a)
-{
- __m128i resultlo = _mm_cvtepu8_epi32(a);
- __m128i resulthi = _mm_cvtepu8_epi32(_mm_srli_si128(a, 4));
-
- __m256i result = _mm256_castsi128_si256(resultlo);
-
- return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_cvtepu16_epi32(__m128i a)
-{
- __m128i resultlo = _mm_cvtepu16_epi32(a);
- __m128i resulthi = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8));
-
- __m256i result = _mm256_castsi128_si256(resultlo);
-
- return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_cvtepu16_epi64(__m128i a)
-{
- __m128i resultlo = _mm_cvtepu16_epi64(a);
- __m128i resulthi = _mm_cvtepu16_epi64(_mm_srli_si128(a, 4));
-
- __m256i result = _mm256_castsi128_si256(resultlo);
-
- return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_cvtepu32_epi64(__m128i a)
-{
- __m128i resultlo = _mm_cvtepu32_epi64(a);
- __m128i resulthi = _mm_cvtepu32_epi64(_mm_srli_si128(a, 8));
-
- __m256i result = _mm256_castsi128_si256(resultlo);
-
- return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_packus_epi16(__m256i a, __m256i b)
-{
- __m128i alo = _mm256_extractf128_si256(a, 0);
- __m128i ahi = _mm256_extractf128_si256(a, 1);
-
- __m128i blo = _mm256_extractf128_si256(b, 0);
- __m128i bhi = _mm256_extractf128_si256(b, 1);
-
- __m128i resultlo = _mm_packus_epi16(alo, blo);
- __m128i resulthi = _mm_packus_epi16(ahi, bhi);
-
- __m256i result = _mm256_castsi128_si256(resultlo);
-
- return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_packs_epi16(__m256i a, __m256i b)
-{
- __m128i alo = _mm256_extractf128_si256(a, 0);
- __m128i ahi = _mm256_extractf128_si256(a, 1);
-
- __m128i blo = _mm256_extractf128_si256(b, 0);
- __m128i bhi = _mm256_extractf128_si256(b, 1);
-
- __m128i resultlo = _mm_packs_epi16(alo, blo);
- __m128i resulthi = _mm_packs_epi16(ahi, bhi);
-
- __m256i result = _mm256_castsi128_si256(resultlo);
-
- return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_packus_epi32(__m256i a, __m256i b)
-{
- __m128i alo = _mm256_extractf128_si256(a, 0);
- __m128i ahi = _mm256_extractf128_si256(a, 1);
-
- __m128i blo = _mm256_extractf128_si256(b, 0);
- __m128i bhi = _mm256_extractf128_si256(b, 1);
-
- __m128i resultlo = _mm_packus_epi32(alo, blo);
- __m128i resulthi = _mm_packus_epi32(ahi, bhi);
-
- __m256i result = _mm256_castsi128_si256(resultlo);
-
- return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_packs_epi32(__m256i a, __m256i b)
-{
- __m128i alo = _mm256_extractf128_si256(a, 0);
- __m128i ahi = _mm256_extractf128_si256(a, 1);
-
- __m128i blo = _mm256_extractf128_si256(b, 0);
- __m128i bhi = _mm256_extractf128_si256(b, 1);
-
- __m128i resultlo = _mm_packs_epi32(alo, blo);
- __m128i resulthi = _mm_packs_epi32(ahi, bhi);
-
- __m256i result = _mm256_castsi128_si256(resultlo);
-
- return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
+typedef SIMD256 SIMD;
#else
-
-#define _simd_mul_epi32 _mm256_mul_epi32
-#define _simd_mullo_epi32 _mm256_mullo_epi32
-#define _simd_sub_epi32 _mm256_sub_epi32
-#define _simd_sub_epi64 _mm256_sub_epi64
-#define _simd_min_epi32 _mm256_min_epi32
-#define _simd_max_epi32 _mm256_max_epi32
-#define _simd_min_epu32 _mm256_min_epu32
-#define _simd_max_epu32 _mm256_max_epu32
-#define _simd_add_epi32 _mm256_add_epi32
-#define _simd_and_si _mm256_and_si256
-#define _simd_andnot_si _mm256_andnot_si256
-#define _simd_cmpeq_epi32 _mm256_cmpeq_epi32
-#define _simd_cmplt_epi32(a,b) _mm256_cmpgt_epi32(b,a)
-#define _simd_cmpgt_epi32(a,b) _mm256_cmpgt_epi32(a,b)
-#define _simd_or_si _mm256_or_si256
-#define _simd_xor_si _mm256_xor_si256
-#define _simd_castps_si _mm256_castps_si256
-
-#define _simd_unpacklo_epi8 _mm256_unpacklo_epi8
-#define _simd_unpackhi_epi8 _mm256_unpackhi_epi8
-#define _simd_unpacklo_epi16 _mm256_unpacklo_epi16
-#define _simd_unpackhi_epi16 _mm256_unpackhi_epi16
-#define _simd_unpacklo_epi32 _mm256_unpacklo_epi32
-#define _simd_unpackhi_epi32 _mm256_unpackhi_epi32
-#define _simd_unpacklo_epi64 _mm256_unpacklo_epi64
-#define _simd_unpackhi_epi64 _mm256_unpackhi_epi64
-
-#define _simd_srli_si(a,i) _simdemu_srli_si128<i>(a)
-#define _simd_slli_epi32 _mm256_slli_epi32
-#define _simd_srai_epi32 _mm256_srai_epi32
-#define _simd_srli_epi32 _mm256_srli_epi32
-#define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
-#define _simd128_fmadd_ps _mm_fmadd_ps
-#define _simd_fmadd_ps _mm256_fmadd_ps
-#define _simd_fmsub_ps _mm256_fmsub_ps
-#define _simd_shuffle_epi8 _mm256_shuffle_epi8
-#define _simd_adds_epu8 _mm256_adds_epu8
-#define _simd_subs_epu8 _mm256_subs_epu8
-#define _simd_add_epi8 _mm256_add_epi8
-#define _simd_i32gather_ps _mm256_i32gather_ps
-#define _simd_mask_i32gather_ps _mm256_mask_i32gather_ps
-#define _simd_abs_epi32 _mm256_abs_epi32
-
-#define _simd_cmpeq_epi64 _mm256_cmpeq_epi64
-#define _simd_cmpgt_epi64 _mm256_cmpgt_epi64
-#define _simd_cmpgt_epi8 _mm256_cmpgt_epi8
-#define _simd_cmpeq_epi8 _mm256_cmpeq_epi8
-#define _simd_cmpgt_epi16 _mm256_cmpgt_epi16
-#define _simd_cmpeq_epi16 _mm256_cmpeq_epi16
-#define _simd_movemask_epi8 _mm256_movemask_epi8
-#define _simd_permute_ps _mm256_permutevar8x32_ps
-#define _simd_permute_epi32 _mm256_permutevar8x32_epi32
-#define _simd_srlv_epi32 _mm256_srlv_epi32
-#define _simd_sllv_epi32 _mm256_sllv_epi32
-#define _simd_cvtepu8_epi16 _mm256_cvtepu8_epi16
-#define _simd_cvtepu8_epi32 _mm256_cvtepu8_epi32
-#define _simd_cvtepu16_epi32 _mm256_cvtepu16_epi32
-#define _simd_cvtepu16_epi64 _mm256_cvtepu16_epi64
-#define _simd_cvtepu32_epi64 _mm256_cvtepu32_epi64
-#define _simd_packus_epi16 _mm256_packus_epi16
-#define _simd_packs_epi16 _mm256_packs_epi16
-#define _simd_packus_epi32 _mm256_packus_epi32
-#define _simd_packs_epi32 _mm256_packs_epi32
-
-#endif
-
-#define _simd_unpacklo_ps _mm256_unpacklo_ps
-#define _simd_unpackhi_ps _mm256_unpackhi_ps
-#define _simd_unpacklo_pd _mm256_unpacklo_pd
-#define _simd_unpackhi_pd _mm256_unpackhi_pd
-#define _simd_insertf128_ps _mm256_insertf128_ps
-#define _simd_insertf128_pd _mm256_insertf128_pd
-#define _simd_insertf128_si _mm256_insertf128_si256
-#define _simd_extractf128_ps _mm256_extractf128_ps
-#define _simd_extractf128_pd _mm256_extractf128_pd
-#define _simd_extractf128_si _mm256_extractf128_si256
-#define _simd_permute2f128_ps _mm256_permute2f128_ps
-#define _simd_permute2f128_pd _mm256_permute2f128_pd
-#define _simd_permute2f128_si _mm256_permute2f128_si256
-#define _simd_shuffle_ps _mm256_shuffle_ps
-#define _simd_shuffle_pd _mm256_shuffle_pd
-#define _simd_shuffle_epi32(a, b, imm8) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), imm8))
-#define _simd_shuffle_epi64(a, b, imm8) _mm256_castps_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b), imm8))
-#define _simd_set1_epi32 _mm256_set1_epi32
-#define _simd_set_epi32 _mm256_set_epi32
-#define _simd_set1_epi8 _mm256_set1_epi8
-#define _simd_setzero_si _mm256_setzero_si256
-#define _simd_cvttps_epi32 _mm256_cvttps_epi32
-#define _simd_store_si _mm256_store_si256
-#define _simd_broadcast_ss _mm256_broadcast_ss
-#define _simd_maskstore_ps _mm256_maskstore_ps
-#define _simd_load_si _mm256_load_si256
-#define _simd_loadu_si _mm256_loadu_si256
-#define _simd_sub_ps _mm256_sub_ps
-#define _simd_testz_ps _mm256_testz_ps
-#define _simd_testz_si _mm256_testz_si256
-#define _simd_xor_ps _mm256_xor_ps
-
-INLINE
-simdscalari _simd_loadu2_si(const __m128i *hiaddr, const __m128i *loaddr)
-{
- __m128i lo = _mm_loadu_si128(loaddr);
- __m128i hi = _mm_loadu_si128(hiaddr);
-
- return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
-}
-
-INLINE
-void _simd_storeu2_si(__m128i *hiaddr, __m128i *loaddr, simdscalari a)
-{
- _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a));
- _mm_storeu_si128(hiaddr, _mm256_extractf128_si256(a, 1));
-}
-
-INLINE
-simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalar mask)
-{
- return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), mask));
-}
-
-INLINE
-simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalari mask)
-{
- return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), _simd_castsi_ps(mask)));
-}
-
-template<int mask>
-INLINE
-__m128i _simd_blend4_epi32(__m128i a, __m128i b)
-{
- return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), mask));
+#error Unsupported vector width
+#endif//KNOB_SIMD16_WIDTH == 16
+
+
+#define _simd128_maskstore_ps SIMD128::maskstore_ps
+#define _simd128_fmadd_ps SIMD128::fmadd_ps
+
+#define _simd_load_ps SIMD::load_ps
+#define _simd_load1_ps SIMD::broadcast_ss
+#define _simd_loadu_ps SIMD::loadu_ps
+#define _simd_setzero_ps SIMD::setzero_ps
+#define _simd_set1_ps SIMD::set1_ps
+#define _simd_blend_ps(a, b, i) SIMD::blend_ps<i>(a, b)
+#define _simd_blend_epi32(a, b, i) SIMD::blend_epi32<i>(a, b)
+#define _simd_blendv_ps SIMD::blendv_ps
+#define _simd_store_ps SIMD::store_ps
+#define _simd_mul_ps SIMD::mul_ps
+#define _simd_add_ps SIMD::add_ps
+#define _simd_sub_ps SIMD::sub_ps
+#define _simd_rsqrt_ps SIMD::rsqrt_ps
+#define _simd_min_ps SIMD::min_ps
+#define _simd_max_ps SIMD::max_ps
+#define _simd_movemask_ps SIMD::movemask_ps
+#define _simd_cvtps_epi32 SIMD::cvtps_epi32
+#define _simd_cvttps_epi32 SIMD::cvttps_epi32
+#define _simd_cvtepi32_ps SIMD::cvtepi32_ps
+#define _simd_cmplt_ps SIMD::cmplt_ps
+#define _simd_cmpgt_ps SIMD::cmpgt_ps
+#define _simd_cmpneq_ps SIMD::cmpneq_ps
+#define _simd_cmpeq_ps SIMD::cmpeq_ps
+#define _simd_cmpge_ps SIMD::cmpge_ps
+#define _simd_cmple_ps SIMD::cmple_ps
+#define _simd_cmp_ps(a, b, imm) SIMD::cmp_ps<SIMD::CompareType(imm)>(a, b)
+#define _simd_and_ps SIMD::and_ps
+#define _simd_or_ps SIMD::or_ps
+#define _simd_rcp_ps SIMD::rcp_ps
+#define _simd_div_ps SIMD::div_ps
+#define _simd_castsi_ps SIMD::castsi_ps
+#define _simd_castps_pd SIMD::castps_pd
+#define _simd_castpd_ps SIMD::castpd_ps
+#define _simd_andnot_ps SIMD::andnot_ps
+#define _simd_round_ps(a, i) SIMD::round_ps<SIMD::RoundMode(i)>(a)
+#define _simd_castpd_ps SIMD::castpd_ps
+#define _simd_broadcast_ps(a) SIMD::broadcast_ps((SIMD128::Float const *)(a))
+#define _simd_stream_ps SIMD::stream_ps
+
+#define _simd_movemask_pd SIMD::movemask_pd
+#define _simd_castsi_pd SIMD::castsi_pd
+
+#define _simd_mul_epi32 SIMD::mul_epi32
+#define _simd_mullo_epi32 SIMD::mullo_epi32
+#define _simd_sub_epi32 SIMD::sub_epi32
+#define _simd_sub_epi64 SIMD::sub_epi64
+#define _simd_min_epi32 SIMD::min_epi32
+#define _simd_min_epu32 SIMD::min_epu32
+#define _simd_max_epi32 SIMD::max_epi32
+#define _simd_max_epu32 SIMD::max_epu32
+#define _simd_add_epi32 SIMD::add_epi32
+#define _simd_and_si SIMD::and_si
+#define _simd_andnot_si SIMD::andnot_si
+#define _simd_cmpeq_epi32 SIMD::cmpeq_epi32
+#define _simd_cmplt_epi32 SIMD::cmplt_epi32
+#define _simd_cmpgt_epi32 SIMD::cmpgt_epi32
+#define _simd_or_si SIMD::or_si
+#define _simd_xor_si SIMD::xor_si
+#define _simd_castps_si SIMD::castps_si
+#define _simd_adds_epu8 SIMD::adds_epu8
+#define _simd_subs_epu8 SIMD::subs_epu8
+#define _simd_add_epi8 SIMD::add_epi8
+#define _simd_cmpeq_epi64 SIMD::cmpeq_epi64
+#define _simd_cmpgt_epi64 SIMD::cmpgt_epi64
+#define _simd_cmpgt_epi8 SIMD::cmpgt_epi8
+#define _simd_cmpeq_epi8 SIMD::cmpeq_epi8
+#define _simd_cmpgt_epi16 SIMD::cmpgt_epi16
+#define _simd_cmpeq_epi16 SIMD::cmpeq_epi16
+#define _simd_movemask_epi8 SIMD::movemask_epi8
+#define _simd_permute_ps SIMD::permute_ps
+#define _simd_permute_epi32 SIMD::permute_epi32
+#define _simd_srlv_epi32 SIMD::srlv_epi32
+#define _simd_sllv_epi32 SIMD::sllv_epi32
+
+#define _simd_unpacklo_epi8 SIMD::unpacklo_epi8
+#define _simd_unpackhi_epi8 SIMD::unpackhi_epi8
+#define _simd_unpacklo_epi16 SIMD::unpacklo_epi16
+#define _simd_unpackhi_epi16 SIMD::unpackhi_epi16
+#define _simd_unpacklo_epi32 SIMD::unpacklo_epi32
+#define _simd_unpackhi_epi32 SIMD::unpackhi_epi32
+#define _simd_unpacklo_epi64 SIMD::unpacklo_epi64
+#define _simd_unpackhi_epi64 SIMD::unpackhi_epi64
+
+#define _simd_slli_epi32(a,i) SIMD::slli_epi32<i>(a)
+#define _simd_srai_epi32(a,i) SIMD::srai_epi32<i>(a)
+#define _simd_srli_epi32(a,i) SIMD::srli_epi32<i>(a)
+#define _simd_srlisi_ps(a,i) SIMD::srlisi_ps<i>(a)
+
+#define _simd_fmadd_ps SIMD::fmadd_ps
+#define _simd_fmsub_ps SIMD::fmsub_ps
+#define _simd_shuffle_epi8 SIMD::shuffle_epi8
+
+#define _simd_i32gather_ps(p, o, s) SIMD::i32gather_ps<SIMD::ScaleFactor(s)>(p, o)
+#define _simd_mask_i32gather_ps(r, p, o, m, s) SIMD::mask_i32gather_ps<SIMD::ScaleFactor(s)>(r, p, o, m)
+#define _simd_abs_epi32 SIMD::abs_epi32
+
+#define _simd_cvtepu8_epi16 SIMD::cvtepu8_epi16
+#define _simd_cvtepu8_epi32 SIMD::cvtepu8_epi32
+#define _simd_cvtepu16_epi32 SIMD::cvtepu16_epi32
+#define _simd_cvtepu16_epi64 SIMD::cvtepu16_epi64
+#define _simd_cvtepu32_epi64 SIMD::cvtepu32_epi64
+
+#define _simd_packus_epi16 SIMD::packus_epi16
+#define _simd_packs_epi16 SIMD::packs_epi16
+#define _simd_packus_epi32 SIMD::packus_epi32
+#define _simd_packs_epi32 SIMD::packs_epi32
+
+#define _simd_unpacklo_ps SIMD::unpacklo_ps
+#define _simd_unpackhi_ps SIMD::unpackhi_ps
+#define _simd_unpacklo_pd SIMD::unpacklo_pd
+#define _simd_unpackhi_pd SIMD::unpackhi_pd
+#define _simd_insertf128_ps SIMD::insertf128_ps
+#define _simd_insertf128_pd SIMD::insertf128_pd
+#define _simd_insertf128_si(a, b, i) SIMD::insertf128_si<i>(a, b)
+#define _simd_extractf128_ps(a, i) SIMD::extractf128_ps<i>(a)
+#define _simd_extractf128_pd(a, i) SIMD::extractf128_pd<i>(a)
+#define _simd_extractf128_si(a, i) SIMD::extractf128_si<i>(a)
+#define _simd_permute2f128_ps(a, b, i) SIMD::permute2f128_ps<i>(a, b)
+#define _simd_permute2f128_pd(a, b, i) SIMD::permute2f128_pd<i>(a, b)
+#define _simd_permute2f128_si(a, b, i) SIMD::permute2f128_si<i>(a, b)
+#define _simd_shuffle_ps(a, b, i) SIMD::shuffle_ps<i>(a, b)
+#define _simd_shuffle_pd(a, b, i) SIMD::shuffle_pd<i>(a, b)
+#define _simd_shuffle_epi32(a, b, imm8) SIMD::shuffle_epi32<imm8>(a, b)
+#define _simd_shuffle_epi64(a, b, imm8) SIMD::shuffle_epi64<imm8>(a, b)
+#define _simd_set1_epi32 SIMD::set1_epi32
+#define _simd_set_epi32 SIMD::set_epi32
+#define _simd_set_ps SIMD::set_ps
+#define _simd_set1_epi8 SIMD::set1_epi8
+#define _simd_setzero_si SIMD::setzero_si
+#define _simd_cvttps_epi32 SIMD::cvttps_epi32
+#define _simd_store_si SIMD::store_si
+#define _simd_broadcast_ss SIMD::broadcast_ss
+#define _simd_maskstore_ps SIMD::maskstore_ps
+#define _simd_load_si SIMD::load_si
+#define _simd_loadu_si SIMD::loadu_si
+#define _simd_sub_ps SIMD::sub_ps
+#define _simd_testz_ps SIMD::testz_ps
+#define _simd_testz_si SIMD::testz_si
+#define _simd_xor_ps SIMD::xor_ps
+
+#define _simd_loadu2_si SIMD::loadu2_si
+#define _simd_storeu2_si SIMD::storeu2_si
+
+#define _simd_blendv_epi32 SIMD::blendv_epi32
+
+template<int mask> SIMDINLINE
+SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer a, SIMD128::Integer b)
+{
+ return SIMD128::castps_si(SIMD128::blend_ps<mask>(SIMD128::castsi_ps(a), SIMD128::castsi_ps(b)));
}
// convert bitmask to vector mask
-INLINE
-simdscalar vMask(int32_t mask)
+SIMDINLINE
+SIMD256::Float vMask(int32_t mask)
{
- __m256i vec = _mm256_set1_epi32(mask);
- const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
- vec = _simd_and_si(vec, bit);
- vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
- return _simd_castsi_ps(vec);
+ SIMD256::Integer vec = SIMD256::set1_epi32(mask);
+ const SIMD256::Integer bit = SIMD256::set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+ vec = SIMD256::and_si(vec, bit);
+ vec = SIMD256::cmplt_epi32(SIMD256::setzero_si(), vec);
+ return SIMD256::castsi_ps(vec);
}
-INLINE
-simdscalari vMaski(int32_t mask)
+SIMDINLINE
+SIMD256::Integer vMaski(int32_t mask)
{
- __m256i vec = _mm256_set1_epi32(mask);
- const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
- vec = _simd_and_si(vec, bit);
- return _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
+ SIMD256::Integer vec = SIMD256::set1_epi32(mask);
+ const SIMD256::Integer bit = SIMD256::set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+ vec = SIMD256::and_si(vec, bit);
+ return SIMD256::cmplt_epi32(SIMD256::setzero_si(), vec);
}
-INLINE
+SIMDINLINE
void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int slane)
{
OSALIGNSIMD(float) rArray[KNOB_SIMD_WIDTH], sArray[KNOB_SIMD_WIDTH];
- _mm256_store_ps(rArray, r);
- _mm256_store_ps(sArray, s);
+ SIMD256::store_ps(rArray, r);
+ SIMD256::store_ps(sArray, s);
rArray[rlane] = sArray[slane];
- r = _mm256_load_ps(rArray);
+ r = SIMD256::load_ps(rArray);
}
-INLINE __m256i _simdemu_slli_epi32(__m256i a, uint32_t i)
-{
- __m128i aHi = _mm256_extractf128_si256(a, 1);
- __m128i aLo = _mm256_castsi256_si128(a);
-
- __m128i resHi = _mm_slli_epi32(aHi, i);
- __m128i resLo = _mm_slli_epi32(aLo, i);
-
- __m256i result = _mm256_castsi128_si256(resLo);
- result = _mm256_insertf128_si256(result, resHi, 1);
-
- return result;
-}
-
-INLINE __m256i _simdemu_srai_epi32(__m256i a, uint32_t i)
-{
- __m128i aHi = _mm256_extractf128_si256(a, 1);
- __m128i aLo = _mm256_castsi256_si128(a);
-
- __m128i resHi = _mm_srai_epi32(aHi, i);
- __m128i resLo = _mm_srai_epi32(aLo, i);
-
- __m256i result = _mm256_castsi128_si256(resLo);
- result = _mm256_insertf128_si256(result, resHi, 1);
-
- return result;
-}
-
-INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i)
-{
- __m128i aHi = _mm256_extractf128_si256(a, 1);
- __m128i aLo = _mm256_castsi256_si128(a);
-
- __m128i resHi = _mm_srli_epi32(aHi, i);
- __m128i resLo = _mm_srli_epi32(aLo, i);
-
- __m256i result = _mm256_castsi128_si256(resLo);
- result = _mm256_insertf128_si256(result, resHi, 1);
-
- return result;
-}
-
-INLINE
-void _simdvec_transpose(simdvector &v)
-{
- SWR_INVALID("Need to implement 8 wide version");
-}
-
-#else
-#error Unsupported vector width
-#endif
-
// Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
-INLINE
-void _simdvec_load_ps(simdvector& r, const float *p)
-{
- r[0] = _simd_set1_ps(p[0]);
- r[1] = _simd_set1_ps(p[1]);
- r[2] = _simd_set1_ps(p[2]);
- r[3] = _simd_set1_ps(p[3]);
-}
+#define _simdvec_load_ps SIMD::vec4_load1_ps
-INLINE
+SIMDINLINE
void _simdvec_mov(simdvector& r, const simdscalar& s)
{
- r[0] = s;
- r[1] = s;
- r[2] = s;
- r[3] = s;
+ SIMD::vec4_set1_vps(r, s);
}
-INLINE
+SIMDINLINE
void _simdvec_mov(simdvector& r, const simdvector& v)
{
- r[0] = v[0];
- r[1] = v[1];
- r[2] = v[2];
- r[3] = v[3];
+ r = v;
}
#if 0
// just move a lane from the source simdvector to dest simdvector
-INLINE
+SIMDINLINE
void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane)
{
_simd_mov(r[0], rlane, s[0], slane);
}
#endif
-INLINE
-void _simdvec_dp3_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
-{
- simdscalar tmp;
- r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
-
- tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
- r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
- tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
- r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
-}
-
-INLINE
-void _simdvec_dp4_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
-{
- simdscalar tmp;
- r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
-
- tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
- r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
-
- tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
- r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
-
- tmp = _simd_mul_ps(v0[3], v1[3]); // (v0.w*v1.w)
- r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
-}
-
-INLINE
-simdscalar _simdvec_rcp_length_ps(const simdvector& v)
-{
- simdscalar length;
- _simdvec_dp4_ps(length, v, v);
- return _simd_rsqrt_ps(length);
-}
-
-INLINE
-void _simdvec_normalize_ps(simdvector& r, const simdvector& v)
-{
- simdscalar vecLength;
- vecLength = _simdvec_rcp_length_ps(v);
-
- r[0] = _simd_mul_ps(v[0], vecLength);
- r[1] = _simd_mul_ps(v[1], vecLength);
- r[2] = _simd_mul_ps(v[2], vecLength);
- r[3] = _simd_mul_ps(v[3], vecLength);
-}
-
-INLINE
-void _simdvec_mul_ps(simdvector& r, const simdvector& v, const simdscalar& s)
-{
- r[0] = _simd_mul_ps(v[0], s);
- r[1] = _simd_mul_ps(v[1], s);
- r[2] = _simd_mul_ps(v[2], s);
- r[3] = _simd_mul_ps(v[3], s);
-}
-
-INLINE
-void _simdvec_mul_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
-{
- r[0] = _simd_mul_ps(v0[0], v1[0]);
- r[1] = _simd_mul_ps(v0[1], v1[1]);
- r[2] = _simd_mul_ps(v0[2], v1[2]);
- r[3] = _simd_mul_ps(v0[3], v1[3]);
-}
-
-INLINE
-void _simdvec_add_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
-{
- r[0] = _simd_add_ps(v0[0], v1[0]);
- r[1] = _simd_add_ps(v0[1], v1[1]);
- r[2] = _simd_add_ps(v0[2], v1[2]);
- r[3] = _simd_add_ps(v0[3], v1[3]);
-}
-
-INLINE
-void _simdvec_min_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
-{
- r[0] = _simd_min_ps(v0[0], s);
- r[1] = _simd_min_ps(v0[1], s);
- r[2] = _simd_min_ps(v0[2], s);
- r[3] = _simd_min_ps(v0[3], s);
-}
-
-INLINE
-void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
-{
- r[0] = _simd_max_ps(v0[0], s);
- r[1] = _simd_max_ps(v0[1], s);
- r[2] = _simd_max_ps(v0[2], s);
- r[3] = _simd_max_ps(v0[3], s);
-}
-
-// Matrix4x4 * Vector4
-// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
-// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
-// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
-// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
-INLINE
-void _simd_mat4x4_vec4_multiply(
- simdvector& result,
- const float *pMatrix,
- const simdvector& v)
-{
- simdscalar m;
- simdscalar r0;
- simdscalar r1;
-
- m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
- r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
- result[0] = r0;
-
- m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
- r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
- result[1] = r0;
-
- m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
- r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
- result[2] = r0;
-
- m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3]
- r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
- result[3] = r0;
-}
-
-// Matrix4x4 * Vector3 - Direction Vector where w = 0.
-// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
-// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
-// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
-// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
-INLINE
-void _simd_mat3x3_vec3_w0_multiply(
- simdvector& result,
- const float *pMatrix,
- const simdvector& v)
-{
- simdscalar m;
- simdscalar r0;
- simdscalar r1;
-
- m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- result[0] = r0;
-
- m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- result[1] = r0;
-
- m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- result[2] = r0;
-
- result[3] = _simd_setzero_ps();
-}
-
-// Matrix4x4 * Vector3 - Position vector where w = 1.
-// outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
-// outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
-// outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
-// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
-INLINE
-void _simd_mat4x4_vec3_w1_multiply(
- simdvector& result,
- const float *pMatrix,
- const simdvector& v)
-{
- simdscalar m;
- simdscalar r0;
- simdscalar r1;
-
- m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
- r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
- result[0] = r0;
-
- m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
- r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
- result[1] = r0;
-
- m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
- r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
- result[2] = r0;
-
- m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3]
- result[3] = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-}
-
-INLINE
-void _simd_mat4x3_vec3_w1_multiply(
- simdvector& result,
- const float *pMatrix,
- const simdvector& v)
-{
- simdscalar m;
- simdscalar r0;
- simdscalar r1;
-
- m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
- r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
- result[0] = r0;
-
- m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
- r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
- result[1] = r0;
-
- m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
- r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
- result[2] = r0;
- result[3] = _simd_set1_ps(1.0f);
-}
+#define _simdvec_dp3_ps SIMD::vec4_dp3_ps
+#define _simdvec_dp4_ps SIMD::vec4_dp4_ps
+#define _simdvec_rcp_length_ps SIMD::vec4_rcp_length_ps
+#define _simdvec_normalize_ps SIMD::vec4_normalize_ps
+#define _simdvec_mul_ps SIMD::vec4_mul_ps
+#define _simdvec_add_ps SIMD::vec4_add_ps
+#define _simdvec_min_ps SIMD::vec4_min_ps
+#define _simdvec_max_ps SIMD::vec4_max_ps
+#define _simd_mat4x4_vec4_multiply SIMD::mat4x4_vec4_multiply
+#define _simd_mat3x3_vec3_w0_multiply SIMD::mat3x3_vec3_w0_multiply
+#define _simd_mat4x4_vec3_w1_multiply SIMD::mat4x4_vec3_w1_multiply
+#define _simd_mat4x3_vec3_w1_multiply SIMD::mat4x3_vec3_w1_multiply
//////////////////////////////////////////////////////////////////////////
/// @brief Compute plane equation vA * vX + vB * vY + vC
-INLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simdscalar &vX, simdscalar &vY)
+SIMDINLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simdscalar &vX, simdscalar &vY)
{
simdscalar vOut = _simd_fmadd_ps(vA, vX, vC);
vOut = _simd_fmadd_ps(vB, vY, vOut);
//////////////////////////////////////////////////////////////////////////
/// @brief Compute plane equation vA * vX + vB * vY + vC
-INLINE __m128 vplaneps128(__m128 vA, __m128 vB, __m128 vC, __m128 &vX, __m128 &vY)
+SIMDINLINE simd4scalar vplaneps(simd4scalar vA, simd4scalar vB, simd4scalar vC, simd4scalar &vX, simd4scalar &vY)
{
- __m128 vOut = _simd128_fmadd_ps(vA, vX, vC);
+ simd4scalar vOut = _simd128_fmadd_ps(vA, vX, vC);
vOut = _simd128_fmadd_ps(vB, vY, vOut);
return vOut;
}
/// @param vJ - barycentric J
/// @param pInterpBuffer - pointer to attribute barycentric coeffs
template<UINT Attrib, UINT Comp, UINT numComponents = 4>
-static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, const float *pInterpBuffer)
+static SIMDINLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, const float *pInterpBuffer)
{
const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
/// @brief Interpolates a single component (flat shade).
/// @param pInterpBuffer - pointer to attribute barycentric coeffs
template<UINT Attrib, UINT Comp, UINT numComponents = 4>
-static INLINE simdscalar InterpolateComponentFlat(const float *pInterpBuffer)
+static SIMDINLINE simdscalar InterpolateComponentFlat(const float *pInterpBuffer)
{
const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
/// @param vJ - barycentric J
/// @param pInterpBuffer - pointer to attribute barycentric coeffs
template<UINT Attrib, UINT Comp, UINT numComponents = 4>
-static INLINE __m128 InterpolateComponent(__m128 vI, __m128 vJ, const float *pInterpBuffer)
+static SIMDINLINE simd4scalar InterpolateComponent(simd4scalar vI, simd4scalar vJ, const float *pInterpBuffer)
{
const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
const float *pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
- __m128 vA = _mm_broadcast_ss(pInterpA);
- __m128 vB = _mm_broadcast_ss(pInterpB);
- __m128 vC = _mm_broadcast_ss(pInterpC);
+ simd4scalar vA = SIMD128::broadcast_ss(pInterpA);
+ simd4scalar vB = SIMD128::broadcast_ss(pInterpB);
+ simd4scalar vC = SIMD128::broadcast_ss(pInterpC);
- __m128 vk = _mm_sub_ps(_mm_sub_ps(_mm_set1_ps(1.0f), vI), vJ);
- vC = _mm_mul_ps(vk, vC);
+ simd4scalar vk = SIMD128::sub_ps(SIMD128::sub_ps(SIMD128::set1_ps(1.0f), vI), vJ);
+ vC = SIMD128::mul_ps(vk, vC);
- return vplaneps128(vA, vB, vC, vI, vJ);
+ return vplaneps(vA, vB, vC, vI, vJ);
}
-static INLINE __m128 _simd128_abs_ps(__m128 a)
+static SIMDINLINE simd4scalar _simd128_abs_ps(simd4scalar a)
{
- __m128i ai = _mm_castps_si128(a);
- return _mm_castsi128_ps(_mm_and_si128(ai, _mm_set1_epi32(0x7fffffff)));
+ simd4scalari ai = SIMD128::castps_si(a);
+ return SIMD128::castsi_ps(SIMD128::and_si(ai, SIMD128::set1_epi32(0x7fffffff)));
}
-static INLINE simdscalar _simd_abs_ps(simdscalar a)
+static SIMDINLINE simdscalar _simd_abs_ps(simdscalar a)
{
simdscalari ai = _simd_castps_si(a);
return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff)));
}
+
#if ENABLE_AVX512_SIMD16
#include "simd16intrin.h"
#endif//ENABLE_AVX512_SIMD16
--- /dev/null
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#pragma once
+
+#include "simdlib_types.hpp"
+
+// For documentation, please see the following include...
+// #include "simdlib_interface.hpp"
+
+namespace SIMDImpl
+{
+ namespace SIMD128Impl
+ {
+#if SIMD_ARCH >= SIMD_ARCH_AVX
+ struct AVXImpl
+ {
+#define __SIMD_LIB_AVX_HPP__
+#include "simdlib_128_avx.inl"
+#undef __SIMD_LIB_AVX_HPP__
+ }; // struct AVXImpl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
+
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX2
+ struct AVX2Impl : AVXImpl
+ {
+#define __SIMD_LIB_AVX2_HPP__
+#include "simdlib_128_avx2.inl"
+#undef __SIMD_LIB_AVX2_HPP__
+ }; // struct AVX2Impl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+ struct AVX512Impl : AVX2Impl
+ {
+#define __SIMD_LIB_AVX512_HPP__
+#include "simdlib_128_avx512.inl"
+#undef __SIMD_LIB_AVX512_HPP__
+ }; // struct AVX2Impl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
+
+ struct Traits : SIMDImpl::Traits
+ {
+#if SIMD_ARCH == SIMD_ARCH_AVX
+ using IsaImpl = AVXImpl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX2
+ using IsaImpl = AVX2Impl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX512
+ using IsaImpl = AVX512Impl;
+#else
+#error Invalid value for SIMD_ARCH
+#endif
+
+ using Float = SIMD128Impl::Float;
+ using Double = SIMD128Impl::Double;
+ using Integer = SIMD128Impl::Integer;
+ using Vec4 = SIMD128Impl::Vec4;
+ using Mask = SIMD128Impl::Mask;
+ };
+ } // ns SIMD128Impl
+
+ namespace SIMD256Impl
+ {
+#if SIMD_ARCH >= SIMD_ARCH_AVX
+ struct AVXImpl
+ {
+#define __SIMD_LIB_AVX_HPP__
+#include "simdlib_256_avx.inl"
+#undef __SIMD_LIB_AVX_HPP__
+ }; // struct AVXImpl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
+
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX2
+ struct AVX2Impl : AVXImpl
+ {
+#define __SIMD_LIB_AVX2_HPP__
+#include "simdlib_256_avx2.inl"
+#undef __SIMD_LIB_AVX2_HPP__
+ }; // struct AVX2Impl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+ struct AVX512Impl : AVX2Impl
+ {
+#define __SIMD_LIB_AVX512_HPP__
+#include "simdlib_256_avx512.inl"
+#undef __SIMD_LIB_AVX512_HPP__
+ }; // struct AVX2Impl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
+
+ struct Traits : SIMDImpl::Traits
+ {
+#if SIMD_ARCH == SIMD_ARCH_AVX
+ using IsaImpl = AVXImpl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX2
+ using IsaImpl = AVX2Impl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX512
+ using IsaImpl = AVX512Impl;
+#else
+#error Invalid value for SIMD_ARCH
+#endif
+
+ using Float = SIMD256Impl::Float;
+ using Double = SIMD256Impl::Double;
+ using Integer = SIMD256Impl::Integer;
+ using Vec4 = SIMD256Impl::Vec4;
+ using Mask = SIMD256Impl::Mask;
+ };
+ } // ns SIMD256Impl
+
+ namespace SIMD512Impl
+ {
+#if SIMD_ARCH >= SIMD_ARCH_AVX
+ template<typename SIMD256T>
+ struct AVXImplBase
+ {
+#define __SIMD_LIB_AVX_HPP__
+#include "simdlib_512_emu.inl"
+#include "simdlib_512_emu_masks.inl"
+#undef __SIMD_LIB_AVX_HPP__
+ }; // struct AVXImplBase
+ using AVXImpl = AVXImplBase<SIMD256Impl::AVXImpl>;
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
+
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX2
+ using AVX2Impl = AVXImplBase<SIMD256Impl::AVX2Impl>;
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
+
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+ struct AVX512Impl
+ {
+#define __SIMD_LIB_AVX512_HPP__
+#include "simdlib_512_avx512.inl"
+#include "simdlib_512_avx512_masks.inl"
+#undef __SIMD_LIB_AVX512_HPP__
+ }; // struct AVX512Impl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
+
+ struct Traits : SIMDImpl::Traits
+ {
+#if SIMD_ARCH == SIMD_ARCH_AVX
+ using IsaImpl = AVXImpl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX2
+ using IsaImpl = AVX2Impl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX512
+ using IsaImpl = AVX512Impl;
+#else
+#error Invalid value for SIMD_ARCH
+#endif
+
+ using Float = SIMD512Impl::Float;
+ using Double = SIMD512Impl::Double;
+ using Integer = SIMD512Impl::Integer;
+ using Vec4 = SIMD512Impl::Vec4;
+ using Mask = SIMD512Impl::Mask;
+ };
+ } // ns SIMD512Impl
+} // ns SIMDImpl
+
+template <typename Traits>
+struct SIMDBase : Traits::IsaImpl
+{
+ using CompareType = typename Traits::CompareType;
+ using ScaleFactor = typename Traits::ScaleFactor;
+ using RoundMode = typename Traits::RoundMode;
+ using SIMD = typename Traits::IsaImpl;
+ using Float = typename Traits::Float;
+ using Double = typename Traits::Double;
+ using Integer = typename Traits::Integer;
+ using Vec4 = typename Traits::Vec4;
+ using Mask = typename Traits::Mask;
+
+ // Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
+ static SIMDINLINE
+ void vec4_load1_ps(Vec4& r, const float *p)
+ {
+ r[0] = SIMD::set1_ps(p[0]);
+ r[1] = SIMD::set1_ps(p[1]);
+ r[2] = SIMD::set1_ps(p[2]);
+ r[3] = SIMD::set1_ps(p[3]);
+ }
+
+ static SIMDINLINE
+ void vec4_set1_vps(Vec4& r, Float s)
+ {
+ r[0] = s;
+ r[1] = s;
+ r[2] = s;
+ r[3] = s;
+ }
+
+ static SIMDINLINE
+ Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1)
+ {
+ Float tmp, r;
+ r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
+
+ tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
+ r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
+
+ tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
+ r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+
+ return r;
+ }
+
+ static SIMDINLINE
+ Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1)
+ {
+ Float tmp, r;
+ r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
+
+ tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
+ r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
+
+ tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
+ r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+
+ tmp = SIMD::mul_ps(v0[3], v1[3]); // (v0.w*v1.w)
+ r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+
+ return r;
+ }
+
+ static SIMDINLINE
+ Float vec4_rcp_length_ps(const Vec4& v)
+ {
+ Float length = vec4_dp4_ps(v, v);
+ return SIMD::rsqrt_ps(length);
+ }
+
+ static SIMDINLINE
+ void vec4_normalize_ps(Vec4& r, const Vec4& v)
+ {
+ Float rcpLength = vec4_rcp_length_ps(v);
+
+ r[0] = SIMD::mul_ps(v[0], rcpLength);
+ r[1] = SIMD::mul_ps(v[1], rcpLength);
+ r[2] = SIMD::mul_ps(v[2], rcpLength);
+ r[3] = SIMD::mul_ps(v[3], rcpLength);
+ }
+
+ static SIMDINLINE
+ void vec4_mul_ps(Vec4& r, const Vec4& v, Float s)
+ {
+ r[0] = SIMD::mul_ps(v[0], s);
+ r[1] = SIMD::mul_ps(v[1], s);
+ r[2] = SIMD::mul_ps(v[2], s);
+ r[3] = SIMD::mul_ps(v[3], s);
+ }
+
+ static SIMDINLINE
+ void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
+ {
+ r[0] = SIMD::mul_ps(v0[0], v1[0]);
+ r[1] = SIMD::mul_ps(v0[1], v1[1]);
+ r[2] = SIMD::mul_ps(v0[2], v1[2]);
+ r[3] = SIMD::mul_ps(v0[3], v1[3]);
+ }
+
+ static SIMDINLINE
+ void vec4_add_ps(Vec4& r, const Vec4& v0, Float s)
+ {
+ r[0] = SIMD::add_ps(v0[0], s);
+ r[1] = SIMD::add_ps(v0[1], s);
+ r[2] = SIMD::add_ps(v0[2], s);
+ r[3] = SIMD::add_ps(v0[3], s);
+ }
+
+ static SIMDINLINE
+ void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
+ {
+ r[0] = SIMD::add_ps(v0[0], v1[0]);
+ r[1] = SIMD::add_ps(v0[1], v1[1]);
+ r[2] = SIMD::add_ps(v0[2], v1[2]);
+ r[3] = SIMD::add_ps(v0[3], v1[3]);
+ }
+
+ static SIMDINLINE
+ void vec4_min_ps(Vec4& r, const Vec4& v0, Float s)
+ {
+ r[0] = SIMD::min_ps(v0[0], s);
+ r[1] = SIMD::min_ps(v0[1], s);
+ r[2] = SIMD::min_ps(v0[2], s);
+ r[3] = SIMD::min_ps(v0[3], s);
+ }
+
+ static SIMDINLINE
+ void vec4_max_ps(Vec4& r, const Vec4& v0, Float s)
+ {
+ r[0] = SIMD::max_ps(v0[0], s);
+ r[1] = SIMD::max_ps(v0[1], s);
+ r[2] = SIMD::max_ps(v0[2], s);
+ r[3] = SIMD::max_ps(v0[3], s);
+ }
+
+ // Matrix4x4 * Vector4
+ // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
+ // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
+ // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
+ // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
+ static SIMDINLINE
+ void SIMDCALL mat4x4_vec4_multiply(
+ Vec4& result,
+ const float *pMatrix,
+ const Vec4& v)
+ {
+ Float m;
+ Float r0;
+ Float r1;
+
+ m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0]
+ r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
+ m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1]
+ r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2]
+ r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = SIMD::load1_ps(pMatrix + 0*4 + 3); // m[row][3]
+ r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+ result[0] = r0;
+
+ m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0]
+ r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
+ m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1]
+ r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2]
+ r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = SIMD::load1_ps(pMatrix + 1*4 + 3); // m[row][3]
+ r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+ result[1] = r0;
+
+ m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0]
+ r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
+ m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1]
+ r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2]
+ r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = SIMD::load1_ps(pMatrix + 2*4 + 3); // m[row][3]
+ r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+ result[2] = r0;
+
+ m = SIMD::load1_ps(pMatrix + 3*4 + 0); // m[row][0]
+ r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
+ m = SIMD::load1_ps(pMatrix + 3*4 + 1); // m[row][1]
+ r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = SIMD::load1_ps(pMatrix + 3*4 + 2); // m[row][2]
+ r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = SIMD::load1_ps(pMatrix + 3*4 + 3); // m[row][3]
+ r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+ result[3] = r0;
+ }
+
+ // Matrix4x4 * Vector3 - Direction Vector where w = 0.
+ // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
+ // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
+ // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
+ // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
+ static SIMDINLINE
+ void SIMDCALL mat3x3_vec3_w0_multiply(
+ Vec4& result,
+ const float *pMatrix,
+ const Vec4& v)
+ {
+ Float m;
+ Float r0;
+ Float r1;
+
+ m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0]
+ r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
+ m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1]
+ r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2]
+ r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ result[0] = r0;
+
+ m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0]
+ r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
+ m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1]
+ r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2]
+ r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ result[1] = r0;
+
+ m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0]
+ r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
+ m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1]
+ r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2]
+ r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ result[2] = r0;
+
+ result[3] = SIMD::setzero_ps();
+ }
+
+ // Matrix4x4 * Vector3 - Position vector where w = 1.
+ // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
+ // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
+ // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
+ // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
+ static SIMDINLINE
+ void SIMDCALL mat4x4_vec3_w1_multiply(
+ Vec4& result,
+ const float *pMatrix,
+ const Vec4& v)
+ {
+ Float m;
+ Float r0;
+ Float r1;
+
+ m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0]
+ r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
+ m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1]
+ r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2]
+ r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = SIMD::load1_ps(pMatrix + 0*4 + 3); // m[row][3]
+ r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ result[0] = r0;
+
+ m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0]
+ r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
+ m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1]
+ r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2]
+ r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = SIMD::load1_ps(pMatrix + 1*4 + 3); // m[row][3]
+ r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ result[1] = r0;
+
+ m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0]
+ r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
+ m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1]
+ r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2]
+ r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = SIMD::load1_ps(pMatrix + 2*4 + 3); // m[row][3]
+ r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ result[2] = r0;
+
+ m = SIMD::load1_ps(pMatrix + 3*4 + 0); // m[row][0]
+ r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
+ m = SIMD::load1_ps(pMatrix + 3*4 + 1); // m[row][1]
+ r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = SIMD::load1_ps(pMatrix + 3*4 + 2); // m[row][2]
+ r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = SIMD::load1_ps(pMatrix + 3*4 + 3); // m[row][3]
+ result[3] = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ }
+
+ static SIMDINLINE
+ void SIMDCALL mat4x3_vec3_w1_multiply(
+ Vec4& result,
+ const float *pMatrix,
+ const Vec4& v)
+ {
+ Float m;
+ Float r0;
+ Float r1;
+
+ m = SIMD::load1_ps(pMatrix + 0*4 + 0); // m[row][0]
+ r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
+ m = SIMD::load1_ps(pMatrix + 0*4 + 1); // m[row][1]
+ r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = SIMD::load1_ps(pMatrix + 0*4 + 2); // m[row][2]
+ r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = SIMD::load1_ps(pMatrix + 0*4 + 3); // m[row][3]
+ r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ result[0] = r0;
+
+ m = SIMD::load1_ps(pMatrix + 1*4 + 0); // m[row][0]
+ r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
+ m = SIMD::load1_ps(pMatrix + 1*4 + 1); // m[row][1]
+ r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = SIMD::load1_ps(pMatrix + 1*4 + 2); // m[row][2]
+ r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = SIMD::load1_ps(pMatrix + 1*4 + 3); // m[row][3]
+ r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ result[1] = r0;
+
+ m = SIMD::load1_ps(pMatrix + 2*4 + 0); // m[row][0]
+ r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x)
+ m = SIMD::load1_ps(pMatrix + 2*4 + 1); // m[row][1]
+ r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = SIMD::load1_ps(pMatrix + 2*4 + 2); // m[row][2]
+ r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = SIMD::load1_ps(pMatrix + 2*4 + 3); // m[row][3]
+ r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ result[2] = r0;
+ result[3] = SIMD::set1_ps(1.0f);
+ }
+}; // struct SIMDBase
+
+using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>;
+using SIMD256 = SIMDBase<SIMDImpl::SIMD256Impl::Traits>;
+using SIMD512 = SIMDBase<SIMDImpl::SIMD512Impl::Traits>;
--- /dev/null
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD128 AVX (1) implementation
+//============================================================================
+
+#define SIMD_WRAPPER_1(op) \
+ static SIMDINLINE Float SIMDCALL op(Float a) \
+ {\
+ return _mm_##op(a);\
+ }
+
+#define SIMD_WRAPPER_2(op) \
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ {\
+ return _mm_##op(a, b);\
+ }
+
+#define SIMD_DWRAPPER_2(op) \
+ static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
+ {\
+ return _mm_##op(a, b);\
+ }
+
+#define SIMD_WRAPPER_2I(op) \
+ template<int ImmT>\
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ {\
+ return _mm_##op(a, b, ImmT);\
+ }
+
+#define SIMD_DWRAPPER_2I(op) \
+ template<int ImmT>\
+ static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
+ {\
+ return _mm_##op(a, b, ImmT);\
+ }
+
+#define SIMD_WRAPPER_3(op) \
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
+ {\
+ return _mm_##op(a, b, c);\
+ }
+
+#define SIMD_IWRAPPER_1(op) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ {\
+ return _mm_##op(a);\
+ }
+
+#define SIMD_IWRAPPER_1I_(op, intrin) \
+ template<int ImmT> \
+ static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ {\
+ return intrin(a, ImmT);\
+ }
+#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm_##op)
+
+#define SIMD_IWRAPPER_2_(op, intrin) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return intrin(a, b);\
+ }
+
+#define SIMD_IWRAPPER_2(op) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return _mm_##op(a, b);\
+ }
+
+#define SIMD_IFWRAPPER_2(op, intrin) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return castps_si( intrin(castsi_ps(a), castsi_ps(b)) );\
+ }
+
+#define SIMD_IWRAPPER_2I(op) \
+ template<int ImmT>\
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return _mm_##op(a, b, ImmT);\
+ }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps); // return a + b
+SIMD_WRAPPER_2(div_ps); // return a / b
+SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps); // return a * b
+SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a
+SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps); // return a - b
+
+static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c
+{
+ return add_ps(mul_ps(a, b), c);
+}
+static SIMDINLINE Float SIMDCALL fmsub_ps(Float a, Float b, Float c) // return (a * b) - c
+{
+ return sub_ps(mul_ps(a, b), c);
+}
+
+template <RoundMode RMT>
+static SIMDINLINE Float SIMDCALL round_ps(Float a)
+{
+ return _mm_round_ps(a, static_cast<int>(RMT));
+}
+
+static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
+SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2(mullo_epi32);
+SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
+SIMD_IWRAPPER_2_(and_si, _mm_and_si128); // return a & b (int)
+SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
+SIMD_IWRAPPER_2_(andnot_si, _mm_andnot_si128); // return (~a) & b (int)
+SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
+SIMD_IWRAPPER_2_(or_si, _mm_or_si128); // return a | b (int)
+SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
+SIMD_IWRAPPER_2_(xor_si, _mm_xor_si128); // return a ^ b (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
+
+static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b (uint32)
+{
+ int32_t a, count;
+ a = _mm_extract_epi32(vA, 0);
+ count = _mm_extract_epi32(vB, 0);
+ a <<= count;
+ vA = _mm_insert_epi32(vA, a, 0);
+
+ a = _mm_extract_epi32(vA, 1);
+ count = _mm_extract_epi32(vB, 1);
+ a <<= count;
+ vA = _mm_insert_epi32(vA, a, 1);
+
+ a = _mm_extract_epi32(vA, 2);
+ count = _mm_extract_epi32(vB, 2);
+ a <<= count;
+ vA = _mm_insert_epi32(vA, a, 2);
+
+ a = _mm_extract_epi32(vA, 3);
+ count = _mm_extract_epi32(vB, 3);
+ a <<= count;
+ vA = _mm_insert_epi32(vA, a, 3);
+
+ return vA;
+}
+
+SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32)
+SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32)
+SIMD_IWRAPPER_1I_(srli_si, _mm_srli_si128); // return a >> (ImmT*8) (uint)
+
+template<int ImmT> // same as srli_si, but with Float cast to int
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
+{
+ return castsi_ps(srli_si<ImmT>(castps_si(a)));
+}
+
+static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b (uint32)
+{
+ int32_t a, count;
+ a = _mm_extract_epi32(vA, 0);
+ count = _mm_extract_epi32(vB, 0);
+ a >>= count;
+ vA = _mm_insert_epi32(vA, a, 0);
+
+ a = _mm_extract_epi32(vA, 1);
+ count = _mm_extract_epi32(vB, 1);
+ a >>= count;
+ vA = _mm_insert_epi32(vA, a, 1);
+
+ a = _mm_extract_epi32(vA, 2);
+ count = _mm_extract_epi32(vB, 2);
+ a >>= count;
+ vA = _mm_insert_epi32(vA, a, 2);
+
+ a = _mm_extract_epi32(vA, 3);
+ count = _mm_extract_epi32(vB, 3);
+ a >>= count;
+ vA = _mm_insert_epi32(vA, a, 3);
+
+ return vA;
+}
+
+
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
+{
+ return _mm_castpd_ps(a);
+}
+
+static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
+{
+ return _mm_castps_si128(a);
+}
+
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
+{
+ return _mm_castsi128_pd(a);
+}
+
+static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
+{
+ return _mm_castps_pd(a);
+}
+
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
+{
+ return _mm_castsi128_ps(a);
+}
+
+static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float)
+{
+ return _mm_cvtepi32_ps(a);
+}
+
+SIMD_IWRAPPER_1(cvtepu8_epi16); // return (int16)a (uint8 --> int16)
+SIMD_IWRAPPER_1(cvtepu8_epi32); // return (int32)a (uint8 --> int32)
+SIMD_IWRAPPER_1(cvtepu16_epi32); // return (int32)a (uint16 --> int32)
+SIMD_IWRAPPER_1(cvtepu16_epi64); // return (int64)a (uint16 --> int64)
+SIMD_IWRAPPER_1(cvtepu32_epi64); // return (int64)a (uint32 --> int64)
+
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32)
+{
+ return _mm_cvtps_epi32(a);
+}
+
+static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32)
+{
+ return _mm_cvttps_epi32(a);
+}
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+template<CompareType CmpTypeT>
+static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
+{
+ return _mm_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
+}
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+
+SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
+SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
+SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
+SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
+SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8)
+SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
+SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
+SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
+SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
+
+static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
+{
+ return 0 != _mm_testz_ps(a, b);
+}
+
+static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
+{
+ return 0 != _mm_testz_si128(a, b);
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float)
+SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float)
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+{
+ return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
+}
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+{
+ return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
+}
+
+static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p) // return *p (all elements in vector get same value)
+{
+ return _mm_broadcast_ss(p);
+}
+
+SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm_packus_epi32 and _mm512_packus_epi32
+
+static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
+{
+ return castps_si(_mm_permutevar_ps(castsi_ps(a), swiz));
+}
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
+{
+ return _mm_permutevar_ps(a, swiz);
+}
+
+SIMD_IWRAPPER_1I(shuffle_epi32);
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) = delete;
+
+SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_DWRAPPER_2I(shuffle_pd);
+SIMD_WRAPPER_2I(shuffle_ps);
+SIMD_IWRAPPER_2(unpackhi_epi16);
+
+//SIMD_IFWRAPPER_2(unpackhi_epi32, _mm_unpackhi_ps);
+static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
+{
+ return castps_si(_mm_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
+}
+
+SIMD_IWRAPPER_2(unpackhi_epi64);
+SIMD_IWRAPPER_2(unpackhi_epi8);
+SIMD_DWRAPPER_2(unpackhi_pd);
+SIMD_WRAPPER_2(unpackhi_ps);
+SIMD_IWRAPPER_2(unpacklo_epi16);
+SIMD_IFWRAPPER_2(unpacklo_epi32, _mm_unpacklo_ps);
+SIMD_IWRAPPER_2(unpacklo_epi64);
+SIMD_IWRAPPER_2(unpacklo_epi8);
+SIMD_DWRAPPER_2(unpacklo_pd);
+SIMD_WRAPPER_2(unpacklo_ps);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+ uint32_t *pOffsets = (uint32_t*)&idx;
+ Float vResult;
+ float* pResult = (float*)&vResult;
+ for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+ {
+ uint32_t offset = pOffsets[i];
+ offset = offset * static_cast<uint32_t>(ScaleT);
+ pResult[i] = *(float const*)(((uint8_t const*)p + offset));
+ }
+
+ return vResult;
+}
+
+static SIMDINLINE Float SIMDCALL load1_ps(float const *p) // return *p (broadcast 1 value to all elements)
+{
+ return broadcast_ss(p);
+}
+
+static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory)
+{
+ return _mm_load_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p
+{
+ return _mm_load_si128(&p->v);
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem)
+{
+ return _mm_loadu_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem)
+{
+ return _mm_lddqu_si128(&p->v);
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+ uint32_t *pOffsets = (uint32_t*)&idx;
+ Float vResult = old;
+ float* pResult = (float*)&vResult;
+ DWORD index;
+ uint32_t umask = movemask_ps(mask);
+ while (_BitScanForward(&index, umask))
+ {
+ umask &= ~(1 << index);
+ uint32_t offset = pOffsets[index];
+ offset = offset * static_cast<uint32_t>(ScaleT);
+ pResult[index] = *(float const *)(((uint8_t const *)p + offset));
+ }
+
+ return vResult;
+}
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+ _mm_maskstore_ps(p, mask, src);
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+ return static_cast<uint32_t>(_mm_movemask_epi8(a));
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
+{
+ return static_cast<uint32_t>(_mm_movemask_pd(a));
+}
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
+{
+ return static_cast<uint32_t>(_mm_movemask_ps(a));
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
+{
+ return _mm_set1_epi32(i);
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
+{
+ return _mm_set1_epi8(i);
+}
+
+static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
+{
+ return _mm_set1_ps(f);
+}
+
+static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
+{
+ return _mm_setzero_ps();
+}
+
+static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
+{
+ return _mm_setzero_si128();
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory)
+{
+ _mm_store_ps(p, a);
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
+{
+ _mm_store_si128(&p->v, a);
+}
+
+static SIMDINLINE void SIMDCALL storeu_si(Integer *p, Integer a) // *p = a (same as store_si but allows for unaligned mem)
+{
+ _mm_storeu_si128(&p->v, a);
+}
+
+static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache)
+{
+ _mm_stream_ps(p, a);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(float in3, float in2, float in1, float in0)
+{
+ return _mm_set_ps(in3, in2, in1, in0);
+}
+
+template <int ImmT>
+static SIMDINLINE float SIMDCALL extract_ps(Float a)
+{
+ int tmp = _mm_extract_ps(a, ImmT);
+ return *reinterpret_cast<float*>(&tmp);
+}
+
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_2
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IFWRAPPER_2
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_1I
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2I
+
--- /dev/null
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX2_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD4 AVX (2) implementation
+//
+// Since this implementation inherits from the AVX (1) implementation,
+// the only operations below ones that replace AVX (1) operations.
+// Only 2 shifts and 2 gathers were introduced with AVX 2
+// Also, add native support for FMA operations
+//============================================================================
+#define SIMD_WRAPPER_3(op) \
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
+ {\
+ return _mm_##op(a, b, c);\
+ }
+
+SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
+
+static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b (uint32)
+{
+ return _mm_sllv_epi32(vA, vB);
+}
+
+static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b (uint32)
+{
+ return _mm_srlv_epi32(vA, vB);
+}
+
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+ return _mm_i32gather_ps(p, idx, static_cast<const int>(ScaleT));
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+ return _mm_mask_i32gather_ps(old, p, idx, mask, static_cast<const int>(ScaleT));
+}
+
+#undef SIMD_WRAPPER_3
+
--- /dev/null
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD128 AVX (512) implementation
+//
+// Since this implementation inherits from the AVX (2) implementation,
+// the only operations below ones that replace AVX (2) operations.
+// These use native AVX512 instructions with masking to enable a larger
+// register set.
+//============================================================================
+
+private:
+ static SIMDINLINE __m512 __conv(Float r) { return _mm512_castps128_ps512(r.v); }
+ static SIMDINLINE __m512d __conv(Double r) { return _mm512_castpd128_pd512(r.v); }
+ static SIMDINLINE __m512i __conv(Integer r) { return _mm512_castsi128_si512(r.v); }
+ static SIMDINLINE Float __conv(__m512 r) { return _mm512_castps512_ps128(r); }
+ static SIMDINLINE Double __conv(__m512d r) { return _mm512_castpd512_pd128(r); }
+ static SIMDINLINE Integer __conv(__m512i r) { return _mm512_castsi512_si128(r); }
+public:
+
+#define SIMD_WRAPPER_1_(op, intrin, mask) \
+ static SIMDINLINE Float SIMDCALL op(Float a) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+ }
+#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xf))
+
+#define SIMD_WRAPPER_1I_(op, intrin, mask) \
+ template<int ImmT> \
+ static SIMDINLINE Float SIMDCALL op(Float a) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+ }
+#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xf))
+
+#define SIMD_WRAPPER_2_(op, intrin, mask) \
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+ }
+#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xf))
+
+#define SIMD_WRAPPER_2I(op) \
+ template<int ImmT>\
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ {\
+ return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
+ }
+
+#define SIMD_WRAPPER_3_(op, intrin, mask) \
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c)));\
+ }
+#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
+
+#define SIMD_DWRAPPER_1_(op, intrin, mask) \
+ static SIMDINLINE Double SIMDCALL op(Double a) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+ }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_DWRAPPER_1I_(op, intrin, mask) \
+ template<int ImmT> \
+ static SIMDINLINE Double SIMDCALL op(Double a) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+ }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_DWRAPPER_2_(op, intrin, mask) \
+ static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+ }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_DWRAPPER_2I(op) \
+ template<int ImmT>\
+ static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
+ {\
+ return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT));\
+ }
+
+#define SIMD_IWRAPPER_1_(op, intrin, mask) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+ }
+#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xf))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffull))
+#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xff))
+#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
+ template<int ImmT> \
+ static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+ }
+#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xf))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffull))
+#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xff))
+#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_IWRAPPER_2_(op, intrin, mask) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+ }
+#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xf))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffull))
+#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xff))
+#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_IWRAPPER_2I(op) \
+ template<int ImmT>\
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
+ }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps); // return a + b
+SIMD_WRAPPER_2(div_ps); // return a / b
+SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
+SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps); // return a * b
+SIMD_WRAPPER_1_(rcp_ps, rcp28_ps, __mmask16(0xf)); // return 1.0f / a
+SIMD_WRAPPER_1_(rsqrt_ps, rsqrt28_ps, __mmask16(0xf)); // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps); // return a - b
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1_32(abs_epi32); // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2_32(add_epi32); // return a + b (int32)
+SIMD_IWRAPPER_2_32(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2_32(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32)
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
+SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
+
+#endif
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2_32(mullo_epi32);
+SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32)
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
+SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+#endif
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2_(and_si, and_epi32, __mmask16(0xf)); // return a & b (int)
+SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xf)); // return (~a) & b (int)
+SIMD_IWRAPPER_2_(or_si, or_epi32, __mmask16(0xf)); // return a | b (int)
+SIMD_IWRAPPER_2_(xor_si, xor_epi32, __mmask16(0xf)); // return a ^ b (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1I_32(slli_epi32); // return a << ImmT
+SIMD_IWRAPPER_2_32(sllv_epi32); // return a << b (uint32)
+SIMD_IWRAPPER_1I_32(srai_epi32); // return a >> ImmT (int32)
+SIMD_IWRAPPER_1I_32(srli_epi32); // return a >> ImmT (uint32)
+SIMD_IWRAPPER_2_32(srlv_epi32); // return a >> b (uint32)
+
+// use AVX2 version
+//SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint)
+
+//-----------------------------------------------------------------------
+// Conversion operations (Use AVX2 versions)
+//-----------------------------------------------------------------------
+// SIMD_IWRAPPER_1L(cvtepu8_epi16, 0xffff); // return (int16)a (uint8 --> int16)
+// SIMD_IWRAPPER_1L(cvtepu8_epi32, 0xff); // return (int32)a (uint8 --> int32)
+// SIMD_IWRAPPER_1L(cvtepu16_epi32, 0xff); // return (int32)a (uint16 --> int32)
+// SIMD_IWRAPPER_1L(cvtepu16_epi64, 0xf); // return (int64)a (uint16 --> int64)
+// SIMD_IWRAPPER_1L(cvtepu32_epi64, 0xf); // return (int64)a (uint32 --> int64)
+
+//-----------------------------------------------------------------------
+// Comparison operations (Use AVX2 versions
+//-----------------------------------------------------------------------
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi8); // return a == b (int8)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi16); // return a == b (int16)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi32); // return a == b (int32)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi64); // return a == b (int64)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi8,); // return a > b (int8)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi16); // return a > b (int16)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi32); // return a > b (int32)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi64); // return a > b (int64)
+//
+//static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b) // return a < b (int32)
+//{
+// return cmpgt_epi32(b, a);
+//}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+
+#endif
+// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
+
+//static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
+//{
+// return _mm256_permutevar8x32_ps(a, swiz);
+//}
+
+SIMD_IWRAPPER_1I_32(shuffle_epi32);
+//template<int ImmT>
+//static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+//{
+// return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
+//}
+//SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_IWRAPPER_2_32(unpackhi_epi32);
+SIMD_IWRAPPER_2_32(unpacklo_epi32);
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_16(unpackhi_epi16);
+SIMD_IWRAPPER_2_64(unpackhi_epi64);
+SIMD_IWRAPPER_2_8(unpackhi_epi8);
+SIMD_IWRAPPER_2_16(unpacklo_epi16);
+SIMD_IWRAPPER_2_64(unpacklo_epi64);
+SIMD_IWRAPPER_2_8(unpacklo_epi8);
+
+#endif
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory)
+{
+ return __conv(_mm512_maskz_load_ps(__mmask16(0xf), p));
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p
+{
+ return __conv(_mm512_maskz_load_epi32(__mmask16(0xf), p));
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem)
+{
+ return __conv(_mm512_maskz_loadu_ps(__mmask16(0xf), p));
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem)
+{
+ return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xf), p));
+}
+
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+ return __conv(_mm512_mask_i32gather_ps(
+ _mm512_setzero_ps(),
+ __mmask16(0xf),
+ __conv(idx),
+ p,
+ static_cast<int>(ScaleT)));
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+ __mmask16 m = 0xf;
+ m = _mm512_mask_test_epi32_mask(m, _mm512_castps_si512(__conv(mask)),
+ _mm512_set1_epi32(0x8000000));
+ return __conv(_mm512_mask_i32gather_ps(
+ __conv(old),
+ m,
+ __conv(idx),
+ p,
+ static_cast<int>(ScaleT)));
+}
+
+#if !defined(AVX512F_STRICT)
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+ __mmask64 m = 0xffffull;
+ return static_cast<uint32_t>(
+ _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
+}
+
+#endif
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+ __mmask16 m = 0xf;
+ m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
+ _mm512_mask_store_ps(p, m, __conv(src));
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory)
+{
+ _mm512_mask_store_ps(p, __mmask16(0xf), __conv(a));
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
+{
+ _mm512_mask_store_epi32(p, __mmask16(0xf), __conv(a));
+}
+
+//=======================================================================
+// Legacy interface (available only in SIMD256 width)
+//=======================================================================
+
+#undef SIMD_WRAPPER_1_
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_1I_
+#undef SIMD_WRAPPER_1I
+#undef SIMD_WRAPPER_2_
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3_
+#undef SIMD_WRAPPER_3
+#undef SIMD_DWRAPPER_1_
+#undef SIMD_DWRAPPER_1
+#undef SIMD_DWRAPPER_1I_
+#undef SIMD_DWRAPPER_1I
+#undef SIMD_DWRAPPER_2_
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_IWRAPPER_1_
+#undef SIMD_IWRAPPER_1_8
+#undef SIMD_IWRAPPER_1_16
+#undef SIMD_IWRAPPER_1_32
+#undef SIMD_IWRAPPER_1_64
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_1I_8
+#undef SIMD_IWRAPPER_1I_16
+#undef SIMD_IWRAPPER_1I_32
+#undef SIMD_IWRAPPER_1I_64
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2_8
+#undef SIMD_IWRAPPER_2_16
+#undef SIMD_IWRAPPER_2_32
+#undef SIMD_IWRAPPER_2_64
+#undef SIMD_IWRAPPER_2I
+//#undef SIMD_IWRAPPER_2I_8
+//#undef SIMD_IWRAPPER_2I_16
+//#undef SIMD_IWRAPPER_2I_32
+//#undef SIMD_IWRAPPER_2I_64
--- /dev/null
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+using SIMD128T = SIMD128Impl::AVXImpl;
+
+//============================================================================
+// SIMD256 AVX (1) implementation
+//============================================================================
+
+#define SIMD_WRAPPER_1(op) \
+ static SIMDINLINE Float SIMDCALL op(Float a) \
+ {\
+ return _mm256_##op(a);\
+ }
+
+#define SIMD_WRAPPER_2(op) \
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ {\
+ return _mm256_##op(a, b);\
+ }
+
+#define SIMD_DWRAPPER_2(op) \
+ static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
+ {\
+ return _mm256_##op(a, b);\
+ }
+
+#define SIMD_WRAPPER_2I(op) \
+ template<int ImmT>\
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ {\
+ return _mm256_##op(a, b, ImmT);\
+ }
+
+#define SIMD_DWRAPPER_2I(op) \
+ template<int ImmT>\
+ static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
+ {\
+ return _mm256_##op(a, b, ImmT);\
+ }
+
+#define SIMD_WRAPPER_3(op) \
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
+ {\
+ return _mm256_##op(a, b, c);\
+ }
+
+#define SIMD_IWRAPPER_1(op) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ {\
+ return _mm256_##op(a);\
+ }
+
+#define SIMD_IWRAPPER_2(op) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return _mm256_##op(a, b);\
+ }
+
+#define SIMD_IFWRAPPER_2(op, intrin) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return castps_si( intrin(castsi_ps(a), castsi_ps(b)) );\
+ }
+
+#define SIMD_IFWRAPPER_2I(op, intrin) \
+ template<int ImmT> \
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return castps_si( intrin(castsi_ps(a), castsi_ps(b), ImmT) );\
+ }
+
+#define SIMD_IWRAPPER_2I_(op, intrin) \
+ template<int ImmT>\
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return _mm256_##intrin(a, b, ImmT);\
+ }
+#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
+
+#define SIMD_IWRAPPER_3(op) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b, Integer c) \
+ {\
+ return _mm256_##op(a, b, c);\
+ }
+
+// emulated integer simd
+#define SIMD_EMU_IWRAPPER_1(op) \
+ static SIMDINLINE \
+ Integer SIMDCALL op(Integer a)\
+ {\
+ return Integer\
+ {\
+ SIMD128T::op(a.v4[0]),\
+ SIMD128T::op(a.v4[1]),\
+ };\
+ }
+#define SIMD_EMU_IWRAPPER_1L(op, shift) \
+ static SIMDINLINE \
+ Integer SIMDCALL op(Integer a)\
+ {\
+ return Integer \
+ {\
+ SIMD128T::op(a.v4[0]), \
+ SIMD128T::op(SIMD128T::template srli_si<shift>(a.v4[0])), \
+ };\
+ }\
+ static SIMDINLINE \
+ Integer SIMDCALL op(SIMD128Impl::Integer a)\
+ {\
+ return Integer \
+ {\
+ SIMD128T::op(a), \
+ SIMD128T::op(SIMD128T::template srli_si<shift>(a)), \
+ };\
+ }
+
+#define SIMD_EMU_IWRAPPER_1I(op) \
+ template <int ImmT> static SIMDINLINE \
+ Integer SIMDCALL op(Integer a)\
+ {\
+ return Integer\
+ {\
+ SIMD128T::template op<ImmT>(a.v4[0]),\
+ SIMD128T::template op<ImmT>(a.v4[1]),\
+ };\
+ }
+
+#define SIMD_EMU_IWRAPPER_2(op) \
+ static SIMDINLINE \
+ Integer SIMDCALL op(Integer a, Integer b)\
+ {\
+ return Integer\
+ {\
+ SIMD128T::op(a.v4[0], b.v4[0]),\
+ SIMD128T::op(a.v4[1], b.v4[1]),\
+ };\
+ }
+
+#define SIMD_EMU_IWRAPPER_2I(op) \
+ template <int ImmT> static SIMDINLINE \
+ Integer SIMDCALL op(Integer a, Integer b)\
+ {\
+ return Integer\
+ {\
+ SIMD128T::template op<ImmT>(a.v4[0], b.v[0]),\
+ SIMD128T::template op<ImmT>(a.v4[1], b.v[1]),\
+ };\
+ }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps); // return a + b
+SIMD_WRAPPER_2(div_ps); // return a / b
+
+static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c
+{
+ return add_ps(mul_ps(a, b), c);
+}
+
+SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
+SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps); // return a * b
+SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a
+SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps); // return a - b
+
+template <RoundMode RMT>
+static SIMDINLINE Float SIMDCALL round_ps(Float a)
+{
+ return _mm256_round_ps(a, static_cast<int>(RMT));
+}
+
+static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_EMU_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_EMU_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_EMU_IWRAPPER_2(add_epi8); // return a + b (int8)
+SIMD_EMU_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
+SIMD_EMU_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_EMU_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_EMU_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_EMU_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_EMU_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_EMU_IWRAPPER_2(mullo_epi32);
+SIMD_EMU_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_EMU_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_EMU_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
+SIMD_EMU_IWRAPPER_2(and_si); // return a & b (int)
+SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
+SIMD_EMU_IWRAPPER_2(andnot_si); // return (~a) & b (int)
+SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
+SIMD_EMU_IWRAPPER_2(or_si); // return a | b (int)
+SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
+SIMD_EMU_IWRAPPER_2(xor_si); // return a ^ b (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_EMU_IWRAPPER_1I(slli_epi32); // return a << ImmT
+
+static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vCount) // return a << b (uint32)
+{
+ int32_t aHi, aLow, countHi, countLow;
+ __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
+ __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
+ __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
+ __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
+
+ aHi = _mm_extract_epi32(vAHi, 0);
+ countHi = _mm_extract_epi32(vCountHi, 0);
+ aHi <<= countHi;
+ vAHi = _mm_insert_epi32(vAHi, aHi, 0);
+
+ aLow = _mm_extract_epi32(vALow, 0);
+ countLow = _mm_extract_epi32(vCountLow, 0);
+ aLow <<= countLow;
+ vALow = _mm_insert_epi32(vALow, aLow, 0);
+
+ aHi = _mm_extract_epi32(vAHi, 1);
+ countHi = _mm_extract_epi32(vCountHi, 1);
+ aHi <<= countHi;
+ vAHi = _mm_insert_epi32(vAHi, aHi, 1);
+
+ aLow = _mm_extract_epi32(vALow, 1);
+ countLow = _mm_extract_epi32(vCountLow, 1);
+ aLow <<= countLow;
+ vALow = _mm_insert_epi32(vALow, aLow, 1);
+
+ aHi = _mm_extract_epi32(vAHi, 2);
+ countHi = _mm_extract_epi32(vCountHi, 2);
+ aHi <<= countHi;
+ vAHi = _mm_insert_epi32(vAHi, aHi, 2);
+
+ aLow = _mm_extract_epi32(vALow, 2);
+ countLow = _mm_extract_epi32(vCountLow, 2);
+ aLow <<= countLow;
+ vALow = _mm_insert_epi32(vALow, aLow, 2);
+
+ aHi = _mm_extract_epi32(vAHi, 3);
+ countHi = _mm_extract_epi32(vCountHi, 3);
+ aHi <<= countHi;
+ vAHi = _mm_insert_epi32(vAHi, aHi, 3);
+
+ aLow = _mm_extract_epi32(vALow, 3);
+ countLow = _mm_extract_epi32(vCountLow, 3);
+ aLow <<= countLow;
+ vALow = _mm_insert_epi32(vALow, aLow, 3);
+
+ __m256i ret = _mm256_set1_epi32(0);
+ ret = _mm256_insertf128_si256(ret, vAHi, 1);
+ ret = _mm256_insertf128_si256(ret, vALow, 0);
+ return ret;
+}
+
+SIMD_EMU_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32)
+SIMD_EMU_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32)
+SIMD_EMU_IWRAPPER_1I(srli_si); // return a >> (ImmT*8) (uint)
+
+template<int ImmT> // same as srli_si, but with Float cast to int
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
+{
+ return castsi_ps(srli_si<ImmT>(castps_si(a)));
+}
+
+static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vCount) // return a >> b (uint32)
+{
+ int32_t aHi, aLow, countHi, countLow;
+ __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
+ __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
+ __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
+ __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
+
+ aHi = _mm_extract_epi32(vAHi, 0);
+ countHi = _mm_extract_epi32(vCountHi, 0);
+ aHi >>= countHi;
+ vAHi = _mm_insert_epi32(vAHi, aHi, 0);
+
+ aLow = _mm_extract_epi32(vALow, 0);
+ countLow = _mm_extract_epi32(vCountLow, 0);
+ aLow >>= countLow;
+ vALow = _mm_insert_epi32(vALow, aLow, 0);
+
+ aHi = _mm_extract_epi32(vAHi, 1);
+ countHi = _mm_extract_epi32(vCountHi, 1);
+ aHi >>= countHi;
+ vAHi = _mm_insert_epi32(vAHi, aHi, 1);
+
+ aLow = _mm_extract_epi32(vALow, 1);
+ countLow = _mm_extract_epi32(vCountLow, 1);
+ aLow >>= countLow;
+ vALow = _mm_insert_epi32(vALow, aLow, 1);
+
+ aHi = _mm_extract_epi32(vAHi, 2);
+ countHi = _mm_extract_epi32(vCountHi, 2);
+ aHi >>= countHi;
+ vAHi = _mm_insert_epi32(vAHi, aHi, 2);
+
+ aLow = _mm_extract_epi32(vALow, 2);
+ countLow = _mm_extract_epi32(vCountLow, 2);
+ aLow >>= countLow;
+ vALow = _mm_insert_epi32(vALow, aLow, 2);
+
+ aHi = _mm_extract_epi32(vAHi, 3);
+ countHi = _mm_extract_epi32(vCountHi, 3);
+ aHi >>= countHi;
+ vAHi = _mm_insert_epi32(vAHi, aHi, 3);
+
+ aLow = _mm_extract_epi32(vALow, 3);
+ countLow = _mm_extract_epi32(vCountLow, 3);
+ aLow >>= countLow;
+ vALow = _mm_insert_epi32(vALow, aLow, 3);
+
+ __m256i ret = _mm256_set1_epi32(0);
+ ret = _mm256_insertf128_si256(ret, vAHi, 1);
+ ret = _mm256_insertf128_si256(ret, vALow, 0);
+ return ret;
+}
+
+
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
+{
+ return _mm256_castpd_ps(a);
+}
+
+static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
+{
+ return _mm256_castps_si256(a);
+}
+
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
+{
+ return _mm256_castsi256_pd(a);
+}
+
+static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
+{
+ return _mm256_castps_pd(a);
+}
+
+static SIMDINLINE Integer SIMDCALL castpd_si(Double a) // return *(Integer*)(&a)
+{
+ return _mm256_castpd_si256(a);
+}
+
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
+{
+ return _mm256_castsi256_ps(a);
+}
+
+static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float)
+{
+ return _mm256_cvtepi32_ps(a);
+}
+
+SIMD_EMU_IWRAPPER_1L(cvtepu8_epi16, 8); // return (int16)a (uint8 --> int16)
+SIMD_EMU_IWRAPPER_1L(cvtepu8_epi32, 4); // return (int32)a (uint8 --> int32)
+SIMD_EMU_IWRAPPER_1L(cvtepu16_epi32, 8); // return (int32)a (uint16 --> int32)
+SIMD_EMU_IWRAPPER_1L(cvtepu16_epi64, 4); // return (int64)a (uint16 --> int64)
+SIMD_EMU_IWRAPPER_1L(cvtepu32_epi64, 8); // return (int64)a (uint32 --> int64)
+
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32)
+{
+ return _mm256_cvtps_epi32(a);
+}
+
+static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32)
+{
+ return _mm256_cvttps_epi32(a);
+}
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+template<CompareType CmpTypeT>
+static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
+{
+ return _mm256_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
+}
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+
+SIMD_EMU_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
+SIMD_EMU_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
+SIMD_EMU_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
+SIMD_EMU_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
+SIMD_EMU_IWRAPPER_2(cmpgt_epi8); // return a > b (int8)
+SIMD_EMU_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
+SIMD_EMU_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
+SIMD_EMU_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
+SIMD_EMU_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
+
+static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
+{
+ return 0 != _mm256_testz_ps(a, b);
+}
+
+static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
+{
+ return 0 != _mm256_testz_si256(a, b);
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float)
+SIMD_IFWRAPPER_2I(blend_epi32, _mm256_blend_ps); // return ImmT ? b : a (int32)
+SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float)
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+{
+ return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
+}
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+{
+ return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
+}
+
+static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p) // return *p (all elements in vector get same value)
+{
+ return _mm256_broadcast_ss(p);
+}
+
+SIMD_EMU_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_EMU_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_EMU_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_EMU_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+
+static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
+{
+ Integer result;
+
+ // Ugly slow implementation
+ uint32_t const *pA = reinterpret_cast<uint32_t const*>(&a);
+ uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
+ uint32_t *pResult = reinterpret_cast<uint32_t *>(&result);
+
+ for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+ {
+ pResult[i] = pA[0xF & pSwiz[i]];
+ }
+
+ return result;
+}
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
+{
+ Float result;
+
+ // Ugly slow implementation
+ float const *pA = reinterpret_cast<float const*>(&a);
+ uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
+ float *pResult = reinterpret_cast<float *>(&result);
+
+ for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+ {
+ pResult[i] = pA[0xF & pSwiz[i]];
+ }
+
+ return result;
+}
+
+SIMD_WRAPPER_2I(permute2f128_ps);
+SIMD_DWRAPPER_2I(permute2f128_pd);
+SIMD_IWRAPPER_2I_(permute2f128_si, permute2f128_si256);
+
+
+SIMD_EMU_IWRAPPER_1I(shuffle_epi32);
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+{
+ return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
+}
+SIMD_EMU_IWRAPPER_2(shuffle_epi8);
+SIMD_DWRAPPER_2I(shuffle_pd);
+SIMD_WRAPPER_2I(shuffle_ps);
+SIMD_EMU_IWRAPPER_2(unpackhi_epi16);
+SIMD_IFWRAPPER_2(unpackhi_epi32, _mm256_unpackhi_ps);
+SIMD_EMU_IWRAPPER_2(unpackhi_epi64);
+SIMD_EMU_IWRAPPER_2(unpackhi_epi8);
+SIMD_DWRAPPER_2(unpackhi_pd);
+SIMD_WRAPPER_2(unpackhi_ps);
+SIMD_EMU_IWRAPPER_2(unpacklo_epi16);
+SIMD_IFWRAPPER_2(unpacklo_epi32, _mm256_unpacklo_ps);
+SIMD_EMU_IWRAPPER_2(unpacklo_epi64);
+SIMD_EMU_IWRAPPER_2(unpacklo_epi8);
+SIMD_DWRAPPER_2(unpacklo_pd);
+SIMD_WRAPPER_2(unpacklo_ps);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+ uint32_t *pOffsets = (uint32_t*)&idx;
+ Float vResult;
+ float* pResult = (float*)&vResult;
+ for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+ {
+ uint32_t offset = pOffsets[i];
+ offset = offset * static_cast<uint32_t>(ScaleT);
+ pResult[i] = *(float const*)(((uint8_t const*)p + offset));
+ }
+
+ return vResult;
+}
+
+static SIMDINLINE Float SIMDCALL load1_ps(float const *p) // return *p (broadcast 1 value to all elements)
+{
+ return broadcast_ss(p);
+}
+
+static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory)
+{
+ return _mm256_load_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p
+{
+ return _mm256_load_si256(&p->v);
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem)
+{
+ return _mm256_loadu_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem)
+{
+ return _mm256_lddqu_si256(&p->v);
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+ uint32_t *pOffsets = (uint32_t*)&idx;
+ Float vResult = old;
+ float* pResult = (float*)&vResult;
+ DWORD index;
+ uint32_t umask = movemask_ps(mask);
+ while (_BitScanForward(&index, umask))
+ {
+ umask &= ~(1 << index);
+ uint32_t offset = pOffsets[index];
+ offset = offset * static_cast<uint32_t>(ScaleT);
+ pResult[index] = *(float const *)(((uint8_t const *)p + offset));
+ }
+
+ return vResult;
+}
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+ _mm256_maskstore_ps(p, mask, src);
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+ return SIMD128T::movemask_epi8(a.v4[0]) |
+ (SIMD128T::movemask_epi8(a.v4[1]) << 16);
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
+{
+ return static_cast<uint32_t>(_mm256_movemask_pd(a));
+}
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
+{
+ return static_cast<uint32_t>(_mm256_movemask_ps(a));
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
+{
+ return _mm256_set1_epi32(i);
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
+{
+ return _mm256_set1_epi8(i);
+}
+
+static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
+{
+ return _mm256_set1_ps(f);
+}
+
+static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
+{
+ return _mm256_setzero_ps();
+}
+
+static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
+{
+ return _mm256_setzero_si256();
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory)
+{
+ _mm256_store_ps(p, a);
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
+{
+ _mm256_store_si256(&p->v, a);
+}
+
+static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache)
+{
+ _mm256_stream_ps(p, a);
+}
+
+//=======================================================================
+// Legacy interface (available only in SIMD256 width)
+//=======================================================================
+
+static SIMDINLINE Float SIMDCALL broadcast_ps(SIMD128Impl::Float const *p)
+{
+ return _mm256_broadcast_ps(&p->v);
+}
+
+template<int ImmT>
+static SIMDINLINE SIMD128Impl::Double SIMDCALL extractf128_pd(Double a)
+{
+ return _mm256_extractf128_pd(a, ImmT);
+}
+
+template<int ImmT>
+static SIMDINLINE SIMD128Impl::Float SIMDCALL extractf128_ps(Float a)
+{
+ return _mm256_extractf128_ps(a, ImmT);
+}
+
+template<int ImmT>
+static SIMDINLINE SIMD128Impl::Integer SIMDCALL extractf128_si(Integer a)
+{
+ return _mm256_extractf128_si256(a, ImmT);
+}
+
+template<int ImmT>
+static SIMDINLINE Double SIMDCALL insertf128_pd(Double a, SIMD128Impl::Double b)
+{
+ return _mm256_insertf128_pd(a, b, ImmT);
+}
+
+template<int ImmT>
+static SIMDINLINE Float SIMDCALL insertf128_ps(Float a, SIMD128Impl::Float b)
+{
+ return _mm256_insertf128_ps(a, b, ImmT);
+}
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL insertf128_si(Integer a, SIMD128Impl::Integer b)
+{
+ return _mm256_insertf128_si256(a, b, ImmT);
+}
+
+#ifndef _mm256_set_m128i
+#define _mm256_set_m128i(/* SIMD128Impl::Integer */ hi, /* SIMD128Impl::Integer */ lo) \
+ _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
+#endif
+
+#ifndef _mm256_loadu2_m128i
+#define _mm256_loadu2_m128i(/* SIMD128Impl::Integer const* */ hiaddr, \
+ /* SIMD128Impl::Integer const* */ loaddr) \
+ _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr))
+#endif
+
+static SIMDINLINE Integer SIMDCALL loadu2_si(SIMD128Impl::Integer const* phi, SIMD128Impl::Integer const* plo)
+{
+ return _mm256_loadu2_m128i(&phi->v, &plo->v);
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+ return _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+ return _mm256_set_ps(i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer *phi, SIMD128Impl::Integer *plo, Integer src)
+{
+ _mm256_storeu2_m128i(&phi->v, &plo->v, src);
+}
+
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_2
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IFWRAPPER_2
+#undef SIMD_IFWRAPPER_2I
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_2I_
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_3
+#undef SIMD_EMU_IWRAPPER_1
+#undef SIMD_EMU_IWRAPPER_1I
+#undef SIMD_EMU_IWRAPPER_2
+#undef SIMD_EMU_IWRAPPER_2I
--- /dev/null
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX2_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD256 AVX (2) implementation
+//
+// Since this implementation inherits from the AVX (1) implementation,
+// the only operations below ones that replace AVX (1) operations.
+// Mostly these are integer operations that are no longer emulated with SSE
+//============================================================================
+
+#define SIMD_IWRAPPER_1(op) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ {\
+ return _mm256_##op(a);\
+ }
+
+#define SIMD_IWRAPPER_1L(op) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ {\
+ return _mm256_##op(_mm256_castsi256_si128(a));\
+ }\
+
+#define SIMD_IWRAPPER_1I(op) \
+ template<int ImmT> \
+ static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ {\
+ return _mm256_##op(a, ImmT);\
+ }
+
+#define SIMD_IWRAPPER_1I_(op, intrin) \
+ template<int ImmT> \
+ static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ {\
+ return _mm256_##intrin(a, ImmT);\
+ }
+
+#define SIMD_IWRAPPER_2_(op, intrin) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return _mm256_##intrin(a, b);\
+ }
+
+#define SIMD_IWRAPPER_2(op) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return _mm256_##op(a, b);\
+ }
+
+#define SIMD_IWRAPPER_2I(op) \
+ template<int ImmT> \
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return _mm256_##op(a, b, ImmT);\
+ }
+
+#define SIMD_IWRAPPER_2I(op) \
+ template<int ImmT>\
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return _mm256_##op(a, b, ImmT);\
+ }
+
+//-----------------------------------------------------------------------
+// Floating point arithmetic operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c
+{
+ return _mm256_fmadd_ps(a, b, c);
+}
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
+SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2(mullo_epi32);
+SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2_(and_si, and_si256); // return a & b (int)
+SIMD_IWRAPPER_2_(andnot_si, andnot_si256); // return (~a) & b (int)
+SIMD_IWRAPPER_2_(or_si, or_si256); // return a | b (int)
+SIMD_IWRAPPER_2_(xor_si, xor_si256); // return a ^ b (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
+SIMD_IWRAPPER_2(sllv_epi32); // return a << b (uint32)
+SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32)
+SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32)
+SIMD_IWRAPPER_2(srlv_epi32); // return a >> b (uint32)
+SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint)
+
+template<int ImmT> // same as srli_si, but with Float cast to int
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
+{
+ return castsi_ps(srli_si<ImmT>(castps_si(a)));
+}
+
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1L(cvtepu8_epi16); // return (int16)a (uint8 --> int16)
+SIMD_IWRAPPER_1L(cvtepu8_epi32); // return (int32)a (uint8 --> int32)
+SIMD_IWRAPPER_1L(cvtepu16_epi32); // return (int32)a (uint16 --> int32)
+SIMD_IWRAPPER_1L(cvtepu16_epi64); // return (int64)a (uint16 --> int64)
+SIMD_IWRAPPER_1L(cvtepu32_epi64); // return (int64)a (uint32 --> int64)
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
+SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
+SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
+SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
+SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8)
+SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
+SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
+SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
+
+static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b) // return a < b (int32)
+{
+ return cmpgt_epi32(b, a);
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a (int32)
+SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
+{
+ return _mm256_permutevar8x32_ps(a, swiz);
+}
+
+SIMD_IWRAPPER_1I(shuffle_epi32);
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+{
+ return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
+}
+SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_IWRAPPER_2(unpackhi_epi16);
+SIMD_IWRAPPER_2(unpackhi_epi32);
+SIMD_IWRAPPER_2(unpackhi_epi64);
+SIMD_IWRAPPER_2(unpackhi_epi8);
+SIMD_IWRAPPER_2(unpacklo_epi16);
+SIMD_IWRAPPER_2(unpacklo_epi32);
+SIMD_IWRAPPER_2(unpacklo_epi64);
+SIMD_IWRAPPER_2(unpacklo_epi8);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+ return _mm256_i32gather_ps(p, idx, static_cast<int>(ScaleT));
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+ // g++ in debug mode needs the explicit .v suffix instead of relying on operator __m256()
+ // Only for this intrinsic - not sure why. :(
+ return _mm256_mask_i32gather_ps(old.v, p, idx.v, mask.v, static_cast<int>(ScaleT));
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+ return static_cast<uint32_t>(_mm256_movemask_epi8(a));
+}
+
+//=======================================================================
+// Legacy interface (available only in SIMD256 width)
+//=======================================================================
+
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_1L
+#undef SIMD_IWRAPPER_1I
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_2I
--- /dev/null
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD256 AVX (512) implementation
+//
+// Since this implementation inherits from the AVX (2) implementation,
+// the only operations below ones that replace AVX (2) operations.
+// These use native AVX512 instructions with masking to enable a larger
+// register set.
+//============================================================================
+
+private:
+ static SIMDINLINE __m512 __conv(Float r) { return _mm512_castps256_ps512(r.v); }
+ static SIMDINLINE __m512d __conv(Double r) { return _mm512_castpd256_pd512(r.v); }
+ static SIMDINLINE __m512i __conv(Integer r) { return _mm512_castsi256_si512(r.v); }
+ static SIMDINLINE Float __conv(__m512 r) { return _mm512_castps512_ps256(r); }
+ static SIMDINLINE Double __conv(__m512d r) { return _mm512_castpd512_pd256(r); }
+ static SIMDINLINE Integer __conv(__m512i r) { return _mm512_castsi512_si256(r); }
+public:
+
+#define SIMD_WRAPPER_1_(op, intrin, mask) \
+ static SIMDINLINE Float SIMDCALL op(Float a) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+ }
+#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xff))
+
+#define SIMD_WRAPPER_1I_(op, intrin, mask) \
+ template<int ImmT> \
+ static SIMDINLINE Float SIMDCALL op(Float a) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+ }
+#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xff))
+
+#define SIMD_WRAPPER_2_(op, intrin, mask) \
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+ }
+#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xff))
+
+#define SIMD_WRAPPER_2I(op) \
+ template<int ImmT>\
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ {\
+ return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT));\
+ }
+
+#define SIMD_WRAPPER_3_(op, intrin, mask) \
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c)));\
+ }
+#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xff))
+
+#define SIMD_DWRAPPER_1_(op, intrin, mask) \
+ static SIMDINLINE Double SIMDCALL op(Double a) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+ }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_DWRAPPER_1I_(op, intrin, mask) \
+ template<int ImmT> \
+ static SIMDINLINE Double SIMDCALL op(Double a) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+ }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_DWRAPPER_2_(op, intrin, mask) \
+ static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+ }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_DWRAPPER_2I(op) \
+ template<int ImmT>\
+ static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
+ {\
+ return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
+ }
+
+#define SIMD_IWRAPPER_1_(op, intrin, mask) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+ }
+#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xff))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffffffull))
+#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xffff))
+#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
+ template<int ImmT> \
+ static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+ }
+#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xff))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffffffull))
+#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xffff))
+#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_IWRAPPER_2_(op, intrin, mask) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+ }
+#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xff))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffffffull))
+#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xffff))
+#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_IWRAPPER_2I(op) \
+ template<int ImmT>\
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT));\
+ }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps); // return a + b
+SIMD_WRAPPER_2(div_ps); // return a / b
+SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
+SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps); // return a * b
+//SIMD_WRAPPER_1_(rcp_ps, rcp28_ps, __mmask16(0xff)); // return 1.0f / a
+//SIMD_WRAPPER_1_(rsqrt_ps, rsqrt28_ps, __mmask16(0xff)); // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps); // return a - b
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1_32(abs_epi32); // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2_32(add_epi32); // return a + b (int32)
+SIMD_IWRAPPER_2_32(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2_32(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32)
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
+SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
+
+#endif
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2_32(mullo_epi32);
+SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32)
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
+SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+#endif
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2_(and_si, and_epi32, __mmask16(0xff)); // return a & b (int)
+SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xff)); // return (~a) & b (int)
+SIMD_IWRAPPER_2_(or_si, or_epi32, __mmask16(0xff)); // return a | b (int)
+SIMD_IWRAPPER_2_(xor_si, xor_epi32, __mmask16(0xff)); // return a ^ b (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1I_32(slli_epi32); // return a << ImmT
+SIMD_IWRAPPER_2_32(sllv_epi32); // return a << b (uint32)
+SIMD_IWRAPPER_1I_32(srai_epi32); // return a >> ImmT (int32)
+SIMD_IWRAPPER_1I_32(srli_epi32); // return a >> ImmT (uint32)
+SIMD_IWRAPPER_2_32(srlv_epi32); // return a >> b (uint32)
+
+// use AVX2 version
+//SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint)
+
+//-----------------------------------------------------------------------
+// Conversion operations (Use AVX2 versions)
+//-----------------------------------------------------------------------
+// SIMD_IWRAPPER_1L(cvtepu8_epi16, 0xffff); // return (int16)a (uint8 --> int16)
+// SIMD_IWRAPPER_1L(cvtepu8_epi32, 0xff); // return (int32)a (uint8 --> int32)
+// SIMD_IWRAPPER_1L(cvtepu16_epi32, 0xff); // return (int32)a (uint16 --> int32)
+// SIMD_IWRAPPER_1L(cvtepu16_epi64, 0xf); // return (int64)a (uint16 --> int64)
+// SIMD_IWRAPPER_1L(cvtepu32_epi64, 0xf); // return (int64)a (uint32 --> int64)
+
+//-----------------------------------------------------------------------
+// Comparison operations (Use AVX2 versions
+//-----------------------------------------------------------------------
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi8); // return a == b (int8)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi16); // return a == b (int16)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi32); // return a == b (int32)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi64); // return a == b (int64)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi8,); // return a > b (int8)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi16); // return a > b (int16)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi32); // return a > b (int32)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi64); // return a > b (int64)
+//
+//static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b) // return a < b (int32)
+//{
+// return cmpgt_epi32(b, a);
+//}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+
+#endif
+
+// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
+
+//static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
+//{
+// return _mm256_permutevar8x32_ps(a, swiz);
+//}
+
+SIMD_IWRAPPER_1I_32(shuffle_epi32);
+//template<int ImmT>
+//static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+//{
+// return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
+//}
+//SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_IWRAPPER_2_32(unpackhi_epi32);
+SIMD_IWRAPPER_2_32(unpacklo_epi32);
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_16(unpackhi_epi16);
+SIMD_IWRAPPER_2_64(unpackhi_epi64);
+SIMD_IWRAPPER_2_8(unpackhi_epi8);
+SIMD_IWRAPPER_2_16(unpacklo_epi16);
+SIMD_IWRAPPER_2_64(unpacklo_epi64);
+SIMD_IWRAPPER_2_8(unpacklo_epi8);
+
+#endif
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory)
+{
+ return __conv(_mm512_maskz_load_ps(__mmask16(0xff), p));
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p
+{
+ return __conv(_mm512_maskz_load_epi32(__mmask16(0xff), p));
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem)
+{
+ return __conv(_mm512_maskz_loadu_ps(__mmask16(0xff), p));
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem)
+{
+ return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xff), p));
+}
+
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+ return __conv(_mm512_mask_i32gather_ps(
+ _mm512_setzero_ps(),
+ __mmask16(0xff),
+ __conv(idx),
+ p,
+ static_cast<int>(ScaleT)));
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+ __mmask16 m = 0xff;
+ m = _mm512_mask_test_epi32_mask(m, _mm512_castps_si512(__conv(mask)),
+ _mm512_set1_epi32(0x8000000));
+ return __conv(_mm512_mask_i32gather_ps(
+ __conv(old),
+ m,
+ __conv(idx),
+ p,
+ static_cast<int>(ScaleT)));
+}
+
+#if !defined(AVX512F_STRICT)
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+ __mmask64 m = 0xffffffffull;
+ return static_cast<uint32_t>(
+ _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
+}
+
+#endif
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+ __mmask16 m = 0xff;
+ m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
+ _mm512_mask_store_ps(p, m, __conv(src));
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory)
+{
+ _mm512_mask_store_ps(p, __mmask16(0xff), __conv(a));
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
+{
+ _mm512_mask_store_epi32(p, __mmask16(0xff), __conv(a));
+}
+
+//=======================================================================
+// Legacy interface (available only in SIMD256 width)
+//=======================================================================
+
+#undef SIMD_WRAPPER_1_
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_1I_
+#undef SIMD_WRAPPER_1I
+#undef SIMD_WRAPPER_2_
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3_
+#undef SIMD_WRAPPER_3
+#undef SIMD_DWRAPPER_1_
+#undef SIMD_DWRAPPER_1
+#undef SIMD_DWRAPPER_1I_
+#undef SIMD_DWRAPPER_1I
+#undef SIMD_DWRAPPER_2_
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_IWRAPPER_1_
+#undef SIMD_IWRAPPER_1_8
+#undef SIMD_IWRAPPER_1_16
+#undef SIMD_IWRAPPER_1_32
+#undef SIMD_IWRAPPER_1_64
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_1I_8
+#undef SIMD_IWRAPPER_1I_16
+#undef SIMD_IWRAPPER_1I_32
+#undef SIMD_IWRAPPER_1I_64
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2_8
+#undef SIMD_IWRAPPER_2_16
+#undef SIMD_IWRAPPER_2_32
+#undef SIMD_IWRAPPER_2_64
+#undef SIMD_IWRAPPER_2I
+//#undef SIMD_IWRAPPER_2I_8
+//#undef SIMD_IWRAPPER_2I_16
+//#undef SIMD_IWRAPPER_2I_32
+//#undef SIMD_IWRAPPER_2I_64
--- /dev/null
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD16 AVX512 (F) implementation
+//
+// TODO: Optimize for KNL / KNH or for SKX??
+// For now probably optimizing more for KNL as that's where
+// immediate customers are.
+//============================================================================
+
+static const int TARGET_SIMD_WIDTH = 16;
+using SIMD256T = SIMD256Impl::AVX2Impl;
+
+#define SIMD_WRAPPER_1_(op, intrin) \
+ static SIMDINLINE Float SIMDCALL op(Float a) \
+ {\
+ return intrin(a);\
+ }
+
+#define SIMD_WRAPPER_1(op) \
+ SIMD_WRAPPER_1_(op, _mm512_##op)
+
+#define SIMD_WRAPPER_2_(op, intrin) \
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ {\
+ return _mm512_##intrin(a, b);\
+ }
+#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
+
+#define SIMD_WRAPPERI_2_(op, intrin) \
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ {\
+ return _mm512_castsi512_ps(_mm512_##intrin(\
+ _mm512_castps_si512(a), _mm512_castps_si512(b)));\
+ }
+
+#define SIMD_DWRAPPER_2(op) \
+ static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
+ {\
+ return _mm512_##op(a, b);\
+ }
+
+#define SIMD_WRAPPER_2I_(op, intrin) \
+ template<int ImmT>\
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ {\
+ return _mm512_##intrin(a, b, ImmT);\
+ }
+#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op)
+
+#define SIMD_DWRAPPER_2I_(op, intrin) \
+ template<int ImmT>\
+ static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
+ {\
+ return _mm512_##intrin(a, b, ImmT);\
+ }
+#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op)
+
+#define SIMD_WRAPPER_3(op) \
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
+ {\
+ return _mm512_##op(a, b, c);\
+ }
+
+#define SIMD_IWRAPPER_1(op) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ {\
+ return _mm512_##op(a);\
+ }
+#define SIMD_IWRAPPER_1_8(op) \
+ static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) \
+ {\
+ return _mm512_##op(a);\
+ }
+
+#define SIMD_IWRAPPER_1_4(op) \
+ static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) \
+ {\
+ return _mm512_##op(a);\
+ }
+
+#define SIMD_IWRAPPER_1I_(op, intrin) \
+ template<int ImmT> \
+ static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ {\
+ return intrin(a, ImmT);\
+ }
+#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
+
+#define SIMD_IWRAPPER_2_(op, intrin) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return _mm512_##intrin(a, b);\
+ }
+#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op)
+
+#define SIMD_IWRAPPER_2_CMP(op, cmp) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return cmp(a, b);\
+ }
+
+#define SIMD_IFWRAPPER_2(op, intrin) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\
+ }
+
+#define SIMD_IWRAPPER_2I_(op, intrin) \
+ template<int ImmT>\
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return _mm512_##intrin(a, b, ImmT);\
+ }
+#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
+
+private:
+ static SIMDINLINE Integer vmask(__mmask8 m)
+ {
+ return _mm512_maskz_set1_epi64(m, -1LL);
+ }
+ static SIMDINLINE Integer vmask(__mmask16 m)
+ {
+ return _mm512_maskz_set1_epi32(m, -1);
+ }
+ static SIMDINLINE Integer vmask(__mmask32 m)
+ {
+ return _mm512_maskz_set1_epi16(m, -1);
+ }
+ static SIMDINLINE Integer vmask(__mmask64 m)
+ {
+ return _mm512_maskz_set1_epi8(m, -1);
+ }
+
+public:
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps); // return a + b
+SIMD_WRAPPER_2(div_ps); // return a / b
+SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
+SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps); // return a * b
+SIMD_WRAPPER_1_(rcp_ps, _mm512_rcp28_ps); // return 1.0f / a
+SIMD_WRAPPER_1_(rsqrt_ps, _mm512_rsqrt28_ps); // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps); // return a - b
+
+template <RoundMode RMT>
+static SIMDINLINE Float SIMDCALL round_ps(Float a)
+{
+ return _mm512_roundscale_ps(a, static_cast<int>(RMT));
+}
+
+static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
+SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+ // return (a * b) & 0xFFFFFFFF
+ //
+ // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+ // and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2(mullo_epi32);
+SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2_(and_si, and_si512); // return a & b (int)
+SIMD_IWRAPPER_2_(andnot_si, andnot_si512); // return (~a) & b (int)
+SIMD_IWRAPPER_2_(or_si, or_si512); // return a | b (int)
+SIMD_IWRAPPER_2_(xor_si, xor_si512); // return a ^ b (int)
+
+#if defined(AVX512F_STRICT)
+
+SIMD_WRAPPERI_2_(and_ps, and_epi32); // return a & b (float treated as int)
+SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32); // return (~a) & b (float treated as int)
+SIMD_WRAPPERI_2_(or_ps, or_epi32); // return a | b (float treated as int)
+SIMD_WRAPPERI_2_(xor_ps, xor_epi32); // return a ^ b (float treated as int)
+
+#else
+
+SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
+SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
+SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
+SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
+
+#endif
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
+SIMD_IWRAPPER_2(sllv_epi32);
+SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32)
+SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32)
+SIMD_IWRAPPER_1I_(srli_si, srli_si512); // return a >> (ImmT*8) (uint)
+
+template<int ImmT> // same as srli_si, but with Float cast to int
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
+{
+ return castsi_ps(srli_si<ImmT>(castps_si(a)));
+}
+
+SIMD_IWRAPPER_2(srlv_epi32);
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
+{
+ return _mm512_castpd_ps(a);
+}
+
+static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
+{
+ return _mm512_castps_si512(a);
+}
+
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
+{
+ return _mm512_castsi512_pd(a);
+}
+
+static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
+{
+ return _mm512_castps_pd(a);
+}
+
+static SIMDINLINE Integer SIMDCALL castpd_si(Double a) // return *(Integer*)(&a)
+{
+ return _mm512_castpd_si512(a);
+}
+
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
+{
+ return _mm512_castsi512_ps(a);
+}
+
+static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float)
+{
+ return _mm512_cvtepi32_ps(a);
+}
+
+SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a (uint8 --> int16)
+SIMD_IWRAPPER_1_4(cvtepu8_epi32); // return (int32)a (uint8 --> int32)
+SIMD_IWRAPPER_1_8(cvtepu16_epi32); // return (int32)a (uint16 --> int32)
+SIMD_IWRAPPER_1_4(cvtepu16_epi64); // return (int64)a (uint16 --> int64)
+SIMD_IWRAPPER_1_8(cvtepu32_epi64); // return (int64)a (uint32 --> int64)
+
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32)
+{
+ return _mm512_cvtps_epi32(a);
+}
+
+static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32)
+{
+ return _mm512_cvttps_epi32(a);
+}
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+template<CompareType CmpTypeT>
+static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
+{
+ return _mm512_cmp_ps_mask(a, b, static_cast<const int>(CmpTypeT));
+}
+
+template<CompareType CmpTypeT>
+static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
+{
+ // Legacy vector mask generator
+ __mmask16 result = cmp_ps_mask<CmpTypeT>(a, b);
+ return castsi_ps(vmask(result));
+}
+
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+
+template<CompareTypeInt CmpTypeT>
+static SIMDINLINE Integer SIMDCALL cmp_epi8(Integer a, Integer b)
+{
+ // Legacy vector mask generator
+ __mmask64 result = _mm512_cmp_epi8_mask(a, b, static_cast<const int>(CmpTypeT));
+ return vmask(result);
+}
+template<CompareTypeInt CmpTypeT>
+static SIMDINLINE Integer SIMDCALL cmp_epi16(Integer a, Integer b)
+{
+ // Legacy vector mask generator
+ __mmask32 result = _mm512_cmp_epi16_mask(a, b, static_cast<const int>(CmpTypeT));
+ return vmask(result);
+}
+template<CompareTypeInt CmpTypeT>
+static SIMDINLINE Integer SIMDCALL cmp_epi32(Integer a, Integer b)
+{
+ // Legacy vector mask generator
+ __mmask16 result = _mm512_cmp_epi32_mask(a, b, static_cast<const int>(CmpTypeT));
+ return vmask(result);
+}
+template<CompareTypeInt CmpTypeT>
+static SIMDINLINE Integer SIMDCALL cmp_epi64(Integer a, Integer b)
+{
+ // Legacy vector mask generator
+ __mmask8 result = _mm512_cmp_epi64_mask(a, b, static_cast<const int>(CmpTypeT));
+ return vmask(result);
+}
+
+SIMD_IWRAPPER_2_CMP(cmpeq_epi8, cmp_epi8<CompareTypeInt::EQ>); // return a == b (int8)
+SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>); // return a == b (int16)
+SIMD_IWRAPPER_2_CMP(cmpeq_epi32, cmp_epi32<CompareTypeInt::EQ>); // return a == b (int32)
+SIMD_IWRAPPER_2_CMP(cmpeq_epi64, cmp_epi64<CompareTypeInt::EQ>); // return a == b (int64)
+SIMD_IWRAPPER_2_CMP(cmpgt_epi8, cmp_epi8<CompareTypeInt::GT>); // return a > b (int8)
+SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>); // return a > b (int16)
+SIMD_IWRAPPER_2_CMP(cmpgt_epi32, cmp_epi32<CompareTypeInt::GT>); // return a > b (int32)
+SIMD_IWRAPPER_2_CMP(cmpgt_epi64, cmp_epi64<CompareTypeInt::GT>); // return a > b (int64)
+SIMD_IWRAPPER_2_CMP(cmplt_epi32, cmp_epi32<CompareTypeInt::LT>); // return a < b (int32)
+
+static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
+{
+ return (0 == static_cast<int>(_mm512_test_epi32_mask(castps_si(a), castps_si(b))));
+}
+
+static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
+{
+ return (0 == static_cast<int>(_mm512_test_epi32_mask(a, b)));
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+template <int ImmT>
+static SIMDINLINE Float blend_ps(Float a, Float b) // return ImmT ? b : a (float)
+{
+ return _mm512_mask_blend_ps(__mmask16(ImmT), a, b);
+}
+
+template <int ImmT>
+static SIMDINLINE Float blend_epi32(Integer a, Integer b) // return ImmT ? b : a (int32)
+{
+ return _mm512_mask_blend_epi32(__mmask16(ImmT), a, b);
+}
+
+static SIMDINLINE Float blendv_ps(Float a, Float b, Float mask) // return mask ? b : a (float)
+{
+ return _mm512_mask_blend_ps(__mmask16(movemask_ps(mask)), a, b);
+}
+
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+{
+ return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
+}
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+{
+ return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
+}
+
+static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p) // return *p (all elements in vector get same value)
+{
+ return _mm512_set1_ps(*p);
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
+{
+ return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(a), imm));
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
+{
+ return _mm512_extractf64x4_pd(a, imm);
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
+{
+ return _mm512_extracti64x4_epi64(a, imm);
+}
+
+template<int imm>
+static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
+{
+ return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(a), _mm256_castps_pd(b), imm));
+}
+
+template<int imm>
+static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
+{
+ return _mm512_insertf64x4(a, b, imm);
+}
+
+template<int imm>
+static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
+{
+ return _mm512_inserti64x4(a, b, imm);
+}
+
+SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32
+
+static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
+{
+ return _mm512_permutexvar_epi32(swiz, a);
+}
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
+{
+ return _mm512_permutexvar_ps(swiz, a);
+}
+
+SIMD_WRAPPER_2I_(permute2f128_ps, shuffle_f32x4);
+SIMD_DWRAPPER_2I_(permute2f128_pd, shuffle_f64x2);
+SIMD_IWRAPPER_2I_(permute2f128_si, shuffle_i32x4);
+
+SIMD_IWRAPPER_1I(shuffle_epi32);
+
+SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_DWRAPPER_2I(shuffle_pd);
+SIMD_WRAPPER_2I(shuffle_ps);
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+{
+ return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
+}
+
+SIMD_IWRAPPER_2(unpackhi_epi16);
+
+//SIMD_IFWRAPPER_2(unpackhi_epi32, _mm512_unpackhi_ps);
+static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
+{
+ return castps_si(_mm512_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
+}
+
+SIMD_IWRAPPER_2(unpackhi_epi64);
+SIMD_IWRAPPER_2(unpackhi_epi8);
+SIMD_DWRAPPER_2(unpackhi_pd);
+SIMD_WRAPPER_2(unpackhi_ps);
+SIMD_IWRAPPER_2(unpacklo_epi16);
+SIMD_IFWRAPPER_2(unpacklo_epi32, unpacklo_ps);
+SIMD_IWRAPPER_2(unpacklo_epi64);
+SIMD_IWRAPPER_2(unpacklo_epi8);
+SIMD_DWRAPPER_2(unpacklo_pd);
+SIMD_WRAPPER_2(unpacklo_ps);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+ uint32_t *pOffsets = (uint32_t*)&idx;
+ Float vResult;
+ float* pResult = (float*)&vResult;
+ for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+ {
+ uint32_t offset = pOffsets[i];
+ offset = offset * static_cast<uint32_t>(ScaleT);
+ pResult[i] = *(float const*)(((uint8_t const*)p + offset));
+ }
+
+ return vResult;
+}
+
+static SIMDINLINE Float SIMDCALL load1_ps(float const *p) // return *p (broadcast 1 value to all elements)
+{
+ return broadcast_ss(p);
+}
+
+static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory)
+{
+ return _mm512_load_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p
+{
+ return _mm512_load_si512(&p->v);
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem)
+{
+ return _mm512_loadu_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem)
+{
+ return _mm512_loadu_si512(p);
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+ __mmask16 k = _mm512_cmpneq_ps_mask(mask, setzero_ps());
+
+ return _mm512_mask_i32gather_ps(old, k, idx, p, ScaleT);
+}
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+ Mask m = _mm512_cmplt_epi32_mask(mask, setzero_si());
+ _mm512_mask_store_ps(p, m, src);
+}
+
+static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
+{
+ __mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si());
+ return static_cast<uint64_t>(m);
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
+{
+ __mmask8 m = _mm512_cmplt_pd_mask(a, setzero_pd());
+ return static_cast<uint32_t>(m);
+}
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
+{
+ __mmask16 m = _mm512_cmplt_ps_mask(a, setzero_ps());
+ return static_cast<uint32_t>(m);
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
+{
+ return _mm512_set1_epi32(i);
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
+{
+ return _mm512_set1_epi8(i);
+}
+
+static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
+{
+ return _mm512_set1_ps(f);
+}
+
+static SIMDINLINE Double SIMDCALL setzero_pd() // return 0 (double)
+{
+ return _mm512_setzero_pd();
+}
+
+static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
+{
+ return _mm512_setzero_ps();
+}
+
+static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
+{
+ return _mm512_setzero_si512();
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory)
+{
+ _mm512_store_ps(p, a);
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
+{
+ _mm512_store_si512(&p->v, a);
+}
+
+static SIMDINLINE void SIMDCALL storeu_si(Integer *p, Integer a) // *p = a (same as store_si but allows for unaligned mem)
+{
+ _mm512_storeu_si512(&p->v, a);
+}
+
+static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache)
+{
+ _mm512_stream_ps(p, a);
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(
+ int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
+ int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+ return _mm512_set_epi32(
+ i15, i14, i13, i12, i11, i10, i9, i8,
+ i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(
+ int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+ return set_epi32(
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(
+ float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
+ float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+ return _mm512_set_ps(
+ i15, i14, i13, i12, i11, i10, i9, i8,
+ i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(
+ float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+ return set_ps(
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
+{
+ return castsi_ps(_mm512_maskz_mov_epi32(__mmask16(mask), set1_epi32(-1)));
+}
+
+#undef SIMD_WRAPPER_1_
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2_
+#undef SIMD_WRAPPERI_2_
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_WRAPPER_2I_
+#undef SIMD_WRAPPER_3_
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IFWRAPPER_2
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_1I
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2I
+
--- /dev/null
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+// Implement mask-enabled SIMD functions
--- /dev/null
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD16 AVX (1) implementation
+//============================================================================
+
+static const int TARGET_SIMD_WIDTH = 8;
+using SIMD128T = SIMD128Impl::AVXImpl;
+
+#define SIMD_WRAPPER_1(op) \
+ static SIMDINLINE Float SIMDCALL op(Float a) \
+ {\
+ return Float\
+ {\
+ SIMD256T::op(a.v8[0]),\
+ SIMD256T::op(a.v8[1]),\
+ };\
+ }
+
+#define SIMD_WRAPPER_2(op) \
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ {\
+ return Float\
+ {\
+ SIMD256T::op(a.v8[0], b.v8[0]),\
+ SIMD256T::op(a.v8[1], b.v8[1]),\
+ };\
+ }
+
+#define SIMD_WRAPPER_2I(op) \
+ template<int ImmT>\
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ {\
+ return Float\
+ {\
+ SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
+ SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
+ };\
+ }
+
+#define SIMD_WRAPPER_2I_1(op) \
+ template<int ImmT>\
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ {\
+ return Float\
+ {\
+ SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
+ SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
+ };\
+ }
+
+#define SIMD_WRAPPER_3(op) \
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
+ {\
+ return Float\
+ {\
+ SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
+ SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
+ };\
+ }
+
+#define SIMD_IWRAPPER_1(op) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ {\
+ return Integer\
+ {\
+ SIMD256T::op(a.v8[0]),\
+ SIMD256T::op(a.v8[1]),\
+ };\
+ }
+
+#define SIMD_IWRAPPER_2(op) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return Integer\
+ {\
+ SIMD256T::op(a.v8[0], b.v8[0]),\
+ SIMD256T::op(a.v8[1], b.v8[1]),\
+ };\
+ }
+
+#define SIMD_IWRAPPER_2I(op) \
+ template<int ImmT>\
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return Integer\
+ {\
+ SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
+ SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
+ };\
+ }
+
+#define SIMD_IWRAPPER_2I_1(op) \
+ template<int ImmT>\
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return Integer\
+ {\
+ SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
+ SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
+ };\
+ }
+
+#define SIMD_IWRAPPER_2I_2(op) \
+ template<int ImmT>\
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return Integer\
+ {\
+ SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]),\
+ SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]),\
+ };\
+ }
+
+#define SIMD_IWRAPPER_3(op) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b, Integer c) \
+ {\
+ return Integer\
+ {\
+ SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
+ SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
+ };\
+ }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps); // return a + b
+SIMD_WRAPPER_2(div_ps); // return a / b
+SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
+SIMD_WRAPPER_2(max_ps); // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps); // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps); // return a * b
+SIMD_WRAPPER_1(rcp_ps); // return 1.0f / a
+SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps); // return a - b
+
+template <RoundMode RMT>
+static SIMDINLINE Float SIMDCALL round_ps(Float a)
+{
+ return Float
+ {
+ SIMD256T::template round_ps<RMT>(a.v8[0]),
+ SIMD256T::template round_ps<RMT>(a.v8[1]),
+ };
+}
+
+static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_IWRAPPER_2(add_epi8); // return a + b (int8)
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
+SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2(mullo_epi32);
+SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
+SIMD_IWRAPPER_2(and_si); // return a & b (int)
+SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
+SIMD_IWRAPPER_2(andnot_si); // return (~a) & b (int)
+SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
+SIMD_IWRAPPER_2(or_si); // return a | b (int)
+SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
+SIMD_IWRAPPER_2(xor_si); // return a ^ b (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL slli_epi32(Integer a) // return a << ImmT
+{
+ return Integer
+ {
+ SIMD256T::template slli_epi32<ImmT>(a.v8[0]),
+ SIMD256T::template slli_epi32<ImmT>(a.v8[1]),
+ };
+}
+
+SIMD_IWRAPPER_2(sllv_epi32); // return a << b (uint32)
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL srai_epi32(Integer a) // return a >> ImmT (int32)
+{
+ return Integer
+ {
+ SIMD256T::template srai_epi32<ImmT>(a.v8[0]),
+ SIMD256T::template srai_epi32<ImmT>(a.v8[1]),
+ };
+}
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL srli_epi32(Integer a) // return a >> ImmT (uint32)
+{
+ return Integer
+ {
+ SIMD256T::template srli_epi32<ImmT>(a.v8[0]),
+ SIMD256T::template srli_epi32<ImmT>(a.v8[1]),
+ };
+}
+
+template<int ImmT> // for each 128-bit lane:
+static SIMDINLINE Integer SIMDCALL srli_si(Integer a) // return a >> (ImmT*8) (uint)
+{
+ return Integer
+ {
+ SIMD256T::template srli_si<ImmT>(a.v8[0]),
+ SIMD256T::template srli_si<ImmT>(a.v8[1]),
+ };
+}
+template<int ImmT>
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a) // same as srli_si, but with Float cast to int
+{
+ return Float
+ {
+ SIMD256T::template srlisi_ps<ImmT>(a.v8[0]),
+ SIMD256T::template srlisi_ps<ImmT>(a.v8[1]),
+ };
+}
+
+SIMD_IWRAPPER_2(srlv_epi32); // return a >> b (uint32)
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
+{
+ return Float
+ {
+ SIMD256T::castpd_ps(a.v8[0]),
+ SIMD256T::castpd_ps(a.v8[1]),
+ };
+}
+
+static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
+{
+ return Integer
+ {
+ SIMD256T::castps_si(a.v8[0]),
+ SIMD256T::castps_si(a.v8[1]),
+ };
+}
+
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
+{
+ return Double
+ {
+ SIMD256T::castsi_pd(a.v8[0]),
+ SIMD256T::castsi_pd(a.v8[1]),
+ };
+}
+
+static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
+{
+ return Double
+ {
+ SIMD256T::castps_pd(a.v8[0]),
+ SIMD256T::castps_pd(a.v8[1]),
+ };
+}
+
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
+{
+ return Float
+ {
+ SIMD256T::castsi_ps(a.v8[0]),
+ SIMD256T::castsi_ps(a.v8[1]),
+ };
+}
+
+static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float)
+{
+ return Float
+ {
+ SIMD256T::cvtepi32_ps(a.v8[0]),
+ SIMD256T::cvtepi32_ps(a.v8[1]),
+ };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu8_epi16(SIMD256Impl::Integer a) // return (int16)a (uint8 --> int16)
+{
+ return Integer
+ {
+ SIMD256T::cvtepu8_epi16(a.v4[0]),
+ SIMD256T::cvtepu8_epi16(a.v4[1]),
+ };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu8_epi32(SIMD256Impl::Integer a) // return (int32)a (uint8 --> int32)
+{
+ return Integer
+ {
+ SIMD256T::cvtepu8_epi32(a.v4[0]),
+ SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])),
+ };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu16_epi32(SIMD256Impl::Integer a) // return (int32)a (uint16 --> int32)
+{
+ return Integer
+ {
+ SIMD256T::cvtepu16_epi32(a.v4[0]),
+ SIMD256T::cvtepu16_epi32(a.v4[1]),
+ };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu16_epi64(SIMD256Impl::Integer a) // return (int64)a (uint16 --> int64)
+{
+ return Integer
+ {
+ SIMD256T::cvtepu16_epi64(a.v4[0]),
+ SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])),
+ };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu32_epi64(SIMD256Impl::Integer a) // return (int64)a (uint32 --> int64)
+{
+ return Integer
+ {
+ SIMD256T::cvtepu32_epi64(a.v4[0]),
+ SIMD256T::cvtepu32_epi64(a.v4[1]),
+ };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32)
+{
+ return Integer
+ {
+ SIMD256T::cvtps_epi32(a.v8[0]),
+ SIMD256T::cvtps_epi32(a.v8[1]),
+ };
+}
+
+static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32)
+{
+ return Integer
+ {
+ SIMD256T::cvtps_epi32(a.v8[0]),
+ SIMD256T::cvtps_epi32(a.v8[1]),
+ };
+}
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+template<CompareType CmpTypeT>
+static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
+{
+ return Float
+ {
+ SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]),
+ SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]),
+ };
+}
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+
+template<CompareType CmpTypeT>
+static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
+{
+ return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b)));
+}
+
+
+SIMD_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
+SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
+SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
+SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
+SIMD_IWRAPPER_2(cmpgt_epi8); // return a > b (int8)
+SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
+SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
+SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
+SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
+
+static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
+{
+ return 0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) &
+ SIMD256T::testz_ps(a.v8[1], b.v8[1]));
+}
+
+static SIMDINLINE int SIMDCALL testz_si(Integer a, Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
+{
+ return 0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) &
+ SIMD256T::testz_si(a.v8[1], b.v8[1]));
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float)
+SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a (int32)
+SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float)
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+{
+ return Integer
+ {
+ SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
+ SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
+ };
+}
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+{
+ return Integer
+ {
+ SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
+ SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
+ };
+}
+
+static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p) // return *p (all elements in vector get same value)
+{
+ float f = *p;
+ return Float
+ {
+ SIMD256T::set1_ps(f),
+ SIMD256T::set1_ps(f),
+ };
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
+{
+ SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+ return a.v8[imm];
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
+{
+ SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+ return a.v8[imm];
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
+{
+ SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+ return a.v8[imm];
+}
+
+template<int imm>
+static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
+{
+ SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+ a.v8[imm] = b;
+ return a;
+}
+
+template<int imm>
+static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
+{
+ SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+ a.v8[imm] = b;
+ return a;
+}
+
+template<int imm>
+static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
+{
+ SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+ a.v8[imm] = b;
+ return a;
+}
+
+SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+
+static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
+{
+ Integer result;
+
+ // Ugly slow implementation
+ uint32_t const *pA = reinterpret_cast<uint32_t const*>(&a);
+ uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
+ uint32_t *pResult = reinterpret_cast<uint32_t *>(&result);
+
+ for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+ {
+ pResult[i] = pA[0xF & pSwiz[i]];
+ }
+
+ return result;
+}
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
+{
+ Float result;
+
+ // Ugly slow implementation
+ float const *pA = reinterpret_cast<float const*>(&a);
+ uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
+ float *pResult = reinterpret_cast<float *>(&result);
+
+ for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+ {
+ pResult[i] = pA[0xF & pSwiz[i]];
+ }
+
+ return result;
+}
+
+// All of the 512-bit permute2f128_XX intrinsics do the following:
+//
+// SELECT4(src, control) {
+// CASE(control[1:0])
+// 0: tmp[127:0] : = src[127:0]
+// 1 : tmp[127:0] : = src[255:128]
+// 2 : tmp[127:0] : = src[383:256]
+// 3 : tmp[127:0] : = src[511:384]
+// ESAC
+// RETURN tmp[127:0]
+// }
+//
+// dst[127:0] : = SELECT4(a[511:0], imm8[1:0])
+// dst[255:128] : = SELECT4(a[511:0], imm8[3:2])
+// dst[383:256] : = SELECT4(b[511:0], imm8[5:4])
+// dst[511:384] : = SELECT4(b[511:0], imm8[7:6])
+// dst[MAX:512] : = 0
+//
+// Since the 256-bit AVX instructions use a 4-bit control field (instead
+// of 2-bit for AVX512), we need to expand the control bits sent to the
+// AVX instructions for emulation.
+//
+template <int shuf>
+static SIMDINLINE Float SIMDCALL permute2f128_ps(Float a, Float b)
+{
+ return Float
+ {
+ SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
+ SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
+ };
+}
+
+template <int shuf>
+static SIMDINLINE Double SIMDCALL permute2f128_pd(Double a, Double b)
+{
+ return Double
+ {
+ SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
+ SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
+ };
+}
+
+template <int shuf>
+static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer a, Integer b)
+{
+ return Integer
+ {
+ SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
+ SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
+ };
+}
+
+SIMD_IWRAPPER_2I_1(shuffle_epi32);
+SIMD_IWRAPPER_2I_2(shuffle_epi64);
+SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_WRAPPER_2I_1(shuffle_pd);
+SIMD_WRAPPER_2I_1(shuffle_ps);
+SIMD_IWRAPPER_2(unpackhi_epi16);
+SIMD_IWRAPPER_2(unpackhi_epi32);
+SIMD_IWRAPPER_2(unpackhi_epi64);
+SIMD_IWRAPPER_2(unpackhi_epi8);
+SIMD_WRAPPER_2(unpackhi_pd);
+SIMD_WRAPPER_2(unpackhi_ps);
+SIMD_IWRAPPER_2(unpacklo_epi16);
+SIMD_IWRAPPER_2(unpacklo_epi32);
+SIMD_IWRAPPER_2(unpacklo_epi64);
+SIMD_IWRAPPER_2(unpacklo_epi8);
+SIMD_WRAPPER_2(unpacklo_pd);
+SIMD_WRAPPER_2(unpacklo_ps);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+ return Float
+ {
+ SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]),
+ SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]),
+ };
+}
+
+static SIMDINLINE Float SIMDCALL load1_ps(float const *p) // return *p (broadcast 1 value to all elements)
+{
+ return broadcast_ss(p);
+}
+
+static SIMDINLINE Float SIMDCALL load_ps(float const *p) // return *p (loads SIMD width elements from memory)
+{
+ return Float
+ {
+ SIMD256T::load_ps(p),
+ SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)
+ };
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p) // return *p
+{
+ return Integer
+ {
+ SIMD256T::load_si(&p->v8[0]),
+ SIMD256T::load_si(&p->v8[1]),
+ };
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p) // return *p (same as load_ps but allows for unaligned mem)
+{
+ return Float
+ {
+ SIMD256T::loadu_ps(p),
+ SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)
+ };
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p (same as load_si but allows for unaligned mem)
+{
+ return Integer
+ {
+ SIMD256T::loadu_si(&p->v8[0]),
+ SIMD256T::loadu_si(&p->v8[1]),
+ };
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+ return Float
+ {
+ SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
+ SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
+ };
+}
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+ SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
+ SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]);
+}
+
+static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
+{
+ uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0]));
+ mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
+
+ return mask;
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
+{
+ uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0]));
+ mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
+
+ return mask;
+}
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
+{
+ uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0]));
+ mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
+
+ return mask;
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
+{
+ return Integer
+ {
+ SIMD256T::set1_epi32(i),
+ SIMD256T::set1_epi32(i)
+ };
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
+{
+ return Integer
+ {
+ SIMD256T::set1_epi8(i),
+ SIMD256T::set1_epi8(i)
+ };
+}
+
+static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
+{
+ return Float
+ {
+ SIMD256T::set1_ps(f),
+ SIMD256T::set1_ps(f)
+ };
+}
+
+static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
+{
+ return Float
+ {
+ SIMD256T::setzero_ps(),
+ SIMD256T::setzero_ps()
+ };
+}
+
+static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
+{
+ return Integer
+ {
+ SIMD256T::setzero_si(),
+ SIMD256T::setzero_si()
+ };
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory)
+{
+ SIMD256T::store_ps(p, a.v8[0]);
+ SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
+{
+ SIMD256T::store_si(&p->v8[0], a.v8[0]);
+ SIMD256T::store_si(&p->v8[1], a.v8[1]);
+}
+
+static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache)
+{
+ SIMD256T::stream_ps(p, a.v8[0]);
+ SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(
+ int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
+ int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+ return Integer
+ {
+ SIMD256T::set_epi32(
+ i7, i6, i5, i4, i3, i2, i1, i0),
+ SIMD256T::set_epi32(
+ i15, i14, i13, i12, i11, i10, i9, i8)
+ };
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(
+ int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+ return set_epi32(
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(
+ float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
+ float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+ return Float
+ {
+ SIMD256T::set_ps(
+ i7, i6, i5, i4, i3, i2, i1, i0),
+ SIMD256T::set_ps(
+ i15, i14, i13, i12, i11, i10, i9, i8)
+ };
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(
+ float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+ return set_ps(
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
+{
+ Integer vec = set1_epi32(mask);
+ const Integer bit = set_epi32(
+ 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100,
+ 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+ vec = and_si(vec, bit);
+ vec = cmplt_epi32(setzero_si(), vec);
+ return castsi_ps(vec);
+}
+
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_2I_1
+#undef SIMD_WRAPPER_3
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_2I_1
+#undef SIMD_IWRAPPER_3
--- /dev/null
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+// no backwards compatibility for simd mask-enabled functions
+
--- /dev/null
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#pragma once
+#if 0
+//===========================================================================
+// Placeholder name representing either SIMD4, SIMD256, or SIMD16 structures.
+//===========================================================================
+struct SIMD256 // or SIMD4 or SIMD16
+{
+ //=======================================================================
+ // SIMD Types
+ //
+ // These typedefs are examples. The SIMD256 and SIMD16 implementations will
+ // use different base types with this same naming.
+ using Float = __m256; // Packed single-precision float vector
+ using Double = __m256d; // Packed double-precision float vector
+ using Integer = __m256i; // Packed integer vector (mutable element widths)
+ using Mask = uint8_t; // Integer representing mask bits
+
+ //=======================================================================
+ // Standard interface
+ // (available in both SIMD256 and SIMD16 widths)
+ //=======================================================================
+
+ //-----------------------------------------------------------------------
+ // Single precision floating point arithmetic operations
+ //-----------------------------------------------------------------------
+ static Float add_ps(Float a, Float b); // return a + b
+ static Float div_ps(Float a, Float b); // return a / b
+ static Float fmadd_ps(Float a, Float b, Float c); // return (a * b) + c
+ static Float fmsub_ps(Float a, Float b, Float c); // return (a * b) - c
+ static Float max_ps(Float a, Float b); // return (a > b) ? a : b
+ static Float min_ps(Float a, Float b); // return (a < b) ? a : b
+ static Float mul_ps(Float a, Float b); // return a * b
+ static Float rcp_ps(Float a); // return 1.0f / a
+ static Float rsqrt_ps(Float a); // return 1.0f / sqrt(a)
+ static Float sub_ps(Float a, Float b); // return a - b
+
+ enum class RoundMode
+ {
+ TO_NEAREST_INT = 0x00, // Round to nearest integer == TRUNCATE(value + (signof(value))0.5)
+ TO_NEG_INF = 0x01, // Round to negative infinity
+ TO_POS_INF = 0x02, // Round to positive infinity
+ TO_ZERO = 0x03, // Round to 0 a.k.a. truncate
+ CUR_DIRECTION = 0x04, // Round in direction set in MXCSR register
+
+ RAISE_EXC = 0x00, // Raise exception on overflow
+ NO_EXC = 0x08, // Suppress exceptions
+
+ NINT = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(RAISE_EXC),
+ NINT_NOEXC = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(NO_EXC),
+ FLOOR = static_cast<int>(TO_NEG_INF) | static_cast<int>(RAISE_EXC),
+ FLOOR_NOEXC = static_cast<int>(TO_NEG_INF) | static_cast<int>(NO_EXC),
+ CEIL = static_cast<int>(TO_POS_INF) | static_cast<int>(RAISE_EXC),
+ CEIL_NOEXC = static_cast<int>(TO_POS_INF) | static_cast<int>(NO_EXC),
+ TRUNC = static_cast<int>(TO_ZERO) | static_cast<int>(RAISE_EXC),
+ TRUNC_NOEXC = static_cast<int>(TO_ZERO) | static_cast<int>(NO_EXC),
+ RINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(RAISE_EXC),
+ NEARBYINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(NO_EXC),
+ };
+
+ // return round_func(a)
+ //
+ // round_func is chosen on the RMT template parameter. See the documentation
+ // for the RoundMode enumeration above.
+ template <RoundMode RMT>
+ static Float round_ps(Float a); // return round(a)
+
+
+ //-----------------------------------------------------------------------
+ // Integer (various width) arithmetic operations
+ //-----------------------------------------------------------------------
+ static Integer abs_epi32(Integer a); // return absolute_value(a) (int32)
+ static Integer add_epi32(Integer a, Integer b); // return a + b (int32)
+ static Integer add_epi8(Integer a, Integer b); // return a + b (int8)
+ static Integer adds_epu8(Integer a, Integer b); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
+ static Integer max_epi32(Integer a, Integer b); // return (a > b) ? a : b (int32)
+ static Integer max_epu32(Integer a, Integer b); // return (a > b) ? a : b (uint32)
+ static Integer min_epi32(Integer a, Integer b); // return (a < b) ? a : b (int32)
+ static Integer min_epu32(Integer a, Integer b); // return (a < b) ? a : b (uint32)
+ static Integer mul_epi32(Integer a, Integer b); // return a * b (int32)
+
+ // return (a * b) & 0xFFFFFFFF
+ //
+ // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+ // and store the low 32 bits of the intermediate integers in dst.
+ static Float mullo_epi32(Integer a, Integer b);
+
+ static Integer sub_epi32(Integer a, Integer b); // return a - b (int32)
+ static Integer sub_epi64(Integer a, Integer b); // return a - b (int64)
+ static Integer subs_epu8(Integer a, Integer b); // return (b > a) ? 0 : (a - b) (uint8)
+
+ //-----------------------------------------------------------------------
+ // Logical operations
+ //-----------------------------------------------------------------------
+ static Float and_ps(Float a, Float b); // return a & b (float treated as int)
+ static Integer and_si(Integer a, Integer b); // return a & b (int)
+ static Float andnot_ps(Float a, Float b); // return (~a) & b (float treated as int)
+ static Integer andnot_si(Integer a, Integer b); // return (~a) & b (int)
+ static Float or_ps(Float a, Float b); // return a | b (float treated as int)
+ static Float or_si(Integer a, Integer b); // return a | b (int)
+ static Float xor_ps(Float a, Float b); // return a ^ b (float treated as int)
+ static Integer xor_si(Integer a, Integer b); // return a ^ b (int)
+
+ //-----------------------------------------------------------------------
+ // Shift operations
+ //-----------------------------------------------------------------------
+ template<int ImmT>
+ static Integer slli_epi32(Integer a); // return a << ImmT
+ static Integer sllv_epi32(Integer a, Integer b); // return a << b
+ template<int ImmT>
+ static Integer srai_epi32(Integer a); // return a >> ImmT (int32)
+ template<int ImmT>
+ static Integer srli_epi32(Integer a); // return a >> ImmT (uint32)
+ template<int ImmT> // for each 128-bit lane:
+ static Integer srli_si(Integer a); // return a >> (ImmT*8) (uint)
+ template<int ImmT>
+ static Float srlisi_ps(Float a); // same as srli_si, but with Float cast to int
+ static Integer srlv_epi32(Integer a, Integer b); // return a >> b (uint32)
+
+ //-----------------------------------------------------------------------
+ // Conversion operations
+ //-----------------------------------------------------------------------
+ static Float castpd_ps(Double a); // return *(Float*)(&a)
+ static Integer castps_si(Float a); // return *(Integer*)(&a)
+ static Double castsi_pd(Integer a); // return *(Double*)(&a)
+ static Double castps_pd(Float a); // return *(Double*)(&a)
+ static Float castsi_ps(Integer a); // return *(Float*)(&a)
+ static Float cvtepi32_ps(Integer a); // return (float)a (int32 --> float)
+ static Integer cvtepu8_epi16(Integer a); // return (int16)a (uint8 --> int16)
+ static Integer cvtepu8_epi32(Integer a); // return (int32)a (uint8 --> int32)
+ static Integer cvtepu16_epi32(Integer a); // return (int32)a (uint16 --> int32)
+ static Integer cvtepu16_epi64(Integer a); // return (int64)a (uint16 --> int64)
+ static Integer cvtepu32_epi64(Integer a); // return (int64)a (uint32 --> int64)
+ static Integer cvtps_epi32(Float a); // return (int32)a (float --> int32)
+ static Integer cvttps_epi32(Float a); // return (int32)a (rnd_to_zero(float) --> int32)
+
+ //-----------------------------------------------------------------------
+ // Comparison operations
+ //-----------------------------------------------------------------------
+
+ // Comparison types used with cmp_ps:
+ // - ordered comparisons are always false if either operand is NaN
+ // - unordered comparisons are always true if either operand is NaN
+ // - signaling comparisons raise an exception if either operand is NaN
+ // - non-signaling comparisons will never raise an exception
+ //
+ // Ordered: return (a != NaN) && (b != NaN) && (a cmp b)
+ // Unordered: return (a == NaN) || (b == NaN) || (a cmp b)
+ enum class CompareType
+ {
+ EQ_OQ = 0x00, // Equal (ordered, nonsignaling)
+ LT_OS = 0x01, // Less-than (ordered, signaling)
+ LE_OS = 0x02, // Less-than-or-equal (ordered, signaling)
+ UNORD_Q = 0x03, // Unordered (nonsignaling)
+ NEQ_UQ = 0x04, // Not-equal (unordered, nonsignaling)
+ NLT_US = 0x05, // Not-less-than (unordered, signaling)
+ NLE_US = 0x06, // Not-less-than-or-equal (unordered, signaling)
+ ORD_Q = 0x07, // Ordered (nonsignaling)
+ EQ_UQ = 0x08, // Equal (unordered, non-signaling)
+ NGE_US = 0x09, // Not-greater-than-or-equal (unordered, signaling)
+ NGT_US = 0x0A, // Not-greater-than (unordered, signaling)
+ FALSE_OQ = 0x0B, // False (ordered, nonsignaling)
+ NEQ_OQ = 0x0C, // Not-equal (ordered, non-signaling)
+ GE_OS = 0x0D, // Greater-than-or-equal (ordered, signaling)
+ GT_OS = 0x0E, // Greater-than (ordered, signaling)
+ TRUE_UQ = 0x0F, // True (unordered, non-signaling)
+ EQ_OS = 0x10, // Equal (ordered, signaling)
+ LT_OQ = 0x11, // Less-than (ordered, nonsignaling)
+ LE_OQ = 0x12, // Less-than-or-equal (ordered, nonsignaling)
+ UNORD_S = 0x13, // Unordered (signaling)
+ NEQ_US = 0x14, // Not-equal (unordered, signaling)
+ NLT_UQ = 0x15, // Not-less-than (unordered, nonsignaling)
+ NLE_UQ = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
+ ORD_S = 0x17, // Ordered (signaling)
+ EQ_US = 0x18, // Equal (unordered, signaling)
+ NGE_UQ = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
+ NGT_UQ = 0x1A, // Not-greater-than (unordered, nonsignaling)
+ FALSE_OS = 0x1B, // False (ordered, signaling)
+ NEQ_OS = 0x1C, // Not-equal (ordered, signaling)
+ GE_OQ = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
+ GT_OQ = 0x1E, // Greater-than (ordered, nonsignaling)
+ TRUE_US = 0x1F, // True (unordered, signaling)
+ };
+
+ // return a (CmpTypeT) b (float)
+ //
+ // See documentation for CompareType above for valid values for CmpTypeT.
+ template<CompareType CmpTypeT>
+ static Float cmp_ps(Float a, Float b); // return a (CmtTypeT) b (see above)
+ static Float cmpgt_ps(Float a, Float b); // return cmp_ps<CompareType::GT_OQ>(a, b)
+ static Float cmple_ps(Float a, Float b); // return cmp_ps<CompareType::LE_OQ>(a, b)
+ static Float cmplt_ps(Float a, Float b); // return cmp_ps<CompareType::LT_OQ>(a, b)
+ static Float cmpneq_ps(Float a, Float b); // return cmp_ps<CompareType::NEQ_OQ>(a, b)
+ static Float cmpeq_ps(Float a, Float b); // return cmp_ps<CompareType::EQ_OQ>(a, b)
+ static Float cmpge_ps(Float a, Float b); // return cmp_ps<CompareType::GE_OQ>(a, b)
+ static Integer cmpeq_epi8(Integer a, Integer b); // return a == b (int8)
+ static Integer cmpeq_epi16(Integer a, Integer b); // return a == b (int16)
+ static Integer cmpeq_epi32(Integer a, Integer b); // return a == b (int32)
+ static Integer cmpeq_epi64(Integer a, Integer b); // return a == b (int64)
+ static Integer cmpgt_epi8(Integer a, Integer b); // return a > b (int8)
+ static Integer cmpgt_epi16(Integer a, Integer b); // return a > b (int16)
+ static Integer cmpgt_epi32(Integer a, Integer b); // return a > b (int32)
+ static Integer cmpgt_epi64(Integer a, Integer b); // return a > b (int64)
+ static Integer cmplt_epi32(Integer a, Integer b); // return a < b (int32)
+ static bool testz_ps(Float a, Float b); // return all_lanes_zero(a & b) ? 1 : 0 (float)
+ static bool testz_si(Integer a, Integer b); // return all_lanes_zero(a & b) ? 1 : 0 (int)
+
+ //-----------------------------------------------------------------------
+ // Blend / shuffle / permute operations
+ //-----------------------------------------------------------------------
+ template<int ImmT>
+ static Float blend_ps(Float a, Float b); // return ImmT ? b : a (float)
+ static Integer blendv_epi32(Integer a, Integer b, Float mask); // return mask ? b : a (int)
+ static Float blendv_ps(Float a, Float b, Float mask); // return mask ? b : a (float)
+ static Float broadcast_ss(float const *p); // return *p (all elements in vector get same value)
+ static Integer packs_epi16(Integer a, Integer b); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+ static Integer packs_epi32(Integer a, Integer b); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+ static Integer packus_epi16(Integer a, Integer b); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+ static Integer packus_epi32(Integer a, Integer b); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+ static Float permute_epi32(Integer a, Integer swiz); // return a[swiz[i]] for each 32-bit lane i (int32)
+ static Float permute_ps(Float a, Integer swiz); // return a[swiz[i]] for each 32-bit lane i (float)
+ template<int SwizT>
+ static Integer shuffle_epi32(Integer a, Integer b);
+ template<int SwizT>
+ static Integer shuffle_epi64(Integer a, Integer b);
+ static Integer shuffle_epi8(Integer a, Integer b);
+ template<int SwizT>
+ static Float shuffle_pd(Double a, Double b);
+ template<int SwizT>
+ static Float shuffle_ps(Float a, Float b);
+ static Integer unpackhi_epi16(Integer a, Integer b);
+ static Integer unpackhi_epi32(Integer a, Integer b);
+ static Integer unpackhi_epi64(Integer a, Integer b);
+ static Integer unpackhi_epi8(Integer a, Integer b);
+ static Float unpackhi_pd(Double a, Double b);
+ static Float unpackhi_ps(Float a, Float b);
+ static Integer unpacklo_epi16(Integer a, Integer b);
+ static Integer unpacklo_epi32(Integer a, Integer b);
+ static Integer unpacklo_epi64(Integer a, Integer b);
+ static Integer unpacklo_epi8(Integer a, Integer b);
+ static Float unpacklo_pd(Double a, Double b);
+ static Float unpacklo_ps(Float a, Float b);
+
+ //-----------------------------------------------------------------------
+ // Load / store operations
+ //-----------------------------------------------------------------------
+ enum class ScaleFactor
+ {
+ SF_1, // No scaling
+ SF_2, // Scale offset by 2
+ SF_4, // Scale offset by 4
+ SF_8, // Scale offset by 8
+ };
+
+ template<ScaleFactor ScaleT>
+ static Float i32gather_ps(float const* p, Integer idx); // return *(float*)(((int8*)p) + (idx * ScaleT))
+ static Float load1_ps(float const *p); // return *p (broadcast 1 value to all elements)
+ static Float load_ps(float const *p); // return *p (loads SIMD width elements from memory)
+ static Integer load_si(Integer const *p); // return *p
+ static Float loadu_ps(float const *p); // return *p (same as load_ps but allows for unaligned mem)
+ static Integer loadu_si(Integer const *p); // return *p (same as load_si but allows for unaligned mem)
+
+ // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+ template<int ScaleT>
+ static Float mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask);
+
+ static void maskstore_ps(float *p, Integer mask, Float src);
+ static int movemask_epi8(Integer a);
+ static int movemask_pd(Double a);
+ static int movemask_ps(Float a);
+ static Integer set1_epi32(int i); // return i (all elements are same value)
+ static Integer set1_epi8(char i); // return i (all elements are same value)
+ static Float set1_ps(float f); // return f (all elements are same value)
+ static Float setzero_ps(); // return 0 (float)
+ static Integer setzero_si(); // return 0 (integer)
+ static void store_ps(float *p, Float a); // *p = a (stores all elements contiguously in memory)
+ static void store_si(Integer *p, Integer a); // *p = a
+ static void stream_ps(float *p, Float a); // *p = a (same as store_ps, but doesn't keep memory in cache)
+
+ //=======================================================================
+ // Legacy interface (available only in SIMD256 width)
+ //=======================================================================
+
+ static Float broadcast_ps(__m128 const *p);
+ template<int ImmT>
+ static __m128d extractf128_pd(Double a);
+ template<int ImmT>
+ static __m128 extractf128_ps(Float a);
+ template<int ImmT>
+ static __m128i extractf128_si(Integer a);
+ template<int ImmT>
+ static Double insertf128_pd(Double a, __m128d b);
+ template<int ImmT>
+ static Float insertf128_ps(Float a, __m128 b);
+ template<int ImmT>
+ static Integer insertf128_si(Integer a, __m128i b);
+ static Integer loadu2_si(__m128 const* phi, __m128 const* plo);
+ template<int ImmT>
+ static Double permute2f128_pd(Double a, Double b);
+ template<int ImmT>
+ static Float permute2f128_ps(Float a, Float b);
+ template<int ImmT>
+ static Integer permute2f128_si(Integer a, Integer b);
+ static Integer set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0);
+ static void storeu2_si(__m128i *phi, __m128i *plo, Integer src);
+
+ //=======================================================================
+ // Advanced masking interface (currently available only in SIMD16 width)
+ //=======================================================================
+
+
+ //=======================================================================
+ // Extended Utility Functions (common to SIMD256 and SIMD16)
+ //=======================================================================
+
+ //-----------------------------------------------------------------------
+ // Extended Types
+ //-----------------------------------------------------------------------
+
+ // Vec4, an SOA SIMD set of 4-dimensional vectors
+ union Vec4
+ {
+ Vec4() = default;
+ Vec4(Float in)
+ {
+ s.x = in;
+ s.y = in;
+ s.z = in;
+ s.w = in;
+ }
+ Vec4(Float x, Float y, Float z, Float w)
+ {
+ s.x = x;
+ s.y = y;
+ s.z = z;
+ s.w = w;
+ }
+
+ Float v[4];
+ Integer vi[4];
+ struct
+ {
+ Float x;
+ Float y;
+ Float z;
+ Float w;
+ } s;
+ Float& operator[] (const int i) { return v[i]; }
+ Float const & operator[] (const int i) const { return v[i]; }
+ };
+
+ //-----------------------------------------------------------------------
+ // Extended Functions
+ //-----------------------------------------------------------------------
+ static void vec4_set1_ps(Vec4& r, const float *p); // r[0] = set1(p[0]), r[1] = set1(p[1]), ...
+ static void vec4_set1_vps(Vec4& r, Float s); // r[0] = s, r[1] = s, ...
+ static Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1); // return dp3(v0, v1)
+ static Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1); // return dp4(v0, v1)
+ static Float vec4_rcp_length_ps(const Vec4& v); // return 1.0f / sqrt(dp4(v, v))
+ static void vec4_normalize_ps(Vec4& r, const Vec4& v); // r = v * rcp_length(v)
+ static void vec4_mul_ps(Vec4& r, const Vec4& v, Float s); // r = v * set1_vps(s)
+ static void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1); // r = v0 * v1
+ static void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1); // r = v0 + v1
+ static void vec4_min_ps(Vec4& r, const Vec4& v0, Float s); // r = (v0 < s) ? v0 : s
+ static void vec4_max_ps(Vec4& r, const Vec4& v0, Float s); // r = (v0 > s) ? v0 : s
+
+ // Matrix4x4 * Vector4
+ // result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * v.s.w)
+ // result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * v.s.w)
+ // result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * v.s.w)
+ // result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * v.s.w)
+ static void mat4x4_vec4_multiply(
+ Vec4& result,
+ const float *pMatrix,
+ const Vec4& v);
+
+ // Matrix4x4 * Vector3 - Direction Vector where w = 0.
+ // result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 0)
+ // result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 0)
+ // result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 0)
+ // result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 0)
+ static void mat3x3_vec3_w0_multiply(
+ Vec4& result,
+ const float *pMatrix,
+ const Vec4& v);
+
+ // Matrix4x4 * Vector3 - Position vector where w = 1.
+ // result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1)
+ // result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1)
+ // result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1)
+ // result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 1)
+ static void mat4x4_vec3_w1_multiply(
+ Vec4& result,
+ const float *pMatrix,
+ const Vec4& v);
+
+ // Matrix4x3 * Vector3 - Position vector where w = 1.
+ // result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1)
+ // result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1)
+ // result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1)
+ // result.s.w = 1
+ static void mat4x3_vec3_w1_multiply(
+ Vec4& result,
+ const float *pMatrix,
+ const Vec4& v);
+};
+#endif // #if 0
--- /dev/null
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#pragma once
+
+#if !defined(__cplusplus)
+#error C++ compilation required
+#endif
+
+#include <immintrin.h>
+#include <inttypes.h>
+#include <stdint.h>
+
+#define SIMD_ARCH_AVX 0
+#define SIMD_ARCH_AVX2 1
+#define SIMD_ARCH_AVX512 2
+
+#if !defined(SIMD_ARCH)
+#define SIMD_ARCH SIMD_ARCH_AVX
+#endif
+
+#if defined(_MSC_VER)
+#define SIMDCALL __vectorcall
+#define SIMDINLINE __forceinline
+#define SIMDALIGN(type_, align_) __declspec(align(align_)) type_
+#else
+#define SIMDCALL
+#define SIMDINLINE inline
+#define SIMDALIGN(type_, align_) type_ __attribute__((aligned(align_)))
+#endif
+
+// For documentation, please see the following include...
+// #include "simdlib_interface.hpp"
+
+namespace SIMDImpl
+{
+ enum class CompareType
+ {
+ EQ_OQ = 0x00, // Equal (ordered, nonsignaling)
+ LT_OS = 0x01, // Less-than (ordered, signaling)
+ LE_OS = 0x02, // Less-than-or-equal (ordered, signaling)
+ UNORD_Q = 0x03, // Unordered (nonsignaling)
+ NEQ_UQ = 0x04, // Not-equal (unordered, nonsignaling)
+ NLT_US = 0x05, // Not-less-than (unordered, signaling)
+ NLE_US = 0x06, // Not-less-than-or-equal (unordered, signaling)
+ ORD_Q = 0x07, // Ordered (nonsignaling)
+ EQ_UQ = 0x08, // Equal (unordered, non-signaling)
+ NGE_US = 0x09, // Not-greater-than-or-equal (unordered, signaling)
+ NGT_US = 0x0A, // Not-greater-than (unordered, signaling)
+ FALSE_OQ = 0x0B, // False (ordered, nonsignaling)
+ NEQ_OQ = 0x0C, // Not-equal (ordered, non-signaling)
+ GE_OS = 0x0D, // Greater-than-or-equal (ordered, signaling)
+ GT_OS = 0x0E, // Greater-than (ordered, signaling)
+ TRUE_UQ = 0x0F, // True (unordered, non-signaling)
+ EQ_OS = 0x10, // Equal (ordered, signaling)
+ LT_OQ = 0x11, // Less-than (ordered, nonsignaling)
+ LE_OQ = 0x12, // Less-than-or-equal (ordered, nonsignaling)
+ UNORD_S = 0x13, // Unordered (signaling)
+ NEQ_US = 0x14, // Not-equal (unordered, signaling)
+ NLT_UQ = 0x15, // Not-less-than (unordered, nonsignaling)
+ NLE_UQ = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
+ ORD_S = 0x17, // Ordered (signaling)
+ EQ_US = 0x18, // Equal (unordered, signaling)
+ NGE_UQ = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
+ NGT_UQ = 0x1A, // Not-greater-than (unordered, nonsignaling)
+ FALSE_OS = 0x1B, // False (ordered, signaling)
+ NEQ_OS = 0x1C, // Not-equal (ordered, signaling)
+ GE_OQ = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
+ GT_OQ = 0x1E, // Greater-than (ordered, nonsignaling)
+ TRUE_US = 0x1F, // True (unordered, signaling)
+ };
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+ enum class CompareTypeInt
+ {
+ EQ = _MM_CMPINT_EQ, // Equal
+ LT = _MM_CMPINT_LT, // Less than
+ LE = _MM_CMPINT_LE, // Less than or Equal
+ NE = _MM_CMPINT_NE, // Not Equal
+ GE = _MM_CMPINT_GE, // Greater than or Equal
+ GT = _MM_CMPINT_GT, // Greater than
+ };
+#endif // SIMD_ARCH >= SIMD_ARCH_AVX512
+
+ enum class ScaleFactor
+ {
+ SF_1 = 1, // No scaling
+ SF_2 = 2, // Scale offset by 2
+ SF_4 = 4, // Scale offset by 4
+ SF_8 = 8, // Scale offset by 8
+ };
+
+ enum class RoundMode
+ {
+ TO_NEAREST_INT = 0x00, // Round to nearest integer == TRUNCATE(value + 0.5)
+ TO_NEG_INF = 0x01, // Round to negative infinity
+ TO_POS_INF = 0x02, // Round to positive infinity
+ TO_ZERO = 0x03, // Round to 0 a.k.a. truncate
+ CUR_DIRECTION = 0x04, // Round in direction set in MXCSR register
+
+ RAISE_EXC = 0x00, // Raise exception on overflow
+ NO_EXC = 0x08, // Suppress exceptions
+
+ NINT = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(RAISE_EXC),
+ NINT_NOEXC = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(NO_EXC),
+ FLOOR = static_cast<int>(TO_NEG_INF) | static_cast<int>(RAISE_EXC),
+ FLOOR_NOEXC = static_cast<int>(TO_NEG_INF) | static_cast<int>(NO_EXC),
+ CEIL = static_cast<int>(TO_POS_INF) | static_cast<int>(RAISE_EXC),
+ CEIL_NOEXC = static_cast<int>(TO_POS_INF) | static_cast<int>(NO_EXC),
+ TRUNC = static_cast<int>(TO_ZERO) | static_cast<int>(RAISE_EXC),
+ TRUNC_NOEXC = static_cast<int>(TO_ZERO) | static_cast<int>(NO_EXC),
+ RINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(RAISE_EXC),
+ NEARBYINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(NO_EXC),
+ };
+
+ struct Traits
+ {
+ using CompareType = SIMDImpl::CompareType;
+ using ScaleFactor = SIMDImpl::ScaleFactor;
+ using RoundMode = SIMDImpl::RoundMode;
+ };
+
+ // Attribute, 4-dimensional attribute in SIMD SOA layout
+ template<typename Float, typename Integer, typename Double>
+ union Vec4
+ {
+ Float v[4];
+ Integer vi[4];
+ Double vd[4];
+ struct
+ {
+ Float x;
+ Float y;
+ Float z;
+ Float w;
+ };
+ SIMDINLINE Float& operator[] (const int i) { return v[i]; }
+ SIMDINLINE Float const & operator[] (const int i) const { return v[i]; }
+ SIMDINLINE Vec4& operator=(Vec4 const & in)
+ {
+ v[0] = in.v[0];
+ v[1] = in.v[1];
+ v[2] = in.v[2];
+ v[3] = in.v[3];
+ return *this;
+ }
+ };
+
+ namespace SIMD128Impl
+ {
+ union Float
+ {
+ SIMDINLINE Float() = default;
+ SIMDINLINE Float(__m128 in) : v(in) {}
+ SIMDINLINE Float& operator=(__m128 in) { v = in; return *this; }
+ SIMDINLINE Float& operator=(Float const & in) { v = in.v; return *this; }
+ SIMDINLINE operator __m128() const { return v; }
+
+ SIMDALIGN(__m128, 16) v;
+ };
+
+ union Integer
+ {
+ SIMDINLINE Integer() = default;
+ SIMDINLINE Integer(__m128i in) : v(in) {}
+ SIMDINLINE Integer& operator=(__m128i in) { v = in; return *this; }
+ SIMDINLINE Integer& operator=(Integer const & in) { v = in.v; return *this; }
+ SIMDINLINE operator __m128i() const { return v; }
+ SIMDALIGN(__m128i, 16) v;
+ };
+
+ union Double
+ {
+ SIMDINLINE Double() = default;
+ SIMDINLINE Double(__m128d in) : v(in) {}
+ SIMDINLINE Double& operator=(__m128d in) { v = in; return *this; }
+ SIMDINLINE Double& operator=(Double const & in) { v = in.v; return *this; }
+ SIMDINLINE operator __m128d() const { return v; }
+ SIMDALIGN(__m128d, 16) v;
+ };
+
+ using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
+ using Mask = uint8_t;
+
+ static const uint32_t SIMD_WIDTH = 4;
+ } // ns SIMD128Impl
+
+ namespace SIMD256Impl
+ {
+ union Float
+ {
+ SIMDINLINE Float() = default;
+ SIMDINLINE Float(__m256 in) : v(in) {}
+ SIMDINLINE Float(SIMD128Impl::Float in_lo, SIMD128Impl::Float in_hi = _mm_setzero_ps())
+ {
+ v = _mm256_insertf128_ps(_mm256_castps128_ps256(in_lo), in_hi, 0x1);
+ }
+ SIMDINLINE Float& operator=(__m256 in) { v = in; return *this; }
+ SIMDINLINE Float& operator=(Float const & in) { v = in.v; return *this; }
+ SIMDINLINE operator __m256() const { return v; }
+
+ SIMDALIGN(__m256, 32) v;
+ SIMD128Impl::Float v4[2];
+ };
+
+ union Integer
+ {
+ SIMDINLINE Integer() = default;
+ SIMDINLINE Integer(__m256i in) : v(in) {}
+ SIMDINLINE Integer(SIMD128Impl::Integer in_lo, SIMD128Impl::Integer in_hi = _mm_setzero_si128())
+ {
+ v = _mm256_insertf128_si256(_mm256_castsi128_si256(in_lo), in_hi, 0x1);
+ }
+ SIMDINLINE Integer& operator=(__m256i in) { v = in; return *this; }
+ SIMDINLINE Integer& operator=(Integer const & in) { v = in.v; return *this; }
+ SIMDINLINE operator __m256i() const { return v; }
+
+ SIMDALIGN(__m256i, 32) v;
+ SIMD128Impl::Integer v4[2];
+ };
+
+ union Double
+ {
+ SIMDINLINE Double() = default;
+ SIMDINLINE Double(__m256d in) : v(in) {}
+ SIMDINLINE Double(SIMD128Impl::Double in_lo, SIMD128Impl::Double in_hi = _mm_setzero_pd())
+ {
+ v = _mm256_insertf128_pd(_mm256_castpd128_pd256(in_lo), in_hi, 0x1);
+ }
+ SIMDINLINE Double& operator=(__m256d in) { v = in; return *this; }
+ SIMDINLINE Double& operator=(Double const & in) { v = in.v; return *this; }
+ SIMDINLINE operator __m256d() const { return v; }
+
+ SIMDALIGN(__m256d, 32) v;
+ SIMD128Impl::Double v4[2];
+ };
+
+ using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
+ using Mask = uint8_t;
+
+ static const uint32_t SIMD_WIDTH = 8;
+ } // ns SIMD256Impl
+
+ namespace SIMD512Impl
+ {
+#if !defined(_MM_K0_REG)
+ // Define AVX512 types if not included via immintrin.h.
+ // All data members of these types are ONLY to viewed
+ // in a debugger. Do NOT access them via code!
+ union __m512
+ {
+ private:
+ float m512_f32[16];
+ };
+ struct __m512d
+ {
+ private:
+ double m512d_f64[8];
+ };
+
+ union __m512i
+ {
+ private:
+ int8_t m512i_i8[64];
+ int16_t m512i_i16[32];
+ int32_t m512i_i32[16];
+ int64_t m512i_i64[8];
+ uint8_t m512i_u8[64];
+ uint16_t m512i_u16[32];
+ uint32_t m512i_u32[16];
+ uint64_t m512i_u64[8];
+ };
+
+ using __mmask16 = uint16_t;
+#endif
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+#define SIMD_ALIGNMENT_BYTES 64
+#else
+#define SIMD_ALIGNMENT_BYTES 32
+#endif
+
+ union Float
+ {
+ SIMDINLINE Float() = default;
+ SIMDINLINE Float(__m512 in) : v(in) {}
+ SIMDINLINE Float(SIMD256Impl::Float in_lo, SIMD256Impl::Float in_hi = _mm256_setzero_ps()) { v8[0] = in_lo; v8[1] = in_hi; }
+ SIMDINLINE Float& operator=(__m512 in) { v = in; return *this; }
+ SIMDINLINE Float& operator=(Float const & in)
+ {
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+ v = in.v;
+#else
+ v8[0] = in.v8[0];
+ v8[1] = in.v8[1];
+#endif
+ return *this;
+ }
+ SIMDINLINE operator __m512() const { return v; }
+
+ SIMDALIGN(__m512, SIMD_ALIGNMENT_BYTES) v;
+ SIMD256Impl::Float v8[2];
+ };
+
+ union Integer
+ {
+ SIMDINLINE Integer() = default;
+ SIMDINLINE Integer(__m512i in) : v(in) {}
+ SIMDINLINE Integer(SIMD256Impl::Integer in_lo, SIMD256Impl::Integer in_hi = _mm256_setzero_si256()) { v8[0] = in_lo; v8[1] = in_hi; }
+ SIMDINLINE Integer& operator=(__m512i in) { v = in; return *this; }
+ SIMDINLINE Integer& operator=(Integer const & in)
+ {
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+ v = in.v;
+#else
+ v8[0] = in.v8[0];
+ v8[1] = in.v8[1];
+#endif
+ return *this;
+ }
+
+ SIMDINLINE operator __m512i() const { return v; }
+
+ SIMDALIGN(__m512i, SIMD_ALIGNMENT_BYTES) v;
+ SIMD256Impl::Integer v8[2];
+ };
+
+ union Double
+ {
+ SIMDINLINE Double() = default;
+ SIMDINLINE Double(__m512d in) : v(in) {}
+ SIMDINLINE Double(SIMD256Impl::Double in_lo, SIMD256Impl::Double in_hi = _mm256_setzero_pd()) { v8[0] = in_lo; v8[1] = in_hi; }
+ SIMDINLINE Double& operator=(__m512d in) { v = in; return *this; }
+ SIMDINLINE Double& operator=(Double const & in)
+ {
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+ v = in.v;
+#else
+ v8[0] = in.v8[0];
+ v8[1] = in.v8[1];
+#endif
+ return *this;
+ }
+
+ SIMDINLINE operator __m512d() const { return v; }
+
+ SIMDALIGN(__m512d, SIMD_ALIGNMENT_BYTES) v;
+ SIMD256Impl::Double v8[2];
+ };
+
+ typedef SIMDImpl::Vec4<Float, Integer, Double> SIMDALIGN(Vec4, 64);
+ using Mask = __mmask16;
+
+ static const uint32_t SIMD_WIDTH = 16;
+
+#undef SIMD_ALIGNMENT_BYTES
+ } // ns SIMD512Impl
+} // ns SIMDImpl
};
#if KNOB_SIMD_WIDTH == 8
-static const simdscalar vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
-static const simdscalar vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
-static const simdscalar vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
-static const simdscalar vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
+static const __m256 vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
+static const __m256 vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
+static const __m256 vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
+static const __m256 vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
#define MASK 0xff
#endif
uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
if(T::MultisampleT::numSamples == 1)
{
- sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
+ sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
}
else if(T::MultisampleT::numSamples == 2)
{
- sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
+ sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
}
else if(T::MultisampleT::numSamples == 4)
{
- sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
+ sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
}
else if(T::MultisampleT::numSamples == 8)
{
- sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+ sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
}
else if(T::MultisampleT::numSamples == 16)
{
- sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
- sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
+ sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
+ sampleCoverage[1] = _simd_set1_epi32(centerCoverage);
}
}
else
{
- __m256i src = _mm256_set1_epi32(0);
- __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
+ simdscalari src = _simd_set1_epi32(0);
+ simdscalari index0 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
if(T::MultisampleT::numSamples == 1)
{
- mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
+ mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
}
else if(T::MultisampleT::numSamples == 2)
{
- mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+ mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
}
else if(T::MultisampleT::numSamples == 4)
{
- mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
+ mask[0] = _simd_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
}
else if(T::MultisampleT::numSamples == 8)
{
- mask[0] = _mm256_set1_epi32(-1);
+ mask[0] = _simd_set1_epi32(-1);
}
else if(T::MultisampleT::numSamples == 16)
{
- mask[0] = _mm256_set1_epi32(-1);
- mask[1] = _mm256_set1_epi32(-1);
- index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
+ mask[0] = _simd_set1_epi32(-1);
+ mask[1] = _simd_set1_epi32(-1);
+ index1 = _simd_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
}
// gather coverage for samples 0-7
packedSampleCoverage = packedCoverage0;
}
#else
- simdscalari permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
+ simdscalari permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
// pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
simdscalari packedSampleCoverage;
if(T::MultisampleT::numSamples > 8)
{
- permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
+ permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
// pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
{
uint32_t inputMask[KNOB_SIMD_WIDTH];
generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
- inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
+ inputCoverage = _simd_castsi_ps(_simd_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
}
};
{
// will need to update for avx512
assert(KNOB_SIMD_WIDTH == 8);
- simdscalari vec = _mm256_set1_epi32(coverageMask[0]);
- const simdscalari bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+ simdscalari vec = _simd_set1_epi32(coverageMask[0]);
+ const simdscalari bit = _simd_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
vec = _simd_and_si(vec, bit);
- vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
+ vec = _simd_cmplt_epi32(_simd_setzero_si(), vec);
vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
inputCoverage = _simd_castsi_ps(vec);
}
(inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
// look up and set the sample offsets from UL pixel corner for first covered sample
- __m256 vXSample = _mm256_set_ps(samplePos.X(sampleNum[7]),
+ simdscalar vXSample = _simd_set_ps(samplePos.X(sampleNum[7]),
samplePos.X(sampleNum[6]),
samplePos.X(sampleNum[5]),
samplePos.X(sampleNum[4]),
samplePos.X(sampleNum[1]),
samplePos.X(sampleNum[0]));
- __m256 vYSample = _mm256_set_ps(samplePos.Y(sampleNum[7]),
+ simdscalar vYSample = _simd_set_ps(samplePos.Y(sampleNum[7]),
samplePos.Y(sampleNum[6]),
samplePos.Y(sampleNum[5]),
samplePos.Y(sampleNum[4]),
// Case (1) and case (3b) - All samples covered or not covered with full SampleMask
static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask();
- simdscalari vInputCoveragei = _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
+ simdscalari vInputCoveragei = _simd_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
static const simdscalari vZero = _simd_setzero_si();
inputSlot = backendState.vertexAttribOffset + i;
}
- __m128 attrib[3]; // triangle attribs (always 4 wide)
+ simd4scalar attrib[3]; // triangle attribs (always 4 wide)
float* pAttribStart = pBuffer;
if (HasConstantInterpT::value || IsDegenerate::value)
for (uint32_t i = 0; i < NumVertsT::value; ++i)
{
- _mm_store_ps(pBuffer, attrib[vid]);
+ SIMD128::store_ps(pBuffer, attrib[vid]);
pBuffer += 4;
}
}
for (uint32_t i = 0; i < NumVertsT::value; ++i)
{
- _mm_store_ps(pBuffer, attrib[i]);
+ SIMD128::store_ps(pBuffer, attrib[i]);
pBuffer += 4;
}
}
for (uint32_t i = 0; i < NumVertsT::value; ++i)
{
- _mm_store_ps(pBuffer, attrib[i]);
+ SIMD128::store_ps(pBuffer, attrib[i]);
pBuffer += 4;
}
}
// effect of the missing vertices in the triangle interpolation.
for (uint32_t v = NumVertsT::value; v < 3; ++v)
{
- _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
+ SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]);
pBuffer += 4;
}
{
static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
simd16scalari &scisXmin, simd16scalari &scisYmin,
- simd16scalari &scisXmax, simd16scalari &scisYmax)
- {
+ simd16scalari &scisXmax, simd16scalari &scisYmax) {
scisXmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
pScissorsInFixedPoint[pViewportIndex[1]].xmin,
pScissorsInFixedPoint[pViewportIndex[2]].xmin,
uint32_t clipAttribSlot = clipSlot == 0 ?
VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
- __m128 primClipDist[3];
+ simd4scalar primClipDist[3];
pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
float vertClipDist[NumVerts];
for (uint32_t e = 0; e < NumVerts; ++e)
{
OSALIGNSIMD(float) aVertClipDist[4];
- _mm_store_ps(aVertClipDist, primClipDist[e]);
+ SIMD128::store_ps(aVertClipDist, primClipDist[e]);
vertClipDist[e] = aVertClipDist[clipComp];
};
(SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
}
+ simdBBox bbox;
+
if (!triMask)
{
goto endBinTriangles;
}
// Calc bounding box of triangles
- simdBBox bbox;
calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
// determine if triangle falls between pixel centers and discard
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
// Gather the AOS effective scissor rects based on the per-prim VP index.
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
- simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
- if (state.backendState.readViewportArrayIndex)
- {
- GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
- }
- else // broadcast fast path for non-VPAI case.
{
- scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
- }
+ simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+ if (state.backendState.readViewportArrayIndex)
+ {
+ GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+ scisXmin, scisYmin, scisXmax, scisYmax);
+ }
+ else // broadcast fast path for non-VPAI case.
+ {
+ scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ }
- // Make triangle bbox inclusive
- bbox.xmax = _simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1));
- bbox.ymax = _simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1));
+ // Make triangle bbox inclusive
+ bbox.xmax = _simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1));
+ bbox.ymax = _simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1));
- bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd_min_epi32(bbox.xmax, scisXmax);
- bbox.ymax = _simd_min_epi32(bbox.ymax, scisYmax);
+ bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = _simd_min_epi32(bbox.xmax, scisXmax);
+ bbox.ymax = _simd_min_epi32(bbox.ymax, scisYmax);
+ }
if (CT::IsConservativeT::value)
{
// transpose verts needed for backend
/// @todo modify BE to take non-transformed verts
- __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
+ simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
// store triangle vertex data
desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
- _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
- _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
- _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
- _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
// store user clip distances
if (rastState.clipDistanceMask)
#if USE_SIMD16_FRONTEND
template <typename CT>
-void SIMDAPI BinTriangles_simd16(
+void SIMDCALL BinTriangles_simd16(
DRAW_CONTEXT *pDC,
PA_STATE& pa,
uint32_t workerId,
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
// Gather the AOS effective scissor rects based on the per-prim VP index.
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
- simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
-
- if (state.backendState.readViewportArrayIndex)
- {
- GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
- }
- else // broadcast fast path for non-VPAI case.
{
- scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
- }
+ simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
+
+ if (state.backendState.readViewportArrayIndex)
+ {
+ GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+ scisXmin, scisYmin, scisXmax, scisYmax);
+ }
+ else // broadcast fast path for non-VPAI case.
+ {
+ scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ }
- // Make triangle bbox inclusive
- bbox.xmax = _simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1));
- bbox.ymax = _simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1));
+ // Make triangle bbox inclusive
+ bbox.xmax = _simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1));
+ bbox.ymax = _simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1));
- bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd16_min_epi32(bbox.xmax, scisXmax);
- bbox.ymax = _simd16_min_epi32(bbox.ymax, scisYmax);
+ bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = _simd16_min_epi32(bbox.xmax, scisXmax);
+ bbox.ymax = _simd16_min_epi32(bbox.ymax, scisYmax);
+ }
if (CT::IsConservativeT::value)
{
// transpose verts needed for backend
/// @todo modify BE to take non-transformed verts
- __m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
- __m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
- __m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
- __m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
vTranspose3x8(vHorizX[0], _simd16_extract_ps(tri[0].x, 0), _simd16_extract_ps(tri[1].x, 0), _simd16_extract_ps(tri[2].x, 0));
vTranspose3x8(vHorizY[0], _simd16_extract_ps(tri[0].y, 0), _simd16_extract_ps(tri[1].y, 0), _simd16_extract_ps(tri[2].y, 0));
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
// Gather the AOS effective scissor rects based on the per-prim VP index.
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
- simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
- if (state.backendState.readViewportArrayIndex)
- {
- GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
- }
- else // broadcast fast path for non-VPAI case.
{
- scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
- }
+ simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+ if (state.backendState.readViewportArrayIndex)
+ {
+ GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+ scisXmin, scisYmin, scisXmax, scisYmax);
+ }
+ else // broadcast fast path for non-VPAI case.
+ {
+ scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ }
- bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
- bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+ bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
+ bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+ }
// Cull bloated points completely outside scissor
simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
// Gather the AOS effective scissor rects based on the per-prim VP index.
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
- simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
- if (state.backendState.readViewportArrayIndex)
{
- GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
- }
- else // broadcast fast path for non-VPAI case.
- {
- scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
- }
+ simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
+ if (state.backendState.readViewportArrayIndex)
+ {
+ GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+ scisXmin, scisYmin, scisXmax, scisYmax);
+ }
+ else // broadcast fast path for non-VPAI case.
+ {
+ scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ }
- bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
- bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+ bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
+ bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+ }
// Cull bloated points completely outside scissor
simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax);
AR_END(FEBinPoints, 1);
}
-void SIMDAPI BinPoints_simd16(
+void SIMDCALL BinPoints_simd16(
DRAW_CONTEXT *pDC,
PA_STATE& pa,
uint32_t workerId,
simdscalar& vRecipW0 = recipW[0];
simdscalar& vRecipW1 = recipW[1];
+ simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
+
// convert to fixed point
simdscalari vXi[2], vYi[2];
vXi[0] = fpToFixedPointVertical(prim[0].x);
bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
- simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
- if (state.backendState.readViewportArrayIndex)
- {
- GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
- }
- else // broadcast fast path for non-VPAI case.
{
- scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
- }
+ simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+ if (state.backendState.readViewportArrayIndex)
+ {
+ GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+ scisXmin, scisYmin, scisXmax, scisYmax);
+ }
+ else // broadcast fast path for non-VPAI case.
+ {
+ scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ }
- bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
- bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+ bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
+ bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+ }
// Cull prims completely outside scissor
{
// transpose verts needed for backend
/// @todo modify BE to take non-transformed verts
- __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
// store line vertex data
desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
- _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
- _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
- _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
- _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
// store user clip distances
if (rastState.clipDistanceMask)
bbox.ymax = _simd16_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
- simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
-
- if (state.backendState.readViewportArrayIndex)
- {
- GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
- }
- else // broadcast fast path for non-VPAI case.
{
- scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
- }
+ simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
+
+ if (state.backendState.readViewportArrayIndex)
+ {
+ GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+ scisXmin, scisYmin, scisXmax, scisYmax);
+ }
+ else // broadcast fast path for non-VPAI case.
+ {
+ scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ }
- bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
- bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+ bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
+ bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+ }
// Cull prims completely outside scissor
{
// transpose verts needed for backend
/// @todo modify BE to take non-transformed verts
- __m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
- __m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
- __m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
- __m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
vTranspose3x8(vHorizX[0], _simd16_extract_ps(prim[0].x, 0), _simd16_extract_ps(prim[1].x, 0), unused);
vTranspose3x8(vHorizY[0], _simd16_extract_ps(prim[0].y, 0), _simd16_extract_ps(prim[1].y, 0), unused);
}
#if USE_SIMD16_FRONTEND
-void SIMDAPI BinLines_simd16(
+void SIMDCALL BinLines_simd16(
DRAW_CONTEXT *pDC,
PA_STATE& pa,
uint32_t workerId,
}
#if USE_SIMD16_FRONTEND
-void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
+void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
{
SWR_CONTEXT *pContext = pDC->pContext;
AR_BEGIN(FEClipTriangles, pDC->drawId);
AR_END(FEClipTriangles, 1);
}
-void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
+void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
{
SWR_CONTEXT *pContext = pDC->pContext;
AR_BEGIN(FEClipLines, pDC->drawId);
AR_END(FEClipLines, 1);
}
-void SIMDAPI ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
+void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
{
SWR_CONTEXT *pContext = pDC->pContext;
AR_BEGIN(FEClipPoints, pDC->drawId);
AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
// we have to clip tris, execute the clipper, which will also
// call the binner
- ClipSimd(vMask16(primMask), vMask16(clipMask), pa, primId);
+ ClipSimd(_simd16_vmask_ps(primMask), _simd16_vmask_ps(clipMask), pa, primId);
AR_END(FEGuardbandClip, 1);
}
else if (validMask)
{
simd16scalari vOffsets = ComputeOffsets(attrib, vIndices, component);
simd16scalar vSrc = _simd16_setzero_ps();
- return _simd16_mask_i32gather_ps(vSrc, pBuffer, vOffsets, _simd16_castps_si(vMask), 1);
+ return _simd16_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1);
}
#endif
void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
#if USE_SIMD16_FRONTEND
-void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
-void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
-void SIMDAPI ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
+void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
+void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
+void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
#endif
#if ENABLE_AVX512_SIMD16
// function signature for pipeline stages that execute after primitive assembly
-typedef void(SIMDAPI *PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[],
+typedef void(SIMDCALL *PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[],
uint32_t primMask, simd16scalari primID);
#endif
/// @param pSrc - source data in SOA form
/// @param dst - output data in SOA form
template<SWR_FORMAT SrcFormat>
-INLINE void SIMDAPI LoadSOA(const uint8_t *pSrc, simd16vector &dst)
+INLINE void SIMDCALL LoadSOA(const uint8_t *pSrc, simd16vector &dst)
{
// fast path for float32
if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<SrcFormat>::GetBPC(0) == 32))
/// @param vComp - SIMD vector of floats
/// @param Component - component
template<SWR_FORMAT Format>
-INLINE simd16scalar SIMDAPI Clamp(simd16scalar vComp, uint32_t Component)
+INLINE simd16scalar SIMDCALL Clamp(simd16scalar vComp, uint32_t Component)
{
if (FormatTraits<Format>::isNormalized(Component))
{
/// @param vComp - SIMD vector of floats
/// @param Component - component
template<SWR_FORMAT Format>
-INLINE simd16scalar SIMDAPI Normalize(simd16scalar vComp, uint32_t Component)
+INLINE simd16scalar SIMDCALL Normalize(simd16scalar vComp, uint32_t Component)
{
if (FormatTraits<Format>::isNormalized(Component))
{
/// @param src - source data in SOA form
/// @param dst - output data in SOA form
template<SWR_FORMAT DstFormat>
-INLINE void SIMDAPI StoreSOA(const simd16vector &src, uint8_t *pDst)
+INLINE void SIMDCALL StoreSOA(const simd16vector &src, uint8_t *pDst)
{
// fast path for float32
if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<DstFormat>::GetBPC(0) == 32))
static simdscalar pack(simdscalar &in) = delete;
#if ENABLE_AVX512_SIMD16
static simd16scalar loadSOA_16(const uint8_t *pSrc) = delete;
- static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src) = delete;
+ static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src) = delete;
static simd16scalar unpack(simd16scalar &in) = delete;
static simd16scalar pack(simd16scalar &in) = delete;
#endif
static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
#if ENABLE_AVX512_SIMD16
static simd16scalar loadSOA_16(const uint8_t *pSrc) { return _simd16_setzero_ps(); }
- static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src) { return; }
+ static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src) { return; }
static simd16scalar unpack(simd16scalar &in) { return _simd16_setzero_ps(); }
static simd16scalar pack(simd16scalar &in) { return _simd16_setzero_ps(); }
#endif
__m256i result = _mm256_castsi128_si256(resLo);
result = _mm256_insertf128_si256(result, resHi, 1);
- return _mm256_castsi256_ps(result);
+ return simdscalar{ _mm256_castsi256_ps(result) };
#else
return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
#endif
return result;
}
- static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
+ static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
{
// store simd16 bytes
_mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
static simd16scalar unpack(simd16scalar &in)
{
- simd16scalari result = _simd16_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0))));
+ simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
+ simd16scalari result = _simd16_cvtepu8_epi32(tmp);
return _simd16_castsi_ps(result);
}
return result;
}
- static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
+ static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
{
// store simd16 bytes
_mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
static simd16scalar unpack(simd16scalar &in)
{
- simd16scalari result = _simd16_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0))));
+ simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
+ simd16scalari result = _simd16_cvtepu8_epi32(tmp);
return _simd16_castsi_ps(result);
}
return result;
}
- static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
+ static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
{
_simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
}
return result;
}
- static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
+ static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
{
_simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
}
return _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
}
- static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
+ static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
{
_simd16_store_ps(reinterpret_cast<float *>(pDst), src);
}
#if ENABLE_AVX512_SIMD16
template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
-inline static simd16scalar SIMDAPI fastpow(simd16scalar value)
+inline static simd16scalar SIMDCALL fastpow(simd16scalar value)
{
static const float factor1 = exp2(127.0f * expden / expnum - 127.0f)
* powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum);
return result;
}
-inline static simd16scalar SIMDAPI pow512_4(simd16scalar arg)
+inline static simd16scalar SIMDCALL pow512_4(simd16scalar arg)
{
// 5/12 is too small, so compute the 4th root of 20/12 instead.
// 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
return xavg;
}
-inline static simd16scalar SIMDAPI powf_wrapper(const simd16scalar base, float exp)
+inline static simd16scalar SIMDCALL powf_wrapper(const simd16scalar base, float exp)
{
const float *f = reinterpret_cast<const float *>(&base);
return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
}
- INLINE static void SIMDAPI storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar src)
+ INLINE static void SIMDCALL storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar src)
{
switch (comp)
{
#include "common/simdintrin.h"
INLINE
-void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
+void vTranspose(simd4scalar &row0, simd4scalar &row1, simd4scalar &row2, simd4scalar &row3)
{
- __m128i row0i = _mm_castps_si128(row0);
- __m128i row1i = _mm_castps_si128(row1);
- __m128i row2i = _mm_castps_si128(row2);
- __m128i row3i = _mm_castps_si128(row3);
+ simd4scalari row0i = SIMD128::castps_si(row0);
+ simd4scalari row1i = SIMD128::castps_si(row1);
+ simd4scalari row2i = SIMD128::castps_si(row2);
+ simd4scalari row3i = SIMD128::castps_si(row3);
- __m128i vTemp = row2i;
- row2i = _mm_unpacklo_epi32(row2i, row3i);
- vTemp = _mm_unpackhi_epi32(vTemp, row3i);
+ simd4scalari vTemp = row2i;
+ row2i = SIMD128::unpacklo_epi32(row2i, row3i);
+ vTemp = SIMD128::unpackhi_epi32(vTemp, row3i);
row3i = row0i;
- row0i = _mm_unpacklo_epi32(row0i, row1i);
- row3i = _mm_unpackhi_epi32(row3i, row1i);
+ row0i = SIMD128::unpacklo_epi32(row0i, row1i);
+ row3i = SIMD128::unpackhi_epi32(row3i, row1i);
row1i = row0i;
- row0i = _mm_unpacklo_epi64(row0i, row2i);
- row1i = _mm_unpackhi_epi64(row1i, row2i);
+ row0i = SIMD128::unpacklo_epi64(row0i, row2i);
+ row1i = SIMD128::unpackhi_epi64(row1i, row2i);
row2i = row3i;
- row2i = _mm_unpacklo_epi64(row2i, vTemp);
- row3i = _mm_unpackhi_epi64(row3i, vTemp);
+ row2i = SIMD128::unpacklo_epi64(row2i, vTemp);
+ row3i = SIMD128::unpackhi_epi64(row3i, vTemp);
- row0 = _mm_castsi128_ps(row0i);
- row1 = _mm_castsi128_ps(row1i);
- row2 = _mm_castsi128_ps(row2i);
- row3 = _mm_castsi128_ps(row3i);
+ row0 = SIMD128::castsi_ps(row0i);
+ row1 = SIMD128::castsi_ps(row1i);
+ row2 = SIMD128::castsi_ps(row2i);
+ row3 = SIMD128::castsi_ps(row3i);
}
INLINE
-void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
+void vTranspose(simd4scalari &row0, simd4scalari &row1, simd4scalari &row2, simd4scalari &row3)
{
- __m128i vTemp = row2;
- row2 = _mm_unpacklo_epi32(row2, row3);
- vTemp = _mm_unpackhi_epi32(vTemp, row3);
+ simd4scalari vTemp = row2;
+ row2 = SIMD128::unpacklo_epi32(row2, row3);
+ vTemp = SIMD128::unpackhi_epi32(vTemp, row3);
row3 = row0;
- row0 = _mm_unpacklo_epi32(row0, row1);
- row3 = _mm_unpackhi_epi32(row3, row1);
+ row0 = SIMD128::unpacklo_epi32(row0, row1);
+ row3 = SIMD128::unpackhi_epi32(row3, row1);
row1 = row0;
- row0 = _mm_unpacklo_epi64(row0, row2);
- row1 = _mm_unpackhi_epi64(row1, row2);
+ row0 = SIMD128::unpacklo_epi64(row0, row2);
+ row1 = SIMD128::unpackhi_epi64(row1, row2);
row2 = row3;
- row2 = _mm_unpacklo_epi64(row2, vTemp);
- row3 = _mm_unpackhi_epi64(row3, vTemp);
+ row2 = SIMD128::unpacklo_epi64(row2, vTemp);
+ row3 = SIMD128::unpackhi_epi64(row3, vTemp);
}
#if KNOB_SIMD_WIDTH == 8
INLINE
-void vTranspose3x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2)
+void vTranspose3x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2)
{
simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
simdscalar r1rx = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps()); //y0w0y1w1 y4w4y5w5
simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
- vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
- vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
- vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
- vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
+ vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
+ vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
+ vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
+ vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
}
INLINE
-void vTranspose4x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2, const simdscalar &vSrc3)
+void vTranspose4x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2, const simdscalar &vSrc3)
{
simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
simdscalar r1rx = _simd_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5
simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6
simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7
- vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
- vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
- vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
- vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
+ vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
+ vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
+ vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
+ vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
#if KNOB_SIMD_WIDTH == 8
#if KNOB_ARCH <= KNOB_ARCH_AVX
- __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg
- __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa
- __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
- __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa
- __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg
- __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa
- __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba
- __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba
- _mm_store_si128((__m128i*)pDst, c0123lo);
- _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
+ simd4scalari c0c1 = src.v4[0]; // rrrrrrrrgggggggg
+ simd4scalari c2c3 = SIMD128::castps_si(_simd_extractf128_ps(_simd_castsi_ps(src), 1)); // bbbbbbbbaaaaaaaa
+ simd4scalari c0c2 = SIMD128::unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb
+ simd4scalari c1c3 = SIMD128::unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa
+ simd4scalari c01 = SIMD128::unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg
+ simd4scalari c23 = SIMD128::unpackhi_epi8(c0c2, c1c3); // babababababababa
+ simd4scalari c0123lo = SIMD128::unpacklo_epi16(c01, c23); // rgbargbargbargba
+ simd4scalari c0123hi = SIMD128::unpackhi_epi16(c01, c23); // rgbargbargbargba
+ SIMD128::store_si((simd4scalari*)pDst, c0123lo);
+ SIMD128::store_si((simd4scalari*)(pDst + 16), c0123hi);
#else
simdscalari dst01 = _simd_shuffle_epi8(src,
_simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
{
- __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc)); // rrrrrrrrrrrrrrrr
- __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
- __m128i src2 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
- __m128i src3 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
+ simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc)); // rrrrrrrrrrrrrrrr
+ simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 1); // gggggggggggggggg
+ simd4scalari src2 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
+ simd4scalari src3 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0);
simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1);
#if KNOB_SIMD_WIDTH == 8
simdscalari src = _simd_load_si((const simdscalari*)pSrc);
- __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg
- __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg
- rg = _mm_unpacklo_epi8(rg, g);
- _mm_store_si128((__m128i*)pDst, rg);
+ simd4scalari rg = src.v4[0]; // rrrrrrrr gggggggg
+ simd4scalari g = SIMD128::unpackhi_epi64(rg, rg); // gggggggg gggggggg
+ rg = SIMD128::unpacklo_epi8(rg, g);
+ SIMD128::store_si((simd4scalari*)pDst, rg);
#else
#error Unsupported vector width
#endif
INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
{
- __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc)); // rrrrrrrrrrrrrrrr
- __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
+ simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc)); // rrrrrrrrrrrrrrrr
+ simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 1); // gggggggggggggggg
simdscalari cvt0 = _simd_cvtepu8_epi16(src0);
simdscalari cvt1 = _simd_cvtepu8_epi16(src1);
simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
- __m128 vDst[8];
+ simd4scalar vDst[8];
vTranspose4x8(vDst, src0, src1, src2, src3);
- _mm_store_ps((float*)pDst, vDst[0]);
- _mm_store_ps((float*)pDst+4, vDst[1]);
- _mm_store_ps((float*)pDst+8, vDst[2]);
- _mm_store_ps((float*)pDst+12, vDst[3]);
- _mm_store_ps((float*)pDst+16, vDst[4]);
- _mm_store_ps((float*)pDst+20, vDst[5]);
- _mm_store_ps((float*)pDst+24, vDst[6]);
- _mm_store_ps((float*)pDst+28, vDst[7]);
+ SIMD128::store_ps((float*)pDst, vDst[0]);
+ SIMD128::store_ps((float*)pDst+4, vDst[1]);
+ SIMD128::store_ps((float*)pDst+8, vDst[2]);
+ SIMD128::store_ps((float*)pDst+12, vDst[3]);
+ SIMD128::store_ps((float*)pDst+16, vDst[4]);
+ SIMD128::store_ps((float*)pDst+20, vDst[5]);
+ SIMD128::store_ps((float*)pDst+24, vDst[6]);
+ SIMD128::store_ps((float*)pDst+28, vDst[7]);
#else
#error Unsupported vector width
#endif
simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
- __m128 vDst[8];
+ simd4scalar vDst[8];
vTranspose3x8(vDst, src0, src1, src2);
- _mm_store_ps((float*)pDst, vDst[0]);
- _mm_store_ps((float*)pDst + 4, vDst[1]);
- _mm_store_ps((float*)pDst + 8, vDst[2]);
- _mm_store_ps((float*)pDst + 12, vDst[3]);
- _mm_store_ps((float*)pDst + 16, vDst[4]);
- _mm_store_ps((float*)pDst + 20, vDst[5]);
- _mm_store_ps((float*)pDst + 24, vDst[6]);
- _mm_store_ps((float*)pDst + 28, vDst[7]);
+ SIMD128::store_ps((float*)pDst, vDst[0]);
+ SIMD128::store_ps((float*)pDst + 4, vDst[1]);
+ SIMD128::store_ps((float*)pDst + 8, vDst[2]);
+ SIMD128::store_ps((float*)pDst + 12, vDst[3]);
+ SIMD128::store_ps((float*)pDst + 16, vDst[4]);
+ SIMD128::store_ps((float*)pDst + 20, vDst[5]);
+ SIMD128::store_ps((float*)pDst + 24, vDst[6]);
+ SIMD128::store_ps((float*)pDst + 28, vDst[7]);
#else
#error Unsupported vector width
#endif
{
#if KNOB_SIMD_WIDTH == 8
const float* pfSrc = (const float*)pSrc;
- __m128 src_r0 = _mm_load_ps(pfSrc + 0);
- __m128 src_r1 = _mm_load_ps(pfSrc + 4);
- __m128 src_g0 = _mm_load_ps(pfSrc + 8);
- __m128 src_g1 = _mm_load_ps(pfSrc + 12);
+ simd4scalar src_r0 = SIMD128::load_ps(pfSrc + 0);
+ simd4scalar src_r1 = SIMD128::load_ps(pfSrc + 4);
+ simd4scalar src_g0 = SIMD128::load_ps(pfSrc + 8);
+ simd4scalar src_g1 = SIMD128::load_ps(pfSrc + 12);
- __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
- __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
- __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
- __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
+ simd4scalar dst0 = SIMD128::unpacklo_ps(src_r0, src_g0);
+ simd4scalar dst1 = SIMD128::unpackhi_ps(src_r0, src_g0);
+ simd4scalar dst2 = SIMD128::unpacklo_ps(src_r1, src_g1);
+ simd4scalar dst3 = SIMD128::unpackhi_ps(src_r1, src_g1);
float* pfDst = (float*)pDst;
- _mm_store_ps(pfDst + 0, dst0);
- _mm_store_ps(pfDst + 4, dst1);
- _mm_store_ps(pfDst + 8, dst2);
- _mm_store_ps(pfDst + 12, dst3);
+ SIMD128::store_ps(pfDst + 0, dst0);
+ SIMD128::store_ps(pfDst + 4, dst1);
+ SIMD128::store_ps(pfDst + 8, dst2);
+ SIMD128::store_ps(pfDst + 12, dst3);
#else
#error Unsupported vector width
#endif
simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
- __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
- __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
- __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
- __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
-
- __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
- __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
- __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
- __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
-
- __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
- __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
- __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
- __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
-
- _mm_store_si128(((__m128i*)pDst) + 0, dst0);
- _mm_store_si128(((__m128i*)pDst) + 1, dst1);
- _mm_store_si128(((__m128i*)pDst) + 2, dst2);
- _mm_store_si128(((__m128i*)pDst) + 3, dst3);
+ simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
+ simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
+ simd4scalari src_b = _simd_extractf128_si(src_ba, 0);
+ simd4scalari src_a = _simd_extractf128_si(src_ba, 1);
+
+ simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
+ simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
+ simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
+ simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
+
+ simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
+ simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
+ simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
+ simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
+
+ SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
+ SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
+ SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
+ SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
#else
#error Unsupported vector width
#endif
#if KNOB_SIMD_WIDTH == 8
simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
- __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
- __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
- __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
- __m128i src_a = _mm_undefined_si128();
-
- __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
- __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
- __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
- __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
-
- __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
- __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
- __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
- __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
-
- _mm_store_si128(((__m128i*)pDst) + 0, dst0);
- _mm_store_si128(((__m128i*)pDst) + 1, dst1);
- _mm_store_si128(((__m128i*)pDst) + 2, dst2);
- _mm_store_si128(((__m128i*)pDst) + 3, dst3);
+ simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
+ simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
+ simd4scalari src_b = SIMD128::load_si((const simd4scalari*)(pSrc + sizeof(simdscalari)));
+ simd4scalari src_a = SIMD128::setzero_si();
+
+ simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
+ simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
+ simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
+ simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
+
+ simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
+ simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
+ simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
+ simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
+
+ SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
+ SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
+ SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
+ SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
#else
#error Unsupported vector width
#endif
#if KNOB_SIMD_WIDTH == 8
simdscalar src = _simd_load_ps((const float*)pSrc);
- __m128 comp0 = _mm256_castps256_ps128(src);
- __m128 comp1 = _mm256_extractf128_ps(src, 1);
+ simd4scalar comp0 = _simd_extractf128_ps(src, 0);
+ simd4scalar comp1 = _simd_extractf128_ps(src, 1);
- __m128i comp0i = _mm_castps_si128(comp0);
- __m128i comp1i = _mm_castps_si128(comp1);
+ simd4scalari comp0i = SIMD128::castps_si(comp0);
+ simd4scalari comp1i = SIMD128::castps_si(comp1);
- __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
- __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
+ simd4scalari resLo = SIMD128::unpacklo_epi16(comp0i, comp1i);
+ simd4scalari resHi = SIMD128::unpackhi_epi16(comp0i, comp1i);
- _mm_store_si128((__m128i*)pDst, resLo);
- _mm_store_si128((__m128i*)pDst + 1, resHi);
+ SIMD128::store_si((simd4scalari*)pDst, resLo);
+ SIMD128::store_si((simd4scalari*)pDst + 1, resHi);
#else
#error Unsupported vector width
#endif
// Write all entries into primitive data buffer for SOS.
while (_BitScanForward(&slot, soMask))
{
- __m128 attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
+ simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex];
pa.AssembleSingle(paSlot, primIndex, attrib);
if (HasStreamOutT::value)
{
+#if ENABLE_AVX512_SIMD16
gsPa.useAlternateOffset = false;
+#endif
StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
}
{
if (HasStreamOutT::value)
{
+#if ENABLE_AVX512_SIMD16
tessPa.useAlternateOffset = false;
+#endif
StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
}
void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
#if USE_SIMD16_FRONTEND
-void SIMDAPI BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
-void SIMDAPI BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
+void SIMDCALL BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
+void SIMDCALL BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
#endif
#if ENABLE_AVX512_SIMD16
virtual bool Assemble_simd16(uint32_t slot, simd16vector verts[]) = 0;
#endif
- virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0;
+ virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0;
virtual bool NextPrim() = 0;
virtual SIMDVERTEX& GetNextVsOutput() = 0;
virtual bool GetNextStreamOutput() = 0;
#if ENABLE_AVX512_SIMD16
typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
#endif
- typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+ typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
PFN_PA_FUNC pfnPaFunc{ nullptr }; // PA state machine function for assembling 4 triangles.
#if ENABLE_AVX512_SIMD16
#endif
// Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
- void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
+ void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
{
return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
}
}
#endif
- void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3])
+ void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3])
{
// move to slot
for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
_simd16_setzero_ps(),
pBase,
indices,
- mask,
+ _simd16_castsi_ps(mask),
4 /* gcc doesn't like sizeof(float) */);
verts[i].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
pBase,
indices,
_simd_castsi_ps(mask),
- 4 /* gcc doesn't like sizeof(float) */);
+ 4); // gcc doesn't like sizeof(float)
#endif
pBase += m_attributeStrideInVectors * SIMD_WIDTH;
}
_simd16_setzero_ps(),
pBase,
indices,
- mask,
+ _simd16_castsi_ps(mask),
4 /* gcc doesn't like sizeof(float) */);
#else
simdscalar temp = _simd_mask_i32gather_ps(
}
#endif
- void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
+ void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
{
SWR_ASSERT(slot < m_numAttributes);
SWR_ASSERT(primIndex < PA_TESS::NumPrims());
#if (KNOB_SIMD_WIDTH == 8)
-INLINE __m128 swizzleLane0(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane0(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
{
simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
}
-INLINE __m128 swizzleLane1(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane1(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
{
simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
}
-INLINE __m128 swizzleLane2(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane2(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
{
simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
}
-INLINE __m128 swizzleLane3(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane3(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
{
simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
}
-INLINE __m128 swizzleLane4(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane4(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
{
simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
}
-INLINE __m128 swizzleLane5(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane5(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
{
simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
}
-INLINE __m128 swizzleLane6(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane6(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
{
simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
}
-INLINE __m128 swizzleLane7(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane7(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
{
simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
}
-INLINE __m128 swizzleLane0(const simdvector &v)
+INLINE simd4scalar swizzleLane0(const simdvector &v)
{
return swizzleLane0(v.x, v.y, v.z, v.w);
}
-INLINE __m128 swizzleLane1(const simdvector &v)
+INLINE simd4scalar swizzleLane1(const simdvector &v)
{
return swizzleLane1(v.x, v.y, v.z, v.w);
}
-INLINE __m128 swizzleLane2(const simdvector &v)
+INLINE simd4scalar swizzleLane2(const simdvector &v)
{
return swizzleLane2(v.x, v.y, v.z, v.w);
}
-INLINE __m128 swizzleLane3(const simdvector &v)
+INLINE simd4scalar swizzleLane3(const simdvector &v)
{
return swizzleLane3(v.x, v.y, v.z, v.w);
}
-INLINE __m128 swizzleLane4(const simdvector &v)
+INLINE simd4scalar swizzleLane4(const simdvector &v)
{
return swizzleLane4(v.x, v.y, v.z, v.w);
}
-INLINE __m128 swizzleLane5(const simdvector &v)
+INLINE simd4scalar swizzleLane5(const simdvector &v)
{
return swizzleLane5(v.x, v.y, v.z, v.w);
}
-INLINE __m128 swizzleLane6(const simdvector &v)
+INLINE simd4scalar swizzleLane6(const simdvector &v)
{
return swizzleLane6(v.x, v.y, v.z, v.w);
}
-INLINE __m128 swizzleLane7(const simdvector &v)
+INLINE simd4scalar swizzleLane7(const simdvector &v)
{
return swizzleLane7(v.x, v.y, v.z, v.w);
}
-INLINE __m128 swizzleLaneN(const simdvector &v, int lane)
+INLINE simd4scalar swizzleLaneN(const simdvector &v, int lane)
{
switch (lane)
{
}
#if ENABLE_AVX512_SIMD16
-INLINE __m128 swizzleLane0(const simd16vector &v)
+INLINE simd4scalar swizzleLane0(const simd16vector &v)
{
return swizzleLane0(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
}
-INLINE __m128 swizzleLane1(const simd16vector &v)
+INLINE simd4scalar swizzleLane1(const simd16vector &v)
{
return swizzleLane1(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
}
-INLINE __m128 swizzleLane2(const simd16vector &v)
+INLINE simd4scalar swizzleLane2(const simd16vector &v)
{
return swizzleLane2(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
}
-INLINE __m128 swizzleLane3(const simd16vector &v)
+INLINE simd4scalar swizzleLane3(const simd16vector &v)
{
return swizzleLane3(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
}
-INLINE __m128 swizzleLane4(const simd16vector &v)
+INLINE simd4scalar swizzleLane4(const simd16vector &v)
{
return swizzleLane4(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
}
-INLINE __m128 swizzleLane5(const simd16vector &v)
+INLINE simd4scalar swizzleLane5(const simd16vector &v)
{
return swizzleLane5(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
}
-INLINE __m128 swizzleLane6(const simd16vector &v)
+INLINE simd4scalar swizzleLane6(const simd16vector &v)
{
return swizzleLane6(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
}
-INLINE __m128 swizzleLane7(const simd16vector &v)
+INLINE simd4scalar swizzleLane7(const simd16vector &v)
{
return swizzleLane7(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
}
-INLINE __m128 swizzleLane8(const simd16vector &v)
+INLINE simd4scalar swizzleLane8(const simd16vector &v)
{
return swizzleLane0(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
}
-INLINE __m128 swizzleLane9(const simd16vector &v)
+INLINE simd4scalar swizzleLane9(const simd16vector &v)
{
return swizzleLane1(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
}
-INLINE __m128 swizzleLaneA(const simd16vector &v)
+INLINE simd4scalar swizzleLaneA(const simd16vector &v)
{
return swizzleLane2(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
}
-INLINE __m128 swizzleLaneB(const simd16vector &v)
+INLINE simd4scalar swizzleLaneB(const simd16vector &v)
{
return swizzleLane3(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
}
-INLINE __m128 swizzleLaneC(const simd16vector &v)
+INLINE simd4scalar swizzleLaneC(const simd16vector &v)
{
return swizzleLane4(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
}
-INLINE __m128 swizzleLaneD(const simd16vector &v)
+INLINE simd4scalar swizzleLaneD(const simd16vector &v)
{
return swizzleLane5(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
}
-INLINE __m128 swizzleLaneE(const simd16vector &v)
+INLINE simd4scalar swizzleLaneE(const simd16vector &v)
{
return swizzleLane6(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
}
-INLINE __m128 swizzleLaneF(const simd16vector &v)
+INLINE simd4scalar swizzleLaneF(const simd16vector &v)
{
return swizzleLane7(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
}
-INLINE __m128 swizzleLaneN(const simd16vector &v, int lane)
+INLINE simd4scalar swizzleLaneN(const simd16vector &v, int lane)
{
switch (lane)
{
bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
#endif
-void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaTriStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
#endif
-void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
#endif
-void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaQuadList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
#endif
-void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaLineLoop0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
#endif
-void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaLineList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
#endif
-void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaLineStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
#endif
-void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
#if ENABLE_AVX512_SIMD16
bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
#endif
-void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
bool PaRectList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
bool PaRectList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
#endif
-void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
template <uint32_t TotalControlPoints>
-void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
{
// We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
// KNOB_SIMD_WIDTH * 1 patch. This function is called once per attribute.
}
#endif
-void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
{
#if USE_SIMD16_FRONTEND
const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
}
#endif
-void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
{
#if USE_SIMD16_FRONTEND
const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
}
#endif
-void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
{
#if USE_SIMD16_FRONTEND
const simd16vector &a = PaGetSimdVector_simd16(pa, pa.first, slot);
}
#endif
-void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
{
#if USE_SIMD16_FRONTEND
const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
}
#endif
-void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
{
PaLineStripSingle0(pa, slot, primIndex, verts);
}
#endif
-void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
{
#if USE_SIMD16_FRONTEND
const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
}
#endif
-void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
{
#if USE_SIMD16_FRONTEND
const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
}
#endif
-void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
{
#if USE_SIMD16_FRONTEND
const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
PA_STATE_OPT& pa,
uint32_t slot,
uint32_t primIndex,
- __m128 verts[])
+ simd4scalar verts[])
{
// We have 12 simdscalars contained within 3 simdvectors which
// hold at least 8 triangles worth of data. We want to assemble a single
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
{
// Each 4-pixel row is 16-bytes
- __m128i *pZRow01 = (__m128i*)pSrc;
- __m128i vQuad00 = _mm_load_si128(pZRow01);
- __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
+ simd4scalari *pZRow01 = (simd4scalari*)pSrc;
+ simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
+ simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);
- __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
- __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
+ simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
+ simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);
- _mm_storeu_si128((__m128i*)ppDsts[0], vRow00);
- _mm_storeu_si128((__m128i*)ppDsts[1], vRow10);
+ SIMD128::storeu_si((simd4scalari*)ppDsts[0], vRow00);
+ SIMD128::storeu_si((simd4scalari*)ppDsts[1], vRow10);
}
};
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
{
// 4 x 16 bytes = 64 bytes, 16 pixels
- const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
+ const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
- __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
+ simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
// Unswizzle from SWR-Z order
- __m128i quad0 = _mm_load_si128(&pSrc128[0]); // 0 1 2 3
- __m128i quad1 = _mm_load_si128(&pSrc128[1]); // 4 5 6 7
- __m128i quad2 = _mm_load_si128(&pSrc128[2]); // 8 9 A B
- __m128i quad3 = _mm_load_si128(&pSrc128[3]); // C D E F
-
- _mm_storeu_si128(ppDsts128[0], _mm_unpacklo_epi64(quad0, quad1)); // 0 1 4 5
- _mm_storeu_si128(ppDsts128[1], _mm_unpackhi_epi64(quad0, quad1)); // 2 3 6 7
- _mm_storeu_si128(ppDsts128[2], _mm_unpacklo_epi64(quad2, quad3)); // 8 9 C D
- _mm_storeu_si128(ppDsts128[3], _mm_unpackhi_epi64(quad2, quad3)); // A B E F
+ simd4scalari quad0 = SIMD128::load_si(&pSrc128[0]); // 0 1 2 3
+ simd4scalari quad1 = SIMD128::load_si(&pSrc128[1]); // 4 5 6 7
+ simd4scalari quad2 = SIMD128::load_si(&pSrc128[2]); // 8 9 A B
+ simd4scalari quad3 = SIMD128::load_si(&pSrc128[3]); // C D E F
+
+ SIMD128::storeu_si(ppDsts128[0], SIMD128::unpacklo_epi64(quad0, quad1)); // 0 1 4 5
+ SIMD128::storeu_si(ppDsts128[1], SIMD128::unpackhi_epi64(quad0, quad1)); // 2 3 6 7
+ SIMD128::storeu_si(ppDsts128[2], SIMD128::unpacklo_epi64(quad2, quad3)); // 8 9 C D
+ SIMD128::storeu_si(ppDsts128[3], SIMD128::unpackhi_epi64(quad2, quad3)); // A B E F
}
};
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
{
// Each 4-pixel row is 32 bytes.
- const __m128i* pPixSrc = (const __m128i*)pSrc;
+ const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
// order of pointers match SWR-Z layout
- __m128i** pvDsts = (__m128i**)&ppDsts[0];
+ simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
*pvDsts[0] = pPixSrc[0];
*pvDsts[1] = pPixSrc[1];
*pvDsts[2] = pPixSrc[2];
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
{
// 8 x 16 bytes = 128 bytes, 16 pixels
- const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
+ const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
- __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
+ simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
// order of pointers match SWR-Z layout
*ppDsts128[0] = pSrc128[0]; // 0 1
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
{
// Each 4-pixel row is 64 bytes.
- const __m128i* pPixSrc = (const __m128i*)pSrc;
+ const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
// Unswizzle from SWR-Z order
- __m128i** pvDsts = (__m128i**)&ppDsts[0];
+ simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
*pvDsts[0] = pPixSrc[0];
*pvDsts[1] = pPixSrc[2];
*pvDsts[2] = pPixSrc[1];
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16])
{
// 16 x 16 bytes = 256 bytes, 16 pixels
- const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
+ const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
- __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
+ simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
for (uint32_t i = 0; i < 16; i += 4)
{
temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
// merge/store data into destination but don't overwrite the X8 bits
- simdscalari destlo = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]));
- simdscalari desthi = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]));
+ simdscalari destlo = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]));
+ simdscalari desthi = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]));
simd16scalari dest = _simd16_setzero_si();
dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp));
- _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]), _simd16_extract_si(dest, 0));
- _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]), _simd16_extract_si(dest, 1));
+ _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]), _simd16_extract_si(dest, 0));
+ _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]), _simd16_extract_si(dest, 1));
#else
static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
// Store data into destination but don't overwrite the X8 bits
// Each 4-pixel row is 16-bytes
- __m128i *pZRow01 = (__m128i*)aosTile;
- __m128i vQuad00 = _mm_load_si128(pZRow01);
- __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
+ simd4scalari *pZRow01 = (simd4scalari*)aosTile;
+ simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
+ simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);
- __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
- __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
+ simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
+ simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);
- __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]);
- __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]);
+ simd4scalari vDst0 = SIMD128::loadu_si((const simd4scalari*)ppDsts[0]);
+ simd4scalari vDst1 = SIMD128::loadu_si((const simd4scalari*)ppDsts[1]);
- __m128i vMask = _mm_set1_epi32(0xFFFFFF);
+ simd4scalari vMask = _mm_set1_epi32(0xFFFFFF);
- vDst0 = _mm_andnot_si128(vMask, vDst0);
- vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask));
- vDst1 = _mm_andnot_si128(vMask, vDst1);
- vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask));
+ vDst0 = SIMD128::andnot_si(vMask, vDst0);
+ vDst0 = SIMD128::or_si(vDst0, SIMD128::and_si(vRow00, vMask));
+ vDst1 = SIMD128::andnot_si(vMask, vDst1);
+ vDst1 = SIMD128::or_si(vDst1, SIMD128::and_si(vRow10, vMask));
- _mm_storeu_si128((__m128i*)ppDsts[0], vDst0);
- _mm_storeu_si128((__m128i*)ppDsts[1], vDst1);
+ SIMD128::storeu_si((simd4scalari*)ppDsts[0], vDst0);
+ SIMD128::storeu_si((simd4scalari*)ppDsts[1], vDst1);
#endif
}
};
// store 8x2 memory order:
// row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
// row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
- _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
- _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
+ _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
+ _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
}
#endif
// splitting into two sets of 4 wide integer vector types
// because AVX doesn't have instructions to support this operation at 8 wide
- __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
- __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
- __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
- __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
+ simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
+ simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
+ simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
+ simd4scalari srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
- __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
- __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
- __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
- __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
+ simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
+ simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
+ simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
+ simd4scalari srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
- srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
- srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00
+ srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
+ srcLo2 = SIMD128::or_si(srcLo2, srcLo3); // ab00ab00ab00ab00
- srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
- srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00
+ srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
+ srcHi2 = SIMD128::or_si(srcHi2, srcHi3); // ab00ab00ab00ab00
- srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr
- srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr
+ srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // abgrabgrabgrabgr
+ srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // abgrabgrabgrabgr
// unpack into rows that get the tiling order correct
- __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr
- __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
+ simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr
+ simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
simdscalari final = _mm256_castsi128_si256(vRow00);
final = _mm256_insertf128_si256(final, vRow10, 1);
final = _mm256_permute4x64_epi64(final, 0xD8);
#endif
- _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
+ _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
}
#if USE_8x2_TILE_BACKEND
// store 8x2 memory order:
// row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
// row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
- _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
- _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
+ _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
+ _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
}
#endif
// splitting into two sets of 4 wide integer vector types
// because AVX doesn't have instructions to support this operation at 8 wide
- __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
- __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
- __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
+ simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
+ simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
+ simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
- __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
- __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
- __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
+ simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
+ simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
+ simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
- srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
+ srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
- srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
+ srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
- srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
- srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
+ srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
+ srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
// unpack into rows that get the tiling order correct
- __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
- __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
+ simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0); // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
+ simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
simdscalari final = _mm256_castsi128_si256(vRow00);
final = _mm256_insertf128_si256(final, vRow10, 1);
#endif
- _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
+ _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
}
template<>