From 7cd50b9e47a8ad131795da270039da87e0175143 Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Thu, 27 Jul 2017 15:33:10 -0500 Subject: [PATCH] swr/rast: fix core / knights split of AVX512 intrinsics Move AVX512BW specific intrinics to be Core-only. Move some AVX512F intrinsics back to common implementation file. Reviewed-by: Bruce Cherniak --- .../drivers/swr/rasterizer/common/simdlib.hpp | 2 + .../rasterizer/common/simdlib_512_avx512.inl | 53 ++++++++---------- .../common/simdlib_512_avx512_core.inl | 54 +++++++++++++++---- .../common/simdlib_512_avx512_knights.inl | 15 ------ 4 files changed, 69 insertions(+), 55 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp index 22d7da42d0a..500cf8a87e3 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp @@ -214,6 +214,8 @@ struct SIMDBase : Traits::IsaImpl using Vec4 = typename Traits::Vec4; using Mask = typename Traits::Mask; + static const size_t VECTOR_BYTES = sizeof(Float); + // Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes xxxx yyyy zzzz wwww. static SIMDINLINE void vec4_load1_ps(Vec4& r, const float *p) diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl index 1dbfff8c9c1..95e4c319099 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl @@ -158,6 +158,11 @@ private: return _mm512_maskz_set1_epi32(m, -1); } + static SIMDINLINE Integer vmask(__mmask8 m) + { + return _mm512_maskz_set1_epi64(m, -1LL); + } + public: //----------------------------------------------------------------------- // Single precision floating point arithmetic operations @@ -187,8 +192,8 @@ static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps 0xff) ? 0xff : (a + b) (uint8) +//SIMD_IWRAPPER_2(add_epi8); // return a + b (int8) +//SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32) SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32) SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32) @@ -202,7 +207,7 @@ SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32) SIMD_IWRAPPER_2(mullo_epi32); SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32) SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64) -SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) +//SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) //----------------------------------------------------------------------- // Logical operations @@ -276,7 +281,7 @@ static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (i return _mm512_cvtepi32_ps(a); } -SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a (uint8 --> int16) +//SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a (uint8 --> int16) SIMD_IWRAPPER_1_4(cvtepu8_epi32); // return (int32)a (uint8 --> int32) SIMD_IWRAPPER_1_8(cvtepu16_epi32); // return (int32)a (uint16 --> int32) SIMD_IWRAPPER_1_4(cvtepu16_epi64); // return (int64)a (uint16 --> int64) @@ -316,20 +321,6 @@ static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps(a, b); } static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps(a, b); } -template -static SIMDINLINE Integer SIMDCALL cmp_epi8(Integer a, Integer b) -{ - // Legacy vector mask generator - __mmask64 result = _mm512_cmp_epi8_mask(a, b, static_cast(CmpTypeT)); - return vmask(result); -} -template -static SIMDINLINE Integer SIMDCALL cmp_epi16(Integer a, Integer b) -{ - // Legacy vector mask generator - __mmask32 result = _mm512_cmp_epi16_mask(a, b, static_cast(CmpTypeT)); - return vmask(result); -} template static SIMDINLINE Integer SIMDCALL cmp_epi32(Integer a, Integer b) { @@ -345,12 +336,12 @@ static SIMDINLINE Integer SIMDCALL cmp_epi64(Integer a, Integer b) return vmask(result); } -SIMD_IWRAPPER_2_CMP(cmpeq_epi8, cmp_epi8); // return a == b (int8) -SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16); // return a == b (int16) +//SIMD_IWRAPPER_2_CMP(cmpeq_epi8, cmp_epi8); // return a == b (int8) +//SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16); // return a == b (int16) SIMD_IWRAPPER_2_CMP(cmpeq_epi32, cmp_epi32); // return a == b (int32) SIMD_IWRAPPER_2_CMP(cmpeq_epi64, cmp_epi64); // return a == b (int64) -SIMD_IWRAPPER_2_CMP(cmpgt_epi8, cmp_epi8); // return a > b (int8) -SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16); // return a > b (int16) +//SIMD_IWRAPPER_2_CMP(cmpgt_epi8, cmp_epi8); // return a > b (int8) +//SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16); // return a > b (int16) SIMD_IWRAPPER_2_CMP(cmpgt_epi32, cmp_epi32); // return a > b (int32) SIMD_IWRAPPER_2_CMP(cmpgt_epi64, cmp_epi64); // return a > b (int64) SIMD_IWRAPPER_2_CMP(cmplt_epi32, cmp_epi32); // return a < b (int32) @@ -458,7 +449,7 @@ SIMD_IWRAPPER_2I_(permute2f128_si, shuffle_i32x4); SIMD_IWRAPPER_1I(shuffle_epi32); -SIMD_IWRAPPER_2(shuffle_epi8); +//SIMD_IWRAPPER_2(shuffle_epi8); SIMD_DWRAPPER_2I(shuffle_pd); SIMD_WRAPPER_2I(shuffle_ps); @@ -477,13 +468,13 @@ static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b) } SIMD_IWRAPPER_2(unpackhi_epi64); -SIMD_IWRAPPER_2(unpackhi_epi8); +//SIMD_IWRAPPER_2(unpackhi_epi8); SIMD_DWRAPPER_2(unpackhi_pd); SIMD_WRAPPER_2(unpackhi_ps); -SIMD_IWRAPPER_2(unpacklo_epi16); +//SIMD_IWRAPPER_2(unpacklo_epi16); SIMD_IFWRAPPER_2(unpacklo_epi32, unpacklo_ps); SIMD_IWRAPPER_2(unpacklo_epi64); -SIMD_IWRAPPER_2(unpacklo_epi8); +//SIMD_IWRAPPER_2(unpacklo_epi8); SIMD_DWRAPPER_2(unpacklo_pd); SIMD_WRAPPER_2(unpacklo_ps); @@ -546,11 +537,11 @@ static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src) _mm512_mask_store_ps(p, m, src); } -static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a) -{ - __mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si()); - return static_cast(m); -} +//static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a) +//{ +// __mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si()); +// return static_cast(m); +//} static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a) { diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_core.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_core.inl index 5063c529306..fed6307f4bc 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_core.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_core.inl @@ -133,10 +133,6 @@ #define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op) private: - static SIMDINLINE Integer vmask(__mmask8 m) - { - return _mm512_maskz_set1_epi64(m, -1LL); - } static SIMDINLINE Integer vmask(__mmask32 m) { return _mm512_maskz_set1_epi16(m, -1); @@ -145,17 +141,57 @@ private: { return _mm512_maskz_set1_epi8(m, -1); } - public: + +SIMD_IWRAPPER_2(add_epi8); // return a + b (int8) +SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) +SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) + SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int) SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int) SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int) SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int) -SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16 -SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32 -SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16 -SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32 +SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a (uint8 --> int16) + +template +static SIMDINLINE Integer SIMDCALL cmp_epi8(Integer a, Integer b) +{ + // Legacy vector mask generator + __mmask64 result = _mm512_cmp_epi8_mask(a, b, static_cast(CmpTypeT)); + return vmask(result); +} +template +static SIMDINLINE Integer SIMDCALL cmp_epi16(Integer a, Integer b) +{ + // Legacy vector mask generator + __mmask32 result = _mm512_cmp_epi16_mask(a, b, static_cast(CmpTypeT)); + return vmask(result); +} + +SIMD_IWRAPPER_2_CMP(cmpeq_epi8, cmp_epi8); // return a == b (int8) +SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16); // return a == b (int16) +SIMD_IWRAPPER_2_CMP(cmpgt_epi8, cmp_epi8); // return a > b (int8) +SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16); // return a > b (int16) + +SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16 +SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32 +SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm512_packus_epi16 +SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm512_packus_epi32 + +SIMD_IWRAPPER_2(unpackhi_epi8); // See documentation for _mm512_unpackhi_epi8 +SIMD_IWRAPPER_2(unpacklo_epi16); // See documentation for _mm512_unpacklo_epi16 +SIMD_IWRAPPER_2(unpacklo_epi8); // See documentation for _mm512_unpacklo_epi8 + +SIMD_IWRAPPER_2(shuffle_epi8); + +static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a) +{ + __mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si()); + return static_cast(m); +} + + #undef SIMD_WRAPPER_1_ #undef SIMD_WRAPPER_1 diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl index 2ee7639ccfa..690ab386b46 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl @@ -132,21 +132,6 @@ } #define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op) -private: - static SIMDINLINE Integer vmask(__mmask8 m) - { - return _mm512_mask_set1_epi64(_mm512_setzero_si512(), m, -1LL); - } - static SIMDINLINE Integer vmask(__mmask32 m) - { - return _mm512_mask_set1_epi16(_mm512_setzero_si512(), m, -1); - } - static SIMDINLINE Integer vmask(__mmask64 m) - { - return _mm512_mask_set1_epi8(_mm512_setzero_si512(), m, -1); - } - -public: SIMD_WRAPPERI_2_(and_ps, and_epi32); // return a & b (float treated as int) SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32); // return (~a) & b (float treated as int) SIMD_WRAPPERI_2_(or_ps, or_epi32); // return a | b (float treated as int) -- 2.30.2