From ff75c35846535baf6ff2150b18089a58fd156bbe Mon Sep 17 00:00:00 2001 From: Jan Zielinski Date: Wed, 31 Jul 2019 16:01:01 +0200 Subject: [PATCH] swr/rasterizer: improvements in simdlib 1. fix build issues with MSVC 2019 compiler The MSVC 2019 compiler seems to have an issue with optimized code-gen when using the _mm256_and_si256() intrinsic. Only disable use of integer vpand on buggy versions MSVC 2019. Otherwise allow use of integer vpand intrinsic. 2. Remove unused vec/matrix functionality Reviewed-by: Alok Hota --- .../swr/rasterizer/common/simdintrin.h | 51 --- .../drivers/swr/rasterizer/common/simdlib.hpp | 333 ------------------ .../swr/rasterizer/common/simdlib_256_avx.inl | 16 +- .../rasterizer/common/simdlib_256_avx2.inl | 15 +- .../rasterizer/common/simdlib_interface.hpp | 96 ----- .../swr/rasterizer/common/simdlib_types.hpp | 2 +- .../drivers/swr/rasterizer/core/api.cpp | 1 + src/gallium/drivers/swr/rasterizer/core/api.h | 1 + .../drivers/swr/rasterizer/core/context.h | 1 + src/gallium/drivers/swr/rasterizer/core/pa.h | 2 +- .../swr/rasterizer/jitter/JitManager.cpp | 2 +- .../drivers/swr/rasterizer/jitter/builder.cpp | 1 + .../drivers/swr/rasterizer/jitter/builder.h | 1 + .../swr/rasterizer/jitter/builder_mem.cpp | 5 + .../swr/rasterizer/jitter/builder_mem.h | 1 + .../swr/rasterizer/jitter/builder_misc.cpp | 10 + .../swr/rasterizer/jitter/builder_misc.h | 3 + 17 files changed, 49 insertions(+), 492 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h index df5c3ac6056..ebb4f4b7f11 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h @@ -191,57 +191,6 @@ SIMDINLINE SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer const& a, SIMD12 SIMD128::blend_ps(SIMD128::castsi_ps(a), SIMD128::castsi_ps(b))); } -SIMDINLINE -void _simd_mov(simdscalar& r, unsigned int rlane, simdscalar& s, unsigned int slane) -{ - OSALIGNSIMD(float) rArray[KNOB_SIMD_WIDTH], sArray[KNOB_SIMD_WIDTH]; - SIMD256::store_ps(rArray, r); - SIMD256::store_ps(sArray, s); - rArray[rlane] = sArray[slane]; - r = SIMD256::load_ps(rArray); -} - -// Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww. -#define _simdvec_load_ps SIMD::vec4_load1_ps - -SIMDINLINE -void _simdvec_mov(simdvector& r, const simdscalar& s) -{ - SIMD::vec4_set1_vps(r, s); -} - -SIMDINLINE -void _simdvec_mov(simdvector& r, const simdvector& v) -{ - r = v; -} - -#if 0 -// just move a lane from the source simdvector to dest simdvector -SIMDINLINE -void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane) -{ - _simd_mov(r[0], rlane, s[0], slane); - _simd_mov(r[1], rlane, s[1], slane); - _simd_mov(r[2], rlane, s[2], slane); - _simd_mov(r[3], rlane, s[3], slane); -} - -#endif - -#define _simdvec_dp3_ps SIMD::vec4_dp3_ps -#define _simdvec_dp4_ps SIMD::vec4_dp4_ps -#define _simdvec_rcp_length_ps SIMD::vec4_rcp_length_ps -#define _simdvec_normalize_ps SIMD::vec4_normalize_ps -#define _simdvec_mul_ps SIMD::vec4_mul_ps -#define _simdvec_add_ps SIMD::vec4_add_ps -#define _simdvec_min_ps SIMD::vec4_min_ps -#define _simdvec_max_ps SIMD::vec4_max_ps -#define _simd_mat4x4_vec4_multiply SIMD::mat4x4_vec4_multiply -#define _simd_mat3x3_vec3_w0_multiply SIMD::mat3x3_vec3_w0_multiply -#define _simd_mat4x4_vec3_w1_multiply SIMD::mat4x4_vec3_w1_multiply -#define _simd_mat4x3_vec3_w1_multiply SIMD::mat4x3_vec3_w1_multiply - ////////////////////////////////////////////////////////////////////////// /// @brief Compute plane equation vA * vX + vB * vY + vC SIMDINLINE simdscalar vplaneps(simdscalar const& vA, diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp index 153e2af7eae..53793ba101c 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp @@ -209,339 +209,6 @@ struct SIMDBase : Traits::IsaImpl using Integer = typename Traits::Integer; using Vec4 = typename Traits::Vec4; using Mask = typename Traits::Mask; - - static const size_t VECTOR_BYTES = sizeof(Float); - - // Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes xxxx yyyy zzzz wwww. - static SIMDINLINE void vec4_load1_ps(Vec4& r, const float* p) - { - r[0] = SIMD::set1_ps(p[0]); - r[1] = SIMD::set1_ps(p[1]); - r[2] = SIMD::set1_ps(p[2]); - r[3] = SIMD::set1_ps(p[3]); - } - - static SIMDINLINE void vec4_set1_vps(Vec4& r, Float const& s) - { - r[0] = s; - r[1] = s; - r[2] = s; - r[3] = s; - } - - static SIMDINLINE Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1) - { - Float tmp, r; - r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x) - - tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y) - r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) - - tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z) - r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) - - return r; - } - - static SIMDINLINE Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1) - { - Float tmp, r; - r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x) - - tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y) - r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) - - tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z) - r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) - - tmp = SIMD::mul_ps(v0[3], v1[3]); // (v0.w*v1.w) - r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) - - return r; - } - - static SIMDINLINE Float vec4_rcp_length_ps(const Vec4& v) - { - Float length = vec4_dp4_ps(v, v); - return SIMD::rsqrt_ps(length); - } - - static SIMDINLINE void vec4_normalize_ps(Vec4& r, const Vec4& v) - { - Float rcpLength = vec4_rcp_length_ps(v); - - r[0] = SIMD::mul_ps(v[0], rcpLength); - r[1] = SIMD::mul_ps(v[1], rcpLength); - r[2] = SIMD::mul_ps(v[2], rcpLength); - r[3] = SIMD::mul_ps(v[3], rcpLength); - } - - static SIMDINLINE void vec4_mul_ps(Vec4& r, const Vec4& v, Float const& s) - { - r[0] = SIMD::mul_ps(v[0], s); - r[1] = SIMD::mul_ps(v[1], s); - r[2] = SIMD::mul_ps(v[2], s); - r[3] = SIMD::mul_ps(v[3], s); - } - - static SIMDINLINE void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1) - { - r[0] = SIMD::mul_ps(v0[0], v1[0]); - r[1] = SIMD::mul_ps(v0[1], v1[1]); - r[2] = SIMD::mul_ps(v0[2], v1[2]); - r[3] = SIMD::mul_ps(v0[3], v1[3]); - } - - static SIMDINLINE void vec4_add_ps(Vec4& r, const Vec4& v0, Float const& s) - { - r[0] = SIMD::add_ps(v0[0], s); - r[1] = SIMD::add_ps(v0[1], s); - r[2] = SIMD::add_ps(v0[2], s); - r[3] = SIMD::add_ps(v0[3], s); - } - - static SIMDINLINE void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1) - { - r[0] = SIMD::add_ps(v0[0], v1[0]); - r[1] = SIMD::add_ps(v0[1], v1[1]); - r[2] = SIMD::add_ps(v0[2], v1[2]); - r[3] = SIMD::add_ps(v0[3], v1[3]); - } - - static SIMDINLINE void vec4_min_ps(Vec4& r, const Vec4& v0, Float const& s) - { - r[0] = SIMD::min_ps(v0[0], s); - r[1] = SIMD::min_ps(v0[1], s); - r[2] = SIMD::min_ps(v0[2], s); - r[3] = SIMD::min_ps(v0[3], s); - } - - static SIMDINLINE void vec4_max_ps(Vec4& r, const Vec4& v0, Float const& s) - { - r[0] = SIMD::max_ps(v0[0], s); - r[1] = SIMD::max_ps(v0[1], s); - r[2] = SIMD::max_ps(v0[2], s); - r[3] = SIMD::max_ps(v0[3], s); - } - - // Matrix4x4 * Vector4 - // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w) - // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w) - // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w) - // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w) - static SIMDINLINE void SIMDCALL mat4x4_vec4_multiply(Vec4& result, - const float* pMatrix, - const Vec4& v) - { - Float m; - Float r0; - Float r1; - - m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3] - r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) - result[0] = r0; - - m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3] - r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) - result[1] = r0; - - m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3] - r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) - result[2] = r0; - - m = SIMD::load1_ps(pMatrix + 3 * 4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 3 * 4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 3 * 4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = SIMD::load1_ps(pMatrix + 3 * 4 + 3); // m[row][3] - r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) - result[3] = r0; - } - - // Matrix4x4 * Vector3 - Direction Vector where w = 0. - // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0) - // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0) - // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0) - // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0) - static SIMDINLINE void SIMDCALL mat3x3_vec3_w0_multiply(Vec4& result, - const float* pMatrix, - const Vec4& v) - { - Float m; - Float r0; - Float r1; - - m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - result[0] = r0; - - m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - result[1] = r0; - - m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - result[2] = r0; - - result[3] = SIMD::setzero_ps(); - } - - // Matrix4x4 * Vector3 - Position vector where w = 1. - // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1) - // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1) - // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1) - // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1) - static SIMDINLINE void SIMDCALL mat4x4_vec3_w1_multiply(Vec4& result, - const float* pMatrix, - const Vec4& v) - { - Float m; - Float r0; - Float r1; - - m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3] - r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) - result[0] = r0; - - m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3] - r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) - result[1] = r0; - - m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3] - r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) - result[2] = r0; - - m = SIMD::load1_ps(pMatrix + 3 * 4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 3 * 4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 3 * 4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = SIMD::load1_ps(pMatrix + 3 * 4 + 3); // m[row][3] - result[3] = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) - } - - static SIMDINLINE void SIMDCALL mat4x3_vec3_w1_multiply(Vec4& result, - const float* pMatrix, - const Vec4& v) - { - Float m; - Float r0; - Float r1; - - m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3] - r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) - result[0] = r0; - - m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3] - r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) - result[1] = r0; - - m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0] - r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) - m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1] - r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2] - r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) - r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3] - r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) - result[2] = r0; - result[3] = SIMD::set1_ps(1.0f); - } }; // struct SIMDBase using SIMD128 = SIMDBase; diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl index 232f43faec7..b5046e48683 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl @@ -222,14 +222,14 @@ SIMD_EMU_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) //----------------------------------------------------------------------- // Logical operations //----------------------------------------------------------------------- -SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int) -SIMD_EMU_IWRAPPER_2(and_si); // return a & b (int) -SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int) -SIMD_EMU_IWRAPPER_2(andnot_si); // return (~a) & b (int) -SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int) -SIMD_EMU_IWRAPPER_2(or_si); // return a | b (int) -SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int) -SIMD_EMU_IWRAPPER_2(xor_si); // return a ^ b (int) +SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int) +SIMD_IFWRAPPER_2(and_si, _mm256_and_ps); // return a & b (int) +SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int) +SIMD_IFWRAPPER_2(andnot_si, _mm256_andnot_ps); // return (~a) & b (int) +SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int) +SIMD_IFWRAPPER_2(or_si, _mm256_or_ps); // return a | b (int) +SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int) +SIMD_IFWRAPPER_2(xor_si, _mm256_xor_ps); // return a ^ b (int) //----------------------------------------------------------------------- // Shift operations diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl index 49650d52442..8fce96dcea4 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl @@ -81,6 +81,7 @@ return _mm256_##op(a, b, ImmT); \ } + //----------------------------------------------------------------------- // Floating point arithmetic operations //----------------------------------------------------------------------- @@ -116,7 +117,14 @@ SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8) //----------------------------------------------------------------------- // Logical operations //----------------------------------------------------------------------- -SIMD_IWRAPPER_2_(and_si, and_si256); // return a & b (int) +#if _MSC_VER >= 1920 // && _MSC_FULL_VER < [some_fixed_version] +// Some versions of MSVC 2019 don't handle constant folding of and_si() correctly. +// Using and_ps instead inhibits the compiler's constant folding and actually issues +// the and intrinsic even though both inputs are constant values. +#else +// Use native integer and intrinsic +SIMD_IWRAPPER_2_(and_si, and_si256); // return a & b (int) +#endif SIMD_IWRAPPER_2_(andnot_si, andnot_si256); // return (~a) & b (int) SIMD_IWRAPPER_2_(or_si, or_si256); // return a | b (int) SIMD_IWRAPPER_2_(xor_si, xor_si256); // return a ^ b (int) @@ -213,6 +221,10 @@ static SIMDINLINE Float SIMDCALL return _mm256_i32gather_ps(p, idx, static_cast(ScaleT)); } +#if _MSC_VER == 1920 // && _MSC_FULL_VER < [some_fixed_version] +// Don't use _mm256_mask_i32gather_ps(), the compiler doesn't preserve the mask register +// correctly in early versions of MSVC 2019 +#else // for each element: (mask & (1 << 31)) ? (i32gather_ps(p, idx), mask = 0) : old template static SIMDINLINE Float SIMDCALL @@ -222,6 +234,7 @@ static SIMDINLINE Float SIMDCALL // Only for this intrinsic - not sure why. :( return _mm256_mask_i32gather_ps(old.v, p, idx.v, mask.v, static_cast(ScaleT)); } +#endif static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const& a) { diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp index 85c722c92c0..3d31b39ee55 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp @@ -328,101 +328,5 @@ struct SIMD256 // or SIMD4 or SIMD16 //======================================================================= // Advanced masking interface (currently available only in SIMD16 width) //======================================================================= - - - //======================================================================= - // Extended Utility Functions (common to SIMD256 and SIMD16) - //======================================================================= - - //----------------------------------------------------------------------- - // Extended Types - //----------------------------------------------------------------------- - - // Vec4, an SOA SIMD set of 4-dimensional vectors - union Vec4 - { - Vec4() = default; - Vec4(Float in) - { - s.x = in; - s.y = in; - s.z = in; - s.w = in; - } - Vec4(Float x, Float y, Float z, Float w) - { - s.x = x; - s.y = y; - s.z = z; - s.w = w; - } - - Float v[4]; - Integer vi[4]; - struct - { - Float x; - Float y; - Float z; - Float w; - } s; - Float& operator[] (const int i) { return v[i]; } - Float const & operator[] (const int i) const { return v[i]; } - }; - - //----------------------------------------------------------------------- - // Extended Functions - //----------------------------------------------------------------------- - static void vec4_set1_ps(Vec4& r, const float *p); // r[0] = set1(p[0]), r[1] = set1(p[1]), ... - static void vec4_set1_vps(Vec4& r, Float s); // r[0] = s, r[1] = s, ... - static Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1); // return dp3(v0, v1) - static Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1); // return dp4(v0, v1) - static Float vec4_rcp_length_ps(const Vec4& v); // return 1.0f / sqrt(dp4(v, v)) - static void vec4_normalize_ps(Vec4& r, const Vec4& v); // r = v * rcp_length(v) - static void vec4_mul_ps(Vec4& r, const Vec4& v, Float s); // r = v * set1_vps(s) - static void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1); // r = v0 * v1 - static void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1); // r = v0 + v1 - static void vec4_min_ps(Vec4& r, const Vec4& v0, Float s); // r = (v0 < s) ? v0 : s - static void vec4_max_ps(Vec4& r, const Vec4& v0, Float s); // r = (v0 > s) ? v0 : s - - // Matrix4x4 * Vector4 - // result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * v.s.w) - // result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * v.s.w) - // result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * v.s.w) - // result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * v.s.w) - static void mat4x4_vec4_multiply( - Vec4& result, - const float *pMatrix, - const Vec4& v); - - // Matrix4x4 * Vector3 - Direction Vector where w = 0. - // result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 0) - // result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 0) - // result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 0) - // result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 0) - static void mat3x3_vec3_w0_multiply( - Vec4& result, - const float *pMatrix, - const Vec4& v); - - // Matrix4x4 * Vector3 - Position vector where w = 1. - // result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1) - // result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1) - // result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1) - // result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 1) - static void mat4x4_vec3_w1_multiply( - Vec4& result, - const float *pMatrix, - const Vec4& v); - - // Matrix4x3 * Vector3 - Position vector where w = 1. - // result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1) - // result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1) - // result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1) - // result.s.w = 1 - static void mat4x3_vec3_w1_multiply( - Vec4& result, - const float *pMatrix, - const Vec4& v); }; #endif // #if 0 diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp index 944c3c23fd3..3ef847d4ca4 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp +++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp @@ -315,7 +315,7 @@ namespace SIMDImpl namespace SIMD512Impl { -#if !(defined(__AVX512F__) || defined(_MM_K0_REG)) +#if !(defined(__AVX512F__) || defined(_ZMMINTRIN_H_INCLUDED)) // Define AVX512 types if not included via immintrin.h. // All data members of these types are ONLY to viewed // in a debugger. Do NOT access them via code! diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index 3601aa3f509..f1b0dc03352 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -1551,6 +1551,7 @@ void SwrDispatch(HANDLE hContext, pTaskData->threadGroupCountX = threadGroupCountX; pTaskData->threadGroupCountY = threadGroupCountY; pTaskData->threadGroupCountZ = threadGroupCountZ; + pTaskData->enableThreadDispatch = false; uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ; uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT; diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h index e1ba893296e..8058defb388 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.h +++ b/src/gallium/drivers/swr/rasterizer/core/api.h @@ -588,6 +588,7 @@ SWR_FUNC(void, uint32_t threadGroupCountY, uint32_t threadGroupCountZ); + /// @note this enum needs to be kept in sync with HOTTILE_STATE! enum SWR_TILE_STATE { diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index 5a8656dcfba..8891cc881a3 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -140,6 +140,7 @@ struct COMPUTE_DESC uint32_t threadGroupCountX; uint32_t threadGroupCountY; uint32_t threadGroupCountZ; + bool enableThreadDispatch; }; typedef void (*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC, diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h index 635bf195e4b..c41376ae97b 100644 --- a/src/gallium/drivers/swr/rasterizer/core/pa.h +++ b/src/gallium/drivers/swr/rasterizer/core/pa.h @@ -1447,7 +1447,7 @@ struct PA_TESS : PA_STATE SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]); if (!m_SOA) { - indices = _simd16_mul_epi32(indices, _simd16_set1_epi32(vertexStride / 4)); + indices = _simd16_mullo_epi32(indices, _simd16_set1_epi32(vertexStride / 4)); } #else SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp index fc8dc46d9de..0f78bd661a5 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp @@ -584,7 +584,7 @@ struct JitCacheFileHeader uint64_t GetObjectCRC() const { return m_objCRC; } private: - static const uint64_t JC_MAGIC_NUMBER = 0xfedcba9876543210ULL + 6; + static const uint64_t JC_MAGIC_NUMBER = 0xfedcba9876543210ULL + 7; static const size_t JC_STR_MAX_LEN = 32; static const uint32_t JC_PLATFORM_KEY = (LLVM_VERSION_MAJOR << 24) | (LLVM_VERSION_MINOR << 16) | (LLVM_VERSION_PATCH << 8) | diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp index 53f11d66db1..30481b43208 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp @@ -61,6 +61,7 @@ namespace SwrJit mInt16PtrTy = PointerType::get(mInt16Ty, 0); mInt32PtrTy = PointerType::get(mInt32Ty, 0); mInt64PtrTy = PointerType::get(mInt64Ty, 0); + mHandleTy = mInt8PtrTy; mSimd4FP64Ty = VectorType::get(mDoubleTy, 4); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h index 97550fad23d..6e1d94b9e68 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h @@ -78,6 +78,7 @@ namespace SwrJit // Built in types: scalar Type* mVoidTy; + Type* mHandleTy; Type* mInt1Ty; Type* mInt8Ty; Type* mInt16Ty; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp index b183a9e0082..2d8240187c5 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp @@ -237,6 +237,11 @@ namespace SwrJit return MASKED_GATHER(pVecSrcPtr, 4, pVecMask, pVecPassthru); } + void Builder::SCATTER_PTR(Value* pVecDstPtr, Value* pVecSrc, Value* pVecMask) + { + MASKED_SCATTER(pVecSrc, pVecDstPtr, 4, pVecMask); + } + void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h index 934a8279c2f..fe4c5dd38a4 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h @@ -148,6 +148,7 @@ void GATHER4DD(const SWR_FORMAT_INFO& info, Value* GATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1); Value* GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru); +void SCATTER_PTR(Value* pVecDstPtr, Value* pVecSrc, Value* pVecMask); virtual void SCATTERPS(Value* pDst, Value* vSrc, diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index 5b06de352dc..6687ead02d3 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -170,6 +170,16 @@ namespace SwrJit return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0)); } + Value* Builder::VIMMED1(uint64_t i) + { + return ConstantVector::getSplat(mVWidth, cast(C(i))); + } + + Value* Builder::VIMMED1_16(uint64_t i) + { + return ConstantVector::getSplat(mVWidth16, cast(C(i))); + } + Value* Builder::VIMMED1(int i) { return ConstantVector::getSplat(mVWidth, cast(C(i))); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h index 91e2a32f1a1..3987a5f3476 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h @@ -71,6 +71,9 @@ Constant* CInc(uint32_t base, uint32_t count) Constant* PRED(bool pred); +Value* VIMMED1(uint64_t i); +Value* VIMMED1_16(uint64_t i); + Value* VIMMED1(int i); Value* VIMMED1_16(int i); -- 2.30.2