From d5157ddca4072856e0afce3d7af8929a7d387044 Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Wed, 29 Mar 2017 12:58:18 -0500 Subject: [PATCH] swr: [rasterizer core] SIMD16 Frontend WIP Implement widened binner for SIMD16 Reviewed-by: Bruce Cherniak --- .../swr/rasterizer/common/simd16intrin.h | 44 +- .../drivers/swr/rasterizer/core/binner.cpp | 1642 ++++++++++++++--- .../drivers/swr/rasterizer/core/frontend.h | 98 + .../drivers/swr/rasterizer/core/utils.h | 10 + 4 files changed, 1509 insertions(+), 285 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h index 3b43d510e68..e5c34c2c876 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h @@ -46,10 +46,6 @@ struct simd16scalari }; typedef uint16_t simd16mask; -#define _simd16_masklo(mask) ((mask) & 0xFF) -#define _simd16_maskhi(mask) (((mask) >> 8)) -#define _simd16_setmask(hi, lo) (((hi) << 8) | (lo)) - #else typedef __m512 simd16scalar; typedef __m512d simd16scalard; @@ -60,6 +56,10 @@ typedef __mmask16 simd16mask; #error Unsupported vector width #endif//KNOB_SIMD16_WIDTH == 16 +#define _simd16_masklo(mask) ((mask) & 0xFF) +#define _simd16_maskhi(mask) (((mask) >> 8) & 0xFF) +#define _simd16_setmask(hi, lo) (((hi) << 8) | (lo)) + OSALIGN(union, KNOB_SIMD16_BYTES) simd16vector { simd16scalar v[4]; @@ -383,32 +383,26 @@ SIMD16_EMU_AVX512_2(simd16scalar, _simd16_max_ps, _mm256_max_ps) INLINE simd16mask _simd16_movemask_ps(simd16scalar a) { - simd16mask mask; + simdmask mask_lo = _mm256_movemask_ps(a.lo); + simdmask mask_hi = _mm256_movemask_ps(a.hi); - reinterpret_cast(&mask)[0] = _mm256_movemask_ps(a.lo); - reinterpret_cast(&mask)[1] = _mm256_movemask_ps(a.hi); - - return mask; + return static_cast(mask_lo) | (static_cast(mask_hi) << 8); } INLINE simd16mask _simd16_movemask_pd(simd16scalard a) { - simd16mask mask; + simdmask mask_lo = _mm256_movemask_pd(a.lo); + simdmask mask_hi = _mm256_movemask_pd(a.hi); - reinterpret_cast(&mask)[0] = _mm256_movemask_pd(a.lo); - reinterpret_cast(&mask)[1] = _mm256_movemask_pd(a.hi); - - return mask; + return static_cast(mask_lo) | (static_cast(mask_hi) << 4); } -INLINE simd16mask _simd16_movemask_epi8(simd16scalari a) +INLINE uint64_t _simd16_movemask_epi8(simd16scalari a) { - simd16mask mask; + uint32_t mask_lo = _mm256_movemask_epi8(a.lo); + uint32_t mask_hi = _mm256_movemask_epi8(a.hi); - reinterpret_cast(&mask)[0] = _mm256_movemask_epi8(a.lo); - reinterpret_cast(&mask)[1] = _mm256_movemask_epi8(a.hi); - - return mask; + return static_cast(mask_lo) | (static_cast(mask_hi) << 32); } INLINE simd16scalari _simd16_cvtps_epi32(simd16scalar a) @@ -809,12 +803,10 @@ INLINE simd16mask _simd16_scalari2mask(simd16scalari mask) return _mm512_cmpneq_epu32_mask(mask, _mm512_setzero_epi32()); } -#if 0 INLINE simd16mask _simd16_scalard2mask(simd16scalard mask) { - return _mm512_cmpneq_epu64_mask(mask, _mm512_setzero_epi64()); + return _mm512_cmpneq_epu64_mask(_mm512_castpd_si512(mask), _mm512_setzero_si512()); } -#endif #define _simd16_setzero_ps _mm512_setzero_ps #define _simd16_setzero_si _mm512_setzero_si512 @@ -889,6 +881,7 @@ INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, cons } #define _simd16_mul_ps _mm512_mul_ps +#define _simd16_div_ps _mm512_div_ps #define _simd16_add_ps _mm512_add_ps #define _simd16_sub_ps _mm512_sub_ps #define _simd16_rsqrt_ps _mm512_rsqrt14_ps @@ -900,12 +893,10 @@ INLINE simd16mask _simd16_movemask_ps(simd16scalar a) return _simd16_scalari2mask(_mm512_castps_si512(a)); } -#if 0 INLINE simd16mask _simd16_movemask_pd(simd16scalard a) { - return _simd16_scalard2mask(_mm512i_castpd_si512(a)); + return _simd16_scalard2mask(a); } -#endif #if 0 INLINE int _simd16_movemask_epi8(simd16scalari a) @@ -1040,7 +1031,6 @@ INLINE simd16scalar _simd16_mask_i32gather_ps_temp(simd16scalar a, const float * #define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp(a, m, index, mask) #define _simd16_abs_epi32 _mm512_abs_epi32 -#define _simd16_cmpeq_epi64 _mm512_abs_epi32 INLINE simd16scalari _simd16_cmpeq_epi64(simd16scalari a, simd16scalari b) { diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp index f00701f8192..9ec5bea76ee 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp @@ -37,15 +37,27 @@ // Function Prototype void BinPostSetupLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], simdscalar vRecipW[2], uint32_t primMask, simdscalari primID, simdscalari viewportIdx); +#if USE_SIMD16_FRONTEND +void BinPostSetupLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], simd16scalar vRecipW[2], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx); +#endif + ////////////////////////////////////////////////////////////////////////// /// @brief Offsets added to post-viewport vertex positions based on /// raster state. static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] = { - _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER - _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL + _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER + _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL +}; + +#if USE_SIMD16_FRONTEND +static const simd16scalar g_pixelOffsets_simd16[SWR_PIXEL_LOCATION_UL + 1] = +{ + _simd16_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER + _simd16_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL }; +#endif ////////////////////////////////////////////////////////////////////////// /// @brief Convert the X,Y coords of a triangle to the requested Fixed /// Point precision from FP32. @@ -56,6 +68,15 @@ INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn) return _simd_cvtps_epi32(vFixed); } +#if USE_SIMD16_FRONTEND +template > +INLINE simd16scalari fpToFixedPointVertical(const simd16scalar vIn) +{ + simd16scalar vFixed = _simd16_mul_ps(vIn, _simd16_set1_ps(PT::ScaleT::value)); + return _simd16_cvtps_epi32(vFixed); +} + +#endif ////////////////////////////////////////////////////////////////////////// /// @brief Helper function to set the X,Y coords of a triangle to the /// requested Fixed Point precision from FP32. @@ -72,6 +93,18 @@ INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari(&vXi vYi[2] = fpToFixedPointVertical(tri[2].y); } +#if USE_SIMD16_FRONTEND +INLINE static void FPToFixedPoint(const simd16vector * const tri, simd16scalari(&vXi)[3], simd16scalari(&vYi)[3]) +{ + vXi[0] = fpToFixedPointVertical(tri[0].x); + vYi[0] = fpToFixedPointVertical(tri[0].y); + vXi[1] = fpToFixedPointVertical(tri[1].x); + vYi[1] = fpToFixedPointVertical(tri[1].y); + vXi[2] = fpToFixedPointVertical(tri[2].x); + vYi[2] = fpToFixedPointVertical(tri[2].y); +} + +#endif ////////////////////////////////////////////////////////////////////////// /// @brief Calculate bounding box for current triangle /// @tparam CT: ConservativeRastFETraits type @@ -105,6 +138,37 @@ INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari bbox.ymax = vMaxY; } +#if USE_SIMD16_FRONTEND +template +INLINE void calcBoundingBoxIntVertical(const simd16vector * const tri, simd16scalari(&vX)[3], simd16scalari(&vY)[3], simd16BBox &bbox) +{ + simd16scalari vMinX = vX[0]; + + vMinX = _simd16_min_epi32(vMinX, vX[1]); + vMinX = _simd16_min_epi32(vMinX, vX[2]); + + simd16scalari vMaxX = vX[0]; + + vMaxX = _simd16_max_epi32(vMaxX, vX[1]); + vMaxX = _simd16_max_epi32(vMaxX, vX[2]); + + simd16scalari vMinY = vY[0]; + + vMinY = _simd16_min_epi32(vMinY, vY[1]); + vMinY = _simd16_min_epi32(vMinY, vY[2]); + + simd16scalari vMaxY = vY[0]; + + vMaxY = _simd16_max_epi32(vMaxY, vY[1]); + vMaxY = _simd16_max_epi32(vMaxY, vY[2]); + + bbox.xmin = vMinX; + bbox.xmax = vMaxX; + bbox.ymin = vMinY; + bbox.ymax = vMaxY; +} + +#endif ////////////////////////////////////////////////////////////////////////// /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical /// Offsets BBox for conservative rast @@ -356,6 +420,96 @@ struct GatherScissors<8> } }; +#if USE_SIMD16_FRONTEND +template +struct GatherScissors_simd16 +{ + static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex, + simd16scalari &scisXmin, simd16scalari &scisYmin, + simd16scalari &scisXmax, simd16scalari &scisYmax) + { + SWR_INVALID("Unhandled Simd Width in Scissor Rect Gather"); + } +}; + +template<> +struct GatherScissors_simd16<16> +{ + static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex, + simd16scalari &scisXmin, simd16scalari &scisYmin, + simd16scalari &scisXmax, simd16scalari &scisYmax) + { + scisXmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin, + pScissorsInFixedPoint[pViewportIndex[1]].xmin, + pScissorsInFixedPoint[pViewportIndex[2]].xmin, + pScissorsInFixedPoint[pViewportIndex[3]].xmin, + pScissorsInFixedPoint[pViewportIndex[4]].xmin, + pScissorsInFixedPoint[pViewportIndex[5]].xmin, + pScissorsInFixedPoint[pViewportIndex[6]].xmin, + pScissorsInFixedPoint[pViewportIndex[7]].xmin, + pScissorsInFixedPoint[pViewportIndex[8]].xmin, + pScissorsInFixedPoint[pViewportIndex[9]].xmin, + pScissorsInFixedPoint[pViewportIndex[10]].xmin, + pScissorsInFixedPoint[pViewportIndex[11]].xmin, + pScissorsInFixedPoint[pViewportIndex[12]].xmin, + pScissorsInFixedPoint[pViewportIndex[13]].xmin, + pScissorsInFixedPoint[pViewportIndex[14]].xmin, + pScissorsInFixedPoint[pViewportIndex[15]].xmin); + + scisYmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin, + pScissorsInFixedPoint[pViewportIndex[1]].ymin, + pScissorsInFixedPoint[pViewportIndex[2]].ymin, + pScissorsInFixedPoint[pViewportIndex[3]].ymin, + pScissorsInFixedPoint[pViewportIndex[4]].ymin, + pScissorsInFixedPoint[pViewportIndex[5]].ymin, + pScissorsInFixedPoint[pViewportIndex[6]].ymin, + pScissorsInFixedPoint[pViewportIndex[7]].ymin, + pScissorsInFixedPoint[pViewportIndex[8]].ymin, + pScissorsInFixedPoint[pViewportIndex[9]].ymin, + pScissorsInFixedPoint[pViewportIndex[10]].ymin, + pScissorsInFixedPoint[pViewportIndex[11]].ymin, + pScissorsInFixedPoint[pViewportIndex[12]].ymin, + pScissorsInFixedPoint[pViewportIndex[13]].ymin, + pScissorsInFixedPoint[pViewportIndex[14]].ymin, + pScissorsInFixedPoint[pViewportIndex[15]].ymin); + + scisXmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax, + pScissorsInFixedPoint[pViewportIndex[1]].xmax, + pScissorsInFixedPoint[pViewportIndex[2]].xmax, + pScissorsInFixedPoint[pViewportIndex[3]].xmax, + pScissorsInFixedPoint[pViewportIndex[4]].xmax, + pScissorsInFixedPoint[pViewportIndex[5]].xmax, + pScissorsInFixedPoint[pViewportIndex[6]].xmax, + pScissorsInFixedPoint[pViewportIndex[7]].xmax, + pScissorsInFixedPoint[pViewportIndex[8]].xmax, + pScissorsInFixedPoint[pViewportIndex[9]].xmax, + pScissorsInFixedPoint[pViewportIndex[10]].xmax, + pScissorsInFixedPoint[pViewportIndex[11]].xmax, + pScissorsInFixedPoint[pViewportIndex[12]].xmax, + pScissorsInFixedPoint[pViewportIndex[13]].xmax, + pScissorsInFixedPoint[pViewportIndex[14]].xmax, + pScissorsInFixedPoint[pViewportIndex[15]].xmax); + + scisYmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax, + pScissorsInFixedPoint[pViewportIndex[1]].ymax, + pScissorsInFixedPoint[pViewportIndex[2]].ymax, + pScissorsInFixedPoint[pViewportIndex[3]].ymax, + pScissorsInFixedPoint[pViewportIndex[4]].ymax, + pScissorsInFixedPoint[pViewportIndex[5]].ymax, + pScissorsInFixedPoint[pViewportIndex[6]].ymax, + pScissorsInFixedPoint[pViewportIndex[7]].ymax, + pScissorsInFixedPoint[pViewportIndex[8]].ymax, + pScissorsInFixedPoint[pViewportIndex[9]].ymax, + pScissorsInFixedPoint[pViewportIndex[10]].ymax, + pScissorsInFixedPoint[pViewportIndex[11]].ymax, + pScissorsInFixedPoint[pViewportIndex[12]].ymax, + pScissorsInFixedPoint[pViewportIndex[13]].ymax, + pScissorsInFixedPoint[pViewportIndex[14]].ymax, + pScissorsInFixedPoint[pViewportIndex[15]].ymax); + } +}; + +#endif typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*); struct ProcessAttributesChooser @@ -873,184 +1027,735 @@ void BinTriangles_simd16( simd16scalari primID, simd16scalari viewportIdx) { - enum { VERTS_PER_PRIM = 3 }; - - simdvector verts[VERTS_PER_PRIM]; - - for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1) - { - for (uint32_t j = 0; j < 4; j += 1) - { - verts[i][j] = _simd16_extract_ps(tri[i][j], 0); - } - } + SWR_CONTEXT *pContext = pDC->pContext; - pa.useAlternateOffset = false; - BinTriangles(pDC, pa, workerId, verts, GetPrimMaskLo(triMask), _simd16_extract_si(primID, 0), _simd16_extract_si(viewportIdx, 0)); + AR_BEGIN(FEBinTriangles, pDC->drawId); - if (GetPrimMaskHi(triMask)) - { - for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1) - { - for (uint32_t j = 0; j < 4; j += 1) - { - verts[i][j] = _simd16_extract_ps(tri[i][j], 1); - } - } + const API_STATE& state = GetApiState(pDC); + const SWR_RASTSTATE& rastState = state.rastState; + const SWR_FRONTEND_STATE& feState = state.frontendState; + const SWR_GS_STATE& gsState = state.gsState; - pa.useAlternateOffset = true; - BinTriangles(pDC, pa, workerId, verts, GetPrimMaskHi(triMask), _simd16_extract_si(primID, 1), _simd16_extract_si(viewportIdx, 1)); - } -} + MacroTileMgr *pTileMgr = pDC->pTileMgr; -#endif -struct FEBinTrianglesChooser -{ - typedef PFN_PROCESS_PRIMS FuncType; + simd16scalar vRecipW0 = _simd16_set1_ps(1.0f); + simd16scalar vRecipW1 = _simd16_set1_ps(1.0f); + simd16scalar vRecipW2 = _simd16_set1_ps(1.0f); - template - static FuncType GetFunc() + if (feState.vpTransformDisable) { - return BinTriangles>; + // RHW is passed in directly when VP transform is disabled + vRecipW0 = tri[0].v[3]; + vRecipW1 = tri[1].v[3]; + vRecipW2 = tri[2].v[3]; } -}; - -// Selector for correct templated BinTrinagles function -PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative) -{ - return TemplateArgUnroller::GetFunc(IsConservative); -} - -#if USE_SIMD16_FRONTEND -struct FEBinTrianglesChooser_simd16 -{ - typedef PFN_PROCESS_PRIMS_SIMD16 FuncType; - - template - static FuncType GetFunc() + else { - return BinTriangles_simd16>; - } -}; - -// Selector for correct templated BinTrinagles function -PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative) -{ - return TemplateArgUnroller::GetFunc(IsConservative); -} - -#endif - -////////////////////////////////////////////////////////////////////////// -/// @brief Bin SIMD points to the backend. Only supports point size of 1 -/// @param pDC - pointer to draw context. -/// @param pa - The primitive assembly object. -/// @param workerId - thread's worker id. Even thread has a unique id. -/// @param tri - Contains point position data for SIMDs worth of points. -/// @param primID - Primitive ID for each point. -void BinPoints( - DRAW_CONTEXT *pDC, - PA_STATE& pa, - uint32_t workerId, - simdvector prim[3], - uint32_t primMask, - simdscalari primID, - simdscalari viewportIdx) -{ - SWR_CONTEXT *pContext = pDC->pContext; - - AR_BEGIN(FEBinPoints, pDC->drawId); - - simdvector& primVerts = prim[0]; + // Perspective divide + vRecipW0 = _simd16_div_ps(_simd16_set1_ps(1.0f), tri[0].w); + vRecipW1 = _simd16_div_ps(_simd16_set1_ps(1.0f), tri[1].w); + vRecipW2 = _simd16_div_ps(_simd16_set1_ps(1.0f), tri[2].w); - const API_STATE& state = GetApiState(pDC); - const SWR_FRONTEND_STATE& feState = state.frontendState; - const SWR_GS_STATE& gsState = state.gsState; - const SWR_RASTSTATE& rastState = state.rastState; - const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; + tri[0].v[0] = _simd16_mul_ps(tri[0].v[0], vRecipW0); + tri[1].v[0] = _simd16_mul_ps(tri[1].v[0], vRecipW1); + tri[2].v[0] = _simd16_mul_ps(tri[2].v[0], vRecipW2); - // Select attribute processor - PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1, - state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); + tri[0].v[1] = _simd16_mul_ps(tri[0].v[1], vRecipW0); + tri[1].v[1] = _simd16_mul_ps(tri[1].v[1], vRecipW1); + tri[2].v[1] = _simd16_mul_ps(tri[2].v[1], vRecipW2); - if (!feState.vpTransformDisable) - { - // perspective divide - simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w); - primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0); - primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0); - primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0); + tri[0].v[2] = _simd16_mul_ps(tri[0].v[2], vRecipW0); + tri[1].v[2] = _simd16_mul_ps(tri[1].v[2], vRecipW1); + tri[2].v[2] = _simd16_mul_ps(tri[2].v[2], vRecipW2); - // viewport transform to screen coords + // Viewport transform to screen space coords if (state.gsState.emitsViewportArrayIndex) { - viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx); + viewportTransform<3>(tri, state.vpMatrices, viewportIdx); } else { - viewportTransform<1>(&primVerts, state.vpMatrices); + viewportTransform<3>(tri, state.vpMatrices); } } - // adjust for pixel center location - simdscalar offset = g_pixelOffsets[rastState.pixelLocation]; - primVerts.x = _simd_add_ps(primVerts.x, offset); - primVerts.y = _simd_add_ps(primVerts.y, offset); + // Adjust for pixel center location + const simd16scalar offset = g_pixelOffsets_simd16[rastState.pixelLocation]; - // convert to fixed point - simdscalari vXi, vYi; - vXi = fpToFixedPointVertical(primVerts.x); - vYi = fpToFixedPointVertical(primVerts.y); + tri[0].x = _simd16_add_ps(tri[0].x, offset); + tri[0].y = _simd16_add_ps(tri[0].y, offset); - if (CanUseSimplePoints(pDC)) - { - // adjust for ymin-xmin rule - vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1)); - vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1)); + tri[1].x = _simd16_add_ps(tri[1].x, offset); + tri[1].y = _simd16_add_ps(tri[1].y, offset); - // cull points off the ymin-xmin edge of the viewport - primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi)); - primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi)); + tri[2].x = _simd16_add_ps(tri[2].x, offset); + tri[2].y = _simd16_add_ps(tri[2].y, offset); - // compute macro tile coordinates - simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); - simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); + simd16scalari vXi[3], vYi[3]; - OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH]; - _simd_store_si((simdscalari*)aMacroX, macroX); - _simd_store_si((simdscalari*)aMacroY, macroY); + // Set vXi, vYi to required fixed point precision + FPToFixedPoint(tri, vXi, vYi); - // compute raster tile coordinates - simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); - simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); + // triangle setup + simd16scalari vAi[3], vBi[3]; + triangleSetupABIntVertical(vXi, vYi, vAi, vBi); - // compute raster tile relative x,y for coverage mask - simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT); - simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT); + // determinant + simd16scalari vDet[2]; + calcDeterminantIntVertical(vAi, vBi, vDet); - simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX); - simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY); + // cull zero area + simd16mask maskLo = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet[0], _simd16_setzero_si()))); + simd16mask maskHi = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet[1], _simd16_setzero_si()))); - OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH]; - OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH]; - _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX); - _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY); + uint32_t cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD16_WIDTH / 2)); - OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH]; - OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH]; - _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX); - _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY); + // don't cull degenerate triangles if we're conservatively rasterizing + uint32_t origTriMask = triMask; + if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value) + { + triMask &= ~cullZeroAreaMask; + } - OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH]; - _simd_store_ps((float*)aZ, primVerts.z); + // determine front winding tris + // CW +det + // CCW det < 0; + // 0 area triangles are marked as backfacing regardless of winding order, + // which is required behavior for conservative rast and wireframe rendering + uint32_t frontWindingTris; + if (rastState.frontWinding == SWR_FRONTWINDING_CW) + { + maskLo = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(vDet[0], _simd16_setzero_si()))); + maskHi = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(vDet[1], _simd16_setzero_si()))); + } + else + { + maskLo = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(_simd16_setzero_si(), vDet[0]))); + maskHi = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(_simd16_setzero_si(), vDet[1]))); + } - // store render target array index - OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; - if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) + frontWindingTris = maskLo | (maskHi << (KNOB_SIMD16_WIDTH / 2)); + + // cull + uint32_t cullTris; + switch ((SWR_CULLMODE)rastState.cullMode) + { + case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break; + case SWR_CULLMODE_NONE: cullTris = 0x0; break; + case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break; + // 0 area triangles are marked as backfacing, which is required behavior for conservative rast + case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break; + default: SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break; + } + + triMask &= ~cullTris; + + if (origTriMask ^ triMask) + { + RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0); + } + + // Simple non-conformant wireframe mode, useful for debugging + if (rastState.fillMode == SWR_FILLMODE_WIREFRAME) + { + // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD + simd16vector line[2]; + simd16scalar recipW[2]; + line[0] = tri[0]; + line[1] = tri[1]; + recipW[0] = vRecipW0; + recipW[1] = vRecipW1; + BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx); + + line[0] = tri[1]; + line[1] = tri[2]; + recipW[0] = vRecipW1; + recipW[1] = vRecipW2; + BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx); + + line[0] = tri[2]; + line[1] = tri[0]; + recipW[0] = vRecipW2; + recipW[1] = vRecipW0; + BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx); + + AR_END(FEBinTriangles, 1); + return; + } + + /// Note: these variable initializations must stay above any 'goto endBenTriangles' + // compute per tri backface + uint32_t frontFaceMask = frontWindingTris; + uint32_t *pPrimID = (uint32_t *)&primID; + const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; + DWORD triIndex = 0; + + uint32_t edgeEnable; + PFN_WORK_FUNC pfnWork; + if (CT::IsConservativeT::value) + { + // determine which edges of the degenerate tri, if any, are valid to rasterize. + // used to call the appropriate templated rasterizer function + if (cullZeroAreaMask > 0) + { + // e0 = v1-v0 + const simd16scalari x0x1Mask = _simd16_cmpeq_epi32(vXi[0], vXi[1]); + const simd16scalari y0y1Mask = _simd16_cmpeq_epi32(vYi[0], vYi[1]); + + uint32_t e0Mask = _simd16_movemask_ps(_simd16_castsi_ps(_simd16_and_si(x0x1Mask, y0y1Mask))); + + // e1 = v2-v1 + const simd16scalari x1x2Mask = _simd16_cmpeq_epi32(vXi[1], vXi[2]); + const simd16scalari y1y2Mask = _simd16_cmpeq_epi32(vYi[1], vYi[2]); + + uint32_t e1Mask = _simd16_movemask_ps(_simd16_castsi_ps(_simd16_and_si(x1x2Mask, y1y2Mask))); + + // e2 = v0-v2 + // if v0 == v1 & v1 == v2, v0 == v2 + uint32_t e2Mask = e0Mask & e1Mask; + SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512"); + + // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2 + // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001 + e0Mask = pdep_u32(e0Mask, 0x00249249); + + // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010 + e1Mask = pdep_u32(e1Mask, 0x00492492); + + // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100 + e2Mask = pdep_u32(e2Mask, 0x00924924); + + edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask))); + } + else + { + edgeEnable = 0x00FFFFFF; + } + } + else + { + // degenerate triangles won't be sent to rasterizer; just enable all edges + pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0), + (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, ALL_EDGES_VALID, (state.scissorsTileAligned == false)); + } + + if (!triMask) + { + goto endBinTriangles; + } + + // Calc bounding box of triangles + simd16BBox bbox; + calcBoundingBoxIntVertical(tri, vXi, vYi, bbox); + + // determine if triangle falls between pixel centers and discard + // only discard for non-MSAA case and when conservative rast is disabled + // (xmin + 127) & ~255 + // (xmax + 128) & ~255 + if ((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) && + (!CT::IsConservativeT::value)) + { + origTriMask = triMask; + + int cullCenterMask; + + { + simd16scalari xmin = _simd16_add_epi32(bbox.xmin, _simd16_set1_epi32(127)); + xmin = _simd16_and_si(xmin, _simd16_set1_epi32(~255)); + simd16scalari xmax = _simd16_add_epi32(bbox.xmax, _simd16_set1_epi32(128)); + xmax = _simd16_and_si(xmax, _simd16_set1_epi32(~255)); + + simd16scalari vMaskH = _simd16_cmpeq_epi32(xmin, xmax); + + simd16scalari ymin = _simd16_add_epi32(bbox.ymin, _simd16_set1_epi32(127)); + ymin = _simd16_and_si(ymin, _simd16_set1_epi32(~255)); + simd16scalari ymax = _simd16_add_epi32(bbox.ymax, _simd16_set1_epi32(128)); + ymax = _simd16_and_si(ymax, _simd16_set1_epi32(~255)); + + simd16scalari vMaskV = _simd16_cmpeq_epi32(ymin, ymax); + + vMaskV = _simd16_or_si(vMaskH, vMaskV); + cullCenterMask = _simd16_movemask_ps(_simd16_castsi_ps(vMaskV)); + } + + triMask &= ~cullCenterMask; + + if (origTriMask ^ triMask) + { + RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0); + } + } + + // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. + // Gather the AOS effective scissor rects based on the per-prim VP index. + /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. + simd16scalari scisXmin, scisYmin, scisXmax, scisYmax; + + if (state.gsState.emitsViewportArrayIndex) + { + GatherScissors_simd16::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, + scisXmin, scisYmin, scisXmax, scisYmax); + } + else // broadcast fast path for non-VPAI case. + { + scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin); + scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin); + scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax); + scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax); + } + + bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin); + bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin); + bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax); + bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax); + + if (CT::IsConservativeT::value) + { + // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has + // some area. Bump the xmax/ymax edges out + simd16scalari topEqualsBottom = _simd16_cmpeq_epi32(bbox.ymin, bbox.ymax); + bbox.ymax = _simd16_blendv_epi32(bbox.ymax, _simd16_add_epi32(bbox.ymax, _simd16_set1_epi32(1)), topEqualsBottom); + simd16scalari leftEqualsRight = _simd16_cmpeq_epi32(bbox.xmin, bbox.xmax); + bbox.xmax = _simd16_blendv_epi32(bbox.xmax, _simd16_add_epi32(bbox.xmax, _simd16_set1_epi32(1)), leftEqualsRight); + } + + // Cull tris completely outside scissor + { + simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax); + simd16scalari maskOutsideScissorY = _simd16_cmpgt_epi32(bbox.ymin, bbox.ymax); + simd16scalari maskOutsideScissorXY = _simd16_or_si(maskOutsideScissorX, maskOutsideScissorY); + uint32_t maskOutsideScissor = _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY)); + triMask = triMask & ~maskOutsideScissor; + } + + if (!triMask) + { + goto endBinTriangles; + } + + // Convert triangle bbox to macrotile units. + bbox.xmin = _simd16_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); + bbox.ymin = _simd16_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); + bbox.xmax = _simd16_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); + bbox.ymax = _simd16_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); + + OSALIGNSIMD16(uint32_t) aMTLeft[KNOB_SIMD16_WIDTH], aMTRight[KNOB_SIMD16_WIDTH], aMTTop[KNOB_SIMD16_WIDTH], aMTBottom[KNOB_SIMD16_WIDTH]; + + _simd16_store_si(reinterpret_cast(aMTLeft), bbox.xmin); + _simd16_store_si(reinterpret_cast(aMTRight), bbox.xmax); + _simd16_store_si(reinterpret_cast(aMTTop), bbox.ymin); + _simd16_store_si(reinterpret_cast(aMTBottom), bbox.ymax); + + // transpose verts needed for backend + /// @todo modify BE to take non-transformed verts + __m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH + __m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH + __m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH + __m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH + + vTranspose3x8(vHorizX[0], _simd16_extract_ps(tri[0].x, 0), _simd16_extract_ps(tri[1].x, 0), _simd16_extract_ps(tri[2].x, 0)); + vTranspose3x8(vHorizY[0], _simd16_extract_ps(tri[0].y, 0), _simd16_extract_ps(tri[1].y, 0), _simd16_extract_ps(tri[2].y, 0)); + vTranspose3x8(vHorizZ[0], _simd16_extract_ps(tri[0].z, 0), _simd16_extract_ps(tri[1].z, 0), _simd16_extract_ps(tri[2].z, 0)); + vTranspose3x8(vHorizW[0], _simd16_extract_ps(vRecipW0, 0), _simd16_extract_ps(vRecipW1, 0), _simd16_extract_ps(vRecipW2, 0)); + + vTranspose3x8(vHorizX[1], _simd16_extract_ps(tri[0].x, 1), _simd16_extract_ps(tri[1].x, 1), _simd16_extract_ps(tri[2].x, 1)); + vTranspose3x8(vHorizY[1], _simd16_extract_ps(tri[0].y, 1), _simd16_extract_ps(tri[1].y, 1), _simd16_extract_ps(tri[2].y, 1)); + vTranspose3x8(vHorizZ[1], _simd16_extract_ps(tri[0].z, 1), _simd16_extract_ps(tri[1].z, 1), _simd16_extract_ps(tri[2].z, 1)); + vTranspose3x8(vHorizW[1], _simd16_extract_ps(vRecipW0, 1), _simd16_extract_ps(vRecipW1, 1), _simd16_extract_ps(vRecipW2, 1)); + + // store render target array index + OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH]; + if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) + { + simd16vector vRtai[3]; + pa.Assemble_simd16(VERTEX_RTAI_SLOT, vRtai); + simd16scalari vRtaii; + vRtaii = _simd16_castps_si(vRtai[0].x); + _simd16_store_si(reinterpret_cast(aRTAI), vRtaii); + } + else + { + _simd16_store_si(reinterpret_cast(aRTAI), _simd16_setzero_si()); + } + +endBinTriangles: + + + // scan remaining valid triangles and bin each separately + while (_BitScanForward(&triIndex, triMask)) + { + uint32_t linkageCount = state.backendState.numAttributes; + uint32_t numScalarAttribs = linkageCount * 4; + + BE_WORK work; + work.type = DRAW; + + bool isDegenerate; + if (CT::IsConservativeT::value) + { + // only rasterize valid edges if we have a degenerate primitive + int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID; + work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0), + (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, triEdgeEnable, (state.scissorsTileAligned == false)); + + // Degenerate triangles are required to be constant interpolated + isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false; + } + else + { + isDegenerate = false; + work.pfnWork = pfnWork; + } + + // Select attribute processor + PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3, + state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate); + + TRIANGLE_WORK_DESC &desc = work.desc.tri; + + desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1); + desc.triFlags.primID = pPrimID[triIndex]; + desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex]; + desc.triFlags.viewportIndex = pViewportIndex[triIndex]; + + auto pArena = pDC->pArena; + SWR_ASSERT(pArena != nullptr); + + // store active attribs + float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); + desc.pAttribs = pAttribs; + desc.numAttribs = linkageCount; + pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs); + + // store triangle vertex data + desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16); + + { + const uint32_t i = triIndex >> 3; // triIndex / KNOB_SIMD_WIDTH + const uint32_t j = triIndex & 7; // triIndex % KNOB_SIMD_WIDTH + + _mm_store_ps(&desc.pTriBuffer[ 0], vHorizX[i][j]); + _mm_store_ps(&desc.pTriBuffer[ 4], vHorizY[i][j]); + _mm_store_ps(&desc.pTriBuffer[ 8], vHorizZ[i][j]); + _mm_store_ps(&desc.pTriBuffer[12], vHorizW[i][j]); + } + + // store user clip distances + if (rastState.clipDistanceMask) + { + uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask); + desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float)); + ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer); + } + + for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y) + { + for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x) + { +#if KNOB_ENABLE_TOSS_POINTS + if (!KNOB_TOSS_SETUP_TRIS) +#endif + { + pTileMgr->enqueue(x, y, &work); + } + } + } + + triMask &= ~(1 << triIndex); + } + + AR_END(FEBinTriangles, 1); +} + +#endif +struct FEBinTrianglesChooser +{ + typedef PFN_PROCESS_PRIMS FuncType; + + template + static FuncType GetFunc() + { + return BinTriangles>; + } +}; + +// Selector for correct templated BinTrinagles function +PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative) +{ + return TemplateArgUnroller::GetFunc(IsConservative); +} + +#if USE_SIMD16_FRONTEND +struct FEBinTrianglesChooser_simd16 +{ + typedef PFN_PROCESS_PRIMS_SIMD16 FuncType; + + template + static FuncType GetFunc() + { + return BinTriangles_simd16>; + } +}; + +// Selector for correct templated BinTrinagles function +PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative) +{ + return TemplateArgUnroller::GetFunc(IsConservative); +} + +#endif + +////////////////////////////////////////////////////////////////////////// +/// @brief Bin SIMD points to the backend. Only supports point size of 1 +/// @param pDC - pointer to draw context. +/// @param pa - The primitive assembly object. +/// @param workerId - thread's worker id. Even thread has a unique id. +/// @param tri - Contains point position data for SIMDs worth of points. +/// @param primID - Primitive ID for each point. +void BinPoints( + DRAW_CONTEXT *pDC, + PA_STATE& pa, + uint32_t workerId, + simdvector prim[3], + uint32_t primMask, + simdscalari primID, + simdscalari viewportIdx) +{ + SWR_CONTEXT *pContext = pDC->pContext; + + AR_BEGIN(FEBinPoints, pDC->drawId); + + simdvector& primVerts = prim[0]; + + const API_STATE& state = GetApiState(pDC); + const SWR_FRONTEND_STATE& feState = state.frontendState; + const SWR_GS_STATE& gsState = state.gsState; + const SWR_RASTSTATE& rastState = state.rastState; + const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; + + // Select attribute processor + PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1, + state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); + + if (!feState.vpTransformDisable) + { + // perspective divide + simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w); + primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0); + primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0); + primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0); + + // viewport transform to screen coords + if (state.gsState.emitsViewportArrayIndex) + { + viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx); + } + else + { + viewportTransform<1>(&primVerts, state.vpMatrices); + } + } + + // adjust for pixel center location + simdscalar offset = g_pixelOffsets[rastState.pixelLocation]; + primVerts.x = _simd_add_ps(primVerts.x, offset); + primVerts.y = _simd_add_ps(primVerts.y, offset); + + // convert to fixed point + simdscalari vXi, vYi; + vXi = fpToFixedPointVertical(primVerts.x); + vYi = fpToFixedPointVertical(primVerts.y); + + if (CanUseSimplePoints(pDC)) + { + // adjust for ymin-xmin rule + vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1)); + vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1)); + + // cull points off the ymin-xmin edge of the viewport + primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi)); + primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi)); + + // compute macro tile coordinates + simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); + simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); + + OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH]; + _simd_store_si((simdscalari*)aMacroX, macroX); + _simd_store_si((simdscalari*)aMacroY, macroY); + + // compute raster tile coordinates + simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); + simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); + + // compute raster tile relative x,y for coverage mask + simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT); + simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT); + + simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX); + simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY); + + OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH]; + OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH]; + _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX); + _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY); + + OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH]; + OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH]; + _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX); + _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY); + + OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH]; + _simd_store_ps((float*)aZ, primVerts.z); + + // store render target array index + OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; + if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) + { + simdvector vRtai; + pa.Assemble(VERTEX_RTAI_SLOT, &vRtai); + simdscalari vRtaii = _simd_castps_si(vRtai.x); + _simd_store_si((simdscalari*)aRTAI, vRtaii); + } + else + { + _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); + } + + uint32_t *pPrimID = (uint32_t *)&primID; + DWORD primIndex = 0; + + const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState; + + // scan remaining valid triangles and bin each separately + while (_BitScanForward(&primIndex, primMask)) + { + uint32_t linkageCount = backendState.numAttributes; + uint32_t numScalarAttribs = linkageCount * 4; + + BE_WORK work; + work.type = DRAW; + + TRIANGLE_WORK_DESC &desc = work.desc.tri; + + // points are always front facing + desc.triFlags.frontFacing = 1; + desc.triFlags.primID = pPrimID[primIndex]; + desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; + desc.triFlags.viewportIndex = pViewportIndex[primIndex]; + + work.pfnWork = RasterizeSimplePoint; + + auto pArena = pDC->pArena; + SWR_ASSERT(pArena != nullptr); + + // store attributes + float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16); + desc.pAttribs = pAttribs; + desc.numAttribs = linkageCount; + + pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs); + + // store raster tile aligned x, y, perspective correct z + float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16); + desc.pTriBuffer = pTriBuffer; + *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex]; + *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex]; + *pTriBuffer = aZ[primIndex]; + + uint32_t tX = aTileRelativeX[primIndex]; + uint32_t tY = aTileRelativeY[primIndex]; + + // pack the relative x,y into the coverageMask, the rasterizer will + // generate the true coverage mask from it + work.desc.tri.triFlags.coverageMask = tX | (tY << 4); + + // bin it + MacroTileMgr *pTileMgr = pDC->pTileMgr; +#if KNOB_ENABLE_TOSS_POINTS + if (!KNOB_TOSS_SETUP_TRIS) +#endif + { + pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work); + } + primMask &= ~(1 << primIndex); + } + } + else + { + // non simple points need to be potentially binned to multiple macro tiles + simdscalar vPointSize; + if (rastState.pointParam) { - simdvector vRtai; - pa.Assemble(VERTEX_RTAI_SLOT, &vRtai); - simdscalari vRtaii = _simd_castps_si(vRtai.x); + simdvector size[3]; + pa.Assemble(VERTEX_POINT_SIZE_SLOT, size); + vPointSize = size[0].x; + } + else + { + vPointSize = _simd_set1_ps(rastState.pointSize); + } + + // bloat point to bbox + simdBBox bbox; + bbox.xmin = bbox.xmax = vXi; + bbox.ymin = bbox.ymax = vYi; + + simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f)); + simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth); + bbox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi); + bbox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi); + bbox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi); + bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi); + + // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. + // Gather the AOS effective scissor rects based on the per-prim VP index. + /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. + simdscalari scisXmin, scisYmin, scisXmax, scisYmax; + if (state.gsState.emitsViewportArrayIndex) + { + GatherScissors::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, + scisXmin, scisYmin, scisXmax, scisYmax); + } + else // broadcast fast path for non-VPAI case. + { + scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); + scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); + scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); + scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); + } + + bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); + bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); + bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); + bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); + + // Cull bloated points completely outside scissor + simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax); + simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax); + simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY); + uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY)); + primMask = primMask & ~maskOutsideScissor; + + // Convert bbox to macrotile units. + bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); + bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); + bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); + bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); + + OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH]; + _simd_store_si((simdscalari*)aMTLeft, bbox.xmin); + _simd_store_si((simdscalari*)aMTRight, bbox.xmax); + _simd_store_si((simdscalari*)aMTTop, bbox.ymin); + _simd_store_si((simdscalari*)aMTBottom, bbox.ymax); + + // store render target array index + OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; + if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) + { + simdvector vRtai[2]; + pa.Assemble(VERTEX_RTAI_SLOT, vRtai); + simdscalari vRtaii = _simd_castps_si(vRtai[0].x); _simd_store_si((simdscalari*)aRTAI, vRtaii); } else @@ -1058,6 +1763,207 @@ void BinPoints( _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); } + OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH]; + _simd_store_ps((float*)aPointSize, vPointSize); + + uint32_t *pPrimID = (uint32_t *)&primID; + + OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH]; + OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH]; + OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH]; + + _simd_store_ps((float*)aPrimVertsX, primVerts.x); + _simd_store_ps((float*)aPrimVertsY, primVerts.y); + _simd_store_ps((float*)aPrimVertsZ, primVerts.z); + + // scan remaining valid prims and bin each separately + const SWR_BACKEND_STATE& backendState = state.backendState; + DWORD primIndex; + while (_BitScanForward(&primIndex, primMask)) + { + uint32_t linkageCount = backendState.numAttributes; + uint32_t numScalarAttribs = linkageCount * 4; + + BE_WORK work; + work.type = DRAW; + + TRIANGLE_WORK_DESC &desc = work.desc.tri; + + desc.triFlags.frontFacing = 1; + desc.triFlags.primID = pPrimID[primIndex]; + desc.triFlags.pointSize = aPointSize[primIndex]; + desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; + desc.triFlags.viewportIndex = pViewportIndex[primIndex]; + + work.pfnWork = RasterizeTriPoint; + + auto pArena = pDC->pArena; + SWR_ASSERT(pArena != nullptr); + + // store active attribs + desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); + desc.numAttribs = linkageCount; + pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs); + + // store point vertex data + float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16); + desc.pTriBuffer = pTriBuffer; + *pTriBuffer++ = aPrimVertsX[primIndex]; + *pTriBuffer++ = aPrimVertsY[primIndex]; + *pTriBuffer = aPrimVertsZ[primIndex]; + + // store user clip distances + if (rastState.clipDistanceMask) + { + uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask); + desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float)); + float dists[8]; + float one = 1.0f; + ProcessUserClipDist<1>(pa, primIndex, rastState.clipDistanceMask, &one, dists); + for (uint32_t i = 0; i < numClipDist; i++) { + desc.pUserClipBuffer[3*i + 0] = 0.0f; + desc.pUserClipBuffer[3*i + 1] = 0.0f; + desc.pUserClipBuffer[3*i + 2] = dists[i]; + } + } + + MacroTileMgr *pTileMgr = pDC->pTileMgr; + for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y) + { + for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x) + { +#if KNOB_ENABLE_TOSS_POINTS + if (!KNOB_TOSS_SETUP_TRIS) +#endif + { + pTileMgr->enqueue(x, y, &work); + } + } + } + + primMask &= ~(1 << primIndex); + } + } + + AR_END(FEBinPoints, 1); +} + +#if USE_SIMD16_FRONTEND +void BinPoints_simd16( + DRAW_CONTEXT *pDC, + PA_STATE& pa, + uint32_t workerId, + simd16vector prim[3], + uint32_t primMask, + simd16scalari primID, + simd16scalari viewportIdx) +{ + SWR_CONTEXT *pContext = pDC->pContext; + + AR_BEGIN(FEBinPoints, pDC->drawId); + + simd16vector& primVerts = prim[0]; + + const API_STATE& state = GetApiState(pDC); + const SWR_FRONTEND_STATE& feState = state.frontendState; + const SWR_GS_STATE& gsState = state.gsState; + const SWR_RASTSTATE& rastState = state.rastState; + const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; + + // Select attribute processor + PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1, + state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); + + if (!feState.vpTransformDisable) + { + // perspective divide + simd16scalar vRecipW0 = _simd16_div_ps(_simd16_set1_ps(1.0f), primVerts.w); + + primVerts.x = _simd16_mul_ps(primVerts.x, vRecipW0); + primVerts.y = _simd16_mul_ps(primVerts.y, vRecipW0); + primVerts.z = _simd16_mul_ps(primVerts.z, vRecipW0); + + // viewport transform to screen coords + if (state.gsState.emitsViewportArrayIndex) + { + viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx); + } + else + { + viewportTransform<1>(&primVerts, state.vpMatrices); + } + } + + const simd16scalar offset = g_pixelOffsets_simd16[rastState.pixelLocation]; + + primVerts.x = _simd16_add_ps(primVerts.x, offset); + primVerts.y = _simd16_add_ps(primVerts.y, offset); + + // convert to fixed point + simd16scalari vXi, vYi; + + vXi = fpToFixedPointVertical(primVerts.x); + vYi = fpToFixedPointVertical(primVerts.y); + + if (CanUseSimplePoints(pDC)) + { + // adjust for ymin-xmin rule + vXi = _simd16_sub_epi32(vXi, _simd16_set1_epi32(1)); + vYi = _simd16_sub_epi32(vYi, _simd16_set1_epi32(1)); + + // cull points off the ymin-xmin edge of the viewport + primMask &= ~_simd16_movemask_ps(_simd16_castsi_ps(vXi)); + primMask &= ~_simd16_movemask_ps(_simd16_castsi_ps(vYi)); + + // compute macro tile coordinates + simd16scalari macroX = _simd16_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); + simd16scalari macroY = _simd16_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); + + OSALIGNSIMD16(uint32_t) aMacroX[KNOB_SIMD16_WIDTH], aMacroY[KNOB_SIMD16_WIDTH]; + + _simd16_store_si(reinterpret_cast(aMacroX), macroX); + _simd16_store_si(reinterpret_cast(aMacroY), macroY); + + // compute raster tile coordinates + simd16scalari rasterX = _simd16_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); + simd16scalari rasterY = _simd16_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); + + // compute raster tile relative x,y for coverage mask + simd16scalari tileAlignedX = _simd16_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT); + simd16scalari tileAlignedY = _simd16_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT); + + simd16scalari tileRelativeX = _simd16_sub_epi32(_simd16_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX); + simd16scalari tileRelativeY = _simd16_sub_epi32(_simd16_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY); + + OSALIGNSIMD16(uint32_t) aTileRelativeX[KNOB_SIMD16_WIDTH]; + OSALIGNSIMD16(uint32_t) aTileRelativeY[KNOB_SIMD16_WIDTH]; + + _simd16_store_si(reinterpret_cast(aTileRelativeX), tileRelativeX); + _simd16_store_si(reinterpret_cast(aTileRelativeY), tileRelativeY); + + OSALIGNSIMD16(uint32_t) aTileAlignedX[KNOB_SIMD16_WIDTH]; + OSALIGNSIMD16(uint32_t) aTileAlignedY[KNOB_SIMD16_WIDTH]; + + _simd16_store_si(reinterpret_cast(aTileAlignedX), tileAlignedX); + _simd16_store_si(reinterpret_cast(aTileAlignedY), tileAlignedY); + + OSALIGNSIMD16(float) aZ[KNOB_SIMD16_WIDTH]; + _simd16_store_ps(reinterpret_cast(aZ), primVerts.z); + + // store render target array index + OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH]; + if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) + { + simd16vector vRtai; + pa.Assemble_simd16(VERTEX_RTAI_SLOT, &vRtai); + simd16scalari vRtaii = _simd16_castps_si(vRtai.x); + _simd16_store_si(reinterpret_cast(aRTAI), vRtaii); + } + else + { + _simd16_store_si(reinterpret_cast(aRTAI), _simd16_setzero_si()); + } + uint32_t *pPrimID = (uint32_t *)&primID; DWORD primIndex = 0; @@ -1114,103 +2020,108 @@ void BinPoints( { pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work); } + primMask &= ~(1 << primIndex); } } else { // non simple points need to be potentially binned to multiple macro tiles - simdscalar vPointSize; + simd16scalar vPointSize; + if (rastState.pointParam) { - simdvector size[3]; - pa.Assemble(VERTEX_POINT_SIZE_SLOT, size); + simd16vector size[3]; + pa.Assemble_simd16(VERTEX_POINT_SIZE_SLOT, size); vPointSize = size[0].x; } else { - vPointSize = _simd_set1_ps(rastState.pointSize); + vPointSize = _simd16_set1_ps(rastState.pointSize); } // bloat point to bbox - simdBBox bbox; + simd16BBox bbox; + bbox.xmin = bbox.xmax = vXi; bbox.ymin = bbox.ymax = vYi; - simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f)); - simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth); - bbox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi); - bbox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi); - bbox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi); - bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi); + simd16scalar vHalfWidth = _simd16_mul_ps(vPointSize, _simd16_set1_ps(0.5f)); + simd16scalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth); + + bbox.xmin = _simd16_sub_epi32(bbox.xmin, vHalfWidthi); + bbox.xmax = _simd16_add_epi32(bbox.xmax, vHalfWidthi); + bbox.ymin = _simd16_sub_epi32(bbox.ymin, vHalfWidthi); + bbox.ymax = _simd16_add_epi32(bbox.ymax, vHalfWidthi); // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. // Gather the AOS effective scissor rects based on the per-prim VP index. /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. - simdscalari scisXmin, scisYmin, scisXmax, scisYmax; + simd16scalari scisXmin, scisYmin, scisXmax, scisYmax; if (state.gsState.emitsViewportArrayIndex) { - GatherScissors::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, + GatherScissors_simd16::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax); } else // broadcast fast path for non-VPAI case. { - scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); - scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); - scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); - scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); + scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin); + scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin); + scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax); + scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax); } - bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); - bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); - bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); - bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); + bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin); + bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin); + bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax); + bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax); // Cull bloated points completely outside scissor - simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax); - simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax); - simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY); - uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY)); + simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax); + simd16scalari maskOutsideScissorY = _simd16_cmpgt_epi32(bbox.ymin, bbox.ymax); + simd16scalari maskOutsideScissorXY = _simd16_or_si(maskOutsideScissorX, maskOutsideScissorY); + uint32_t maskOutsideScissor = _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY)); primMask = primMask & ~maskOutsideScissor; // Convert bbox to macrotile units. - bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); - bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); - bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); - bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); + bbox.xmin = _simd16_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); + bbox.ymin = _simd16_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); + bbox.xmax = _simd16_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); + bbox.ymax = _simd16_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); - OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH]; - _simd_store_si((simdscalari*)aMTLeft, bbox.xmin); - _simd_store_si((simdscalari*)aMTRight, bbox.xmax); - _simd_store_si((simdscalari*)aMTTop, bbox.ymin); - _simd_store_si((simdscalari*)aMTBottom, bbox.ymax); + OSALIGNSIMD16(uint32_t) aMTLeft[KNOB_SIMD16_WIDTH], aMTRight[KNOB_SIMD16_WIDTH], aMTTop[KNOB_SIMD16_WIDTH], aMTBottom[KNOB_SIMD16_WIDTH]; + + _simd16_store_si(reinterpret_cast(aMTLeft), bbox.xmin); + _simd16_store_si(reinterpret_cast(aMTRight), bbox.xmax); + _simd16_store_si(reinterpret_cast(aMTTop), bbox.ymin); + _simd16_store_si(reinterpret_cast(aMTBottom), bbox.ymax); // store render target array index - OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; + OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH]; if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) { - simdvector vRtai[2]; - pa.Assemble(VERTEX_RTAI_SLOT, vRtai); - simdscalari vRtaii = _simd_castps_si(vRtai[0].x); - _simd_store_si((simdscalari*)aRTAI, vRtaii); + simd16vector vRtai[2]; + pa.Assemble_simd16(VERTEX_RTAI_SLOT, vRtai); + simd16scalari vRtaii = _simd16_castps_si(vRtai[0].x); + _simd16_store_si(reinterpret_cast(aRTAI), vRtaii); } else { - _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); + _simd16_store_si(reinterpret_cast(aRTAI), _simd16_setzero_si()); } - OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH]; - _simd_store_ps((float*)aPointSize, vPointSize); + OSALIGNSIMD16(float) aPointSize[KNOB_SIMD16_WIDTH]; + _simd16_store_ps(reinterpret_cast(aPointSize), vPointSize); uint32_t *pPrimID = (uint32_t *)&primID; - OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH]; - OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH]; - OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH]; + OSALIGNSIMD16(float) aPrimVertsX[KNOB_SIMD16_WIDTH]; + OSALIGNSIMD16(float) aPrimVertsY[KNOB_SIMD16_WIDTH]; + OSALIGNSIMD16(float) aPrimVertsZ[KNOB_SIMD16_WIDTH]; - _simd_store_ps((float*)aPrimVertsX, primVerts.x); - _simd_store_ps((float*)aPrimVertsY, primVerts.y); - _simd_store_ps((float*)aPrimVertsZ, primVerts.z); + _simd16_store_ps(reinterpret_cast(aPrimVertsX), primVerts.x); + _simd16_store_ps(reinterpret_cast(aPrimVertsY), primVerts.y); + _simd16_store_ps(reinterpret_cast(aPrimVertsZ), primVerts.z); // scan remaining valid prims and bin each separately const SWR_BACKEND_STATE& backendState = state.backendState; @@ -1257,9 +2168,9 @@ void BinPoints( float one = 1.0f; ProcessUserClipDist<1>(pa, primIndex, rastState.clipDistanceMask, &one, dists); for (uint32_t i = 0; i < numClipDist; i++) { - desc.pUserClipBuffer[3*i + 0] = 0.0f; - desc.pUserClipBuffer[3*i + 1] = 0.0f; - desc.pUserClipBuffer[3*i + 2] = dists[i]; + desc.pUserClipBuffer[3 * i + 0] = 0.0f; + desc.pUserClipBuffer[3 * i + 1] = 0.0f; + desc.pUserClipBuffer[3 * i + 2] = dists[i]; } } @@ -1284,46 +2195,6 @@ void BinPoints( AR_END(FEBinPoints, 1); } -#if USE_SIMD16_FRONTEND -void BinPoints_simd16( - DRAW_CONTEXT *pDC, - PA_STATE& pa, - uint32_t workerId, - simd16vector prim[3], - uint32_t primMask, - simd16scalari primID, - simd16scalari viewportIdx) -{ - enum { VERTS_PER_PRIM = 1 }; - - simdvector verts[VERTS_PER_PRIM]; - - for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1) - { - for (uint32_t j = 0; j < 4; j += 1) - { - verts[i][j] = _simd16_extract_ps(prim[i][j], 0); - } - } - - pa.useAlternateOffset = false; - BinPoints(pDC, pa, workerId, verts, GetPrimMaskLo(primMask), _simd16_extract_si(primID, 0), _simd16_extract_si(viewportIdx, 0)); - - if (GetPrimMaskHi(primMask)) - { - for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1) - { - for (uint32_t j = 0; j < 4; j += 1) - { - verts[i][j] = _simd16_extract_ps(prim[i][j], 1); - } - } - - pa.useAlternateOffset = true; - BinPoints(pDC, pa, workerId, verts, GetPrimMaskHi(primMask), _simd16_extract_si(primID, 1), _simd16_extract_si(viewportIdx, 1)); - } -} - #endif ////////////////////////////////////////////////////////////////////////// /// @brief Bin SIMD lines to the backend. @@ -1537,6 +2408,232 @@ endBinLines: AR_END(FEBinLines, 1); } +#if USE_SIMD16_FRONTEND +void BinPostSetupLines_simd16( + DRAW_CONTEXT *pDC, + PA_STATE& pa, + uint32_t workerId, + simd16vector prim[], + simd16scalar recipW[], + uint32_t primMask, + simd16scalari primID, + simd16scalari viewportIdx) +{ + SWR_CONTEXT *pContext = pDC->pContext; + + AR_BEGIN(FEBinLines, pDC->drawId); + + const API_STATE& state = GetApiState(pDC); + const SWR_RASTSTATE& rastState = state.rastState; + const SWR_FRONTEND_STATE& feState = state.frontendState; + const SWR_GS_STATE& gsState = state.gsState; + + // Select attribute processor + PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2, + state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); + + simd16scalar& vRecipW0 = recipW[0]; + simd16scalar& vRecipW1 = recipW[1]; + + // convert to fixed point + simd16scalari vXi[2], vYi[2]; + + vXi[0] = fpToFixedPointVertical(prim[0].x); + vYi[0] = fpToFixedPointVertical(prim[0].y); + vXi[1] = fpToFixedPointVertical(prim[1].x); + vYi[1] = fpToFixedPointVertical(prim[1].y); + + // compute x-major vs y-major mask + simd16scalari xLength = _simd16_abs_epi32(_simd16_sub_epi32(vXi[0], vXi[1])); + simd16scalari yLength = _simd16_abs_epi32(_simd16_sub_epi32(vYi[0], vYi[1])); + simd16scalar vYmajorMask = _simd16_castsi_ps(_simd16_cmpgt_epi32(yLength, xLength)); + uint32_t yMajorMask = _simd16_movemask_ps(vYmajorMask); + + // cull zero-length lines + simd16scalari vZeroLengthMask = _simd16_cmpeq_epi32(xLength, _simd16_setzero_si()); + vZeroLengthMask = _simd16_and_si(vZeroLengthMask, _simd16_cmpeq_epi32(yLength, _simd16_setzero_si())); + + primMask &= ~_simd16_movemask_ps(_simd16_castsi_ps(vZeroLengthMask)); + + uint32_t *pPrimID = (uint32_t *)&primID; + const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; + + // Calc bounding box of lines + simd16BBox bbox; + bbox.xmin = _simd16_min_epi32(vXi[0], vXi[1]); + bbox.xmax = _simd16_max_epi32(vXi[0], vXi[1]); + bbox.ymin = _simd16_min_epi32(vYi[0], vYi[1]); + bbox.ymax = _simd16_max_epi32(vYi[0], vYi[1]); + + // bloat bbox by line width along minor axis + simd16scalar vHalfWidth = _simd16_set1_ps(rastState.lineWidth / 2.0f); + simd16scalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth); + + simd16BBox bloatBox; + + bloatBox.xmin = _simd16_sub_epi32(bbox.xmin, vHalfWidthi); + bloatBox.xmax = _simd16_add_epi32(bbox.xmax, vHalfWidthi); + bloatBox.ymin = _simd16_sub_epi32(bbox.ymin, vHalfWidthi); + bloatBox.ymax = _simd16_add_epi32(bbox.ymax, vHalfWidthi); + + bbox.xmin = _simd16_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask); + bbox.xmax = _simd16_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask); + bbox.ymin = _simd16_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask); + bbox.ymax = _simd16_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask); + + // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. + simd16scalari scisXmin, scisYmin, scisXmax, scisYmax; + + if (state.gsState.emitsViewportArrayIndex) + { + GatherScissors_simd16::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, + scisXmin, scisYmin, scisXmax, scisYmax); + } + else // broadcast fast path for non-VPAI case. + { + scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin); + scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin); + scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax); + scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax); + } + + bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin); + bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin); + bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax); + bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax); + + // Cull prims completely outside scissor + { + simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax); + simd16scalari maskOutsideScissorY = _simd16_cmpgt_epi32(bbox.ymin, bbox.ymax); + simd16scalari maskOutsideScissorXY = _simd16_or_si(maskOutsideScissorX, maskOutsideScissorY); + uint32_t maskOutsideScissor = _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY)); + primMask = primMask & ~maskOutsideScissor; + } + + if (!primMask) + { + goto endBinLines; + } + + // Convert triangle bbox to macrotile units. + bbox.xmin = _simd16_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); + bbox.ymin = _simd16_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); + bbox.xmax = _simd16_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); + bbox.ymax = _simd16_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); + + OSALIGNSIMD16(uint32_t) aMTLeft[KNOB_SIMD16_WIDTH], aMTRight[KNOB_SIMD16_WIDTH], aMTTop[KNOB_SIMD16_WIDTH], aMTBottom[KNOB_SIMD16_WIDTH]; + + _simd16_store_si(reinterpret_cast(aMTLeft), bbox.xmin); + _simd16_store_si(reinterpret_cast(aMTRight), bbox.xmax); + _simd16_store_si(reinterpret_cast(aMTTop), bbox.ymin); + _simd16_store_si(reinterpret_cast(aMTBottom), bbox.ymax); + + // transpose verts needed for backend + /// @todo modify BE to take non-transformed verts + __m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH + __m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH + __m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH + __m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH + + const simdscalar unused = _simd_setzero_ps(); + + vTranspose3x8(vHorizX[0], _simd16_extract_ps(prim[0].x, 0), _simd16_extract_ps(prim[1].x, 0), unused); + vTranspose3x8(vHorizY[0], _simd16_extract_ps(prim[0].y, 0), _simd16_extract_ps(prim[1].y, 0), unused); + vTranspose3x8(vHorizZ[0], _simd16_extract_ps(prim[0].z, 0), _simd16_extract_ps(prim[1].z, 0), unused); + vTranspose3x8(vHorizW[0], _simd16_extract_ps(vRecipW0, 0), _simd16_extract_ps(vRecipW1, 0), unused); + + vTranspose3x8(vHorizX[1], _simd16_extract_ps(prim[0].x, 1), _simd16_extract_ps(prim[1].x, 1), unused); + vTranspose3x8(vHorizY[1], _simd16_extract_ps(prim[0].y, 1), _simd16_extract_ps(prim[1].y, 1), unused); + vTranspose3x8(vHorizZ[1], _simd16_extract_ps(prim[0].z, 1), _simd16_extract_ps(prim[1].z, 1), unused); + vTranspose3x8(vHorizW[1], _simd16_extract_ps(vRecipW0, 1), _simd16_extract_ps(vRecipW1, 1), unused); + + // store render target array index + OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH]; + if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) + { + simd16vector vRtai[2]; + pa.Assemble_simd16(VERTEX_RTAI_SLOT, vRtai); + simd16scalari vRtaii = _simd16_castps_si(vRtai[0].x); + _simd16_store_si(reinterpret_cast(aRTAI), vRtaii); + } + else + { + _simd16_store_si(reinterpret_cast(aRTAI), _simd16_setzero_si()); + } + + // scan remaining valid prims and bin each separately + DWORD primIndex; + while (_BitScanForward(&primIndex, primMask)) + { + uint32_t linkageCount = state.backendState.numAttributes; + uint32_t numScalarAttribs = linkageCount * 4; + + BE_WORK work; + work.type = DRAW; + + TRIANGLE_WORK_DESC &desc = work.desc.tri; + + desc.triFlags.frontFacing = 1; + desc.triFlags.primID = pPrimID[primIndex]; + desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1; + desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; + desc.triFlags.viewportIndex = pViewportIndex[primIndex]; + + work.pfnWork = RasterizeLine; + + auto pArena = pDC->pArena; + SWR_ASSERT(pArena != nullptr); + + // store active attribs + desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); + desc.numAttribs = linkageCount; + pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs); + + // store line vertex data + desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16); + + { + const uint32_t i = primIndex >> 3; // triIndex / KNOB_SIMD_WIDTH + const uint32_t j = primIndex & 7; // triIndex % KNOB_SIMD_WIDTH + + _mm_store_ps(&desc.pTriBuffer[ 0], vHorizX[i][j]); + _mm_store_ps(&desc.pTriBuffer[ 4], vHorizY[i][j]); + _mm_store_ps(&desc.pTriBuffer[ 8], vHorizZ[i][j]); + _mm_store_ps(&desc.pTriBuffer[12], vHorizW[i][j]); + } + + // store user clip distances + if (rastState.clipDistanceMask) + { + uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask); + desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float)); + ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer); + } + + MacroTileMgr *pTileMgr = pDC->pTileMgr; + for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y) + { + for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x) + { +#if KNOB_ENABLE_TOSS_POINTS + if (!KNOB_TOSS_SETUP_TRIS) +#endif + { + pTileMgr->enqueue(x, y, &work); + } + } + } + + primMask &= ~(1 << primIndex); + } + +endBinLines: + + AR_END(FEBinLines, 1); +} + +#endif ////////////////////////////////////////////////////////////////////////// /// @brief Bin SIMD lines to the backend. /// @param pDC - pointer to draw context. @@ -1622,34 +2719,63 @@ void BinLines_simd16( simd16scalari primID, simd16scalari viewportIdx) { - enum { VERTS_PER_PRIM = 2 }; + SWR_CONTEXT *pContext = pDC->pContext; - simdvector verts[VERTS_PER_PRIM]; + const API_STATE& state = GetApiState(pDC); + const SWR_RASTSTATE& rastState = state.rastState; + const SWR_FRONTEND_STATE& feState = state.frontendState; + const SWR_GS_STATE& gsState = state.gsState; - for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1) - { - for (uint32_t j = 0; j < 4; j += 1) - { - verts[i][j] = _simd16_extract_ps(prim[i][j], 0); - } - } + // Select attribute processor + PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2, + state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); - pa.useAlternateOffset = false; - BinLines(pDC, pa, workerId, verts, GetPrimMaskLo(primMask), _simd16_extract_si(primID, 0), _simd16_extract_si(viewportIdx, 0)); + simd16scalar vRecipW[2] = { _simd16_set1_ps(1.0f), _simd16_set1_ps(1.0f) }; - if (GetPrimMaskHi(primMask)) + if (!feState.vpTransformDisable) { - for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1) + // perspective divide + vRecipW[0] = _simd16_div_ps(_simd16_set1_ps(1.0f), prim[0].w); + vRecipW[1] = _simd16_div_ps(_simd16_set1_ps(1.0f), prim[1].w); + + prim[0].v[0] = _simd16_mul_ps(prim[0].v[0], vRecipW[0]); + prim[1].v[0] = _simd16_mul_ps(prim[1].v[0], vRecipW[1]); + + prim[0].v[1] = _simd16_mul_ps(prim[0].v[1], vRecipW[0]); + prim[1].v[1] = _simd16_mul_ps(prim[1].v[1], vRecipW[1]); + + prim[0].v[2] = _simd16_mul_ps(prim[0].v[2], vRecipW[0]); + prim[1].v[2] = _simd16_mul_ps(prim[1].v[2], vRecipW[1]); + + // viewport transform to screen coords + if (state.gsState.emitsViewportArrayIndex) { - for (uint32_t j = 0; j < 4; j += 1) - { - verts[i][j] = _simd16_extract_ps(prim[i][j], 1); - } + viewportTransform<2>(prim, state.vpMatrices, viewportIdx); + } + else + { + viewportTransform<2>(prim, state.vpMatrices); } +} - pa.useAlternateOffset = true; - BinLines(pDC, pa, workerId, verts, GetPrimMaskHi(primMask), _simd16_extract_si(primID, 1), _simd16_extract_si(viewportIdx, 1)); - } + // adjust for pixel center location + simd16scalar offset = g_pixelOffsets_simd16[rastState.pixelLocation]; + + prim[0].x = _simd16_add_ps(prim[0].x, offset); + prim[0].y = _simd16_add_ps(prim[0].y, offset); + + prim[1].x = _simd16_add_ps(prim[1].x, offset); + prim[1].y = _simd16_add_ps(prim[1].y, offset); + + BinPostSetupLines_simd16( + pDC, + pa, + workerId, + prim, + vRecipW, + primMask, + primID, + viewportIdx); } #endif diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h index 37b7215c516..e880ead71dc 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.h +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h @@ -112,6 +112,23 @@ void triangleSetupABIntVertical(const simdscalari vX[3], const simdscalari vY[3] vB[1] = _simd_sub_epi32(vX[2], vX[1]); vB[2] = _simd_sub_epi32(vX[0], vX[2]); } + +#if ENABLE_AVX512_SIMD16 +INLINE +void triangleSetupABIntVertical(const simd16scalari vX[3], const simd16scalari vY[3], simd16scalari(&vA)[3], simd16scalari(&vB)[3]) +{ + // A = y0 - y1 + // B = x1 - x0 + vA[0] = _simd16_sub_epi32(vY[0], vY[1]); + vA[1] = _simd16_sub_epi32(vY[1], vY[2]); + vA[2] = _simd16_sub_epi32(vY[2], vY[0]); + + vB[0] = _simd16_sub_epi32(vX[1], vX[0]); + vB[1] = _simd16_sub_epi32(vX[2], vX[1]); + vB[2] = _simd16_sub_epi32(vX[0], vX[2]); +} + +#endif // Calculate the determinant of the triangle // 2 vectors between the 3 points: P, Q // Px = x0-x2, Py = y0-y2 @@ -185,6 +202,44 @@ void calcDeterminantIntVertical(const simdscalari vA[3], const simdscalari vB[3] pvDet[1] = vResultHi; } +#if ENABLE_AVX512_SIMD16 +INLINE +void calcDeterminantIntVertical(const simd16scalari vA[3], const simd16scalari vB[3], simd16scalari *pvDet) +{ + // refer to calcDeterminantInt comment for calculation explanation + // A1*B2 + simd16scalari vA1Lo = _simd16_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5 + simd16scalari vA1Hi = _simd16_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7 + + simd16scalari vB2Lo = _simd16_unpacklo_epi32(vB[2], vB[2]); + simd16scalari vB2Hi = _simd16_unpackhi_epi32(vB[2], vB[2]); + + simd16scalari vA1B2Lo = _simd16_mul_epi32(vA1Lo, vB2Lo); // 0 1 4 5 + simd16scalari vA1B2Hi = _simd16_mul_epi32(vA1Hi, vB2Hi); // 2 3 6 7 + + // B1*A2 + simd16scalari vA2Lo = _simd16_unpacklo_epi32(vA[2], vA[2]); + simd16scalari vA2Hi = _simd16_unpackhi_epi32(vA[2], vA[2]); + + simd16scalari vB1Lo = _simd16_unpacklo_epi32(vB[1], vB[1]); + simd16scalari vB1Hi = _simd16_unpackhi_epi32(vB[1], vB[1]); + + simd16scalari vA2B1Lo = _simd16_mul_epi32(vA2Lo, vB1Lo); + simd16scalari vA2B1Hi = _simd16_mul_epi32(vA2Hi, vB1Hi); + + // A1*B2 - A2*B1 + simd16scalari detLo = _simd16_sub_epi64(vA1B2Lo, vA2B1Lo); + simd16scalari detHi = _simd16_sub_epi64(vA1B2Hi, vA2B1Hi); + + // shuffle 0 1 4 5 -> 0 1 2 3 + simd16scalari vResultLo = _simd16_permute2f128_si(detLo, detHi, 0x20); + simd16scalari vResultHi = _simd16_permute2f128_si(detLo, detHi, 0x31); + + pvDet[0] = vResultLo; + pvDet[1] = vResultHi; +} + +#endif INLINE void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128 &vB, __m128 &vC) { @@ -227,6 +282,27 @@ void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices) } } +#if USE_SIMD16_FRONTEND +template +INLINE +void viewportTransform(simd16vector *v, const SWR_VIEWPORT_MATRICES & vpMatrices) +{ + const simd16scalar m00 = _simd16_broadcast_ss(&vpMatrices.m00[0]); + const simd16scalar m30 = _simd16_broadcast_ss(&vpMatrices.m30[0]); + const simd16scalar m11 = _simd16_broadcast_ss(&vpMatrices.m11[0]); + const simd16scalar m31 = _simd16_broadcast_ss(&vpMatrices.m31[0]); + const simd16scalar m22 = _simd16_broadcast_ss(&vpMatrices.m22[0]); + const simd16scalar m32 = _simd16_broadcast_ss(&vpMatrices.m32[0]); + + for (uint32_t i = 0; i < NumVerts; ++i) + { + v[i].x = _simd16_fmadd_ps(v[i].x, m00, m30); + v[i].y = _simd16_fmadd_ps(v[i].y, m11, m31); + v[i].z = _simd16_fmadd_ps(v[i].z, m22, m32); + } +} + +#endif template INLINE void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simdscalari vViewportIdx) @@ -247,6 +323,28 @@ void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, } } +#if USE_SIMD16_FRONTEND +template +INLINE +void viewportTransform(simd16vector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simd16scalari vViewportIdx) +{ + // perform a gather of each matrix element based on the viewport array indexes + const simd16scalar m00 = _simd16_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4); + const simd16scalar m30 = _simd16_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 4); + const simd16scalar m11 = _simd16_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 4); + const simd16scalar m31 = _simd16_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 4); + const simd16scalar m22 = _simd16_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 4); + const simd16scalar m32 = _simd16_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 4); + + for (uint32_t i = 0; i < NumVerts; ++i) + { + v[i].x = _simd16_fmadd_ps(v[i].x, m00, m30); + v[i].y = _simd16_fmadd_ps(v[i].y, m11, m31); + v[i].z = _simd16_fmadd_ps(v[i].z, m22, m32); + } +} + +#endif INLINE void calcBoundingBoxInt(const __m128i &vX, const __m128i &vY, SWR_RECT &bbox) { diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h index 9dfa16a529e..660a63fe4e3 100644 --- a/src/gallium/drivers/swr/rasterizer/core/utils.h +++ b/src/gallium/drivers/swr/rasterizer/core/utils.h @@ -83,6 +83,16 @@ struct simdBBox simdscalari xmax; }; +#if ENABLE_AVX512_SIMD16 +struct simd16BBox +{ + simd16scalari ymin; + simd16scalari ymax; + simd16scalari xmin; + simd16scalari xmax; +}; + +#endif INLINE void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3) { -- 2.30.2