From: Tim Rowley Date: Fri, 4 Aug 2017 23:07:01 +0000 (-0500) Subject: swr/rast: FE/Binner - unify SIMD8/16 functions using simdlib types X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=6cb20c9f3a327fe3c1a99d6824632aea238d7d72;p=mesa.git swr/rast: FE/Binner - unify SIMD8/16 functions using simdlib types Reviewed-by: Bruce Cherniak --- diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp index 9fe1b017117..e09ff7a3995 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp @@ -36,13 +36,26 @@ #include "tilemgr.h" // Function Prototype -void BinPostSetupLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], simdscalar vRecipW[2], uint32_t primMask, simdscalari const &primID, simdscalari const &viewportIdx); -void BinPostSetupPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primID, simdscalari const &viewportIdx); +template +void BinPostSetupLinesImpl( + DRAW_CONTEXT *pDC, + PA_STATE &pa, + uint32_t workerId, + typename SIMD_T::Vec4 prim[], + typename SIMD_T::Float recipW[], + uint32_t primMask, + typename SIMD_T::Integer const &primID, + typename SIMD_T::Integer const &viewportIdx); -#if USE_SIMD16_FRONTEND -void BinPostSetupLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], simd16scalar vRecipW[2], uint32_t primMask, simd16scalari const &primID, simd16scalari const &viewportIdx); -void BinPostSetupPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primID, simd16scalari const &viewportIdx); -#endif +template +void BinPostSetupPointsImpl( + DRAW_CONTEXT *pDC, + PA_STATE &pa, + uint32_t workerId, + typename SIMD_T::Vec4 prim[], + uint32_t primMask, + typename SIMD_T::Integer const &primID, + typename SIMD_T::Integer const &viewportIdx); ////////////////////////////////////////////////////////////////////////// /// @brief Processes attributes for the backend based on linkage mask and @@ -209,148 +222,123 @@ INLINE void ProcessAttributes( /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data. // /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. -template -struct GatherScissors -{ - static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex, - simdscalari &scisXmin, simdscalari &scisYmin, - simdscalari &scisXmax, simdscalari &scisYmax) - { - SWR_INVALID("Unhandled Simd Width in Scissor Rect Gather"); - } -}; - -template<> -struct GatherScissors<8> -{ - static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex, - simdscalari &scisXmin, simdscalari &scisYmin, - simdscalari &scisXmax, simdscalari &scisYmax) - { - scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin, - pScissorsInFixedPoint[pViewportIndex[1]].xmin, - pScissorsInFixedPoint[pViewportIndex[2]].xmin, - pScissorsInFixedPoint[pViewportIndex[3]].xmin, - pScissorsInFixedPoint[pViewportIndex[4]].xmin, - pScissorsInFixedPoint[pViewportIndex[5]].xmin, - pScissorsInFixedPoint[pViewportIndex[6]].xmin, - pScissorsInFixedPoint[pViewportIndex[7]].xmin); - scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin, - pScissorsInFixedPoint[pViewportIndex[1]].ymin, - pScissorsInFixedPoint[pViewportIndex[2]].ymin, - pScissorsInFixedPoint[pViewportIndex[3]].ymin, - pScissorsInFixedPoint[pViewportIndex[4]].ymin, - pScissorsInFixedPoint[pViewportIndex[5]].ymin, - pScissorsInFixedPoint[pViewportIndex[6]].ymin, - pScissorsInFixedPoint[pViewportIndex[7]].ymin); - scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax, - pScissorsInFixedPoint[pViewportIndex[1]].xmax, - pScissorsInFixedPoint[pViewportIndex[2]].xmax, - pScissorsInFixedPoint[pViewportIndex[3]].xmax, - pScissorsInFixedPoint[pViewportIndex[4]].xmax, - pScissorsInFixedPoint[pViewportIndex[5]].xmax, - pScissorsInFixedPoint[pViewportIndex[6]].xmax, - pScissorsInFixedPoint[pViewportIndex[7]].xmax); - scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax, - pScissorsInFixedPoint[pViewportIndex[1]].ymax, - pScissorsInFixedPoint[pViewportIndex[2]].ymax, - pScissorsInFixedPoint[pViewportIndex[3]].ymax, - pScissorsInFixedPoint[pViewportIndex[4]].ymax, - pScissorsInFixedPoint[pViewportIndex[5]].ymax, - pScissorsInFixedPoint[pViewportIndex[6]].ymax, - pScissorsInFixedPoint[pViewportIndex[7]].ymax); - } -}; - -#if USE_SIMD16_FRONTEND -template -struct GatherScissors_simd16 +static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const uint32_t *pViewportIndex, + simdscalari &scisXmin, simdscalari &scisYmin, simdscalari &scisXmax, simdscalari &scisYmax) { - static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex, - simd16scalari &scisXmin, simd16scalari &scisYmin, - simd16scalari &scisXmax, simd16scalari &scisYmax) - { - SWR_INVALID("Unhandled Simd Width in Scissor Rect Gather"); - } -}; + scisXmin = _simd_set_epi32( + pScissorsInFixedPoint[pViewportIndex[0]].xmin, + pScissorsInFixedPoint[pViewportIndex[1]].xmin, + pScissorsInFixedPoint[pViewportIndex[2]].xmin, + pScissorsInFixedPoint[pViewportIndex[3]].xmin, + pScissorsInFixedPoint[pViewportIndex[4]].xmin, + pScissorsInFixedPoint[pViewportIndex[5]].xmin, + pScissorsInFixedPoint[pViewportIndex[6]].xmin, + pScissorsInFixedPoint[pViewportIndex[7]].xmin); + scisYmin = _simd_set_epi32( + pScissorsInFixedPoint[pViewportIndex[0]].ymin, + pScissorsInFixedPoint[pViewportIndex[1]].ymin, + pScissorsInFixedPoint[pViewportIndex[2]].ymin, + pScissorsInFixedPoint[pViewportIndex[3]].ymin, + pScissorsInFixedPoint[pViewportIndex[4]].ymin, + pScissorsInFixedPoint[pViewportIndex[5]].ymin, + pScissorsInFixedPoint[pViewportIndex[6]].ymin, + pScissorsInFixedPoint[pViewportIndex[7]].ymin); + scisXmax = _simd_set_epi32( + pScissorsInFixedPoint[pViewportIndex[0]].xmax, + pScissorsInFixedPoint[pViewportIndex[1]].xmax, + pScissorsInFixedPoint[pViewportIndex[2]].xmax, + pScissorsInFixedPoint[pViewportIndex[3]].xmax, + pScissorsInFixedPoint[pViewportIndex[4]].xmax, + pScissorsInFixedPoint[pViewportIndex[5]].xmax, + pScissorsInFixedPoint[pViewportIndex[6]].xmax, + pScissorsInFixedPoint[pViewportIndex[7]].xmax); + scisYmax = _simd_set_epi32( + pScissorsInFixedPoint[pViewportIndex[0]].ymax, + pScissorsInFixedPoint[pViewportIndex[1]].ymax, + pScissorsInFixedPoint[pViewportIndex[2]].ymax, + pScissorsInFixedPoint[pViewportIndex[3]].ymax, + pScissorsInFixedPoint[pViewportIndex[4]].ymax, + pScissorsInFixedPoint[pViewportIndex[5]].ymax, + pScissorsInFixedPoint[pViewportIndex[6]].ymax, + pScissorsInFixedPoint[pViewportIndex[7]].ymax); +} -template<> -struct GatherScissors_simd16<16> +static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const uint32_t *pViewportIndex, + simd16scalari &scisXmin, simd16scalari &scisYmin, simd16scalari &scisXmax, simd16scalari &scisYmax) { - static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex, - simd16scalari &scisXmin, simd16scalari &scisYmin, - simd16scalari &scisXmax, simd16scalari &scisYmax) { - scisXmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin, - pScissorsInFixedPoint[pViewportIndex[1]].xmin, - pScissorsInFixedPoint[pViewportIndex[2]].xmin, - pScissorsInFixedPoint[pViewportIndex[3]].xmin, - pScissorsInFixedPoint[pViewportIndex[4]].xmin, - pScissorsInFixedPoint[pViewportIndex[5]].xmin, - pScissorsInFixedPoint[pViewportIndex[6]].xmin, - pScissorsInFixedPoint[pViewportIndex[7]].xmin, - pScissorsInFixedPoint[pViewportIndex[8]].xmin, - pScissorsInFixedPoint[pViewportIndex[9]].xmin, - pScissorsInFixedPoint[pViewportIndex[10]].xmin, - pScissorsInFixedPoint[pViewportIndex[11]].xmin, - pScissorsInFixedPoint[pViewportIndex[12]].xmin, - pScissorsInFixedPoint[pViewportIndex[13]].xmin, - pScissorsInFixedPoint[pViewportIndex[14]].xmin, - pScissorsInFixedPoint[pViewportIndex[15]].xmin); - - scisYmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin, - pScissorsInFixedPoint[pViewportIndex[1]].ymin, - pScissorsInFixedPoint[pViewportIndex[2]].ymin, - pScissorsInFixedPoint[pViewportIndex[3]].ymin, - pScissorsInFixedPoint[pViewportIndex[4]].ymin, - pScissorsInFixedPoint[pViewportIndex[5]].ymin, - pScissorsInFixedPoint[pViewportIndex[6]].ymin, - pScissorsInFixedPoint[pViewportIndex[7]].ymin, - pScissorsInFixedPoint[pViewportIndex[8]].ymin, - pScissorsInFixedPoint[pViewportIndex[9]].ymin, - pScissorsInFixedPoint[pViewportIndex[10]].ymin, - pScissorsInFixedPoint[pViewportIndex[11]].ymin, - pScissorsInFixedPoint[pViewportIndex[12]].ymin, - pScissorsInFixedPoint[pViewportIndex[13]].ymin, - pScissorsInFixedPoint[pViewportIndex[14]].ymin, - pScissorsInFixedPoint[pViewportIndex[15]].ymin); - - scisXmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax, - pScissorsInFixedPoint[pViewportIndex[1]].xmax, - pScissorsInFixedPoint[pViewportIndex[2]].xmax, - pScissorsInFixedPoint[pViewportIndex[3]].xmax, - pScissorsInFixedPoint[pViewportIndex[4]].xmax, - pScissorsInFixedPoint[pViewportIndex[5]].xmax, - pScissorsInFixedPoint[pViewportIndex[6]].xmax, - pScissorsInFixedPoint[pViewportIndex[7]].xmax, - pScissorsInFixedPoint[pViewportIndex[8]].xmax, - pScissorsInFixedPoint[pViewportIndex[9]].xmax, - pScissorsInFixedPoint[pViewportIndex[10]].xmax, - pScissorsInFixedPoint[pViewportIndex[11]].xmax, - pScissorsInFixedPoint[pViewportIndex[12]].xmax, - pScissorsInFixedPoint[pViewportIndex[13]].xmax, - pScissorsInFixedPoint[pViewportIndex[14]].xmax, - pScissorsInFixedPoint[pViewportIndex[15]].xmax); - - scisYmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax, - pScissorsInFixedPoint[pViewportIndex[1]].ymax, - pScissorsInFixedPoint[pViewportIndex[2]].ymax, - pScissorsInFixedPoint[pViewportIndex[3]].ymax, - pScissorsInFixedPoint[pViewportIndex[4]].ymax, - pScissorsInFixedPoint[pViewportIndex[5]].ymax, - pScissorsInFixedPoint[pViewportIndex[6]].ymax, - pScissorsInFixedPoint[pViewportIndex[7]].ymax, - pScissorsInFixedPoint[pViewportIndex[8]].ymax, - pScissorsInFixedPoint[pViewportIndex[9]].ymax, - pScissorsInFixedPoint[pViewportIndex[10]].ymax, - pScissorsInFixedPoint[pViewportIndex[11]].ymax, - pScissorsInFixedPoint[pViewportIndex[12]].ymax, - pScissorsInFixedPoint[pViewportIndex[13]].ymax, - pScissorsInFixedPoint[pViewportIndex[14]].ymax, - pScissorsInFixedPoint[pViewportIndex[15]].ymax); - } -}; + scisXmin = _simd16_set_epi32( + pScissorsInFixedPoint[pViewportIndex[0]].xmin, + pScissorsInFixedPoint[pViewportIndex[1]].xmin, + pScissorsInFixedPoint[pViewportIndex[2]].xmin, + pScissorsInFixedPoint[pViewportIndex[3]].xmin, + pScissorsInFixedPoint[pViewportIndex[4]].xmin, + pScissorsInFixedPoint[pViewportIndex[5]].xmin, + pScissorsInFixedPoint[pViewportIndex[6]].xmin, + pScissorsInFixedPoint[pViewportIndex[7]].xmin, + pScissorsInFixedPoint[pViewportIndex[8]].xmin, + pScissorsInFixedPoint[pViewportIndex[9]].xmin, + pScissorsInFixedPoint[pViewportIndex[10]].xmin, + pScissorsInFixedPoint[pViewportIndex[11]].xmin, + pScissorsInFixedPoint[pViewportIndex[12]].xmin, + pScissorsInFixedPoint[pViewportIndex[13]].xmin, + pScissorsInFixedPoint[pViewportIndex[14]].xmin, + pScissorsInFixedPoint[pViewportIndex[15]].xmin); + + scisYmin = _simd16_set_epi32( + pScissorsInFixedPoint[pViewportIndex[0]].ymin, + pScissorsInFixedPoint[pViewportIndex[1]].ymin, + pScissorsInFixedPoint[pViewportIndex[2]].ymin, + pScissorsInFixedPoint[pViewportIndex[3]].ymin, + pScissorsInFixedPoint[pViewportIndex[4]].ymin, + pScissorsInFixedPoint[pViewportIndex[5]].ymin, + pScissorsInFixedPoint[pViewportIndex[6]].ymin, + pScissorsInFixedPoint[pViewportIndex[7]].ymin, + pScissorsInFixedPoint[pViewportIndex[8]].ymin, + pScissorsInFixedPoint[pViewportIndex[9]].ymin, + pScissorsInFixedPoint[pViewportIndex[10]].ymin, + pScissorsInFixedPoint[pViewportIndex[11]].ymin, + pScissorsInFixedPoint[pViewportIndex[12]].ymin, + pScissorsInFixedPoint[pViewportIndex[13]].ymin, + pScissorsInFixedPoint[pViewportIndex[14]].ymin, + pScissorsInFixedPoint[pViewportIndex[15]].ymin); + + scisXmax = _simd16_set_epi32( + pScissorsInFixedPoint[pViewportIndex[0]].xmax, + pScissorsInFixedPoint[pViewportIndex[1]].xmax, + pScissorsInFixedPoint[pViewportIndex[2]].xmax, + pScissorsInFixedPoint[pViewportIndex[3]].xmax, + pScissorsInFixedPoint[pViewportIndex[4]].xmax, + pScissorsInFixedPoint[pViewportIndex[5]].xmax, + pScissorsInFixedPoint[pViewportIndex[6]].xmax, + pScissorsInFixedPoint[pViewportIndex[7]].xmax, + pScissorsInFixedPoint[pViewportIndex[8]].xmax, + pScissorsInFixedPoint[pViewportIndex[9]].xmax, + pScissorsInFixedPoint[pViewportIndex[10]].xmax, + pScissorsInFixedPoint[pViewportIndex[11]].xmax, + pScissorsInFixedPoint[pViewportIndex[12]].xmax, + pScissorsInFixedPoint[pViewportIndex[13]].xmax, + pScissorsInFixedPoint[pViewportIndex[14]].xmax, + pScissorsInFixedPoint[pViewportIndex[15]].xmax); + + scisYmax = _simd16_set_epi32( + pScissorsInFixedPoint[pViewportIndex[0]].ymax, + pScissorsInFixedPoint[pViewportIndex[1]].ymax, + pScissorsInFixedPoint[pViewportIndex[2]].ymax, + pScissorsInFixedPoint[pViewportIndex[3]].ymax, + pScissorsInFixedPoint[pViewportIndex[4]].ymax, + pScissorsInFixedPoint[pViewportIndex[5]].ymax, + pScissorsInFixedPoint[pViewportIndex[6]].ymax, + pScissorsInFixedPoint[pViewportIndex[7]].ymax, + pScissorsInFixedPoint[pViewportIndex[8]].ymax, + pScissorsInFixedPoint[pViewportIndex[9]].ymax, + pScissorsInFixedPoint[pViewportIndex[10]].ymax, + pScissorsInFixedPoint[pViewportIndex[11]].ymax, + pScissorsInFixedPoint[pViewportIndex[12]].ymax, + pScissorsInFixedPoint[pViewportIndex[13]].ymax, + pScissorsInFixedPoint[pViewportIndex[14]].ymax, + pScissorsInFixedPoint[pViewportIndex[15]].ymax); +} -#endif typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*); struct ProcessAttributesChooser @@ -416,6 +404,47 @@ void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, } } +// WA linux compiler issue with SIMDLIB and shift immediates +#define SIMD_WA_SXXI_EPI32 1 + +#if SIMD_WA_SXXI_EPI32 +template +simdscalari simd_wa_slli_epi32(simdscalari a) +{ + return SIMD256::slli_epi32(a); +} + +template +simd16scalari simd_wa_slli_epi32(simd16scalari a) +{ + return SIMD512::slli_epi32(a); +} + +template +simdscalari simd_wa_srai_epi32(simdscalari a) +{ + return SIMD256::srai_epi32(a); +} + +template +simd16scalari simd_wa_srai_epi32(simd16scalari a) +{ + return SIMD512::srai_epi32(a); +} + +#endif +INLINE +void TransposeVertices(simd4scalar(&dst)[8], const simdscalar &src0, const simdscalar &src1, const simdscalar &src2) +{ + vTranspose3x8(dst, src0, src1, src2); +} + +INLINE +void TransposeVertices(simd4scalar(&dst)[16], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2) +{ + vTranspose4x16(reinterpret_cast(dst), src0, src1, src2, _simd16_setzero_ps()); +} + ////////////////////////////////////////////////////////////////////////// /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping /// culling, viewport transform, etc. @@ -426,14 +455,14 @@ void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, /// @param primID - Primitive ID for each triangle. /// @param viewportIdx - viewport array index for each triangle. /// @tparam CT - ConservativeRastFETraits -template -void BinTriangles( +template +void SIMDCALL BinTrianglesImpl( DRAW_CONTEXT *pDC, - PA_STATE& pa, + PA_STATE &pa, uint32_t workerId, - simdvector tri[3], + typename SIMD_T::Vec4 tri[3], uint32_t triMask, - simdscalari const &primID) + typename SIMD_T::Integer const &primID) { SWR_CONTEXT *pContext = pDC->pContext; @@ -442,25 +471,26 @@ void BinTriangles( const API_STATE& state = GetApiState(pDC); const SWR_RASTSTATE& rastState = state.rastState; const SWR_FRONTEND_STATE& feState = state.frontendState; + MacroTileMgr *pTileMgr = pDC->pTileMgr; - simdscalar vRecipW0 = _simd_set1_ps(1.0f); - simdscalar vRecipW1 = _simd_set1_ps(1.0f); - simdscalar vRecipW2 = _simd_set1_ps(1.0f); + typename SIMD_T::Float vRecipW0 = SIMD_T::set1_ps(1.0f); + typename SIMD_T::Float vRecipW1 = SIMD_T::set1_ps(1.0f); + typename SIMD_T::Float vRecipW2 = SIMD_T::set1_ps(1.0f); + + typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0); - // Read viewport array index if needed - simdscalari viewportIdx = _simd_set1_epi32(0); if (state.backendState.readViewportArrayIndex) { - simdvector vpiAttrib[3]; + typename SIMD_T::Vec4 vpiAttrib[3]; pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib); // OOB indices => forced to zero. - simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); - vpai = _simd_max_epi32(_simd_setzero_si(), vpai); - simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); - simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports); - viewportIdx = _simd_and_si(vClearMask, vpai); + typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); + vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai); + typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); + typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports); + viewportIdx = SIMD_T::and_si(vClearMask, vpai); } if (feState.vpTransformDisable) @@ -473,21 +503,21 @@ void BinTriangles( else { // Perspective divide - vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w); - vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w); - vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w); + vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[0].w); + vRecipW1 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[1].w); + vRecipW2 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[2].w); - tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0); - tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1); - tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2); + tri[0].v[0] = SIMD_T::mul_ps(tri[0].v[0], vRecipW0); + tri[1].v[0] = SIMD_T::mul_ps(tri[1].v[0], vRecipW1); + tri[2].v[0] = SIMD_T::mul_ps(tri[2].v[0], vRecipW2); - tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0); - tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1); - tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2); + tri[0].v[1] = SIMD_T::mul_ps(tri[0].v[1], vRecipW0); + tri[1].v[1] = SIMD_T::mul_ps(tri[1].v[1], vRecipW1); + tri[2].v[1] = SIMD_T::mul_ps(tri[2].v[1], vRecipW2); - tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0); - tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1); - tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2); + tri[0].v[2] = SIMD_T::mul_ps(tri[0].v[2], vRecipW0); + tri[1].v[2] = SIMD_T::mul_ps(tri[1].v[2], vRecipW1); + tri[2].v[2] = SIMD_T::mul_ps(tri[2].v[2], vRecipW2); // Viewport transform to screen space coords if (state.backendState.readViewportArrayIndex) @@ -501,36 +531,37 @@ void BinTriangles( } // Adjust for pixel center location - simdscalar offset = g_pixelOffsets[rastState.pixelLocation]; - tri[0].x = _simd_add_ps(tri[0].x, offset); - tri[0].y = _simd_add_ps(tri[0].y, offset); + typename SIMD_T::Float offset = g_pixelOffsets[rastState.pixelLocation]; + + tri[0].x = SIMD_T::add_ps(tri[0].x, offset); + tri[0].y = SIMD_T::add_ps(tri[0].y, offset); - tri[1].x = _simd_add_ps(tri[1].x, offset); - tri[1].y = _simd_add_ps(tri[1].y, offset); + tri[1].x = SIMD_T::add_ps(tri[1].x, offset); + tri[1].y = SIMD_T::add_ps(tri[1].y, offset); - tri[2].x = _simd_add_ps(tri[2].x, offset); - tri[2].y = _simd_add_ps(tri[2].y, offset); + tri[2].x = SIMD_T::add_ps(tri[2].x, offset); + tri[2].y = SIMD_T::add_ps(tri[2].y, offset); - simdscalari vXi[3], vYi[3]; // Set vXi, vYi to required fixed point precision - FPToFixedPoint(tri, vXi, vYi); + typename SIMD_T::Integer vXi[3], vYi[3]; + FPToFixedPoint(tri, vXi, vYi); // triangle setup - simdscalari vAi[3], vBi[3]; + typename SIMD_T::Integer vAi[3], vBi[3]; triangleSetupABIntVertical(vXi, vYi, vAi, vBi); // determinant - simdscalari vDet[2]; + typename SIMD_T::Integer vDet[2]; calcDeterminantIntVertical(vAi, vBi, vDet); // cull zero area - int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si()))); - int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si()))); + uint32_t maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[0], SIMD_T::setzero_si()))); + uint32_t maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[1], SIMD_T::setzero_si()))); - int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2)); + uint32_t cullZeroAreaMask = maskLo | (maskHi << (SIMD_WIDTH / 2)); - uint32_t origTriMask = triMask; // don't cull degenerate triangles if we're conservatively rasterizing + uint32_t origTriMask = triMask; if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value) { triMask &= ~cullZeroAreaMask; @@ -544,15 +575,15 @@ void BinTriangles( uint32_t frontWindingTris; if (rastState.frontWinding == SWR_FRONTWINDING_CW) { - maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si()))); - maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si()))); + maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si()))); + maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si()))); } else { - maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet[0]))); - maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet[1]))); + maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[0]))); + maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[1]))); } - frontWindingTris = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2)); + frontWindingTris = maskLo | (maskHi << (SIMD_WIDTH / 2)); // cull uint32_t cullTris; @@ -579,6 +610,7 @@ void BinTriangles( uint32_t *pPrimID = (uint32_t *)&primID; const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; DWORD triIndex = 0; + uint32_t edgeEnable; PFN_WORK_FUNC pfnWork; if (CT::IsConservativeT::value) @@ -588,14 +620,16 @@ void BinTriangles( if (cullZeroAreaMask > 0) { // e0 = v1-v0 - simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]); - simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]); - uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask))); + const typename SIMD_T::Integer x0x1Mask = SIMD_T::cmpeq_epi32(vXi[0], vXi[1]); + const typename SIMD_T::Integer y0y1Mask = SIMD_T::cmpeq_epi32(vYi[0], vYi[1]); + + uint32_t e0Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask, y0y1Mask))); // e1 = v2-v1 - simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]); - simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]); - uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask))); + const typename SIMD_T::Integer x1x2Mask = SIMD_T::cmpeq_epi32(vXi[1], vXi[2]); + const typename SIMD_T::Integer y1y2Mask = SIMD_T::cmpeq_epi32(vYi[1], vYi[2]); + + uint32_t e1Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask, y1y2Mask))); // e2 = v0-v2 // if v0 == v1 & v1 == v2, v0 == v2 @@ -605,8 +639,10 @@ void BinTriangles( // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001 e0Mask = pdep_u32(e0Mask, 0x00249249); + // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010 e1Mask = pdep_u32(e1Mask, 0x00492492); + // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100 e2Mask = pdep_u32(e2Mask, 0x00924924); @@ -620,11 +656,11 @@ void BinTriangles( else { // degenerate triangles won't be sent to rasterizer; just enable all edges - pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0), + pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0), (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false)); } - simdBBox bbox; + SIMDBBOX_T bbox; if (!triMask) { @@ -632,34 +668,36 @@ void BinTriangles( } // Calc bounding box of triangles - calcBoundingBoxIntVertical(tri, vXi, vYi, bbox); + calcBoundingBoxIntVertical(vXi, vYi, bbox); // determine if triangle falls between pixel centers and discard // only discard for non-MSAA case and when conservative rast is disabled // (xmin + 127) & ~255 // (xmax + 128) & ~255 - if((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) && + if ((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) && (!CT::IsConservativeT::value)) { origTriMask = triMask; int cullCenterMask; + { - simdscalari xmin = _simd_add_epi32(bbox.xmin, _simd_set1_epi32(127)); - xmin = _simd_and_si(xmin, _simd_set1_epi32(~255)); - simdscalari xmax = _simd_add_epi32(bbox.xmax, _simd_set1_epi32(128)); - xmax = _simd_and_si(xmax, _simd_set1_epi32(~255)); + typename SIMD_T::Integer xmin = SIMD_T::add_epi32(bbox.xmin, SIMD_T::set1_epi32(127)); + xmin = SIMD_T::and_si(xmin, SIMD_T::set1_epi32(~255)); + typename SIMD_T::Integer xmax = SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(128)); + xmax = SIMD_T::and_si(xmax, SIMD_T::set1_epi32(~255)); - simdscalari vMaskH = _simd_cmpeq_epi32(xmin, xmax); + typename SIMD_T::Integer vMaskH = SIMD_T::cmpeq_epi32(xmin, xmax); - simdscalari ymin = _simd_add_epi32(bbox.ymin, _simd_set1_epi32(127)); - ymin = _simd_and_si(ymin, _simd_set1_epi32(~255)); - simdscalari ymax = _simd_add_epi32(bbox.ymax, _simd_set1_epi32(128)); - ymax = _simd_and_si(ymax, _simd_set1_epi32(~255)); + typename SIMD_T::Integer ymin = SIMD_T::add_epi32(bbox.ymin, SIMD_T::set1_epi32(127)); + ymin = SIMD_T::and_si(ymin, SIMD_T::set1_epi32(~255)); + typename SIMD_T::Integer ymax = SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(128)); + ymax = SIMD_T::and_si(ymax, SIMD_T::set1_epi32(~255)); - simdscalari vMaskV = _simd_cmpeq_epi32(ymin, ymax); - vMaskV = _simd_or_si(vMaskH, vMaskV); - cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV)); + typename SIMD_T::Integer vMaskV = SIMD_T::cmpeq_epi32(ymin, ymax); + + vMaskV = SIMD_T::or_si(vMaskH, vMaskV); + cullCenterMask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vMaskV)); } triMask &= ~cullCenterMask; @@ -674,75 +712,82 @@ void BinTriangles( // Gather the AOS effective scissor rects based on the per-prim VP index. /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. { - simdscalari scisXmin, scisYmin, scisXmax, scisYmax; + typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax; + if (state.backendState.readViewportArrayIndex) { - GatherScissors::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, - scisXmin, scisYmin, scisXmax, scisYmax); + GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax); } else // broadcast fast path for non-VPAI case. { - scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); - scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); - scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); - scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); + scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin); + scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin); + scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax); + scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax); } // Make triangle bbox inclusive - bbox.xmax = _simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)); - bbox.ymax = _simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)); + bbox.xmax = SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)); + bbox.ymax = SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)); - bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); - bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); - bbox.xmax = _simd_min_epi32(bbox.xmax, scisXmax); - bbox.ymax = _simd_min_epi32(bbox.ymax, scisYmax); + bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin); + bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin); + bbox.xmax = SIMD_T::min_epi32(bbox.xmax, scisXmax); + bbox.ymax = SIMD_T::min_epi32(bbox.ymax, scisYmax); } if (CT::IsConservativeT::value) { // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has // some area. Bump the xmax/ymax edges out - simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.ymin, bbox.ymax); - bbox.ymax = _simd_blendv_epi32(bbox.ymax, _simd_add_epi32(bbox.ymax, _simd_set1_epi32(1)), topEqualsBottom); - simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.xmin, bbox.xmax); - bbox.xmax = _simd_blendv_epi32(bbox.xmax, _simd_add_epi32(bbox.xmax, _simd_set1_epi32(1)), leftEqualsRight); + + typename SIMD_T::Integer topEqualsBottom = SIMD_T::cmpeq_epi32(bbox.ymin, bbox.ymax); + bbox.ymax = SIMD_T::blendv_epi32(bbox.ymax, SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), topEqualsBottom); + + typename SIMD_T::Integer leftEqualsRight = SIMD_T::cmpeq_epi32(bbox.xmin, bbox.xmax); + bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), leftEqualsRight); } // Cull tris completely outside scissor { - simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax); - simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax); - simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY); - uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY)); + typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax); + typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax); + typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY); + uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY)); triMask = triMask & ~maskOutsideScissor; } endBinTriangles: + // Send surviving triangles to the line or point binner based on fill mode if (rastState.fillMode == SWR_FILLMODE_WIREFRAME) { - // Simple non-conformant wireframe mode, useful for debugging. - // Construct 3 SIMD lines out of the triangle and call the line binner for each SIMD - simdvector line[2]; - simdscalar recipW[2]; + // Simple non-conformant wireframe mode, useful for debugging + // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD + typename SIMD_T::Vec4 line[2]; + typename SIMD_T::Float recipW[2]; + line[0] = tri[0]; line[1] = tri[1]; recipW[0] = vRecipW0; recipW[1] = vRecipW1; - BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx); + + BinPostSetupLinesImpl(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx); line[0] = tri[1]; line[1] = tri[2]; recipW[0] = vRecipW1; recipW[1] = vRecipW2; - BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx); + + BinPostSetupLinesImpl(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx); line[0] = tri[2]; line[1] = tri[0]; recipW[0] = vRecipW2; recipW[1] = vRecipW0; - BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx); + + BinPostSetupLinesImpl(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx); AR_END(FEBinTriangles, 1); return; @@ -750,45 +795,59 @@ endBinTriangles: else if (rastState.fillMode == SWR_FILLMODE_POINT) { // Bin 3 points - BinPostSetupPoints(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx); - BinPostSetupPoints(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx); - BinPostSetupPoints(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx); + BinPostSetupPointsImpl(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx); + BinPostSetupPointsImpl(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx); + BinPostSetupPointsImpl(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx); + + AR_END(FEBinTriangles, 1); return; } // Convert triangle bbox to macrotile units. - bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); - bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); - bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); - bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); +#if SIMD_WA_SXXI_EPI32 + bbox.xmin = simd_wa_srai_epi32(bbox.xmin); + bbox.ymin = simd_wa_srai_epi32(bbox.ymin); + bbox.xmax = simd_wa_srai_epi32(bbox.xmax); + bbox.ymax = simd_wa_srai_epi32(bbox.ymax); +#else + bbox.xmin = SIMD_T::srai_epi32(bbox.xmin); + bbox.ymin = SIMD_T::srai_epi32(bbox.ymin); + bbox.xmax = SIMD_T::srai_epi32(bbox.xmax); + bbox.ymax = SIMD_T::srai_epi32(bbox.ymax); +#endif - OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH]; - _simd_store_si((simdscalari*)aMTLeft, bbox.xmin); - _simd_store_si((simdscalari*)aMTRight, bbox.xmax); - _simd_store_si((simdscalari*)aMTTop, bbox.ymin); - _simd_store_si((simdscalari*)aMTBottom, bbox.ymax); + OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH]; + + SIMD_T::store_si(reinterpret_cast(aMTLeft), bbox.xmin); + SIMD_T::store_si(reinterpret_cast(aMTRight), bbox.xmax); + SIMD_T::store_si(reinterpret_cast(aMTTop), bbox.ymin); + SIMD_T::store_si(reinterpret_cast(aMTBottom), bbox.ymax); // transpose verts needed for backend /// @todo modify BE to take non-transformed verts - simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8]; - vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x); - vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y); - vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z); - vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2); + simd4scalar vHorizX[SIMD_WIDTH]; + simd4scalar vHorizY[SIMD_WIDTH]; + simd4scalar vHorizZ[SIMD_WIDTH]; + simd4scalar vHorizW[SIMD_WIDTH]; + + TransposeVertices(vHorizX, tri[0].x, tri[1].x, tri[2].x); + TransposeVertices(vHorizY, tri[0].y, tri[1].y, tri[2].y); + TransposeVertices(vHorizZ, tri[0].z, tri[1].z, tri[2].z); + TransposeVertices(vHorizW, vRecipW0, vRecipW1, vRecipW2); // store render target array index - OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; + OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH]; if (state.backendState.readRenderTargetArrayIndex) { - simdvector vRtai[3]; + typename SIMD_T::Vec4 vRtai[3]; pa.Assemble(VERTEX_SGV_SLOT, vRtai); - simdscalari vRtaii; - vRtaii = _simd_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]); - _simd_store_si((simdscalari*)aRTAI, vRtaii); + typename SIMD_T::Integer vRtaii; + vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]); + SIMD_T::store_si(reinterpret_cast(aRTAI), vRtaii); } else { - _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); + SIMD_T::store_si(reinterpret_cast(aRTAI), SIMD_T::setzero_si()); } // scan remaining valid triangles and bin each separately @@ -805,7 +864,7 @@ endBinTriangles: { // only rasterize valid edges if we have a degenerate primitive int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID; - work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0), + work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0), (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(triEdgeEnable), (state.scissorsTileAligned == false)); // Degenerate triangles are required to be constant interpolated @@ -839,9 +898,9 @@ endBinTriangles: // store triangle vertex data desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16); - SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]); - SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]); - SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]); + SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]); + SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]); + SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]); SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]); // store user clip distances @@ -864,989 +923,183 @@ endBinTriangles: } } } + triMask &= ~(1 << triIndex); } AR_END(FEBinTriangles, 1); } +template +void BinTriangles( + DRAW_CONTEXT *pDC, + PA_STATE &pa, + uint32_t workerId, + simdvector tri[3], + uint32_t triMask, + simdscalari const &primID) +{ + BinTrianglesImpl(pDC, pa, workerId, tri, triMask, primID); +} + #if USE_SIMD16_FRONTEND template void SIMDCALL BinTriangles_simd16( DRAW_CONTEXT *pDC, - PA_STATE& pa, + PA_STATE &pa, uint32_t workerId, simd16vector tri[3], uint32_t triMask, simd16scalari const &primID) { - SWR_CONTEXT *pContext = pDC->pContext; - - AR_BEGIN(FEBinTriangles, pDC->drawId); - - const API_STATE& state = GetApiState(pDC); - const SWR_RASTSTATE& rastState = state.rastState; - const SWR_FRONTEND_STATE& feState = state.frontendState; - - MacroTileMgr *pTileMgr = pDC->pTileMgr; - - simd16scalar vRecipW0 = _simd16_set1_ps(1.0f); - simd16scalar vRecipW1 = _simd16_set1_ps(1.0f); - simd16scalar vRecipW2 = _simd16_set1_ps(1.0f); - - simd16scalari viewportIdx = _simd16_set1_epi32(0); - if (state.backendState.readViewportArrayIndex) - { - simd16vector vpiAttrib[3]; - pa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib); + BinTrianglesImpl(pDC, pa, workerId, tri, triMask, primID); +} - // OOB indices => forced to zero. - simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); - vpai = _simd16_max_epi32(_simd16_setzero_si(), vpai); - simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); - simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports); - viewportIdx = _simd16_and_si(vClearMask, vpai); - } +#endif +struct FEBinTrianglesChooser +{ + typedef PFN_PROCESS_PRIMS FuncType; - if (feState.vpTransformDisable) + template + static FuncType GetFunc() { - // RHW is passed in directly when VP transform is disabled - vRecipW0 = tri[0].v[3]; - vRecipW1 = tri[1].v[3]; - vRecipW2 = tri[2].v[3]; + return BinTriangles>; } - else - { - // Perspective divide - vRecipW0 = _simd16_div_ps(_simd16_set1_ps(1.0f), tri[0].w); - vRecipW1 = _simd16_div_ps(_simd16_set1_ps(1.0f), tri[1].w); - vRecipW2 = _simd16_div_ps(_simd16_set1_ps(1.0f), tri[2].w); - - tri[0].v[0] = _simd16_mul_ps(tri[0].v[0], vRecipW0); - tri[1].v[0] = _simd16_mul_ps(tri[1].v[0], vRecipW1); - tri[2].v[0] = _simd16_mul_ps(tri[2].v[0], vRecipW2); +}; - tri[0].v[1] = _simd16_mul_ps(tri[0].v[1], vRecipW0); - tri[1].v[1] = _simd16_mul_ps(tri[1].v[1], vRecipW1); - tri[2].v[1] = _simd16_mul_ps(tri[2].v[1], vRecipW2); +// Selector for correct templated BinTrinagles function +PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative) +{ + return TemplateArgUnroller::GetFunc(IsConservative); +} - tri[0].v[2] = _simd16_mul_ps(tri[0].v[2], vRecipW0); - tri[1].v[2] = _simd16_mul_ps(tri[1].v[2], vRecipW1); - tri[2].v[2] = _simd16_mul_ps(tri[2].v[2], vRecipW2); +#if USE_SIMD16_FRONTEND +struct FEBinTrianglesChooser_simd16 +{ + typedef PFN_PROCESS_PRIMS_SIMD16 FuncType; - // Viewport transform to screen space coords - if (state.backendState.readViewportArrayIndex) - { - viewportTransform<3>(tri, state.vpMatrices, viewportIdx); - } - else - { - viewportTransform<3>(tri, state.vpMatrices); - } + template + static FuncType GetFunc() + { + return BinTriangles_simd16>; } +}; - // Adjust for pixel center location - const simd16scalar offset = g_pixelOffsets_simd16[rastState.pixelLocation]; - - tri[0].x = _simd16_add_ps(tri[0].x, offset); - tri[0].y = _simd16_add_ps(tri[0].y, offset); - - tri[1].x = _simd16_add_ps(tri[1].x, offset); - tri[1].y = _simd16_add_ps(tri[1].y, offset); +// Selector for correct templated BinTrinagles function +PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative) +{ + return TemplateArgUnroller::GetFunc(IsConservative); +} - tri[2].x = _simd16_add_ps(tri[2].x, offset); - tri[2].y = _simd16_add_ps(tri[2].y, offset); +#endif - simd16scalari vXi[3], vYi[3]; +template +void BinPostSetupPointsImpl( + DRAW_CONTEXT *pDC, + PA_STATE &pa, + uint32_t workerId, + typename SIMD_T::Vec4 prim[], + uint32_t primMask, + typename SIMD_T::Integer const &primID, + typename SIMD_T::Integer const &viewportIdx) +{ + SWR_CONTEXT *pContext = pDC->pContext; - // Set vXi, vYi to required fixed point precision - FPToFixedPoint(tri, vXi, vYi); + AR_BEGIN(FEBinPoints, pDC->drawId); - // triangle setup - simd16scalari vAi[3], vBi[3]; - triangleSetupABIntVertical(vXi, vYi, vAi, vBi); + typename SIMD_T::Vec4 &primVerts = prim[0]; - // determinant - simd16scalari vDet[2]; - calcDeterminantIntVertical(vAi, vBi, vDet); + const API_STATE& state = GetApiState(pDC); + const SWR_RASTSTATE& rastState = state.rastState; + const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; - // cull zero area - uint32_t maskLo = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet[0], _simd16_setzero_si()))); - uint32_t maskHi = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet[1], _simd16_setzero_si()))); + // Select attribute processor + PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1, + state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); - uint32_t cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD16_WIDTH / 2)); + // convert to fixed point + typename SIMD_T::Integer vXi, vYi; - // don't cull degenerate triangles if we're conservatively rasterizing - uint32_t origTriMask = triMask; - if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value) - { - triMask &= ~cullZeroAreaMask; - } + vXi = fpToFixedPointVertical(primVerts.x); + vYi = fpToFixedPointVertical(primVerts.y); - // determine front winding tris - // CW +det - // CCW det < 0; - // 0 area triangles are marked as backfacing regardless of winding order, - // which is required behavior for conservative rast and wireframe rendering - uint32_t frontWindingTris; - if (rastState.frontWinding == SWR_FRONTWINDING_CW) - { - maskLo = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(vDet[0], _simd16_setzero_si()))); - maskHi = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(vDet[1], _simd16_setzero_si()))); - } - else + if (CanUseSimplePoints(pDC)) { - maskLo = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(_simd16_setzero_si(), vDet[0]))); - maskHi = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(_simd16_setzero_si(), vDet[1]))); - } - frontWindingTris = maskLo | (maskHi << (KNOB_SIMD16_WIDTH / 2)); + // adjust for ymin-xmin rule + vXi = SIMD_T::sub_epi32(vXi, SIMD_T::set1_epi32(1)); + vYi = SIMD_T::sub_epi32(vYi, SIMD_T::set1_epi32(1)); - // cull - uint32_t cullTris; - switch ((SWR_CULLMODE)rastState.cullMode) - { - case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break; - case SWR_CULLMODE_NONE: cullTris = 0x0; break; - case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break; - // 0 area triangles are marked as backfacing, which is required behavior for conservative rast - case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break; - default: SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break; - } + // cull points off the ymin-xmin edge of the viewport + primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vXi)); + primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi)); - triMask &= ~cullTris; + // compute macro tile coordinates +#if SIMD_WA_SXXI_EPI32 + typename SIMD_T::Integer macroX = simd_wa_srai_epi32(vXi); + typename SIMD_T::Integer macroY = simd_wa_srai_epi32(vYi); +#else + typename SIMD_T::Integer macroX = SIMD_T::srai_epi32(vXi); + typename SIMD_T::Integer macroY = SIMD_T::srai_epi32(vYi); +#endif - if (origTriMask ^ triMask) - { - RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0); - } + OSALIGNSIMD16(uint32_t) aMacroX[SIMD_WIDTH], aMacroY[SIMD_WIDTH]; - /// Note: these variable initializations must stay above any 'goto endBenTriangles' - // compute per tri backface - uint32_t frontFaceMask = frontWindingTris; - uint32_t *pPrimID = (uint32_t *)&primID; - const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; - DWORD triIndex = 0; + SIMD_T::store_si(reinterpret_cast(aMacroX), macroX); + SIMD_T::store_si(reinterpret_cast(aMacroY), macroY); - uint32_t edgeEnable; - PFN_WORK_FUNC pfnWork; - if (CT::IsConservativeT::value) - { - // determine which edges of the degenerate tri, if any, are valid to rasterize. - // used to call the appropriate templated rasterizer function - if (cullZeroAreaMask > 0) - { - // e0 = v1-v0 - const simd16scalari x0x1Mask = _simd16_cmpeq_epi32(vXi[0], vXi[1]); - const simd16scalari y0y1Mask = _simd16_cmpeq_epi32(vYi[0], vYi[1]); + // compute raster tile coordinates +#if SIMD_WA_SXXI_EPI32 + typename SIMD_T::Integer rasterX = simd_wa_srai_epi32(vXi); + typename SIMD_T::Integer rasterY = simd_wa_srai_epi32(vYi); +#else + typename SIMD_T::Integer rasterX = SIMD_T::srai_epi32(vXi); + typename SIMD_T::Integer rasterY = SIMD_T::srai_epi32(vYi); +#endif - uint32_t e0Mask = _simd16_movemask_ps(_simd16_castsi_ps(_simd16_and_si(x0x1Mask, y0y1Mask))); + // compute raster tile relative x,y for coverage mask +#if SIMD_WA_SXXI_EPI32 + typename SIMD_T::Integer tileAlignedX = simd_wa_slli_epi32(rasterX); + typename SIMD_T::Integer tileAlignedY = simd_wa_slli_epi32(rasterY); +#else + typename SIMD_T::Integer tileAlignedX = SIMD_T::slli_epi32(rasterX); + typename SIMD_T::Integer tileAlignedY = SIMD_T::slli_epi32(rasterY); +#endif - // e1 = v2-v1 - const simd16scalari x1x2Mask = _simd16_cmpeq_epi32(vXi[1], vXi[2]); - const simd16scalari y1y2Mask = _simd16_cmpeq_epi32(vYi[1], vYi[2]); +#if SIMD_WA_SXXI_EPI32 + typename SIMD_T::Integer tileRelativeX = SIMD_T::sub_epi32(simd_wa_srai_epi32(vXi), tileAlignedX); + typename SIMD_T::Integer tileRelativeY = SIMD_T::sub_epi32(simd_wa_srai_epi32(vYi), tileAlignedY); +#else + typename SIMD_T::Integer tileRelativeX = SIMD_T::sub_epi32(SIMD_T::srai_epi32(vXi), tileAlignedX); + typename SIMD_T::Integer tileRelativeY = SIMD_T::sub_epi32(SIMD_T::srai_epi32(vYi), tileAlignedY); +#endif - uint32_t e1Mask = _simd16_movemask_ps(_simd16_castsi_ps(_simd16_and_si(x1x2Mask, y1y2Mask))); + OSALIGNSIMD16(uint32_t) aTileRelativeX[SIMD_WIDTH]; + OSALIGNSIMD16(uint32_t) aTileRelativeY[SIMD_WIDTH]; - // e2 = v0-v2 - // if v0 == v1 & v1 == v2, v0 == v2 - uint32_t e2Mask = e0Mask & e1Mask; - SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512"); + SIMD_T::store_si(reinterpret_cast(aTileRelativeX), tileRelativeX); + SIMD_T::store_si(reinterpret_cast(aTileRelativeY), tileRelativeY); - // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2 - // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001 - e0Mask = pdep_u32(e0Mask, 0x00249249); + OSALIGNSIMD16(uint32_t) aTileAlignedX[SIMD_WIDTH]; + OSALIGNSIMD16(uint32_t) aTileAlignedY[SIMD_WIDTH]; - // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010 - e1Mask = pdep_u32(e1Mask, 0x00492492); + SIMD_T::store_si(reinterpret_cast(aTileAlignedX), tileAlignedX); + SIMD_T::store_si(reinterpret_cast(aTileAlignedY), tileAlignedY); - // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100 - e2Mask = pdep_u32(e2Mask, 0x00924924); + OSALIGNSIMD16(float) aZ[SIMD_WIDTH]; + SIMD_T::store_ps(reinterpret_cast(aZ), primVerts.z); - edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask))); + // store render target array index + OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH]; + if (state.backendState.readRenderTargetArrayIndex) + { + typename SIMD_T::Vec4 vRtai; + pa.Assemble(VERTEX_SGV_SLOT, &vRtai); + typename SIMD_T::Integer vRtaii = SIMD_T::castps_si(vRtai[VERTEX_SGV_RTAI_COMP]); + SIMD_T::store_si(reinterpret_cast(aRTAI), vRtaii); } else { - edgeEnable = 0x00FFFFFF; - } - } - else - { - // degenerate triangles won't be sent to rasterizer; just enable all edges - pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0), - (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false)); - } - - simd16BBox bbox; - - if (!triMask) - { - goto endBinTriangles; - } - - // Calc bounding box of triangles - calcBoundingBoxIntVertical(tri, vXi, vYi, bbox); - - // determine if triangle falls between pixel centers and discard - // only discard for non-MSAA case and when conservative rast is disabled - // (xmin + 127) & ~255 - // (xmax + 128) & ~255 - if ((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) && - (!CT::IsConservativeT::value)) - { - origTriMask = triMask; - - int cullCenterMask; - - { - simd16scalari xmin = _simd16_add_epi32(bbox.xmin, _simd16_set1_epi32(127)); - xmin = _simd16_and_si(xmin, _simd16_set1_epi32(~255)); - simd16scalari xmax = _simd16_add_epi32(bbox.xmax, _simd16_set1_epi32(128)); - xmax = _simd16_and_si(xmax, _simd16_set1_epi32(~255)); - - simd16scalari vMaskH = _simd16_cmpeq_epi32(xmin, xmax); - - simd16scalari ymin = _simd16_add_epi32(bbox.ymin, _simd16_set1_epi32(127)); - ymin = _simd16_and_si(ymin, _simd16_set1_epi32(~255)); - simd16scalari ymax = _simd16_add_epi32(bbox.ymax, _simd16_set1_epi32(128)); - ymax = _simd16_and_si(ymax, _simd16_set1_epi32(~255)); - - simd16scalari vMaskV = _simd16_cmpeq_epi32(ymin, ymax); - - vMaskV = _simd16_or_si(vMaskH, vMaskV); - cullCenterMask = _simd16_movemask_ps(_simd16_castsi_ps(vMaskV)); - } - - triMask &= ~cullCenterMask; - - if (origTriMask ^ triMask) - { - RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0); - } - } - - // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. - // Gather the AOS effective scissor rects based on the per-prim VP index. - /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. - { - simd16scalari scisXmin, scisYmin, scisXmax, scisYmax; - - if (state.backendState.readViewportArrayIndex) - { - GatherScissors_simd16::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, - scisXmin, scisYmin, scisXmax, scisYmax); - } - else // broadcast fast path for non-VPAI case. - { - scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin); - scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin); - scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax); - scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax); - } - - // Make triangle bbox inclusive - bbox.xmax = _simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)); - bbox.ymax = _simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)); - - bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin); - bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin); - bbox.xmax = _simd16_min_epi32(bbox.xmax, scisXmax); - bbox.ymax = _simd16_min_epi32(bbox.ymax, scisYmax); - } - - if (CT::IsConservativeT::value) - { - // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has - // some area. Bump the xmax/ymax edges out - simd16scalari topEqualsBottom = _simd16_cmpeq_epi32(bbox.ymin, bbox.ymax); - bbox.ymax = _simd16_blendv_epi32(bbox.ymax, _simd16_add_epi32(bbox.ymax, _simd16_set1_epi32(1)), topEqualsBottom); - simd16scalari leftEqualsRight = _simd16_cmpeq_epi32(bbox.xmin, bbox.xmax); - bbox.xmax = _simd16_blendv_epi32(bbox.xmax, _simd16_add_epi32(bbox.xmax, _simd16_set1_epi32(1)), leftEqualsRight); - } - - // Cull tris completely outside scissor - { - simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax); - simd16scalari maskOutsideScissorY = _simd16_cmpgt_epi32(bbox.ymin, bbox.ymax); - simd16scalari maskOutsideScissorXY = _simd16_or_si(maskOutsideScissorX, maskOutsideScissorY); - uint32_t maskOutsideScissor = _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY)); - triMask = triMask & ~maskOutsideScissor; - } - -endBinTriangles: - - // Send surviving triangles to the line or point binner based on fill mode - if (rastState.fillMode == SWR_FILLMODE_WIREFRAME) - { - // Simple non-conformant wireframe mode, useful for debugging - // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD - simd16vector line[2]; - simd16scalar recipW[2]; - line[0] = tri[0]; - line[1] = tri[1]; - recipW[0] = vRecipW0; - recipW[1] = vRecipW1; - BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx); - - line[0] = tri[1]; - line[1] = tri[2]; - recipW[0] = vRecipW1; - recipW[1] = vRecipW2; - BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx); - - line[0] = tri[2]; - line[1] = tri[0]; - recipW[0] = vRecipW2; - recipW[1] = vRecipW0; - BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx); - - AR_END(FEBinTriangles, 1); - return; - } - else if (rastState.fillMode == SWR_FILLMODE_POINT) - { - // Bin 3 points - BinPostSetupPoints_simd16(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx); - BinPostSetupPoints_simd16(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx); - BinPostSetupPoints_simd16(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx); - return; - } - - // Convert triangle bbox to macrotile units. - bbox.xmin = _simd16_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); - bbox.ymin = _simd16_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); - bbox.xmax = _simd16_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); - bbox.ymax = _simd16_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); - - OSALIGNSIMD16(uint32_t) aMTLeft[KNOB_SIMD16_WIDTH], aMTRight[KNOB_SIMD16_WIDTH], aMTTop[KNOB_SIMD16_WIDTH], aMTBottom[KNOB_SIMD16_WIDTH]; - - _simd16_store_si(reinterpret_cast(aMTLeft), bbox.xmin); - _simd16_store_si(reinterpret_cast(aMTRight), bbox.xmax); - _simd16_store_si(reinterpret_cast(aMTTop), bbox.ymin); - _simd16_store_si(reinterpret_cast(aMTBottom), bbox.ymax); - - // transpose verts needed for backend - /// @todo modify BE to take non-transformed verts - simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH - simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH - simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH - simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH - - vTranspose3x8(vHorizX[0], _simd16_extract_ps(tri[0].x, 0), _simd16_extract_ps(tri[1].x, 0), _simd16_extract_ps(tri[2].x, 0)); - vTranspose3x8(vHorizY[0], _simd16_extract_ps(tri[0].y, 0), _simd16_extract_ps(tri[1].y, 0), _simd16_extract_ps(tri[2].y, 0)); - vTranspose3x8(vHorizZ[0], _simd16_extract_ps(tri[0].z, 0), _simd16_extract_ps(tri[1].z, 0), _simd16_extract_ps(tri[2].z, 0)); - vTranspose3x8(vHorizW[0], _simd16_extract_ps(vRecipW0, 0), _simd16_extract_ps(vRecipW1, 0), _simd16_extract_ps(vRecipW2, 0)); - - vTranspose3x8(vHorizX[1], _simd16_extract_ps(tri[0].x, 1), _simd16_extract_ps(tri[1].x, 1), _simd16_extract_ps(tri[2].x, 1)); - vTranspose3x8(vHorizY[1], _simd16_extract_ps(tri[0].y, 1), _simd16_extract_ps(tri[1].y, 1), _simd16_extract_ps(tri[2].y, 1)); - vTranspose3x8(vHorizZ[1], _simd16_extract_ps(tri[0].z, 1), _simd16_extract_ps(tri[1].z, 1), _simd16_extract_ps(tri[2].z, 1)); - vTranspose3x8(vHorizW[1], _simd16_extract_ps(vRecipW0, 1), _simd16_extract_ps(vRecipW1, 1), _simd16_extract_ps(vRecipW2, 1)); - - // store render target array index - OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH]; - if (state.backendState.readRenderTargetArrayIndex) - { - simd16vector vRtai[3]; - pa.Assemble_simd16(VERTEX_SGV_SLOT, vRtai); - simd16scalari vRtaii; - vRtaii = _simd16_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]); - _simd16_store_si(reinterpret_cast(aRTAI), vRtaii); - } - else - { - _simd16_store_si(reinterpret_cast(aRTAI), _simd16_setzero_si()); - } - - - // scan remaining valid triangles and bin each separately - while (_BitScanForward(&triIndex, triMask)) - { - uint32_t linkageCount = state.backendState.numAttributes; - uint32_t numScalarAttribs = linkageCount * 4; - - BE_WORK work; - work.type = DRAW; - - bool isDegenerate; - if (CT::IsConservativeT::value) - { - // only rasterize valid edges if we have a degenerate primitive - int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID; - work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0), - (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(triEdgeEnable), (state.scissorsTileAligned == false)); - - // Degenerate triangles are required to be constant interpolated - isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false; - } - else - { - isDegenerate = false; - work.pfnWork = pfnWork; - } - - // Select attribute processor - PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3, - state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate); - - TRIANGLE_WORK_DESC &desc = work.desc.tri; - - desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1); - desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex]; - desc.triFlags.viewportIndex = pViewportIndex[triIndex]; - - auto pArena = pDC->pArena; - SWR_ASSERT(pArena != nullptr); - - // store active attribs - float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); - desc.pAttribs = pAttribs; - desc.numAttribs = linkageCount; - pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs); - - // store triangle vertex data - desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16); - - { - const uint32_t i = triIndex >> 3; // triIndex / KNOB_SIMD_WIDTH - const uint32_t j = triIndex & 7; // triIndex % KNOB_SIMD_WIDTH - - _mm_store_ps(&desc.pTriBuffer[ 0], vHorizX[i][j]); - _mm_store_ps(&desc.pTriBuffer[ 4], vHorizY[i][j]); - _mm_store_ps(&desc.pTriBuffer[ 8], vHorizZ[i][j]); - _mm_store_ps(&desc.pTriBuffer[12], vHorizW[i][j]); - } - - // store user clip distances - if (rastState.clipDistanceMask) - { - uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask); - desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float)); - ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer); - } - - for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y) - { - for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x) - { -#if KNOB_ENABLE_TOSS_POINTS - if (!KNOB_TOSS_SETUP_TRIS) -#endif - { - pTileMgr->enqueue(x, y, &work); - } - } - } - - triMask &= ~(1 << triIndex); - } - - AR_END(FEBinTriangles, 1); -} - -#endif -struct FEBinTrianglesChooser -{ - typedef PFN_PROCESS_PRIMS FuncType; - - template - static FuncType GetFunc() - { - return BinTriangles>; - } -}; - -// Selector for correct templated BinTrinagles function -PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative) -{ - return TemplateArgUnroller::GetFunc(IsConservative); -} - -#if USE_SIMD16_FRONTEND -struct FEBinTrianglesChooser_simd16 -{ - typedef PFN_PROCESS_PRIMS_SIMD16 FuncType; - - template - static FuncType GetFunc() - { - return BinTriangles_simd16>; - } -}; - -// Selector for correct templated BinTrinagles function -PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative) -{ - return TemplateArgUnroller::GetFunc(IsConservative); -} - -#endif - -void BinPostSetupPoints( - DRAW_CONTEXT *pDC, - PA_STATE& pa, - uint32_t workerId, - simdvector prim[], - uint32_t primMask, - simdscalari const &primID, - simdscalari const &viewportIdx) -{ - SWR_CONTEXT *pContext = pDC->pContext; - - AR_BEGIN(FEBinPoints, pDC->drawId); - - simdvector& primVerts = prim[0]; - - const API_STATE& state = GetApiState(pDC); - const SWR_RASTSTATE& rastState = state.rastState; - const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; - - // Select attribute processor - PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1, - state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); - - // convert to fixed point - simdscalari vXi, vYi; - vXi = fpToFixedPointVertical(primVerts.x); - vYi = fpToFixedPointVertical(primVerts.y); - - if (CanUseSimplePoints(pDC)) - { - // adjust for ymin-xmin rule - vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1)); - vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1)); - - // cull points off the ymin-xmin edge of the viewport - primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi)); - primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi)); - - // compute macro tile coordinates - simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); - simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); - - OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH]; - _simd_store_si((simdscalari*)aMacroX, macroX); - _simd_store_si((simdscalari*)aMacroY, macroY); - - // compute raster tile coordinates - simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); - simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); - - // compute raster tile relative x,y for coverage mask - simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT); - simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT); - - simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX); - simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY); - - OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH]; - OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH]; - _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX); - _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY); - - OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH]; - OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH]; - _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX); - _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY); - - OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH]; - _simd_store_ps((float*)aZ, primVerts.z); - - // store render target array index - OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; - if (state.backendState.readRenderTargetArrayIndex) - { - simdvector vRtai; - pa.Assemble(VERTEX_SGV_SLOT, &vRtai); - simdscalari vRtaii = _simd_castps_si(vRtai[VERTEX_SGV_RTAI_COMP]); - _simd_store_si((simdscalari*)aRTAI, vRtaii); - } - else - { - _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); - } - - uint32_t *pPrimID = (uint32_t *)&primID; - DWORD primIndex = 0; - - const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState; - - // scan remaining valid triangles and bin each separately - while (_BitScanForward(&primIndex, primMask)) - { - uint32_t linkageCount = backendState.numAttributes; - uint32_t numScalarAttribs = linkageCount * 4; - - BE_WORK work; - work.type = DRAW; - - TRIANGLE_WORK_DESC &desc = work.desc.tri; - - // points are always front facing - desc.triFlags.frontFacing = 1; - desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; - desc.triFlags.viewportIndex = pViewportIndex[primIndex]; - - work.pfnWork = RasterizeSimplePoint; - - auto pArena = pDC->pArena; - SWR_ASSERT(pArena != nullptr); - - // store attributes - float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16); - desc.pAttribs = pAttribs; - desc.numAttribs = linkageCount; - - pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs); - - // store raster tile aligned x, y, perspective correct z - float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16); - desc.pTriBuffer = pTriBuffer; - *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex]; - *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex]; - *pTriBuffer = aZ[primIndex]; - - uint32_t tX = aTileRelativeX[primIndex]; - uint32_t tY = aTileRelativeY[primIndex]; - - // pack the relative x,y into the coverageMask, the rasterizer will - // generate the true coverage mask from it - work.desc.tri.triFlags.coverageMask = tX | (tY << 4); - - // bin it - MacroTileMgr *pTileMgr = pDC->pTileMgr; -#if KNOB_ENABLE_TOSS_POINTS - if (!KNOB_TOSS_SETUP_TRIS) -#endif - { - pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work); - } - primMask &= ~(1 << primIndex); - } - } - else - { - // non simple points need to be potentially binned to multiple macro tiles - simdscalar vPointSize; - if (rastState.pointParam) - { - simdvector size[3]; - pa.Assemble(VERTEX_SGV_SLOT, size); - vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP]; - } - else - { - vPointSize = _simd_set1_ps(rastState.pointSize); - } - - // bloat point to bbox - simdBBox bbox; - bbox.xmin = bbox.xmax = vXi; - bbox.ymin = bbox.ymax = vYi; - - simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f)); - simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth); - bbox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi); - bbox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi); - bbox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi); - bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi); - - // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. - // Gather the AOS effective scissor rects based on the per-prim VP index. - /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. - { - simdscalari scisXmin, scisYmin, scisXmax, scisYmax; - if (state.backendState.readViewportArrayIndex) - { - GatherScissors::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, - scisXmin, scisYmin, scisXmax, scisYmax); - } - else // broadcast fast path for non-VPAI case. - { - scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); - scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); - scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); - scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); - } - - bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); - bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); - bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); - bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); - } - - // Cull bloated points completely outside scissor - simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax); - simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax); - simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY); - uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY)); - primMask = primMask & ~maskOutsideScissor; - - // Convert bbox to macrotile units. - bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); - bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); - bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); - bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); - - OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH]; - _simd_store_si((simdscalari*)aMTLeft, bbox.xmin); - _simd_store_si((simdscalari*)aMTRight, bbox.xmax); - _simd_store_si((simdscalari*)aMTTop, bbox.ymin); - _simd_store_si((simdscalari*)aMTBottom, bbox.ymax); - - // store render target array index - OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; - if (state.backendState.readRenderTargetArrayIndex) - { - simdvector vRtai[2]; - pa.Assemble(VERTEX_SGV_SLOT, vRtai); - simdscalari vRtaii = _simd_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]); - _simd_store_si((simdscalari*)aRTAI, vRtaii); - } - else - { - _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); - } - - OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH]; - _simd_store_ps((float*)aPointSize, vPointSize); - - uint32_t *pPrimID = (uint32_t *)&primID; - - OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH]; - OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH]; - OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH]; - - _simd_store_ps((float*)aPrimVertsX, primVerts.x); - _simd_store_ps((float*)aPrimVertsY, primVerts.y); - _simd_store_ps((float*)aPrimVertsZ, primVerts.z); - - // scan remaining valid prims and bin each separately - const SWR_BACKEND_STATE& backendState = state.backendState; - DWORD primIndex; - while (_BitScanForward(&primIndex, primMask)) - { - uint32_t linkageCount = backendState.numAttributes; - uint32_t numScalarAttribs = linkageCount * 4; - - BE_WORK work; - work.type = DRAW; - - TRIANGLE_WORK_DESC &desc = work.desc.tri; - - desc.triFlags.frontFacing = 1; - desc.triFlags.pointSize = aPointSize[primIndex]; - desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; - desc.triFlags.viewportIndex = pViewportIndex[primIndex]; - - work.pfnWork = RasterizeTriPoint; - - auto pArena = pDC->pArena; - SWR_ASSERT(pArena != nullptr); - - // store active attribs - desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); - desc.numAttribs = linkageCount; - pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs); - - // store point vertex data - float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16); - desc.pTriBuffer = pTriBuffer; - *pTriBuffer++ = aPrimVertsX[primIndex]; - *pTriBuffer++ = aPrimVertsY[primIndex]; - *pTriBuffer = aPrimVertsZ[primIndex]; - - // store user clip distances - if (rastState.clipDistanceMask) - { - uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask); - desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float)); - float dists[8]; - float one = 1.0f; - ProcessUserClipDist<1>(pa, primIndex, rastState.clipDistanceMask, &one, dists); - for (uint32_t i = 0; i < numClipDist; i++) { - desc.pUserClipBuffer[3*i + 0] = 0.0f; - desc.pUserClipBuffer[3*i + 1] = 0.0f; - desc.pUserClipBuffer[3*i + 2] = dists[i]; - } - } - - MacroTileMgr *pTileMgr = pDC->pTileMgr; - for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y) - { - for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x) - { -#if KNOB_ENABLE_TOSS_POINTS - if (!KNOB_TOSS_SETUP_TRIS) -#endif - { - pTileMgr->enqueue(x, y, &work); - } - } - } - - primMask &= ~(1 << primIndex); - } - } - - AR_END(FEBinPoints, 1); -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Bin SIMD points to the backend. Only supports point size of 1 -/// @param pDC - pointer to draw context. -/// @param pa - The primitive assembly object. -/// @param workerId - thread's worker id. Even thread has a unique id. -/// @param tri - Contains point position data for SIMDs worth of points. -/// @param primID - Primitive ID for each point. -void BinPoints( - DRAW_CONTEXT *pDC, - PA_STATE& pa, - uint32_t workerId, - simdvector prim[3], - uint32_t primMask, - simdscalari const &primID) -{ - simdvector& primVerts = prim[0]; - - const API_STATE& state = GetApiState(pDC); - const SWR_FRONTEND_STATE& feState = state.frontendState; - const SWR_RASTSTATE& rastState = state.rastState; - - // Read back viewport index if required - simdscalari viewportIdx = _simd_set1_epi32(0); - if (state.backendState.readViewportArrayIndex) - { - simdvector vpiAttrib[1]; - pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib); - simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); - - // OOB indices => forced to zero. - vpai = _simd_max_epi32(_simd_setzero_si(), vpai); - simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); - simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports); - viewportIdx = _simd_and_si(vClearMask, vpai); - } - - if (!feState.vpTransformDisable) - { - // perspective divide - simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w); - primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0); - primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0); - primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0); - - // viewport transform to screen coords - if (state.backendState.readViewportArrayIndex) - { - viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx); - } - else - { - viewportTransform<1>(&primVerts, state.vpMatrices); - } - } - - // adjust for pixel center location - simdscalar offset = g_pixelOffsets[rastState.pixelLocation]; - primVerts.x = _simd_add_ps(primVerts.x, offset); - primVerts.y = _simd_add_ps(primVerts.y, offset); - - BinPostSetupPoints( - pDC, - pa, - workerId, - prim, - primMask, - primID, - viewportIdx); -} - -#if USE_SIMD16_FRONTEND -void BinPostSetupPoints_simd16( - DRAW_CONTEXT *pDC, - PA_STATE& pa, - uint32_t workerId, - simd16vector prim[], - uint32_t primMask, - simd16scalari const &primID, - simd16scalari const &viewportIdx) -{ - SWR_CONTEXT *pContext = pDC->pContext; - - AR_BEGIN(FEBinPoints, pDC->drawId); - - simd16vector& primVerts = prim[0]; - - const API_STATE& state = GetApiState(pDC); - const SWR_RASTSTATE& rastState = state.rastState; - const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; - - // Select attribute processor - PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1, - state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); - - // convert to fixed point - simd16scalari vXi, vYi; - - vXi = fpToFixedPointVertical(primVerts.x); - vYi = fpToFixedPointVertical(primVerts.y); - - if (CanUseSimplePoints(pDC)) - { - // adjust for ymin-xmin rule - vXi = _simd16_sub_epi32(vXi, _simd16_set1_epi32(1)); - vYi = _simd16_sub_epi32(vYi, _simd16_set1_epi32(1)); - - // cull points off the ymin-xmin edge of the viewport - primMask &= ~_simd16_movemask_ps(_simd16_castsi_ps(vXi)); - primMask &= ~_simd16_movemask_ps(_simd16_castsi_ps(vYi)); - - // compute macro tile coordinates - simd16scalari macroX = _simd16_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); - simd16scalari macroY = _simd16_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); - - OSALIGNSIMD16(uint32_t) aMacroX[KNOB_SIMD16_WIDTH], aMacroY[KNOB_SIMD16_WIDTH]; - - _simd16_store_si(reinterpret_cast(aMacroX), macroX); - _simd16_store_si(reinterpret_cast(aMacroY), macroY); - - // compute raster tile coordinates - simd16scalari rasterX = _simd16_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); - simd16scalari rasterY = _simd16_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); - - // compute raster tile relative x,y for coverage mask - simd16scalari tileAlignedX = _simd16_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT); - simd16scalari tileAlignedY = _simd16_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT); - - simd16scalari tileRelativeX = _simd16_sub_epi32(_simd16_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX); - simd16scalari tileRelativeY = _simd16_sub_epi32(_simd16_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY); - - OSALIGNSIMD16(uint32_t) aTileRelativeX[KNOB_SIMD16_WIDTH]; - OSALIGNSIMD16(uint32_t) aTileRelativeY[KNOB_SIMD16_WIDTH]; - - _simd16_store_si(reinterpret_cast(aTileRelativeX), tileRelativeX); - _simd16_store_si(reinterpret_cast(aTileRelativeY), tileRelativeY); - - OSALIGNSIMD16(uint32_t) aTileAlignedX[KNOB_SIMD16_WIDTH]; - OSALIGNSIMD16(uint32_t) aTileAlignedY[KNOB_SIMD16_WIDTH]; - - _simd16_store_si(reinterpret_cast(aTileAlignedX), tileAlignedX); - _simd16_store_si(reinterpret_cast(aTileAlignedY), tileAlignedY); - - OSALIGNSIMD16(float) aZ[KNOB_SIMD16_WIDTH]; - _simd16_store_ps(reinterpret_cast(aZ), primVerts.z); - - // store render target array index - OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH]; - if (state.backendState.readRenderTargetArrayIndex) - { - simd16vector vRtai; - pa.Assemble_simd16(VERTEX_SGV_SLOT, &vRtai); - simd16scalari vRtaii = _simd16_castps_si(vRtai[VERTEX_SGV_RTAI_COMP]); - _simd16_store_si(reinterpret_cast(aRTAI), vRtaii); - } - else - { - _simd16_store_si(reinterpret_cast(aRTAI), _simd16_setzero_si()); + SIMD_T::store_si(reinterpret_cast(aRTAI), SIMD_T::setzero_si()); } uint32_t *pPrimID = (uint32_t *)&primID; @@ -1911,103 +1164,110 @@ void BinPostSetupPoints_simd16( else { // non simple points need to be potentially binned to multiple macro tiles - simd16scalar vPointSize; + typename SIMD_T::Float vPointSize; if (rastState.pointParam) { - simd16vector size[3]; - pa.Assemble_simd16(VERTEX_SGV_SLOT, size); + typename SIMD_T::Vec4 size[3]; + pa.Assemble(VERTEX_SGV_SLOT, size); vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP]; } else { - vPointSize = _simd16_set1_ps(rastState.pointSize); + vPointSize = SIMD_T::set1_ps(rastState.pointSize); } // bloat point to bbox - simd16BBox bbox; + SIMDBBOX_T bbox; bbox.xmin = bbox.xmax = vXi; bbox.ymin = bbox.ymax = vYi; - simd16scalar vHalfWidth = _simd16_mul_ps(vPointSize, _simd16_set1_ps(0.5f)); - simd16scalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth); + typename SIMD_T::Float vHalfWidth = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f)); + typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical(vHalfWidth); - bbox.xmin = _simd16_sub_epi32(bbox.xmin, vHalfWidthi); - bbox.xmax = _simd16_add_epi32(bbox.xmax, vHalfWidthi); - bbox.ymin = _simd16_sub_epi32(bbox.ymin, vHalfWidthi); - bbox.ymax = _simd16_add_epi32(bbox.ymax, vHalfWidthi); + bbox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi); + bbox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi); + bbox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi); + bbox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi); // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. // Gather the AOS effective scissor rects based on the per-prim VP index. /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. { - simd16scalari scisXmin, scisYmin, scisXmax, scisYmax; + typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax; + if (state.backendState.readViewportArrayIndex) { - GatherScissors_simd16::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, - scisXmin, scisYmin, scisXmax, scisYmax); + GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax); } else // broadcast fast path for non-VPAI case. { - scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin); - scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin); - scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax); - scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax); + scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin); + scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin); + scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax); + scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax); } - bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin); - bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin); - bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax); - bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax); + bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin); + bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin); + bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax); + bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax); } // Cull bloated points completely outside scissor - simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax); - simd16scalari maskOutsideScissorY = _simd16_cmpgt_epi32(bbox.ymin, bbox.ymax); - simd16scalari maskOutsideScissorXY = _simd16_or_si(maskOutsideScissorX, maskOutsideScissorY); - uint32_t maskOutsideScissor = _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY)); + typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax); + typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax); + typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY); + uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY)); primMask = primMask & ~maskOutsideScissor; // Convert bbox to macrotile units. - bbox.xmin = _simd16_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); - bbox.ymin = _simd16_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); - bbox.xmax = _simd16_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); - bbox.ymax = _simd16_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); +#if SIMD_WA_SXXI_EPI32 + bbox.xmin = simd_wa_srai_epi32(bbox.xmin); + bbox.ymin = simd_wa_srai_epi32(bbox.ymin); + bbox.xmax = simd_wa_srai_epi32(bbox.xmax); + bbox.ymax = simd_wa_srai_epi32(bbox.ymax); +#else + bbox.xmin = SIMD_T::srai_epi32(bbox.xmin); + bbox.ymin = SIMD_T::srai_epi32(bbox.ymin); + bbox.xmax = SIMD_T::srai_epi32(bbox.xmax); + bbox.ymax = SIMD_T::srai_epi32(bbox.ymax); +#endif - OSALIGNSIMD16(uint32_t) aMTLeft[KNOB_SIMD16_WIDTH], aMTRight[KNOB_SIMD16_WIDTH], aMTTop[KNOB_SIMD16_WIDTH], aMTBottom[KNOB_SIMD16_WIDTH]; + OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH]; - _simd16_store_si(reinterpret_cast(aMTLeft), bbox.xmin); - _simd16_store_si(reinterpret_cast(aMTRight), bbox.xmax); - _simd16_store_si(reinterpret_cast(aMTTop), bbox.ymin); - _simd16_store_si(reinterpret_cast(aMTBottom), bbox.ymax); + SIMD_T::store_si(reinterpret_cast(aMTLeft), bbox.xmin); + SIMD_T::store_si(reinterpret_cast(aMTRight), bbox.xmax); + SIMD_T::store_si(reinterpret_cast(aMTTop), bbox.ymin); + SIMD_T::store_si(reinterpret_cast(aMTBottom), bbox.ymax); // store render target array index - OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH]; + OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH]; if (state.backendState.readRenderTargetArrayIndex) { - simd16vector vRtai[2]; - pa.Assemble_simd16(VERTEX_SGV_SLOT, vRtai); - simd16scalari vRtaii = _simd16_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]); - _simd16_store_si(reinterpret_cast(aRTAI), vRtaii); + typename SIMD_T::Vec4 vRtai[2]; + pa.Assemble(VERTEX_SGV_SLOT, vRtai); + typename SIMD_T::Integer vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]); + SIMD_T::store_si(reinterpret_cast(aRTAI), vRtaii); } else { - _simd16_store_si(reinterpret_cast(aRTAI), _simd16_setzero_si()); + SIMD_T::store_si(reinterpret_cast(aRTAI), SIMD_T::setzero_si()); } - OSALIGNSIMD16(float) aPointSize[KNOB_SIMD16_WIDTH]; + OSALIGNSIMD16(float) aPointSize[SIMD_WIDTH]; _simd16_store_ps(reinterpret_cast(aPointSize), vPointSize); uint32_t *pPrimID = (uint32_t *)&primID; - OSALIGNSIMD16(float) aPrimVertsX[KNOB_SIMD16_WIDTH]; - OSALIGNSIMD16(float) aPrimVertsY[KNOB_SIMD16_WIDTH]; - OSALIGNSIMD16(float) aPrimVertsZ[KNOB_SIMD16_WIDTH]; + OSALIGNSIMD16(float) aPrimVertsX[SIMD_WIDTH]; + OSALIGNSIMD16(float) aPrimVertsY[SIMD_WIDTH]; + OSALIGNSIMD16(float) aPrimVertsZ[SIMD_WIDTH]; - _simd16_store_ps(reinterpret_cast(aPrimVertsX), primVerts.x); - _simd16_store_ps(reinterpret_cast(aPrimVertsY), primVerts.y); - _simd16_store_ps(reinterpret_cast(aPrimVertsZ), primVerts.z); + SIMD_T::store_ps(reinterpret_cast(aPrimVertsX), primVerts.x); + SIMD_T::store_ps(reinterpret_cast(aPrimVertsY), primVerts.y); + SIMD_T::store_ps(reinterpret_cast(aPrimVertsZ), primVerts.z); // scan remaining valid prims and bin each separately const SWR_BACKEND_STATE& backendState = state.backendState; @@ -2080,61 +1340,67 @@ void BinPostSetupPoints_simd16( AR_END(FEBinPoints, 1); } -void SIMDCALL BinPoints_simd16( +////////////////////////////////////////////////////////////////////////// +/// @brief Bin SIMD points to the backend. Only supports point size of 1 +/// @param pDC - pointer to draw context. +/// @param pa - The primitive assembly object. +/// @param workerId - thread's worker id. Even thread has a unique id. +/// @param tri - Contains point position data for SIMDs worth of points. +/// @param primID - Primitive ID for each point. +template +void BinPointsImpl( DRAW_CONTEXT *pDC, - PA_STATE& pa, + PA_STATE &pa, uint32_t workerId, - simd16vector prim[3], + typename SIMD_T::Vec4 prim[3], uint32_t primMask, - simd16scalari const &primID) + typename SIMD_T::Integer const &primID) { - simd16vector& primVerts = prim[0]; - const API_STATE& state = GetApiState(pDC); const SWR_FRONTEND_STATE& feState = state.frontendState; const SWR_RASTSTATE& rastState = state.rastState; // Read back viewport index if required - simd16scalari viewportIdx = _simd16_set1_epi32(0); + typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0); if (state.backendState.readViewportArrayIndex) { - simd16vector vpiAttrib[1]; - pa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib); + typename SIMD_T::Vec4 vpiAttrib[1]; + pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib); // OOB indices => forced to zero. - simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); - vpai = _simd16_max_epi32(_simd16_setzero_si(), vpai); - simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); - simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports); - viewportIdx = _simd16_and_si(vClearMask, vpai); + typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); + vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai); + typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); + typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports); + viewportIdx = SIMD_T::and_si(vClearMask, vpai); } if (!feState.vpTransformDisable) { // perspective divide - simd16scalar vRecipW0 = _simd16_div_ps(_simd16_set1_ps(1.0f), primVerts.w); + typename SIMD_T::Float vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w); - primVerts.x = _simd16_mul_ps(primVerts.x, vRecipW0); - primVerts.y = _simd16_mul_ps(primVerts.y, vRecipW0); - primVerts.z = _simd16_mul_ps(primVerts.z, vRecipW0); + prim[0].x = SIMD_T::mul_ps(prim[0].x, vRecipW0); + prim[0].y = SIMD_T::mul_ps(prim[0].y, vRecipW0); + prim[0].z = SIMD_T::mul_ps(prim[0].z, vRecipW0); // viewport transform to screen coords if (state.backendState.readViewportArrayIndex) { - viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx); + viewportTransform<1>(prim, state.vpMatrices, viewportIdx); } else { - viewportTransform<1>(&primVerts, state.vpMatrices); + viewportTransform<1>(prim, state.vpMatrices); } } - const simd16scalar offset = g_pixelOffsets_simd16[rastState.pixelLocation]; + typename SIMD_T::Float offset = g_pixelOffsets[rastState.pixelLocation]; - primVerts.x = _simd16_add_ps(primVerts.x, offset); - primVerts.y = _simd16_add_ps(primVerts.y, offset); + prim[0].x = SIMD_T::add_ps(prim[0].x, offset); + prim[0].y = SIMD_T::add_ps(prim[0].y, offset); - BinPostSetupPoints_simd16( + BinPostSetupPointsImpl( pDC, pa, workerId, @@ -2144,6 +1410,41 @@ void SIMDCALL BinPoints_simd16( viewportIdx); } +void BinPoints( + DRAW_CONTEXT *pDC, + PA_STATE &pa, + uint32_t workerId, + simdvector prim[3], + uint32_t primMask, + simdscalari const &primID) +{ + BinPointsImpl( + pDC, + pa, + workerId, + prim, + primMask, + primID); +} + +#if USE_SIMD16_FRONTEND +void SIMDCALL BinPoints_simd16( + DRAW_CONTEXT *pDC, + PA_STATE &pa, + uint32_t workerId, + simd16vector prim[3], + uint32_t primMask, + simd16scalari const &primID) +{ + BinPointsImpl( + pDC, + pa, + workerId, + prim, + primMask, + primID); +} + #endif ////////////////////////////////////////////////////////////////////////// /// @brief Bin SIMD lines to the backend. @@ -2153,321 +1454,114 @@ void SIMDCALL BinPoints_simd16( /// @param tri - Contains line position data for SIMDs worth of points. /// @param primID - Primitive ID for each line. /// @param viewportIdx - Viewport Array Index for each line. -void BinPostSetupLines( - DRAW_CONTEXT *pDC, - PA_STATE& pa, - uint32_t workerId, - simdvector prim[], - simdscalar recipW[], - uint32_t primMask, - simdscalari const &primID, - simdscalari const &viewportIdx) -{ - SWR_CONTEXT *pContext = pDC->pContext; - - AR_BEGIN(FEBinLines, pDC->drawId); - - const API_STATE& state = GetApiState(pDC); - const SWR_RASTSTATE& rastState = state.rastState; - - // Select attribute processor - PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2, - state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); - - simdscalar& vRecipW0 = recipW[0]; - simdscalar& vRecipW1 = recipW[1]; - - simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8]; - - // convert to fixed point - simdscalari vXi[2], vYi[2]; - vXi[0] = fpToFixedPointVertical(prim[0].x); - vYi[0] = fpToFixedPointVertical(prim[0].y); - vXi[1] = fpToFixedPointVertical(prim[1].x); - vYi[1] = fpToFixedPointVertical(prim[1].y); - - // compute x-major vs y-major mask - simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1])); - simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1])); - simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength)); - uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask); - - // cull zero-length lines - simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si()); - vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si())); - - primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask)); - - uint32_t *pPrimID = (uint32_t *)&primID; - const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; - - simdscalar vUnused = _simd_setzero_ps(); - - // Calc bounding box of lines - simdBBox bbox; - bbox.xmin = _simd_min_epi32(vXi[0], vXi[1]); - bbox.xmax = _simd_max_epi32(vXi[0], vXi[1]); - bbox.ymin = _simd_min_epi32(vYi[0], vYi[1]); - bbox.ymax = _simd_max_epi32(vYi[0], vYi[1]); - - // bloat bbox by line width along minor axis - simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f); - simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth); - simdBBox bloatBox; - bloatBox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi); - bloatBox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi); - bloatBox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi); - bloatBox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi); - - bbox.xmin = _simd_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask); - bbox.xmax = _simd_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask); - bbox.ymin = _simd_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask); - bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask); - - // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. - { - simdscalari scisXmin, scisYmin, scisXmax, scisYmax; - if (state.backendState.readViewportArrayIndex) - { - GatherScissors::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, - scisXmin, scisYmin, scisXmax, scisYmax); - } - else // broadcast fast path for non-VPAI case. - { - scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); - scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); - scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); - scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); - } - - bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); - bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); - bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); - bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); - } - - // Cull prims completely outside scissor - { - simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax); - simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax); - simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY); - uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY)); - primMask = primMask & ~maskOutsideScissor; - } - - if (!primMask) - { - goto endBinLines; - } - - // Convert triangle bbox to macrotile units. - bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); - bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); - bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); - bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); - - OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH]; - _simd_store_si((simdscalari*)aMTLeft, bbox.xmin); - _simd_store_si((simdscalari*)aMTRight, bbox.xmax); - _simd_store_si((simdscalari*)aMTTop, bbox.ymin); - _simd_store_si((simdscalari*)aMTBottom, bbox.ymax); - - // transpose verts needed for backend - /// @todo modify BE to take non-transformed verts - vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused); - vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused); - vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused); - vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused); - - // store render target array index - OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; - if (state.backendState.readRenderTargetArrayIndex) - { - simdvector vRtai[2]; - pa.Assemble(VERTEX_SGV_SLOT, vRtai); - simdscalari vRtaii = _simd_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]); - _simd_store_si((simdscalari*)aRTAI, vRtaii); - } - else - { - _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); - } - - // scan remaining valid prims and bin each separately - DWORD primIndex; - while (_BitScanForward(&primIndex, primMask)) - { - uint32_t linkageCount = state.backendState.numAttributes; - uint32_t numScalarAttribs = linkageCount * 4; - - BE_WORK work; - work.type = DRAW; - - TRIANGLE_WORK_DESC &desc = work.desc.tri; - - desc.triFlags.frontFacing = 1; - desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1; - desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; - desc.triFlags.viewportIndex = pViewportIndex[primIndex]; - - work.pfnWork = RasterizeLine; - - auto pArena = pDC->pArena; - SWR_ASSERT(pArena != nullptr); - - // store active attribs - desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); - desc.numAttribs = linkageCount; - pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs); - - // store line vertex data - desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16); - SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]); - SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]); - SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]); - SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]); - - // store user clip distances - if (rastState.clipDistanceMask) - { - uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask); - desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float)); - ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer); - } - - MacroTileMgr *pTileMgr = pDC->pTileMgr; - for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y) - { - for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x) - { -#if KNOB_ENABLE_TOSS_POINTS - if (!KNOB_TOSS_SETUP_TRIS) -#endif - { - pTileMgr->enqueue(x, y, &work); - } - } - } - - primMask &= ~(1 << primIndex); - } - -endBinLines: - - AR_END(FEBinLines, 1); -} - -#if USE_SIMD16_FRONTEND -void BinPostSetupLines_simd16( +template +void BinPostSetupLinesImpl( DRAW_CONTEXT *pDC, - PA_STATE& pa, + PA_STATE &pa, uint32_t workerId, - simd16vector prim[], - simd16scalar recipW[], + typename SIMD_T::Vec4 prim[], + typename SIMD_T::Float recipW[], uint32_t primMask, - simd16scalari const &primID, - simd16scalari const &viewportIdx) + typename SIMD_T::Integer const &primID, + typename SIMD_T::Integer const &viewportIdx) { SWR_CONTEXT *pContext = pDC->pContext; AR_BEGIN(FEBinLines, pDC->drawId); - const API_STATE& state = GetApiState(pDC); - const SWR_RASTSTATE& rastState = state.rastState; + const API_STATE &state = GetApiState(pDC); + const SWR_RASTSTATE &rastState = state.rastState; // Select attribute processor PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2, state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); - simd16scalar& vRecipW0 = recipW[0]; - simd16scalar& vRecipW1 = recipW[1]; + typename SIMD_T::Float &vRecipW0 = recipW[0]; + typename SIMD_T::Float &vRecipW1 = recipW[1]; // convert to fixed point - simd16scalari vXi[2], vYi[2]; + typename SIMD_T::Integer vXi[2], vYi[2]; - vXi[0] = fpToFixedPointVertical(prim[0].x); - vYi[0] = fpToFixedPointVertical(prim[0].y); - vXi[1] = fpToFixedPointVertical(prim[1].x); - vYi[1] = fpToFixedPointVertical(prim[1].y); + vXi[0] = fpToFixedPointVertical(prim[0].x); + vYi[0] = fpToFixedPointVertical(prim[0].y); + vXi[1] = fpToFixedPointVertical(prim[1].x); + vYi[1] = fpToFixedPointVertical(prim[1].y); // compute x-major vs y-major mask - simd16scalari xLength = _simd16_abs_epi32(_simd16_sub_epi32(vXi[0], vXi[1])); - simd16scalari yLength = _simd16_abs_epi32(_simd16_sub_epi32(vYi[0], vYi[1])); - simd16scalar vYmajorMask = _simd16_castsi_ps(_simd16_cmpgt_epi32(yLength, xLength)); - uint32_t yMajorMask = _simd16_movemask_ps(vYmajorMask); + typename SIMD_T::Integer xLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1])); + typename SIMD_T::Integer yLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1])); + typename SIMD_T::Float vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength)); + uint32_t yMajorMask = SIMD_T::movemask_ps(vYmajorMask); // cull zero-length lines - simd16scalari vZeroLengthMask = _simd16_cmpeq_epi32(xLength, _simd16_setzero_si()); - vZeroLengthMask = _simd16_and_si(vZeroLengthMask, _simd16_cmpeq_epi32(yLength, _simd16_setzero_si())); + typename SIMD_T::Integer vZeroLengthMask = SIMD_T::cmpeq_epi32(xLength, SIMD_T::setzero_si()); + vZeroLengthMask = SIMD_T::and_si(vZeroLengthMask, SIMD_T::cmpeq_epi32(yLength, SIMD_T::setzero_si())); - primMask &= ~_simd16_movemask_ps(_simd16_castsi_ps(vZeroLengthMask)); + primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vZeroLengthMask)); uint32_t *pPrimID = (uint32_t *)&primID; const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; // Calc bounding box of lines - simd16BBox bbox; - bbox.xmin = _simd16_min_epi32(vXi[0], vXi[1]); - bbox.xmax = _simd16_max_epi32(vXi[0], vXi[1]); - bbox.ymin = _simd16_min_epi32(vYi[0], vYi[1]); - bbox.ymax = _simd16_max_epi32(vYi[0], vYi[1]); + SIMDBBOX_T bbox; + bbox.xmin = SIMD_T::min_epi32(vXi[0], vXi[1]); + bbox.xmax = SIMD_T::max_epi32(vXi[0], vXi[1]); + bbox.ymin = SIMD_T::min_epi32(vYi[0], vYi[1]); + bbox.ymax = SIMD_T::max_epi32(vYi[0], vYi[1]); // bloat bbox by line width along minor axis - simd16scalar vHalfWidth = _simd16_set1_ps(rastState.lineWidth / 2.0f); - simd16scalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth); + typename SIMD_T::Float vHalfWidth = SIMD_T::set1_ps(rastState.lineWidth / 2.0f); + typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical(vHalfWidth); - simd16BBox bloatBox; + SIMDBBOX_T bloatBox; - bloatBox.xmin = _simd16_sub_epi32(bbox.xmin, vHalfWidthi); - bloatBox.xmax = _simd16_add_epi32(bbox.xmax, vHalfWidthi); - bloatBox.ymin = _simd16_sub_epi32(bbox.ymin, vHalfWidthi); - bloatBox.ymax = _simd16_add_epi32(bbox.ymax, vHalfWidthi); + bloatBox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi); + bloatBox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi); + bloatBox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi); + bloatBox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi); - bbox.xmin = _simd16_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask); - bbox.xmax = _simd16_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask); - bbox.ymin = _simd16_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask); - bbox.ymax = _simd16_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask); + bbox.xmin = SIMD_T::blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask); + bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask); + bbox.ymin = SIMD_T::blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask); + bbox.ymax = SIMD_T::blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask); // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. { - simd16scalari scisXmin, scisYmin, scisXmax, scisYmax; + typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax; if (state.backendState.readViewportArrayIndex) { - GatherScissors_simd16::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, - scisXmin, scisYmin, scisXmax, scisYmax); + GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax); } else // broadcast fast path for non-VPAI case. { - scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin); - scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin); - scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax); - scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax); + scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin); + scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin); + scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax); + scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax); } - bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin); - bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin); - bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax); - bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax); + bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin); + bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin); + bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax); + bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax); } // Cull prims completely outside scissor { - simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax); - simd16scalari maskOutsideScissorY = _simd16_cmpgt_epi32(bbox.ymin, bbox.ymax); - simd16scalari maskOutsideScissorXY = _simd16_or_si(maskOutsideScissorX, maskOutsideScissorY); - uint32_t maskOutsideScissor = _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY)); + typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax); + typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax); + typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY); + uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY)); primMask = primMask & ~maskOutsideScissor; } - const simdscalar unused = _simd_setzero_ps(); - // transpose verts needed for backend /// @todo modify BE to take non-transformed verts - simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH - simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH - simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH - simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH + simd4scalar vHorizX[SIMD_WIDTH]; + simd4scalar vHorizY[SIMD_WIDTH]; + simd4scalar vHorizZ[SIMD_WIDTH]; + simd4scalar vHorizW[SIMD_WIDTH]; if (!primMask) { @@ -2475,40 +1569,42 @@ void BinPostSetupLines_simd16( } // Convert triangle bbox to macrotile units. - bbox.xmin = _simd16_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); - bbox.ymin = _simd16_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); - bbox.xmax = _simd16_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); - bbox.ymax = _simd16_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); - - OSALIGNSIMD16(uint32_t) aMTLeft[KNOB_SIMD16_WIDTH], aMTRight[KNOB_SIMD16_WIDTH], aMTTop[KNOB_SIMD16_WIDTH], aMTBottom[KNOB_SIMD16_WIDTH]; +#if SIMD_WA_SXXI_EPI32 + bbox.xmin = simd_wa_srai_epi32(bbox.xmin); + bbox.ymin = simd_wa_srai_epi32(bbox.ymin); + bbox.xmax = simd_wa_srai_epi32(bbox.xmax); + bbox.ymax = simd_wa_srai_epi32(bbox.ymax); +#else + bbox.xmin = SIMD_T::srai_epi32(bbox.xmin); + bbox.ymin = SIMD_T::srai_epi32(bbox.ymin); + bbox.xmax = SIMD_T::srai_epi32(bbox.xmax); + bbox.ymax = SIMD_T::srai_epi32(bbox.ymax); +#endif - _simd16_store_si(reinterpret_cast(aMTLeft), bbox.xmin); - _simd16_store_si(reinterpret_cast(aMTRight), bbox.xmax); - _simd16_store_si(reinterpret_cast(aMTTop), bbox.ymin); - _simd16_store_si(reinterpret_cast(aMTBottom), bbox.ymax); + OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH]; - vTranspose3x8(vHorizX[0], _simd16_extract_ps(prim[0].x, 0), _simd16_extract_ps(prim[1].x, 0), unused); - vTranspose3x8(vHorizY[0], _simd16_extract_ps(prim[0].y, 0), _simd16_extract_ps(prim[1].y, 0), unused); - vTranspose3x8(vHorizZ[0], _simd16_extract_ps(prim[0].z, 0), _simd16_extract_ps(prim[1].z, 0), unused); - vTranspose3x8(vHorizW[0], _simd16_extract_ps(vRecipW0, 0), _simd16_extract_ps(vRecipW1, 0), unused); + SIMD_T::store_si(reinterpret_cast(aMTLeft), bbox.xmin); + SIMD_T::store_si(reinterpret_cast(aMTRight), bbox.xmax); + SIMD_T::store_si(reinterpret_cast(aMTTop), bbox.ymin); + SIMD_T::store_si(reinterpret_cast(aMTBottom), bbox.ymax); - vTranspose3x8(vHorizX[1], _simd16_extract_ps(prim[0].x, 1), _simd16_extract_ps(prim[1].x, 1), unused); - vTranspose3x8(vHorizY[1], _simd16_extract_ps(prim[0].y, 1), _simd16_extract_ps(prim[1].y, 1), unused); - vTranspose3x8(vHorizZ[1], _simd16_extract_ps(prim[0].z, 1), _simd16_extract_ps(prim[1].z, 1), unused); - vTranspose3x8(vHorizW[1], _simd16_extract_ps(vRecipW0, 1), _simd16_extract_ps(vRecipW1, 1), unused); + TransposeVertices(vHorizX, prim[0].x, prim[1].x, SIMD_T::setzero_ps()); + TransposeVertices(vHorizY, prim[0].y, prim[1].y, SIMD_T::setzero_ps()); + TransposeVertices(vHorizZ, prim[0].z, prim[1].z, SIMD_T::setzero_ps()); + TransposeVertices(vHorizW, vRecipW0, vRecipW1, SIMD_T::setzero_ps()); // store render target array index - OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH]; + OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH]; if (state.backendState.readRenderTargetArrayIndex) { - simd16vector vRtai[2]; - pa.Assemble_simd16(VERTEX_SGV_SLOT, vRtai); - simd16scalari vRtaii = _simd16_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]); - _simd16_store_si(reinterpret_cast(aRTAI), vRtaii); + typename SIMD_T::Vec4 vRtai[2]; + pa.Assemble(VERTEX_SGV_SLOT, vRtai); + typename SIMD_T::Integer vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]); + SIMD_T::store_si(reinterpret_cast(aRTAI), vRtaii); } else { - _simd16_store_si(reinterpret_cast(aRTAI), _simd16_setzero_si()); + SIMD_T::store_si(reinterpret_cast(aRTAI), SIMD_T::setzero_si()); } // scan remaining valid prims and bin each separately @@ -2541,15 +1637,10 @@ void BinPostSetupLines_simd16( // store line vertex data desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16); - { - const uint32_t i = primIndex >> 3; // triIndex / KNOB_SIMD_WIDTH - const uint32_t j = primIndex & 7; // triIndex % KNOB_SIMD_WIDTH - - _mm_store_ps(&desc.pTriBuffer[ 0], vHorizX[i][j]); - _mm_store_ps(&desc.pTriBuffer[ 4], vHorizY[i][j]); - _mm_store_ps(&desc.pTriBuffer[ 8], vHorizZ[i][j]); - _mm_store_ps(&desc.pTriBuffer[12], vHorizW[i][j]); - } + _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]); + _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]); + _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]); + _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]); // store user clip distances if (rastState.clipDistanceMask) @@ -2581,7 +1672,6 @@ endBinLines: AR_END(FEBinLines, 1); } -#endif ////////////////////////////////////////////////////////////////////////// /// @brief Bin SIMD lines to the backend. /// @param pDC - pointer to draw context. @@ -2590,48 +1680,49 @@ endBinLines: /// @param tri - Contains line position data for SIMDs worth of points. /// @param primID - Primitive ID for each line. /// @param viewportIdx - Viewport Array Index for each line. -void BinLines( +template +void SIMDCALL BinLinesImpl( DRAW_CONTEXT *pDC, - PA_STATE& pa, + PA_STATE &pa, uint32_t workerId, - simdvector prim[], + typename SIMD_T::Vec4 prim[3], uint32_t primMask, - simdscalari const &primID) + typename SIMD_T::Integer const &primID) { const API_STATE& state = GetApiState(pDC); const SWR_RASTSTATE& rastState = state.rastState; const SWR_FRONTEND_STATE& feState = state.frontendState; - simdscalar vRecipW[2] = { _simd_set1_ps(1.0f), _simd_set1_ps(1.0f) }; + typename SIMD_T::Float vRecipW[2] = { SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f) }; - simdscalari viewportIdx = _simd_set1_epi32(0); + typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0); if (state.backendState.readViewportArrayIndex) { - simdvector vpiAttrib[2]; + typename SIMD_T::Vec4 vpiAttrib[2]; pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib); - simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); - vpai = _simd_max_epi32(_simd_setzero_si(), vpai); // OOB indices => forced to zero. - simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); - simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports); - viewportIdx = _simd_and_si(vClearMask, vpai); + typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); + vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai); + typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); + typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports); + viewportIdx = SIMD_T::and_si(vClearMask, vpai); } if (!feState.vpTransformDisable) { // perspective divide - vRecipW[0] = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w); - vRecipW[1] = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w); + vRecipW[0] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w); + vRecipW[1] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[1].w); - prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW[0]); - prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW[1]); + prim[0].v[0] = SIMD_T::mul_ps(prim[0].v[0], vRecipW[0]); + prim[1].v[0] = SIMD_T::mul_ps(prim[1].v[0], vRecipW[1]); - prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW[0]); - prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW[1]); + prim[0].v[1] = SIMD_T::mul_ps(prim[0].v[1], vRecipW[0]); + prim[1].v[1] = SIMD_T::mul_ps(prim[1].v[1], vRecipW[1]); - prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW[0]); - prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW[1]); + prim[0].v[2] = SIMD_T::mul_ps(prim[0].v[2], vRecipW[0]); + prim[1].v[2] = SIMD_T::mul_ps(prim[1].v[2], vRecipW[1]); // viewport transform to screen coords if (state.backendState.readViewportArrayIndex) @@ -2645,14 +1736,15 @@ void BinLines( } // adjust for pixel center location - simdscalar offset = g_pixelOffsets[rastState.pixelLocation]; - prim[0].x = _simd_add_ps(prim[0].x, offset); - prim[0].y = _simd_add_ps(prim[0].y, offset); + typename SIMD_T::Float offset = g_pixelOffsets[rastState.pixelLocation]; + + prim[0].x = SIMD_T::add_ps(prim[0].x, offset); + prim[0].y = SIMD_T::add_ps(prim[0].y, offset); - prim[1].x = _simd_add_ps(prim[1].x, offset); - prim[1].y = _simd_add_ps(prim[1].y, offset); + prim[1].x = SIMD_T::add_ps(prim[1].x, offset); + prim[1].y = SIMD_T::add_ps(prim[1].y, offset); - BinPostSetupLines( + BinPostSetupLinesImpl( pDC, pa, workerId, @@ -2663,79 +1755,27 @@ void BinLines( viewportIdx); } +void BinLines( + DRAW_CONTEXT *pDC, + PA_STATE &pa, + uint32_t workerId, + simdvector prim[], + uint32_t primMask, + simdscalari const &primID) +{ + BinLinesImpl(pDC, pa, workerId, prim, primMask, primID); +} + #if USE_SIMD16_FRONTEND void SIMDCALL BinLines_simd16( DRAW_CONTEXT *pDC, - PA_STATE& pa, + PA_STATE &pa, uint32_t workerId, simd16vector prim[3], uint32_t primMask, simd16scalari const &primID) { - const API_STATE& state = GetApiState(pDC); - const SWR_RASTSTATE& rastState = state.rastState; - const SWR_FRONTEND_STATE& feState = state.frontendState; - - simd16scalar vRecipW[2] = { _simd16_set1_ps(1.0f), _simd16_set1_ps(1.0f) }; - - simd16scalari viewportIdx = _simd16_set1_epi32(0); - if (state.backendState.readViewportArrayIndex) - { - simd16vector vpiAttrib[2]; - pa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib); - - // OOB indices => forced to zero. - simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); - vpai = _simd16_max_epi32(_simd16_setzero_si(), vpai); - simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); - simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports); - viewportIdx = _simd16_and_si(vClearMask, vpai); - } - - if (!feState.vpTransformDisable) - { - // perspective divide - vRecipW[0] = _simd16_div_ps(_simd16_set1_ps(1.0f), prim[0].w); - vRecipW[1] = _simd16_div_ps(_simd16_set1_ps(1.0f), prim[1].w); - - prim[0].v[0] = _simd16_mul_ps(prim[0].v[0], vRecipW[0]); - prim[1].v[0] = _simd16_mul_ps(prim[1].v[0], vRecipW[1]); - - prim[0].v[1] = _simd16_mul_ps(prim[0].v[1], vRecipW[0]); - prim[1].v[1] = _simd16_mul_ps(prim[1].v[1], vRecipW[1]); - - prim[0].v[2] = _simd16_mul_ps(prim[0].v[2], vRecipW[0]); - prim[1].v[2] = _simd16_mul_ps(prim[1].v[2], vRecipW[1]); - - // viewport transform to screen coords - if (state.backendState.readViewportArrayIndex) - { - viewportTransform<2>(prim, state.vpMatrices, viewportIdx); - } - else - { - viewportTransform<2>(prim, state.vpMatrices); - } - } - - // adjust for pixel center location - simd16scalar offset = g_pixelOffsets_simd16[rastState.pixelLocation]; - - prim[0].x = _simd16_add_ps(prim[0].x, offset); - prim[0].y = _simd16_add_ps(prim[0].y, offset); - - prim[1].x = _simd16_add_ps(prim[1].x, offset); - prim[1].y = _simd16_add_ps(prim[1].y, offset); - - BinPostSetupLines_simd16( - pDC, - pa, - workerId, - prim, - vRecipW, - primMask, - primID, - viewportIdx); + BinLinesImpl(pDC, pa, workerId, prim, primMask, primID); } #endif diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.h b/src/gallium/drivers/swr/rasterizer/core/binner.h index 16161431d14..e842aa663b2 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.h +++ b/src/gallium/drivers/swr/rasterizer/core/binner.h @@ -31,67 +31,39 @@ ////////////////////////////////////////////////////////////////////////// /// @brief Offsets added to post-viewport vertex positions based on /// raster state. -static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] = +template +static const typename SIMD_T::Float g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] = { - _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER - _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL + SIMD_T::set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER + SIMD_T::set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL }; -#if USE_SIMD16_FRONTEND -static const simd16scalar g_pixelOffsets_simd16[SWR_PIXEL_LOCATION_UL + 1] = -{ - _simd16_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER - _simd16_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL -}; - -#endif ////////////////////////////////////////////////////////////////////////// /// @brief Convert the X,Y coords of a triangle to the requested Fixed /// Point precision from FP32. -template > -INLINE simdscalari fpToFixedPointVertical(const simdscalar &vIn) +template > +INLINE typename SIMD_T::Integer fpToFixedPointVertical(const typename SIMD_T::Float &vIn) { - simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value)); - return _simd_cvtps_epi32(vFixed); + return SIMD_T::cvtps_epi32(SIMD_T::mul_ps(vIn, SIMD_T::set1_ps(PT::ScaleT::value))); } -#if USE_SIMD16_FRONTEND -template > -INLINE simd16scalari fpToFixedPointVertical(const simd16scalar &vIn) -{ - simd16scalar vFixed = _simd16_mul_ps(vIn, _simd16_set1_ps(PT::ScaleT::value)); - return _simd16_cvtps_epi32(vFixed); -} - -#endif ////////////////////////////////////////////////////////////////////////// /// @brief Helper function to set the X,Y coords of a triangle to the /// requested Fixed Point precision from FP32. /// @param tri: simdvector[3] of FP triangle verts /// @param vXi: fixed point X coords of tri verts /// @param vYi: fixed point Y coords of tri verts -INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari(&vXi)[3], simdscalari(&vYi)[3]) -{ - vXi[0] = fpToFixedPointVertical(tri[0].x); - vYi[0] = fpToFixedPointVertical(tri[0].y); - vXi[1] = fpToFixedPointVertical(tri[1].x); - vYi[1] = fpToFixedPointVertical(tri[1].y); - vXi[2] = fpToFixedPointVertical(tri[2].x); - vYi[2] = fpToFixedPointVertical(tri[2].y); -} - -#if USE_SIMD16_FRONTEND -INLINE static void FPToFixedPoint(const simd16vector * const tri, simd16scalari(&vXi)[3], simd16scalari(&vYi)[3]) +template +INLINE static void FPToFixedPoint(const typename SIMD_T::Vec4 *const tri, typename SIMD_T::Integer(&vXi)[3], typename SIMD_T::Integer(&vYi)[3]) { - vXi[0] = fpToFixedPointVertical(tri[0].x); - vYi[0] = fpToFixedPointVertical(tri[0].y); - vXi[1] = fpToFixedPointVertical(tri[1].x); - vYi[1] = fpToFixedPointVertical(tri[1].y); - vXi[2] = fpToFixedPointVertical(tri[2].x); - vYi[2] = fpToFixedPointVertical(tri[2].y); + vXi[0] = fpToFixedPointVertical(tri[0].x); + vYi[0] = fpToFixedPointVertical(tri[0].y); + vXi[1] = fpToFixedPointVertical(tri[1].x); + vYi[1] = fpToFixedPointVertical(tri[1].y); + vXi[2] = fpToFixedPointVertical(tri[2].x); + vYi[2] = fpToFixedPointVertical(tri[2].y); } -#endif ////////////////////////////////////////////////////////////////////////// /// @brief Calculate bounding box for current triangle /// @tparam CT: ConservativeRastFETraits type @@ -100,124 +72,44 @@ INLINE static void FPToFixedPoint(const simd16vector * const tri, simd16scalari( /// @param bbox: fixed point bbox /// *Note*: expects vX, vY to be in the correct precision for the type /// of rasterization. This avoids unnecessary FP->fixed conversions. -template -INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari(&vX)[3], simdscalari(&vY)[3], simdBBox &bbox) +template +INLINE void calcBoundingBoxIntVertical(const typename SIMD_T::Integer(&vX)[3], const typename SIMD_T::Integer(&vY)[3], SIMDBBOX_T &bbox) { - simdscalari vMinX = vX[0]; - vMinX = _simd_min_epi32(vMinX, vX[1]); - vMinX = _simd_min_epi32(vMinX, vX[2]); + typename SIMD_T::Integer vMinX = vX[0]; - simdscalari vMaxX = vX[0]; - vMaxX = _simd_max_epi32(vMaxX, vX[1]); - vMaxX = _simd_max_epi32(vMaxX, vX[2]); + vMinX = SIMD_T::min_epi32(vMinX, vX[1]); + vMinX = SIMD_T::min_epi32(vMinX, vX[2]); - simdscalari vMinY = vY[0]; - vMinY = _simd_min_epi32(vMinY, vY[1]); - vMinY = _simd_min_epi32(vMinY, vY[2]); + typename SIMD_T::Integer vMaxX = vX[0]; - simdscalari vMaxY = vY[0]; - vMaxY = _simd_max_epi32(vMaxY, vY[1]); - vMaxY = _simd_max_epi32(vMaxY, vY[2]); + vMaxX = SIMD_T::max_epi32(vMaxX, vX[1]); + vMaxX = SIMD_T::max_epi32(vMaxX, vX[2]); - bbox.xmin = vMinX; - bbox.xmax = vMaxX; - bbox.ymin = vMinY; - bbox.ymax = vMaxY; -} + typename SIMD_T::Integer vMinY = vY[0]; -#if USE_SIMD16_FRONTEND -template -INLINE void calcBoundingBoxIntVertical(const simd16vector * const tri, simd16scalari(&vX)[3], simd16scalari(&vY)[3], simd16BBox &bbox) -{ - simd16scalari vMinX = vX[0]; - - vMinX = _simd16_min_epi32(vMinX, vX[1]); - vMinX = _simd16_min_epi32(vMinX, vX[2]); + vMinY = SIMD_T::min_epi32(vMinY, vY[1]); + vMinY = SIMD_T::min_epi32(vMinY, vY[2]); - simd16scalari vMaxX = vX[0]; + typename SIMD_T::Integer vMaxY = vY[0]; - vMaxX = _simd16_max_epi32(vMaxX, vX[1]); - vMaxX = _simd16_max_epi32(vMaxX, vX[2]); + vMaxY = SIMD_T::max_epi32(vMaxY, vY[1]); + vMaxY = SIMD_T::max_epi32(vMaxY, vY[2]); - simd16scalari vMinY = vY[0]; + if (CT::BoundingBoxOffsetT::value != 0) + { + /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization + /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer. - vMinY = _simd16_min_epi32(vMinY, vY[1]); - vMinY = _simd16_min_epi32(vMinY, vY[2]); + const typename SIMD_T::Integer value = SIMD_T::set1_epi32(CT::BoundingBoxOffsetT::value); - simd16scalari vMaxY = vY[0]; - - vMaxY = _simd16_max_epi32(vMaxY, vY[1]); - vMaxY = _simd16_max_epi32(vMaxY, vY[2]); + vMinX = SIMD_T::sub_epi32(vMinX, value); + vMaxX = SIMD_T::add_epi32(vMaxX, value); + vMinY = SIMD_T::sub_epi32(vMinY, value); + vMaxY = SIMD_T::add_epi32(vMaxY, value); + } bbox.xmin = vMinX; bbox.xmax = vMaxX; bbox.ymin = vMinY; bbox.ymax = vMaxY; } - -#endif -////////////////////////////////////////////////////////////////////////// -/// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical -/// Offsets BBox for conservative rast -template <> -INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari(&vX)[3], simdscalari(&vY)[3], simdBBox &bbox) -{ - // FE conservative rast traits - typedef FEConservativeRastT CT; - - simdscalari vMinX = vX[0]; - vMinX = _simd_min_epi32(vMinX, vX[1]); - vMinX = _simd_min_epi32(vMinX, vX[2]); - - simdscalari vMaxX = vX[0]; - vMaxX = _simd_max_epi32(vMaxX, vX[1]); - vMaxX = _simd_max_epi32(vMaxX, vX[2]); - - simdscalari vMinY = vY[0]; - vMinY = _simd_min_epi32(vMinY, vY[1]); - vMinY = _simd_min_epi32(vMinY, vY[2]); - - simdscalari vMaxY = vY[0]; - vMaxY = _simd_max_epi32(vMaxY, vY[1]); - vMaxY = _simd_max_epi32(vMaxY, vY[2]); - - /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization - /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer. - bbox.xmin = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)); - bbox.xmax = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)); - bbox.ymin = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)); - bbox.ymax = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)); -} - -#if USE_SIMD16_FRONTEND -template <> -INLINE void calcBoundingBoxIntVertical(const simd16vector * const tri, simd16scalari(&vX)[3], simd16scalari(&vY)[3], simd16BBox &bbox) -{ - // FE conservative rast traits - typedef FEConservativeRastT CT; - - simd16scalari vMinX = vX[0]; - vMinX = _simd16_min_epi32(vMinX, vX[1]); - vMinX = _simd16_min_epi32(vMinX, vX[2]); - - simd16scalari vMaxX = vX[0]; - vMaxX = _simd16_max_epi32(vMaxX, vX[1]); - vMaxX = _simd16_max_epi32(vMaxX, vX[2]); - - simd16scalari vMinY = vY[0]; - vMinY = _simd16_min_epi32(vMinY, vY[1]); - vMinY = _simd16_min_epi32(vMinY, vY[2]); - - simd16scalari vMaxY = vY[0]; - vMaxY = _simd16_max_epi32(vMaxY, vY[1]); - vMaxY = _simd16_max_epi32(vMaxY, vY[2]); - - /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization - /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer. - bbox.xmin = _simd16_sub_epi32(vMinX, _simd16_set1_epi32(CT::BoundingBoxOffsetT::value)); - bbox.xmax = _simd16_add_epi32(vMaxX, _simd16_set1_epi32(CT::BoundingBoxOffsetT::value)); - bbox.ymin = _simd16_sub_epi32(vMinY, _simd16_set1_epi32(CT::BoundingBoxOffsetT::value)); - bbox.ymax = _simd16_add_epi32(vMaxY, _simd16_set1_epi32(CT::BoundingBoxOffsetT::value)); -} - -#endif diff --git a/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h b/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h index 1d8546959f5..00c3a87c188 100644 --- a/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h +++ b/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h @@ -109,6 +109,7 @@ template <> struct ConservativeRastFETraits { typedef std::false_type IsConservativeT; + typedef std::integral_constant BoundingBoxOffsetT; }; ////////////////////////////////////////////////////////////////////////// diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h index 87dba22bf80..b2ae8dafb97 100644 --- a/src/gallium/drivers/swr/rasterizer/core/pa.h +++ b/src/gallium/drivers/swr/rasterizer/core/pa.h @@ -91,6 +91,7 @@ struct PA_STATE virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0; #if ENABLE_AVX512_SIMD16 virtual bool Assemble_simd16(uint32_t slot, simd16vector verts[]) = 0; + virtual bool Assemble(uint32_t slot, simd16vector verts[]) = 0; #endif virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0; virtual bool NextPrim() = 0; @@ -203,6 +204,11 @@ struct PA_STATE_OPT : public PA_STATE return this->pfnPaFunc_simd16(*this, slot, verts); } + bool Assemble(uint32_t slot, simd16vector verts[]) + { + return Assemble_simd16(slot, verts); + } + #endif // Assembles 1 primitive. Each simdscalar is a vertex (xyzw). void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) @@ -765,6 +771,11 @@ struct PA_STATE_CUT : public PA_STATE return true; } + bool Assemble(uint32_t slot, simd16vector verts[]) + { + return Assemble_simd16(slot, verts); + } + #endif void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3]) { @@ -1319,6 +1330,11 @@ struct PA_TESS : PA_STATE return true; } + bool Assemble(uint32_t slot, simd16vector verts[]) + { + return Assemble_simd16(slot, verts); + } + #endif void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) { diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h index 392ee4ba94a..b096d2120cb 100644 --- a/src/gallium/drivers/swr/rasterizer/core/utils.h +++ b/src/gallium/drivers/swr/rasterizer/core/utils.h @@ -53,6 +53,14 @@ struct simd16BBox }; #endif +template +struct SIMDBBOX_T +{ + typename SIMD_T::Integer ymin; + typename SIMD_T::Integer ymax; + typename SIMD_T::Integer xmin; + typename SIMD_T::Integer xmax; +}; // helper function to unroll loops template