#include "tilemgr.h"
// Function Prototype
-void BinPostSetupLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], simdscalar vRecipW[2], uint32_t primMask, simdscalari const &primID, simdscalari const &viewportIdx);
-void BinPostSetupPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primID, simdscalari const &viewportIdx);
+template <typename SIMD_T, uint32_t SIMD_WIDTH>
+void BinPostSetupLinesImpl(
+ DRAW_CONTEXT *pDC,
+ PA_STATE &pa,
+ uint32_t workerId,
+ typename SIMD_T::Vec4 prim[],
+ typename SIMD_T::Float recipW[],
+ uint32_t primMask,
+ typename SIMD_T::Integer const &primID,
+ typename SIMD_T::Integer const &viewportIdx);
-#if USE_SIMD16_FRONTEND
-void BinPostSetupLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], simd16scalar vRecipW[2], uint32_t primMask, simd16scalari const &primID, simd16scalari const &viewportIdx);
-void BinPostSetupPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primID, simd16scalari const &viewportIdx);
-#endif
+template <typename SIMD_T, uint32_t SIMD_WIDTH>
+void BinPostSetupPointsImpl(
+ DRAW_CONTEXT *pDC,
+ PA_STATE &pa,
+ uint32_t workerId,
+ typename SIMD_T::Vec4 prim[],
+ uint32_t primMask,
+ typename SIMD_T::Integer const &primID,
+ typename SIMD_T::Integer const &viewportIdx);
//////////////////////////////////////////////////////////////////////////
/// @brief Processes attributes for the backend based on linkage mask and
/// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
//
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
-template<size_t SimdWidth>
-struct GatherScissors
-{
- static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
- simdscalari &scisXmin, simdscalari &scisYmin,
- simdscalari &scisXmax, simdscalari &scisYmax)
- {
- SWR_INVALID("Unhandled Simd Width in Scissor Rect Gather");
- }
-};
-
-template<>
-struct GatherScissors<8>
-{
- static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
- simdscalari &scisXmin, simdscalari &scisYmin,
- simdscalari &scisXmax, simdscalari &scisYmax)
- {
- scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
- pScissorsInFixedPoint[pViewportIndex[1]].xmin,
- pScissorsInFixedPoint[pViewportIndex[2]].xmin,
- pScissorsInFixedPoint[pViewportIndex[3]].xmin,
- pScissorsInFixedPoint[pViewportIndex[4]].xmin,
- pScissorsInFixedPoint[pViewportIndex[5]].xmin,
- pScissorsInFixedPoint[pViewportIndex[6]].xmin,
- pScissorsInFixedPoint[pViewportIndex[7]].xmin);
- scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin,
- pScissorsInFixedPoint[pViewportIndex[1]].ymin,
- pScissorsInFixedPoint[pViewportIndex[2]].ymin,
- pScissorsInFixedPoint[pViewportIndex[3]].ymin,
- pScissorsInFixedPoint[pViewportIndex[4]].ymin,
- pScissorsInFixedPoint[pViewportIndex[5]].ymin,
- pScissorsInFixedPoint[pViewportIndex[6]].ymin,
- pScissorsInFixedPoint[pViewportIndex[7]].ymin);
- scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax,
- pScissorsInFixedPoint[pViewportIndex[1]].xmax,
- pScissorsInFixedPoint[pViewportIndex[2]].xmax,
- pScissorsInFixedPoint[pViewportIndex[3]].xmax,
- pScissorsInFixedPoint[pViewportIndex[4]].xmax,
- pScissorsInFixedPoint[pViewportIndex[5]].xmax,
- pScissorsInFixedPoint[pViewportIndex[6]].xmax,
- pScissorsInFixedPoint[pViewportIndex[7]].xmax);
- scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax,
- pScissorsInFixedPoint[pViewportIndex[1]].ymax,
- pScissorsInFixedPoint[pViewportIndex[2]].ymax,
- pScissorsInFixedPoint[pViewportIndex[3]].ymax,
- pScissorsInFixedPoint[pViewportIndex[4]].ymax,
- pScissorsInFixedPoint[pViewportIndex[5]].ymax,
- pScissorsInFixedPoint[pViewportIndex[6]].ymax,
- pScissorsInFixedPoint[pViewportIndex[7]].ymax);
- }
-};
-
-#if USE_SIMD16_FRONTEND
-template<size_t SimdWidth>
-struct GatherScissors_simd16
+static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const uint32_t *pViewportIndex,
+ simdscalari &scisXmin, simdscalari &scisYmin, simdscalari &scisXmax, simdscalari &scisYmax)
{
- static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
- simd16scalari &scisXmin, simd16scalari &scisYmin,
- simd16scalari &scisXmax, simd16scalari &scisYmax)
- {
- SWR_INVALID("Unhandled Simd Width in Scissor Rect Gather");
- }
-};
+ scisXmin = _simd_set_epi32(
+ pScissorsInFixedPoint[pViewportIndex[0]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[1]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[2]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[3]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[4]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[5]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[6]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[7]].xmin);
+ scisYmin = _simd_set_epi32(
+ pScissorsInFixedPoint[pViewportIndex[0]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[1]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[2]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[3]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[4]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[5]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[6]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[7]].ymin);
+ scisXmax = _simd_set_epi32(
+ pScissorsInFixedPoint[pViewportIndex[0]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[1]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[2]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[3]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[4]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[5]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[6]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[7]].xmax);
+ scisYmax = _simd_set_epi32(
+ pScissorsInFixedPoint[pViewportIndex[0]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[1]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[2]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[3]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[4]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[5]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[6]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[7]].ymax);
+}
-template<>
-struct GatherScissors_simd16<16>
+static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const uint32_t *pViewportIndex,
+ simd16scalari &scisXmin, simd16scalari &scisYmin, simd16scalari &scisXmax, simd16scalari &scisYmax)
{
- static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
- simd16scalari &scisXmin, simd16scalari &scisYmin,
- simd16scalari &scisXmax, simd16scalari &scisYmax) {
- scisXmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
- pScissorsInFixedPoint[pViewportIndex[1]].xmin,
- pScissorsInFixedPoint[pViewportIndex[2]].xmin,
- pScissorsInFixedPoint[pViewportIndex[3]].xmin,
- pScissorsInFixedPoint[pViewportIndex[4]].xmin,
- pScissorsInFixedPoint[pViewportIndex[5]].xmin,
- pScissorsInFixedPoint[pViewportIndex[6]].xmin,
- pScissorsInFixedPoint[pViewportIndex[7]].xmin,
- pScissorsInFixedPoint[pViewportIndex[8]].xmin,
- pScissorsInFixedPoint[pViewportIndex[9]].xmin,
- pScissorsInFixedPoint[pViewportIndex[10]].xmin,
- pScissorsInFixedPoint[pViewportIndex[11]].xmin,
- pScissorsInFixedPoint[pViewportIndex[12]].xmin,
- pScissorsInFixedPoint[pViewportIndex[13]].xmin,
- pScissorsInFixedPoint[pViewportIndex[14]].xmin,
- pScissorsInFixedPoint[pViewportIndex[15]].xmin);
-
- scisYmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin,
- pScissorsInFixedPoint[pViewportIndex[1]].ymin,
- pScissorsInFixedPoint[pViewportIndex[2]].ymin,
- pScissorsInFixedPoint[pViewportIndex[3]].ymin,
- pScissorsInFixedPoint[pViewportIndex[4]].ymin,
- pScissorsInFixedPoint[pViewportIndex[5]].ymin,
- pScissorsInFixedPoint[pViewportIndex[6]].ymin,
- pScissorsInFixedPoint[pViewportIndex[7]].ymin,
- pScissorsInFixedPoint[pViewportIndex[8]].ymin,
- pScissorsInFixedPoint[pViewportIndex[9]].ymin,
- pScissorsInFixedPoint[pViewportIndex[10]].ymin,
- pScissorsInFixedPoint[pViewportIndex[11]].ymin,
- pScissorsInFixedPoint[pViewportIndex[12]].ymin,
- pScissorsInFixedPoint[pViewportIndex[13]].ymin,
- pScissorsInFixedPoint[pViewportIndex[14]].ymin,
- pScissorsInFixedPoint[pViewportIndex[15]].ymin);
-
- scisXmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax,
- pScissorsInFixedPoint[pViewportIndex[1]].xmax,
- pScissorsInFixedPoint[pViewportIndex[2]].xmax,
- pScissorsInFixedPoint[pViewportIndex[3]].xmax,
- pScissorsInFixedPoint[pViewportIndex[4]].xmax,
- pScissorsInFixedPoint[pViewportIndex[5]].xmax,
- pScissorsInFixedPoint[pViewportIndex[6]].xmax,
- pScissorsInFixedPoint[pViewportIndex[7]].xmax,
- pScissorsInFixedPoint[pViewportIndex[8]].xmax,
- pScissorsInFixedPoint[pViewportIndex[9]].xmax,
- pScissorsInFixedPoint[pViewportIndex[10]].xmax,
- pScissorsInFixedPoint[pViewportIndex[11]].xmax,
- pScissorsInFixedPoint[pViewportIndex[12]].xmax,
- pScissorsInFixedPoint[pViewportIndex[13]].xmax,
- pScissorsInFixedPoint[pViewportIndex[14]].xmax,
- pScissorsInFixedPoint[pViewportIndex[15]].xmax);
-
- scisYmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax,
- pScissorsInFixedPoint[pViewportIndex[1]].ymax,
- pScissorsInFixedPoint[pViewportIndex[2]].ymax,
- pScissorsInFixedPoint[pViewportIndex[3]].ymax,
- pScissorsInFixedPoint[pViewportIndex[4]].ymax,
- pScissorsInFixedPoint[pViewportIndex[5]].ymax,
- pScissorsInFixedPoint[pViewportIndex[6]].ymax,
- pScissorsInFixedPoint[pViewportIndex[7]].ymax,
- pScissorsInFixedPoint[pViewportIndex[8]].ymax,
- pScissorsInFixedPoint[pViewportIndex[9]].ymax,
- pScissorsInFixedPoint[pViewportIndex[10]].ymax,
- pScissorsInFixedPoint[pViewportIndex[11]].ymax,
- pScissorsInFixedPoint[pViewportIndex[12]].ymax,
- pScissorsInFixedPoint[pViewportIndex[13]].ymax,
- pScissorsInFixedPoint[pViewportIndex[14]].ymax,
- pScissorsInFixedPoint[pViewportIndex[15]].ymax);
- }
-};
+ scisXmin = _simd16_set_epi32(
+ pScissorsInFixedPoint[pViewportIndex[0]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[1]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[2]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[3]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[4]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[5]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[6]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[7]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[8]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[9]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[10]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[11]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[12]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[13]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[14]].xmin,
+ pScissorsInFixedPoint[pViewportIndex[15]].xmin);
+
+ scisYmin = _simd16_set_epi32(
+ pScissorsInFixedPoint[pViewportIndex[0]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[1]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[2]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[3]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[4]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[5]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[6]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[7]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[8]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[9]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[10]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[11]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[12]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[13]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[14]].ymin,
+ pScissorsInFixedPoint[pViewportIndex[15]].ymin);
+
+ scisXmax = _simd16_set_epi32(
+ pScissorsInFixedPoint[pViewportIndex[0]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[1]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[2]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[3]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[4]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[5]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[6]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[7]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[8]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[9]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[10]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[11]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[12]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[13]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[14]].xmax,
+ pScissorsInFixedPoint[pViewportIndex[15]].xmax);
+
+ scisYmax = _simd16_set_epi32(
+ pScissorsInFixedPoint[pViewportIndex[0]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[1]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[2]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[3]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[4]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[5]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[6]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[7]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[8]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[9]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[10]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[11]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[12]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[13]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[14]].ymax,
+ pScissorsInFixedPoint[pViewportIndex[15]].ymax);
+}
-#endif
typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
struct ProcessAttributesChooser
}
}
+// WA linux compiler issue with SIMDLIB and shift immediates
+#define SIMD_WA_SXXI_EPI32 1
+
+#if SIMD_WA_SXXI_EPI32
+template<int i>
+simdscalari simd_wa_slli_epi32(simdscalari a)
+{
+ return SIMD256::slli_epi32<i>(a);
+}
+
+template<int i>
+simd16scalari simd_wa_slli_epi32(simd16scalari a)
+{
+ return SIMD512::slli_epi32<i>(a);
+}
+
+template<int i>
+simdscalari simd_wa_srai_epi32(simdscalari a)
+{
+ return SIMD256::srai_epi32<i>(a);
+}
+
+template<int i>
+simd16scalari simd_wa_srai_epi32(simd16scalari a)
+{
+ return SIMD512::srai_epi32<i>(a);
+}
+
+#endif
+INLINE
+void TransposeVertices(simd4scalar(&dst)[8], const simdscalar &src0, const simdscalar &src1, const simdscalar &src2)
+{
+ vTranspose3x8(dst, src0, src1, src2);
+}
+
+INLINE
+void TransposeVertices(simd4scalar(&dst)[16], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2)
+{
+ vTranspose4x16(reinterpret_cast<simd16scalar(&)[4]>(dst), src0, src1, src2, _simd16_setzero_ps());
+}
+
//////////////////////////////////////////////////////////////////////////
/// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
/// culling, viewport transform, etc.
/// @param primID - Primitive ID for each triangle.
/// @param viewportIdx - viewport array index for each triangle.
/// @tparam CT - ConservativeRastFETraits
-template <typename CT>
-void BinTriangles(
+template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
+void SIMDCALL BinTrianglesImpl(
DRAW_CONTEXT *pDC,
- PA_STATE& pa,
+ PA_STATE &pa,
uint32_t workerId,
- simdvector tri[3],
+ typename SIMD_T::Vec4 tri[3],
uint32_t triMask,
- simdscalari const &primID)
+ typename SIMD_T::Integer const &primID)
{
SWR_CONTEXT *pContext = pDC->pContext;
const API_STATE& state = GetApiState(pDC);
const SWR_RASTSTATE& rastState = state.rastState;
const SWR_FRONTEND_STATE& feState = state.frontendState;
+
MacroTileMgr *pTileMgr = pDC->pTileMgr;
- simdscalar vRecipW0 = _simd_set1_ps(1.0f);
- simdscalar vRecipW1 = _simd_set1_ps(1.0f);
- simdscalar vRecipW2 = _simd_set1_ps(1.0f);
+ typename SIMD_T::Float vRecipW0 = SIMD_T::set1_ps(1.0f);
+ typename SIMD_T::Float vRecipW1 = SIMD_T::set1_ps(1.0f);
+ typename SIMD_T::Float vRecipW2 = SIMD_T::set1_ps(1.0f);
+
+ typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0);
- // Read viewport array index if needed
- simdscalari viewportIdx = _simd_set1_epi32(0);
if (state.backendState.readViewportArrayIndex)
{
- simdvector vpiAttrib[3];
+ typename SIMD_T::Vec4 vpiAttrib[3];
pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
// OOB indices => forced to zero.
- simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
- vpai = _simd_max_epi32(_simd_setzero_si(), vpai);
- simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
- simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports);
- viewportIdx = _simd_and_si(vClearMask, vpai);
+ typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+ vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai);
+ typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+ typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
+ viewportIdx = SIMD_T::and_si(vClearMask, vpai);
}
if (feState.vpTransformDisable)
else
{
// Perspective divide
- vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
- vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
- vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
+ vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[0].w);
+ vRecipW1 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[1].w);
+ vRecipW2 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[2].w);
- tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
- tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
- tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
+ tri[0].v[0] = SIMD_T::mul_ps(tri[0].v[0], vRecipW0);
+ tri[1].v[0] = SIMD_T::mul_ps(tri[1].v[0], vRecipW1);
+ tri[2].v[0] = SIMD_T::mul_ps(tri[2].v[0], vRecipW2);
- tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
- tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
- tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
+ tri[0].v[1] = SIMD_T::mul_ps(tri[0].v[1], vRecipW0);
+ tri[1].v[1] = SIMD_T::mul_ps(tri[1].v[1], vRecipW1);
+ tri[2].v[1] = SIMD_T::mul_ps(tri[2].v[1], vRecipW2);
- tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
- tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
- tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
+ tri[0].v[2] = SIMD_T::mul_ps(tri[0].v[2], vRecipW0);
+ tri[1].v[2] = SIMD_T::mul_ps(tri[1].v[2], vRecipW1);
+ tri[2].v[2] = SIMD_T::mul_ps(tri[2].v[2], vRecipW2);
// Viewport transform to screen space coords
if (state.backendState.readViewportArrayIndex)
}
// Adjust for pixel center location
- simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
- tri[0].x = _simd_add_ps(tri[0].x, offset);
- tri[0].y = _simd_add_ps(tri[0].y, offset);
+ typename SIMD_T::Float offset = g_pixelOffsets<SIMD_T>[rastState.pixelLocation];
+
+ tri[0].x = SIMD_T::add_ps(tri[0].x, offset);
+ tri[0].y = SIMD_T::add_ps(tri[0].y, offset);
- tri[1].x = _simd_add_ps(tri[1].x, offset);
- tri[1].y = _simd_add_ps(tri[1].y, offset);
+ tri[1].x = SIMD_T::add_ps(tri[1].x, offset);
+ tri[1].y = SIMD_T::add_ps(tri[1].y, offset);
- tri[2].x = _simd_add_ps(tri[2].x, offset);
- tri[2].y = _simd_add_ps(tri[2].y, offset);
+ tri[2].x = SIMD_T::add_ps(tri[2].x, offset);
+ tri[2].y = SIMD_T::add_ps(tri[2].y, offset);
- simdscalari vXi[3], vYi[3];
// Set vXi, vYi to required fixed point precision
- FPToFixedPoint(tri, vXi, vYi);
+ typename SIMD_T::Integer vXi[3], vYi[3];
+ FPToFixedPoint<SIMD_T>(tri, vXi, vYi);
// triangle setup
- simdscalari vAi[3], vBi[3];
+ typename SIMD_T::Integer vAi[3], vBi[3];
triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
// determinant
- simdscalari vDet[2];
+ typename SIMD_T::Integer vDet[2];
calcDeterminantIntVertical(vAi, vBi, vDet);
// cull zero area
- int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
- int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
+ uint32_t maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[0], SIMD_T::setzero_si())));
+ uint32_t maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[1], SIMD_T::setzero_si())));
- int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
+ uint32_t cullZeroAreaMask = maskLo | (maskHi << (SIMD_WIDTH / 2));
- uint32_t origTriMask = triMask;
// don't cull degenerate triangles if we're conservatively rasterizing
+ uint32_t origTriMask = triMask;
if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
{
triMask &= ~cullZeroAreaMask;
uint32_t frontWindingTris;
if (rastState.frontWinding == SWR_FRONTWINDING_CW)
{
- maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
- maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
+ maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
+ maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
}
else
{
- maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet[0])));
- maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(_simd_setzero_si(), vDet[1])));
+ maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[0])));
+ maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[1])));
}
- frontWindingTris = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
+ frontWindingTris = maskLo | (maskHi << (SIMD_WIDTH / 2));
// cull
uint32_t cullTris;
uint32_t *pPrimID = (uint32_t *)&primID;
const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
DWORD triIndex = 0;
+
uint32_t edgeEnable;
PFN_WORK_FUNC pfnWork;
if (CT::IsConservativeT::value)
if (cullZeroAreaMask > 0)
{
// e0 = v1-v0
- simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]);
- simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]);
- uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask)));
+ const typename SIMD_T::Integer x0x1Mask = SIMD_T::cmpeq_epi32(vXi[0], vXi[1]);
+ const typename SIMD_T::Integer y0y1Mask = SIMD_T::cmpeq_epi32(vYi[0], vYi[1]);
+
+ uint32_t e0Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask, y0y1Mask)));
// e1 = v2-v1
- simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]);
- simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]);
- uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask)));
+ const typename SIMD_T::Integer x1x2Mask = SIMD_T::cmpeq_epi32(vXi[1], vXi[2]);
+ const typename SIMD_T::Integer y1y2Mask = SIMD_T::cmpeq_epi32(vYi[1], vYi[2]);
+
+ uint32_t e1Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask, y1y2Mask)));
// e2 = v0-v2
// if v0 == v1 & v1 == v2, v0 == v2
// edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
// 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
e0Mask = pdep_u32(e0Mask, 0x00249249);
+
// 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
e1Mask = pdep_u32(e1Mask, 0x00492492);
+
// 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
e2Mask = pdep_u32(e2Mask, 0x00924924);
else
{
// degenerate triangles won't be sent to rasterizer; just enable all edges
- pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
+ pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
(SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
}
- simdBBox bbox;
+ SIMDBBOX_T<SIMD_T> bbox;
if (!triMask)
{
}
// Calc bounding box of triangles
- calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
+ calcBoundingBoxIntVertical<SIMD_T, CT>(vXi, vYi, bbox);
// determine if triangle falls between pixel centers and discard
// only discard for non-MSAA case and when conservative rast is disabled
// (xmin + 127) & ~255
// (xmax + 128) & ~255
- if((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
+ if ((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
(!CT::IsConservativeT::value))
{
origTriMask = triMask;
int cullCenterMask;
+
{
- simdscalari xmin = _simd_add_epi32(bbox.xmin, _simd_set1_epi32(127));
- xmin = _simd_and_si(xmin, _simd_set1_epi32(~255));
- simdscalari xmax = _simd_add_epi32(bbox.xmax, _simd_set1_epi32(128));
- xmax = _simd_and_si(xmax, _simd_set1_epi32(~255));
+ typename SIMD_T::Integer xmin = SIMD_T::add_epi32(bbox.xmin, SIMD_T::set1_epi32(127));
+ xmin = SIMD_T::and_si(xmin, SIMD_T::set1_epi32(~255));
+ typename SIMD_T::Integer xmax = SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(128));
+ xmax = SIMD_T::and_si(xmax, SIMD_T::set1_epi32(~255));
- simdscalari vMaskH = _simd_cmpeq_epi32(xmin, xmax);
+ typename SIMD_T::Integer vMaskH = SIMD_T::cmpeq_epi32(xmin, xmax);
- simdscalari ymin = _simd_add_epi32(bbox.ymin, _simd_set1_epi32(127));
- ymin = _simd_and_si(ymin, _simd_set1_epi32(~255));
- simdscalari ymax = _simd_add_epi32(bbox.ymax, _simd_set1_epi32(128));
- ymax = _simd_and_si(ymax, _simd_set1_epi32(~255));
+ typename SIMD_T::Integer ymin = SIMD_T::add_epi32(bbox.ymin, SIMD_T::set1_epi32(127));
+ ymin = SIMD_T::and_si(ymin, SIMD_T::set1_epi32(~255));
+ typename SIMD_T::Integer ymax = SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(128));
+ ymax = SIMD_T::and_si(ymax, SIMD_T::set1_epi32(~255));
- simdscalari vMaskV = _simd_cmpeq_epi32(ymin, ymax);
- vMaskV = _simd_or_si(vMaskH, vMaskV);
- cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
+ typename SIMD_T::Integer vMaskV = SIMD_T::cmpeq_epi32(ymin, ymax);
+
+ vMaskV = SIMD_T::or_si(vMaskH, vMaskV);
+ cullCenterMask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vMaskV));
}
triMask &= ~cullCenterMask;
// Gather the AOS effective scissor rects based on the per-prim VP index.
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
{
- simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+ typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
+
if (state.backendState.readViewportArrayIndex)
{
- GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
+ GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
}
else // broadcast fast path for non-VPAI case.
{
- scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
}
// Make triangle bbox inclusive
- bbox.xmax = _simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1));
- bbox.ymax = _simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1));
+ bbox.xmax = SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1));
+ bbox.ymax = SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1));
- bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd_min_epi32(bbox.xmax, scisXmax);
- bbox.ymax = _simd_min_epi32(bbox.ymax, scisYmax);
+ bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = SIMD_T::min_epi32(bbox.xmax, scisXmax);
+ bbox.ymax = SIMD_T::min_epi32(bbox.ymax, scisYmax);
}
if (CT::IsConservativeT::value)
{
// in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
// some area. Bump the xmax/ymax edges out
- simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.ymin, bbox.ymax);
- bbox.ymax = _simd_blendv_epi32(bbox.ymax, _simd_add_epi32(bbox.ymax, _simd_set1_epi32(1)), topEqualsBottom);
- simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.xmin, bbox.xmax);
- bbox.xmax = _simd_blendv_epi32(bbox.xmax, _simd_add_epi32(bbox.xmax, _simd_set1_epi32(1)), leftEqualsRight);
+
+ typename SIMD_T::Integer topEqualsBottom = SIMD_T::cmpeq_epi32(bbox.ymin, bbox.ymax);
+ bbox.ymax = SIMD_T::blendv_epi32(bbox.ymax, SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), topEqualsBottom);
+
+ typename SIMD_T::Integer leftEqualsRight = SIMD_T::cmpeq_epi32(bbox.xmin, bbox.xmax);
+ bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), leftEqualsRight);
}
// Cull tris completely outside scissor
{
- simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
- simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
- simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
- uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
+ typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
+ typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
+ typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
+ uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
triMask = triMask & ~maskOutsideScissor;
}
endBinTriangles:
+
// Send surviving triangles to the line or point binner based on fill mode
if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
{
- // Simple non-conformant wireframe mode, useful for debugging.
- // Construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
- simdvector line[2];
- simdscalar recipW[2];
+ // Simple non-conformant wireframe mode, useful for debugging
+ // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
+ typename SIMD_T::Vec4 line[2];
+ typename SIMD_T::Float recipW[2];
+
line[0] = tri[0];
line[1] = tri[1];
recipW[0] = vRecipW0;
recipW[1] = vRecipW1;
- BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
+
+ BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
line[0] = tri[1];
line[1] = tri[2];
recipW[0] = vRecipW1;
recipW[1] = vRecipW2;
- BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
+
+ BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
line[0] = tri[2];
line[1] = tri[0];
recipW[0] = vRecipW2;
recipW[1] = vRecipW0;
- BinPostSetupLines(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
+
+ BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
AR_END(FEBinTriangles, 1);
return;
else if (rastState.fillMode == SWR_FILLMODE_POINT)
{
// Bin 3 points
- BinPostSetupPoints(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx);
- BinPostSetupPoints(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx);
- BinPostSetupPoints(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx);
+ BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx);
+ BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx);
+ BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx);
+
+ AR_END(FEBinTriangles, 1);
return;
}
// Convert triangle bbox to macrotile units.
- bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
- bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
+#if SIMD_WA_SXXI_EPI32
+ bbox.xmin = simd_wa_srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
+ bbox.ymin = simd_wa_srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
+ bbox.xmax = simd_wa_srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
+ bbox.ymax = simd_wa_srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
+#else
+ bbox.xmin = SIMD_T::srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
+ bbox.ymin = SIMD_T::srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
+ bbox.xmax = SIMD_T::srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
+ bbox.ymax = SIMD_T::srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
+#endif
- OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
- _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
- _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
- _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
- _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
+ OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
+
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft), bbox.xmin);
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight), bbox.xmax);
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop), bbox.ymin);
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
// transpose verts needed for backend
/// @todo modify BE to take non-transformed verts
- simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
- vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
- vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
- vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
- vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
+ simd4scalar vHorizX[SIMD_WIDTH];
+ simd4scalar vHorizY[SIMD_WIDTH];
+ simd4scalar vHorizZ[SIMD_WIDTH];
+ simd4scalar vHorizW[SIMD_WIDTH];
+
+ TransposeVertices(vHorizX, tri[0].x, tri[1].x, tri[2].x);
+ TransposeVertices(vHorizY, tri[0].y, tri[1].y, tri[2].y);
+ TransposeVertices(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
+ TransposeVertices(vHorizW, vRecipW0, vRecipW1, vRecipW2);
// store render target array index
- OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
+ OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
if (state.backendState.readRenderTargetArrayIndex)
{
- simdvector vRtai[3];
+ typename SIMD_T::Vec4 vRtai[3];
pa.Assemble(VERTEX_SGV_SLOT, vRtai);
- simdscalari vRtaii;
- vRtaii = _simd_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
- _simd_store_si((simdscalari*)aRTAI, vRtaii);
+ typename SIMD_T::Integer vRtaii;
+ vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
}
else
{
- _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
}
// scan remaining valid triangles and bin each separately
{
// only rasterize valid edges if we have a degenerate primitive
int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
- work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
+ work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
(SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(triEdgeEnable), (state.scissorsTileAligned == false));
// Degenerate triangles are required to be constant interpolated
// store triangle vertex data
desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
- SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
- SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
- SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
// store user clip distances
}
}
}
+
triMask &= ~(1 << triIndex);
}
AR_END(FEBinTriangles, 1);
}
+template <typename CT>
+void BinTriangles(
+ DRAW_CONTEXT *pDC,
+ PA_STATE &pa,
+ uint32_t workerId,
+ simdvector tri[3],
+ uint32_t triMask,
+ simdscalari const &primID)
+{
+ BinTrianglesImpl<SIMD256, KNOB_SIMD_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID);
+}
+
#if USE_SIMD16_FRONTEND
template <typename CT>
void SIMDCALL BinTriangles_simd16(
DRAW_CONTEXT *pDC,
- PA_STATE& pa,
+ PA_STATE &pa,
uint32_t workerId,
simd16vector tri[3],
uint32_t triMask,
simd16scalari const &primID)
{
- SWR_CONTEXT *pContext = pDC->pContext;
-
- AR_BEGIN(FEBinTriangles, pDC->drawId);
-
- const API_STATE& state = GetApiState(pDC);
- const SWR_RASTSTATE& rastState = state.rastState;
- const SWR_FRONTEND_STATE& feState = state.frontendState;
-
- MacroTileMgr *pTileMgr = pDC->pTileMgr;
-
- simd16scalar vRecipW0 = _simd16_set1_ps(1.0f);
- simd16scalar vRecipW1 = _simd16_set1_ps(1.0f);
- simd16scalar vRecipW2 = _simd16_set1_ps(1.0f);
-
- simd16scalari viewportIdx = _simd16_set1_epi32(0);
- if (state.backendState.readViewportArrayIndex)
- {
- simd16vector vpiAttrib[3];
- pa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib);
+ BinTrianglesImpl<SIMD512, KNOB_SIMD16_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID);
+}
- // OOB indices => forced to zero.
- simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
- vpai = _simd16_max_epi32(_simd16_setzero_si(), vpai);
- simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
- simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports);
- viewportIdx = _simd16_and_si(vClearMask, vpai);
- }
+#endif
+struct FEBinTrianglesChooser
+{
+ typedef PFN_PROCESS_PRIMS FuncType;
- if (feState.vpTransformDisable)
+ template <typename... ArgsB>
+ static FuncType GetFunc()
{
- // RHW is passed in directly when VP transform is disabled
- vRecipW0 = tri[0].v[3];
- vRecipW1 = tri[1].v[3];
- vRecipW2 = tri[2].v[3];
+ return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
}
- else
- {
- // Perspective divide
- vRecipW0 = _simd16_div_ps(_simd16_set1_ps(1.0f), tri[0].w);
- vRecipW1 = _simd16_div_ps(_simd16_set1_ps(1.0f), tri[1].w);
- vRecipW2 = _simd16_div_ps(_simd16_set1_ps(1.0f), tri[2].w);
-
- tri[0].v[0] = _simd16_mul_ps(tri[0].v[0], vRecipW0);
- tri[1].v[0] = _simd16_mul_ps(tri[1].v[0], vRecipW1);
- tri[2].v[0] = _simd16_mul_ps(tri[2].v[0], vRecipW2);
+};
- tri[0].v[1] = _simd16_mul_ps(tri[0].v[1], vRecipW0);
- tri[1].v[1] = _simd16_mul_ps(tri[1].v[1], vRecipW1);
- tri[2].v[1] = _simd16_mul_ps(tri[2].v[1], vRecipW2);
+// Selector for correct templated BinTrinagles function
+PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
+{
+ return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
+}
- tri[0].v[2] = _simd16_mul_ps(tri[0].v[2], vRecipW0);
- tri[1].v[2] = _simd16_mul_ps(tri[1].v[2], vRecipW1);
- tri[2].v[2] = _simd16_mul_ps(tri[2].v[2], vRecipW2);
+#if USE_SIMD16_FRONTEND
+struct FEBinTrianglesChooser_simd16
+{
+ typedef PFN_PROCESS_PRIMS_SIMD16 FuncType;
- // Viewport transform to screen space coords
- if (state.backendState.readViewportArrayIndex)
- {
- viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
- }
- else
- {
- viewportTransform<3>(tri, state.vpMatrices);
- }
+ template <typename... ArgsB>
+ static FuncType GetFunc()
+ {
+ return BinTriangles_simd16<ConservativeRastFETraits<ArgsB...>>;
}
+};
- // Adjust for pixel center location
- const simd16scalar offset = g_pixelOffsets_simd16[rastState.pixelLocation];
-
- tri[0].x = _simd16_add_ps(tri[0].x, offset);
- tri[0].y = _simd16_add_ps(tri[0].y, offset);
-
- tri[1].x = _simd16_add_ps(tri[1].x, offset);
- tri[1].y = _simd16_add_ps(tri[1].y, offset);
+// Selector for correct templated BinTrinagles function
+PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative)
+{
+ return TemplateArgUnroller<FEBinTrianglesChooser_simd16>::GetFunc(IsConservative);
+}
- tri[2].x = _simd16_add_ps(tri[2].x, offset);
- tri[2].y = _simd16_add_ps(tri[2].y, offset);
+#endif
- simd16scalari vXi[3], vYi[3];
+template <typename SIMD_T, uint32_t SIMD_WIDTH>
+void BinPostSetupPointsImpl(
+ DRAW_CONTEXT *pDC,
+ PA_STATE &pa,
+ uint32_t workerId,
+ typename SIMD_T::Vec4 prim[],
+ uint32_t primMask,
+ typename SIMD_T::Integer const &primID,
+ typename SIMD_T::Integer const &viewportIdx)
+{
+ SWR_CONTEXT *pContext = pDC->pContext;
- // Set vXi, vYi to required fixed point precision
- FPToFixedPoint(tri, vXi, vYi);
+ AR_BEGIN(FEBinPoints, pDC->drawId);
- // triangle setup
- simd16scalari vAi[3], vBi[3];
- triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
+ typename SIMD_T::Vec4 &primVerts = prim[0];
- // determinant
- simd16scalari vDet[2];
- calcDeterminantIntVertical(vAi, vBi, vDet);
+ const API_STATE& state = GetApiState(pDC);
+ const SWR_RASTSTATE& rastState = state.rastState;
+ const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
- // cull zero area
- uint32_t maskLo = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet[0], _simd16_setzero_si())));
- uint32_t maskHi = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpeq_epi64(vDet[1], _simd16_setzero_si())));
+ // Select attribute processor
+ PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
+ state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
- uint32_t cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD16_WIDTH / 2));
+ // convert to fixed point
+ typename SIMD_T::Integer vXi, vYi;
- // don't cull degenerate triangles if we're conservatively rasterizing
- uint32_t origTriMask = triMask;
- if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
- {
- triMask &= ~cullZeroAreaMask;
- }
+ vXi = fpToFixedPointVertical<SIMD_T>(primVerts.x);
+ vYi = fpToFixedPointVertical<SIMD_T>(primVerts.y);
- // determine front winding tris
- // CW +det
- // CCW det < 0;
- // 0 area triangles are marked as backfacing regardless of winding order,
- // which is required behavior for conservative rast and wireframe rendering
- uint32_t frontWindingTris;
- if (rastState.frontWinding == SWR_FRONTWINDING_CW)
- {
- maskLo = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(vDet[0], _simd16_setzero_si())));
- maskHi = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(vDet[1], _simd16_setzero_si())));
- }
- else
+ if (CanUseSimplePoints(pDC))
{
- maskLo = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(_simd16_setzero_si(), vDet[0])));
- maskHi = _simd16_movemask_pd(_simd16_castsi_pd(_simd16_cmpgt_epi64(_simd16_setzero_si(), vDet[1])));
- }
- frontWindingTris = maskLo | (maskHi << (KNOB_SIMD16_WIDTH / 2));
+ // adjust for ymin-xmin rule
+ vXi = SIMD_T::sub_epi32(vXi, SIMD_T::set1_epi32(1));
+ vYi = SIMD_T::sub_epi32(vYi, SIMD_T::set1_epi32(1));
- // cull
- uint32_t cullTris;
- switch ((SWR_CULLMODE)rastState.cullMode)
- {
- case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break;
- case SWR_CULLMODE_NONE: cullTris = 0x0; break;
- case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
- // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
- case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break;
- default: SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
- }
+ // cull points off the ymin-xmin edge of the viewport
+ primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vXi));
+ primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi));
- triMask &= ~cullTris;
+ // compute macro tile coordinates
+#if SIMD_WA_SXXI_EPI32
+ typename SIMD_T::Integer macroX = simd_wa_srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(vXi);
+ typename SIMD_T::Integer macroY = simd_wa_srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(vYi);
+#else
+ typename SIMD_T::Integer macroX = SIMD_T::srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(vXi);
+ typename SIMD_T::Integer macroY = SIMD_T::srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(vYi);
+#endif
- if (origTriMask ^ triMask)
- {
- RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
- }
+ OSALIGNSIMD16(uint32_t) aMacroX[SIMD_WIDTH], aMacroY[SIMD_WIDTH];
- /// Note: these variable initializations must stay above any 'goto endBenTriangles'
- // compute per tri backface
- uint32_t frontFaceMask = frontWindingTris;
- uint32_t *pPrimID = (uint32_t *)&primID;
- const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
- DWORD triIndex = 0;
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroX), macroX);
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroY), macroY);
- uint32_t edgeEnable;
- PFN_WORK_FUNC pfnWork;
- if (CT::IsConservativeT::value)
- {
- // determine which edges of the degenerate tri, if any, are valid to rasterize.
- // used to call the appropriate templated rasterizer function
- if (cullZeroAreaMask > 0)
- {
- // e0 = v1-v0
- const simd16scalari x0x1Mask = _simd16_cmpeq_epi32(vXi[0], vXi[1]);
- const simd16scalari y0y1Mask = _simd16_cmpeq_epi32(vYi[0], vYi[1]);
+ // compute raster tile coordinates
+#if SIMD_WA_SXXI_EPI32
+ typename SIMD_T::Integer rasterX = simd_wa_srai_epi32<KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT>(vXi);
+ typename SIMD_T::Integer rasterY = simd_wa_srai_epi32<KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT>(vYi);
+#else
+ typename SIMD_T::Integer rasterX = SIMD_T::srai_epi32<KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT>(vXi);
+ typename SIMD_T::Integer rasterY = SIMD_T::srai_epi32<KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT>(vYi);
+#endif
- uint32_t e0Mask = _simd16_movemask_ps(_simd16_castsi_ps(_simd16_and_si(x0x1Mask, y0y1Mask)));
+ // compute raster tile relative x,y for coverage mask
+#if SIMD_WA_SXXI_EPI32
+ typename SIMD_T::Integer tileAlignedX = simd_wa_slli_epi32<KNOB_TILE_X_DIM_SHIFT>(rasterX);
+ typename SIMD_T::Integer tileAlignedY = simd_wa_slli_epi32<KNOB_TILE_Y_DIM_SHIFT>(rasterY);
+#else
+ typename SIMD_T::Integer tileAlignedX = SIMD_T::slli_epi32<KNOB_TILE_X_DIM_SHIFT>(rasterX);
+ typename SIMD_T::Integer tileAlignedY = SIMD_T::slli_epi32<KNOB_TILE_Y_DIM_SHIFT>(rasterY);
+#endif
- // e1 = v2-v1
- const simd16scalari x1x2Mask = _simd16_cmpeq_epi32(vXi[1], vXi[2]);
- const simd16scalari y1y2Mask = _simd16_cmpeq_epi32(vYi[1], vYi[2]);
+#if SIMD_WA_SXXI_EPI32
+ typename SIMD_T::Integer tileRelativeX = SIMD_T::sub_epi32(simd_wa_srai_epi32<FIXED_POINT_SHIFT>(vXi), tileAlignedX);
+ typename SIMD_T::Integer tileRelativeY = SIMD_T::sub_epi32(simd_wa_srai_epi32<FIXED_POINT_SHIFT>(vYi), tileAlignedY);
+#else
+ typename SIMD_T::Integer tileRelativeX = SIMD_T::sub_epi32(SIMD_T::srai_epi32<FIXED_POINT_SHIFT>(vXi), tileAlignedX);
+ typename SIMD_T::Integer tileRelativeY = SIMD_T::sub_epi32(SIMD_T::srai_epi32<FIXED_POINT_SHIFT>(vYi), tileAlignedY);
+#endif
- uint32_t e1Mask = _simd16_movemask_ps(_simd16_castsi_ps(_simd16_and_si(x1x2Mask, y1y2Mask)));
+ OSALIGNSIMD16(uint32_t) aTileRelativeX[SIMD_WIDTH];
+ OSALIGNSIMD16(uint32_t) aTileRelativeY[SIMD_WIDTH];
- // e2 = v0-v2
- // if v0 == v1 & v1 == v2, v0 == v2
- uint32_t e2Mask = e0Mask & e1Mask;
- SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeX), tileRelativeX);
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeY), tileRelativeY);
- // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
- // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
- e0Mask = pdep_u32(e0Mask, 0x00249249);
+ OSALIGNSIMD16(uint32_t) aTileAlignedX[SIMD_WIDTH];
+ OSALIGNSIMD16(uint32_t) aTileAlignedY[SIMD_WIDTH];
- // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
- e1Mask = pdep_u32(e1Mask, 0x00492492);
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedX), tileAlignedX);
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedY), tileAlignedY);
- // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
- e2Mask = pdep_u32(e2Mask, 0x00924924);
+ OSALIGNSIMD16(float) aZ[SIMD_WIDTH];
+ SIMD_T::store_ps(reinterpret_cast<float *>(aZ), primVerts.z);
- edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
+ // store render target array index
+ OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
+ if (state.backendState.readRenderTargetArrayIndex)
+ {
+ typename SIMD_T::Vec4 vRtai;
+ pa.Assemble(VERTEX_SGV_SLOT, &vRtai);
+ typename SIMD_T::Integer vRtaii = SIMD_T::castps_si(vRtai[VERTEX_SGV_RTAI_COMP]);
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
}
else
{
- edgeEnable = 0x00FFFFFF;
- }
- }
- else
- {
- // degenerate triangles won't be sent to rasterizer; just enable all edges
- pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
- (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
- }
-
- simd16BBox bbox;
-
- if (!triMask)
- {
- goto endBinTriangles;
- }
-
- // Calc bounding box of triangles
- calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
-
- // determine if triangle falls between pixel centers and discard
- // only discard for non-MSAA case and when conservative rast is disabled
- // (xmin + 127) & ~255
- // (xmax + 128) & ~255
- if ((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
- (!CT::IsConservativeT::value))
- {
- origTriMask = triMask;
-
- int cullCenterMask;
-
- {
- simd16scalari xmin = _simd16_add_epi32(bbox.xmin, _simd16_set1_epi32(127));
- xmin = _simd16_and_si(xmin, _simd16_set1_epi32(~255));
- simd16scalari xmax = _simd16_add_epi32(bbox.xmax, _simd16_set1_epi32(128));
- xmax = _simd16_and_si(xmax, _simd16_set1_epi32(~255));
-
- simd16scalari vMaskH = _simd16_cmpeq_epi32(xmin, xmax);
-
- simd16scalari ymin = _simd16_add_epi32(bbox.ymin, _simd16_set1_epi32(127));
- ymin = _simd16_and_si(ymin, _simd16_set1_epi32(~255));
- simd16scalari ymax = _simd16_add_epi32(bbox.ymax, _simd16_set1_epi32(128));
- ymax = _simd16_and_si(ymax, _simd16_set1_epi32(~255));
-
- simd16scalari vMaskV = _simd16_cmpeq_epi32(ymin, ymax);
-
- vMaskV = _simd16_or_si(vMaskH, vMaskV);
- cullCenterMask = _simd16_movemask_ps(_simd16_castsi_ps(vMaskV));
- }
-
- triMask &= ~cullCenterMask;
-
- if (origTriMask ^ triMask)
- {
- RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
- }
- }
-
- // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
- // Gather the AOS effective scissor rects based on the per-prim VP index.
- /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
- {
- simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
-
- if (state.backendState.readViewportArrayIndex)
- {
- GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
- }
- else // broadcast fast path for non-VPAI case.
- {
- scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
- }
-
- // Make triangle bbox inclusive
- bbox.xmax = _simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1));
- bbox.ymax = _simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1));
-
- bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd16_min_epi32(bbox.xmax, scisXmax);
- bbox.ymax = _simd16_min_epi32(bbox.ymax, scisYmax);
- }
-
- if (CT::IsConservativeT::value)
- {
- // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
- // some area. Bump the xmax/ymax edges out
- simd16scalari topEqualsBottom = _simd16_cmpeq_epi32(bbox.ymin, bbox.ymax);
- bbox.ymax = _simd16_blendv_epi32(bbox.ymax, _simd16_add_epi32(bbox.ymax, _simd16_set1_epi32(1)), topEqualsBottom);
- simd16scalari leftEqualsRight = _simd16_cmpeq_epi32(bbox.xmin, bbox.xmax);
- bbox.xmax = _simd16_blendv_epi32(bbox.xmax, _simd16_add_epi32(bbox.xmax, _simd16_set1_epi32(1)), leftEqualsRight);
- }
-
- // Cull tris completely outside scissor
- {
- simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax);
- simd16scalari maskOutsideScissorY = _simd16_cmpgt_epi32(bbox.ymin, bbox.ymax);
- simd16scalari maskOutsideScissorXY = _simd16_or_si(maskOutsideScissorX, maskOutsideScissorY);
- uint32_t maskOutsideScissor = _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY));
- triMask = triMask & ~maskOutsideScissor;
- }
-
-endBinTriangles:
-
- // Send surviving triangles to the line or point binner based on fill mode
- if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
- {
- // Simple non-conformant wireframe mode, useful for debugging
- // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
- simd16vector line[2];
- simd16scalar recipW[2];
- line[0] = tri[0];
- line[1] = tri[1];
- recipW[0] = vRecipW0;
- recipW[1] = vRecipW1;
- BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
-
- line[0] = tri[1];
- line[1] = tri[2];
- recipW[0] = vRecipW1;
- recipW[1] = vRecipW2;
- BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
-
- line[0] = tri[2];
- line[1] = tri[0];
- recipW[0] = vRecipW2;
- recipW[1] = vRecipW0;
- BinPostSetupLines_simd16(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
-
- AR_END(FEBinTriangles, 1);
- return;
- }
- else if (rastState.fillMode == SWR_FILLMODE_POINT)
- {
- // Bin 3 points
- BinPostSetupPoints_simd16(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx);
- BinPostSetupPoints_simd16(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx);
- BinPostSetupPoints_simd16(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx);
- return;
- }
-
- // Convert triangle bbox to macrotile units.
- bbox.xmin = _simd16_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.ymin = _simd16_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
- bbox.xmax = _simd16_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.ymax = _simd16_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-
- OSALIGNSIMD16(uint32_t) aMTLeft[KNOB_SIMD16_WIDTH], aMTRight[KNOB_SIMD16_WIDTH], aMTTop[KNOB_SIMD16_WIDTH], aMTBottom[KNOB_SIMD16_WIDTH];
-
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTLeft), bbox.xmin);
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTRight), bbox.xmax);
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTTop), bbox.ymin);
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTBottom), bbox.ymax);
-
- // transpose verts needed for backend
- /// @todo modify BE to take non-transformed verts
- simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
- simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
- simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
- simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-
- vTranspose3x8(vHorizX[0], _simd16_extract_ps(tri[0].x, 0), _simd16_extract_ps(tri[1].x, 0), _simd16_extract_ps(tri[2].x, 0));
- vTranspose3x8(vHorizY[0], _simd16_extract_ps(tri[0].y, 0), _simd16_extract_ps(tri[1].y, 0), _simd16_extract_ps(tri[2].y, 0));
- vTranspose3x8(vHorizZ[0], _simd16_extract_ps(tri[0].z, 0), _simd16_extract_ps(tri[1].z, 0), _simd16_extract_ps(tri[2].z, 0));
- vTranspose3x8(vHorizW[0], _simd16_extract_ps(vRecipW0, 0), _simd16_extract_ps(vRecipW1, 0), _simd16_extract_ps(vRecipW2, 0));
-
- vTranspose3x8(vHorizX[1], _simd16_extract_ps(tri[0].x, 1), _simd16_extract_ps(tri[1].x, 1), _simd16_extract_ps(tri[2].x, 1));
- vTranspose3x8(vHorizY[1], _simd16_extract_ps(tri[0].y, 1), _simd16_extract_ps(tri[1].y, 1), _simd16_extract_ps(tri[2].y, 1));
- vTranspose3x8(vHorizZ[1], _simd16_extract_ps(tri[0].z, 1), _simd16_extract_ps(tri[1].z, 1), _simd16_extract_ps(tri[2].z, 1));
- vTranspose3x8(vHorizW[1], _simd16_extract_ps(vRecipW0, 1), _simd16_extract_ps(vRecipW1, 1), _simd16_extract_ps(vRecipW2, 1));
-
- // store render target array index
- OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH];
- if (state.backendState.readRenderTargetArrayIndex)
- {
- simd16vector vRtai[3];
- pa.Assemble_simd16(VERTEX_SGV_SLOT, vRtai);
- simd16scalari vRtaii;
- vRtaii = _simd16_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
- }
- else
- {
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), _simd16_setzero_si());
- }
-
-
- // scan remaining valid triangles and bin each separately
- while (_BitScanForward(&triIndex, triMask))
- {
- uint32_t linkageCount = state.backendState.numAttributes;
- uint32_t numScalarAttribs = linkageCount * 4;
-
- BE_WORK work;
- work.type = DRAW;
-
- bool isDegenerate;
- if (CT::IsConservativeT::value)
- {
- // only rasterize valid edges if we have a degenerate primitive
- int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
- work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
- (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(triEdgeEnable), (state.scissorsTileAligned == false));
-
- // Degenerate triangles are required to be constant interpolated
- isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
- }
- else
- {
- isDegenerate = false;
- work.pfnWork = pfnWork;
- }
-
- // Select attribute processor
- PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
- state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
-
- TRIANGLE_WORK_DESC &desc = work.desc.tri;
-
- desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
- desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
- desc.triFlags.viewportIndex = pViewportIndex[triIndex];
-
- auto pArena = pDC->pArena;
- SWR_ASSERT(pArena != nullptr);
-
- // store active attribs
- float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
- desc.pAttribs = pAttribs;
- desc.numAttribs = linkageCount;
- pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
-
- // store triangle vertex data
- desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
-
- {
- const uint32_t i = triIndex >> 3; // triIndex / KNOB_SIMD_WIDTH
- const uint32_t j = triIndex & 7; // triIndex % KNOB_SIMD_WIDTH
-
- _mm_store_ps(&desc.pTriBuffer[ 0], vHorizX[i][j]);
- _mm_store_ps(&desc.pTriBuffer[ 4], vHorizY[i][j]);
- _mm_store_ps(&desc.pTriBuffer[ 8], vHorizZ[i][j]);
- _mm_store_ps(&desc.pTriBuffer[12], vHorizW[i][j]);
- }
-
- // store user clip distances
- if (rastState.clipDistanceMask)
- {
- uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
- desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
- ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer);
- }
-
- for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
- {
- for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
- {
-#if KNOB_ENABLE_TOSS_POINTS
- if (!KNOB_TOSS_SETUP_TRIS)
-#endif
- {
- pTileMgr->enqueue(x, y, &work);
- }
- }
- }
-
- triMask &= ~(1 << triIndex);
- }
-
- AR_END(FEBinTriangles, 1);
-}
-
-#endif
-struct FEBinTrianglesChooser
-{
- typedef PFN_PROCESS_PRIMS FuncType;
-
- template <typename... ArgsB>
- static FuncType GetFunc()
- {
- return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
- }
-};
-
-// Selector for correct templated BinTrinagles function
-PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
-{
- return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
-}
-
-#if USE_SIMD16_FRONTEND
-struct FEBinTrianglesChooser_simd16
-{
- typedef PFN_PROCESS_PRIMS_SIMD16 FuncType;
-
- template <typename... ArgsB>
- static FuncType GetFunc()
- {
- return BinTriangles_simd16<ConservativeRastFETraits<ArgsB...>>;
- }
-};
-
-// Selector for correct templated BinTrinagles function
-PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative)
-{
- return TemplateArgUnroller<FEBinTrianglesChooser_simd16>::GetFunc(IsConservative);
-}
-
-#endif
-
-void BinPostSetupPoints(
- DRAW_CONTEXT *pDC,
- PA_STATE& pa,
- uint32_t workerId,
- simdvector prim[],
- uint32_t primMask,
- simdscalari const &primID,
- simdscalari const &viewportIdx)
-{
- SWR_CONTEXT *pContext = pDC->pContext;
-
- AR_BEGIN(FEBinPoints, pDC->drawId);
-
- simdvector& primVerts = prim[0];
-
- const API_STATE& state = GetApiState(pDC);
- const SWR_RASTSTATE& rastState = state.rastState;
- const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
-
- // Select attribute processor
- PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
- state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
-
- // convert to fixed point
- simdscalari vXi, vYi;
- vXi = fpToFixedPointVertical(primVerts.x);
- vYi = fpToFixedPointVertical(primVerts.y);
-
- if (CanUseSimplePoints(pDC))
- {
- // adjust for ymin-xmin rule
- vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
- vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
-
- // cull points off the ymin-xmin edge of the viewport
- primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
- primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
-
- // compute macro tile coordinates
- simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-
- OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
- _simd_store_si((simdscalari*)aMacroX, macroX);
- _simd_store_si((simdscalari*)aMacroY, macroY);
-
- // compute raster tile coordinates
- simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
- simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
-
- // compute raster tile relative x,y for coverage mask
- simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
- simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
-
- simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
- simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
-
- OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
- OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
- _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
- _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
-
- OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
- OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
- _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
- _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
-
- OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
- _simd_store_ps((float*)aZ, primVerts.z);
-
- // store render target array index
- OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
- if (state.backendState.readRenderTargetArrayIndex)
- {
- simdvector vRtai;
- pa.Assemble(VERTEX_SGV_SLOT, &vRtai);
- simdscalari vRtaii = _simd_castps_si(vRtai[VERTEX_SGV_RTAI_COMP]);
- _simd_store_si((simdscalari*)aRTAI, vRtaii);
- }
- else
- {
- _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
- }
-
- uint32_t *pPrimID = (uint32_t *)&primID;
- DWORD primIndex = 0;
-
- const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
-
- // scan remaining valid triangles and bin each separately
- while (_BitScanForward(&primIndex, primMask))
- {
- uint32_t linkageCount = backendState.numAttributes;
- uint32_t numScalarAttribs = linkageCount * 4;
-
- BE_WORK work;
- work.type = DRAW;
-
- TRIANGLE_WORK_DESC &desc = work.desc.tri;
-
- // points are always front facing
- desc.triFlags.frontFacing = 1;
- desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
- desc.triFlags.viewportIndex = pViewportIndex[primIndex];
-
- work.pfnWork = RasterizeSimplePoint;
-
- auto pArena = pDC->pArena;
- SWR_ASSERT(pArena != nullptr);
-
- // store attributes
- float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
- desc.pAttribs = pAttribs;
- desc.numAttribs = linkageCount;
-
- pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
-
- // store raster tile aligned x, y, perspective correct z
- float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
- desc.pTriBuffer = pTriBuffer;
- *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
- *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
- *pTriBuffer = aZ[primIndex];
-
- uint32_t tX = aTileRelativeX[primIndex];
- uint32_t tY = aTileRelativeY[primIndex];
-
- // pack the relative x,y into the coverageMask, the rasterizer will
- // generate the true coverage mask from it
- work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
-
- // bin it
- MacroTileMgr *pTileMgr = pDC->pTileMgr;
-#if KNOB_ENABLE_TOSS_POINTS
- if (!KNOB_TOSS_SETUP_TRIS)
-#endif
- {
- pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
- }
- primMask &= ~(1 << primIndex);
- }
- }
- else
- {
- // non simple points need to be potentially binned to multiple macro tiles
- simdscalar vPointSize;
- if (rastState.pointParam)
- {
- simdvector size[3];
- pa.Assemble(VERTEX_SGV_SLOT, size);
- vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP];
- }
- else
- {
- vPointSize = _simd_set1_ps(rastState.pointSize);
- }
-
- // bloat point to bbox
- simdBBox bbox;
- bbox.xmin = bbox.xmax = vXi;
- bbox.ymin = bbox.ymax = vYi;
-
- simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
- simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
- bbox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
- bbox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
- bbox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
- bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
-
- // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
- // Gather the AOS effective scissor rects based on the per-prim VP index.
- /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
- {
- simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
- if (state.backendState.readViewportArrayIndex)
- {
- GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
- }
- else // broadcast fast path for non-VPAI case.
- {
- scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
- }
-
- bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
- bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
- }
-
- // Cull bloated points completely outside scissor
- simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
- simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
- simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
- uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
- primMask = primMask & ~maskOutsideScissor;
-
- // Convert bbox to macrotile units.
- bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
- bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-
- OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
- _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
- _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
- _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
- _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
-
- // store render target array index
- OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
- if (state.backendState.readRenderTargetArrayIndex)
- {
- simdvector vRtai[2];
- pa.Assemble(VERTEX_SGV_SLOT, vRtai);
- simdscalari vRtaii = _simd_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
- _simd_store_si((simdscalari*)aRTAI, vRtaii);
- }
- else
- {
- _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
- }
-
- OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
- _simd_store_ps((float*)aPointSize, vPointSize);
-
- uint32_t *pPrimID = (uint32_t *)&primID;
-
- OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
- OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
- OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
-
- _simd_store_ps((float*)aPrimVertsX, primVerts.x);
- _simd_store_ps((float*)aPrimVertsY, primVerts.y);
- _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
-
- // scan remaining valid prims and bin each separately
- const SWR_BACKEND_STATE& backendState = state.backendState;
- DWORD primIndex;
- while (_BitScanForward(&primIndex, primMask))
- {
- uint32_t linkageCount = backendState.numAttributes;
- uint32_t numScalarAttribs = linkageCount * 4;
-
- BE_WORK work;
- work.type = DRAW;
-
- TRIANGLE_WORK_DESC &desc = work.desc.tri;
-
- desc.triFlags.frontFacing = 1;
- desc.triFlags.pointSize = aPointSize[primIndex];
- desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
- desc.triFlags.viewportIndex = pViewportIndex[primIndex];
-
- work.pfnWork = RasterizeTriPoint;
-
- auto pArena = pDC->pArena;
- SWR_ASSERT(pArena != nullptr);
-
- // store active attribs
- desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
- desc.numAttribs = linkageCount;
- pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
-
- // store point vertex data
- float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
- desc.pTriBuffer = pTriBuffer;
- *pTriBuffer++ = aPrimVertsX[primIndex];
- *pTriBuffer++ = aPrimVertsY[primIndex];
- *pTriBuffer = aPrimVertsZ[primIndex];
-
- // store user clip distances
- if (rastState.clipDistanceMask)
- {
- uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
- desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
- float dists[8];
- float one = 1.0f;
- ProcessUserClipDist<1>(pa, primIndex, rastState.clipDistanceMask, &one, dists);
- for (uint32_t i = 0; i < numClipDist; i++) {
- desc.pUserClipBuffer[3*i + 0] = 0.0f;
- desc.pUserClipBuffer[3*i + 1] = 0.0f;
- desc.pUserClipBuffer[3*i + 2] = dists[i];
- }
- }
-
- MacroTileMgr *pTileMgr = pDC->pTileMgr;
- for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
- {
- for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
- {
-#if KNOB_ENABLE_TOSS_POINTS
- if (!KNOB_TOSS_SETUP_TRIS)
-#endif
- {
- pTileMgr->enqueue(x, y, &work);
- }
- }
- }
-
- primMask &= ~(1 << primIndex);
- }
- }
-
- AR_END(FEBinPoints, 1);
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Bin SIMD points to the backend. Only supports point size of 1
-/// @param pDC - pointer to draw context.
-/// @param pa - The primitive assembly object.
-/// @param workerId - thread's worker id. Even thread has a unique id.
-/// @param tri - Contains point position data for SIMDs worth of points.
-/// @param primID - Primitive ID for each point.
-void BinPoints(
- DRAW_CONTEXT *pDC,
- PA_STATE& pa,
- uint32_t workerId,
- simdvector prim[3],
- uint32_t primMask,
- simdscalari const &primID)
-{
- simdvector& primVerts = prim[0];
-
- const API_STATE& state = GetApiState(pDC);
- const SWR_FRONTEND_STATE& feState = state.frontendState;
- const SWR_RASTSTATE& rastState = state.rastState;
-
- // Read back viewport index if required
- simdscalari viewportIdx = _simd_set1_epi32(0);
- if (state.backendState.readViewportArrayIndex)
- {
- simdvector vpiAttrib[1];
- pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
- simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
-
- // OOB indices => forced to zero.
- vpai = _simd_max_epi32(_simd_setzero_si(), vpai);
- simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
- simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports);
- viewportIdx = _simd_and_si(vClearMask, vpai);
- }
-
- if (!feState.vpTransformDisable)
- {
- // perspective divide
- simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
- primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
- primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
- primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
-
- // viewport transform to screen coords
- if (state.backendState.readViewportArrayIndex)
- {
- viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
- }
- else
- {
- viewportTransform<1>(&primVerts, state.vpMatrices);
- }
- }
-
- // adjust for pixel center location
- simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
- primVerts.x = _simd_add_ps(primVerts.x, offset);
- primVerts.y = _simd_add_ps(primVerts.y, offset);
-
- BinPostSetupPoints(
- pDC,
- pa,
- workerId,
- prim,
- primMask,
- primID,
- viewportIdx);
-}
-
-#if USE_SIMD16_FRONTEND
-void BinPostSetupPoints_simd16(
- DRAW_CONTEXT *pDC,
- PA_STATE& pa,
- uint32_t workerId,
- simd16vector prim[],
- uint32_t primMask,
- simd16scalari const &primID,
- simd16scalari const &viewportIdx)
-{
- SWR_CONTEXT *pContext = pDC->pContext;
-
- AR_BEGIN(FEBinPoints, pDC->drawId);
-
- simd16vector& primVerts = prim[0];
-
- const API_STATE& state = GetApiState(pDC);
- const SWR_RASTSTATE& rastState = state.rastState;
- const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
-
- // Select attribute processor
- PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
- state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
-
- // convert to fixed point
- simd16scalari vXi, vYi;
-
- vXi = fpToFixedPointVertical(primVerts.x);
- vYi = fpToFixedPointVertical(primVerts.y);
-
- if (CanUseSimplePoints(pDC))
- {
- // adjust for ymin-xmin rule
- vXi = _simd16_sub_epi32(vXi, _simd16_set1_epi32(1));
- vYi = _simd16_sub_epi32(vYi, _simd16_set1_epi32(1));
-
- // cull points off the ymin-xmin edge of the viewport
- primMask &= ~_simd16_movemask_ps(_simd16_castsi_ps(vXi));
- primMask &= ~_simd16_movemask_ps(_simd16_castsi_ps(vYi));
-
- // compute macro tile coordinates
- simd16scalari macroX = _simd16_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- simd16scalari macroY = _simd16_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-
- OSALIGNSIMD16(uint32_t) aMacroX[KNOB_SIMD16_WIDTH], aMacroY[KNOB_SIMD16_WIDTH];
-
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aMacroX), macroX);
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aMacroY), macroY);
-
- // compute raster tile coordinates
- simd16scalari rasterX = _simd16_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
- simd16scalari rasterY = _simd16_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
-
- // compute raster tile relative x,y for coverage mask
- simd16scalari tileAlignedX = _simd16_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
- simd16scalari tileAlignedY = _simd16_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
-
- simd16scalari tileRelativeX = _simd16_sub_epi32(_simd16_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
- simd16scalari tileRelativeY = _simd16_sub_epi32(_simd16_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
-
- OSALIGNSIMD16(uint32_t) aTileRelativeX[KNOB_SIMD16_WIDTH];
- OSALIGNSIMD16(uint32_t) aTileRelativeY[KNOB_SIMD16_WIDTH];
-
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aTileRelativeX), tileRelativeX);
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aTileRelativeY), tileRelativeY);
-
- OSALIGNSIMD16(uint32_t) aTileAlignedX[KNOB_SIMD16_WIDTH];
- OSALIGNSIMD16(uint32_t) aTileAlignedY[KNOB_SIMD16_WIDTH];
-
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aTileAlignedX), tileAlignedX);
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aTileAlignedY), tileAlignedY);
-
- OSALIGNSIMD16(float) aZ[KNOB_SIMD16_WIDTH];
- _simd16_store_ps(reinterpret_cast<float *>(aZ), primVerts.z);
-
- // store render target array index
- OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH];
- if (state.backendState.readRenderTargetArrayIndex)
- {
- simd16vector vRtai;
- pa.Assemble_simd16(VERTEX_SGV_SLOT, &vRtai);
- simd16scalari vRtaii = _simd16_castps_si(vRtai[VERTEX_SGV_RTAI_COMP]);
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
- }
- else
- {
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), _simd16_setzero_si());
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
}
uint32_t *pPrimID = (uint32_t *)&primID;
else
{
// non simple points need to be potentially binned to multiple macro tiles
- simd16scalar vPointSize;
+ typename SIMD_T::Float vPointSize;
if (rastState.pointParam)
{
- simd16vector size[3];
- pa.Assemble_simd16(VERTEX_SGV_SLOT, size);
+ typename SIMD_T::Vec4 size[3];
+ pa.Assemble(VERTEX_SGV_SLOT, size);
vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP];
}
else
{
- vPointSize = _simd16_set1_ps(rastState.pointSize);
+ vPointSize = SIMD_T::set1_ps(rastState.pointSize);
}
// bloat point to bbox
- simd16BBox bbox;
+ SIMDBBOX_T<SIMD_T> bbox;
bbox.xmin = bbox.xmax = vXi;
bbox.ymin = bbox.ymax = vYi;
- simd16scalar vHalfWidth = _simd16_mul_ps(vPointSize, _simd16_set1_ps(0.5f));
- simd16scalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
+ typename SIMD_T::Float vHalfWidth = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f));
+ typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
- bbox.xmin = _simd16_sub_epi32(bbox.xmin, vHalfWidthi);
- bbox.xmax = _simd16_add_epi32(bbox.xmax, vHalfWidthi);
- bbox.ymin = _simd16_sub_epi32(bbox.ymin, vHalfWidthi);
- bbox.ymax = _simd16_add_epi32(bbox.ymax, vHalfWidthi);
+ bbox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
+ bbox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
+ bbox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
+ bbox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
// Gather the AOS effective scissor rects based on the per-prim VP index.
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
{
- simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
+ typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
+
if (state.backendState.readViewportArrayIndex)
{
- GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
+ GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
}
else // broadcast fast path for non-VPAI case.
{
- scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
}
- bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
- bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+ bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
+ bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
}
// Cull bloated points completely outside scissor
- simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax);
- simd16scalari maskOutsideScissorY = _simd16_cmpgt_epi32(bbox.ymin, bbox.ymax);
- simd16scalari maskOutsideScissorXY = _simd16_or_si(maskOutsideScissorX, maskOutsideScissorY);
- uint32_t maskOutsideScissor = _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY));
+ typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
+ typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
+ typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
+ uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
primMask = primMask & ~maskOutsideScissor;
// Convert bbox to macrotile units.
- bbox.xmin = _simd16_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.ymin = _simd16_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
- bbox.xmax = _simd16_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.ymax = _simd16_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
+#if SIMD_WA_SXXI_EPI32
+ bbox.xmin = simd_wa_srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
+ bbox.ymin = simd_wa_srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
+ bbox.xmax = simd_wa_srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
+ bbox.ymax = simd_wa_srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
+#else
+ bbox.xmin = SIMD_T::srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
+ bbox.ymin = SIMD_T::srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
+ bbox.xmax = SIMD_T::srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
+ bbox.ymax = SIMD_T::srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
+#endif
- OSALIGNSIMD16(uint32_t) aMTLeft[KNOB_SIMD16_WIDTH], aMTRight[KNOB_SIMD16_WIDTH], aMTTop[KNOB_SIMD16_WIDTH], aMTBottom[KNOB_SIMD16_WIDTH];
+ OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTLeft), bbox.xmin);
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTRight), bbox.xmax);
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTTop), bbox.ymin);
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTBottom), bbox.ymax);
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft), bbox.xmin);
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight), bbox.xmax);
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop), bbox.ymin);
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
// store render target array index
- OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH];
+ OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
if (state.backendState.readRenderTargetArrayIndex)
{
- simd16vector vRtai[2];
- pa.Assemble_simd16(VERTEX_SGV_SLOT, vRtai);
- simd16scalari vRtaii = _simd16_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
+ typename SIMD_T::Vec4 vRtai[2];
+ pa.Assemble(VERTEX_SGV_SLOT, vRtai);
+ typename SIMD_T::Integer vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
}
else
{
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), _simd16_setzero_si());
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
}
- OSALIGNSIMD16(float) aPointSize[KNOB_SIMD16_WIDTH];
+ OSALIGNSIMD16(float) aPointSize[SIMD_WIDTH];
_simd16_store_ps(reinterpret_cast<float *>(aPointSize), vPointSize);
uint32_t *pPrimID = (uint32_t *)&primID;
- OSALIGNSIMD16(float) aPrimVertsX[KNOB_SIMD16_WIDTH];
- OSALIGNSIMD16(float) aPrimVertsY[KNOB_SIMD16_WIDTH];
- OSALIGNSIMD16(float) aPrimVertsZ[KNOB_SIMD16_WIDTH];
+ OSALIGNSIMD16(float) aPrimVertsX[SIMD_WIDTH];
+ OSALIGNSIMD16(float) aPrimVertsY[SIMD_WIDTH];
+ OSALIGNSIMD16(float) aPrimVertsZ[SIMD_WIDTH];
- _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsX), primVerts.x);
- _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsY), primVerts.y);
- _simd16_store_ps(reinterpret_cast<float *>(aPrimVertsZ), primVerts.z);
+ SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsX), primVerts.x);
+ SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsY), primVerts.y);
+ SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsZ), primVerts.z);
// scan remaining valid prims and bin each separately
const SWR_BACKEND_STATE& backendState = state.backendState;
AR_END(FEBinPoints, 1);
}
-void SIMDCALL BinPoints_simd16(
+//////////////////////////////////////////////////////////////////////////
+/// @brief Bin SIMD points to the backend. Only supports point size of 1
+/// @param pDC - pointer to draw context.
+/// @param pa - The primitive assembly object.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param tri - Contains point position data for SIMDs worth of points.
+/// @param primID - Primitive ID for each point.
+template <typename SIMD_T, uint32_t SIMD_WIDTH>
+void BinPointsImpl(
DRAW_CONTEXT *pDC,
- PA_STATE& pa,
+ PA_STATE &pa,
uint32_t workerId,
- simd16vector prim[3],
+ typename SIMD_T::Vec4 prim[3],
uint32_t primMask,
- simd16scalari const &primID)
+ typename SIMD_T::Integer const &primID)
{
- simd16vector& primVerts = prim[0];
-
const API_STATE& state = GetApiState(pDC);
const SWR_FRONTEND_STATE& feState = state.frontendState;
const SWR_RASTSTATE& rastState = state.rastState;
// Read back viewport index if required
- simd16scalari viewportIdx = _simd16_set1_epi32(0);
+ typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0);
if (state.backendState.readViewportArrayIndex)
{
- simd16vector vpiAttrib[1];
- pa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib);
+ typename SIMD_T::Vec4 vpiAttrib[1];
+ pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
// OOB indices => forced to zero.
- simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
- vpai = _simd16_max_epi32(_simd16_setzero_si(), vpai);
- simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
- simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports);
- viewportIdx = _simd16_and_si(vClearMask, vpai);
+ typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+ vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai);
+ typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+ typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
+ viewportIdx = SIMD_T::and_si(vClearMask, vpai);
}
if (!feState.vpTransformDisable)
{
// perspective divide
- simd16scalar vRecipW0 = _simd16_div_ps(_simd16_set1_ps(1.0f), primVerts.w);
+ typename SIMD_T::Float vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
- primVerts.x = _simd16_mul_ps(primVerts.x, vRecipW0);
- primVerts.y = _simd16_mul_ps(primVerts.y, vRecipW0);
- primVerts.z = _simd16_mul_ps(primVerts.z, vRecipW0);
+ prim[0].x = SIMD_T::mul_ps(prim[0].x, vRecipW0);
+ prim[0].y = SIMD_T::mul_ps(prim[0].y, vRecipW0);
+ prim[0].z = SIMD_T::mul_ps(prim[0].z, vRecipW0);
// viewport transform to screen coords
if (state.backendState.readViewportArrayIndex)
{
- viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
+ viewportTransform<1>(prim, state.vpMatrices, viewportIdx);
}
else
{
- viewportTransform<1>(&primVerts, state.vpMatrices);
+ viewportTransform<1>(prim, state.vpMatrices);
}
}
- const simd16scalar offset = g_pixelOffsets_simd16[rastState.pixelLocation];
+ typename SIMD_T::Float offset = g_pixelOffsets<SIMD_T>[rastState.pixelLocation];
- primVerts.x = _simd16_add_ps(primVerts.x, offset);
- primVerts.y = _simd16_add_ps(primVerts.y, offset);
+ prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
+ prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
- BinPostSetupPoints_simd16(
+ BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
pDC,
pa,
workerId,
viewportIdx);
}
+void BinPoints(
+ DRAW_CONTEXT *pDC,
+ PA_STATE &pa,
+ uint32_t workerId,
+ simdvector prim[3],
+ uint32_t primMask,
+ simdscalari const &primID)
+{
+ BinPointsImpl<SIMD256, KNOB_SIMD_WIDTH>(
+ pDC,
+ pa,
+ workerId,
+ prim,
+ primMask,
+ primID);
+}
+
+#if USE_SIMD16_FRONTEND
+void SIMDCALL BinPoints_simd16(
+ DRAW_CONTEXT *pDC,
+ PA_STATE &pa,
+ uint32_t workerId,
+ simd16vector prim[3],
+ uint32_t primMask,
+ simd16scalari const &primID)
+{
+ BinPointsImpl<SIMD512, KNOB_SIMD16_WIDTH>(
+ pDC,
+ pa,
+ workerId,
+ prim,
+ primMask,
+ primID);
+}
+
#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Bin SIMD lines to the backend.
/// @param tri - Contains line position data for SIMDs worth of points.
/// @param primID - Primitive ID for each line.
/// @param viewportIdx - Viewport Array Index for each line.
-void BinPostSetupLines(
- DRAW_CONTEXT *pDC,
- PA_STATE& pa,
- uint32_t workerId,
- simdvector prim[],
- simdscalar recipW[],
- uint32_t primMask,
- simdscalari const &primID,
- simdscalari const &viewportIdx)
-{
- SWR_CONTEXT *pContext = pDC->pContext;
-
- AR_BEGIN(FEBinLines, pDC->drawId);
-
- const API_STATE& state = GetApiState(pDC);
- const SWR_RASTSTATE& rastState = state.rastState;
-
- // Select attribute processor
- PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
- state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
-
- simdscalar& vRecipW0 = recipW[0];
- simdscalar& vRecipW1 = recipW[1];
-
- simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
-
- // convert to fixed point
- simdscalari vXi[2], vYi[2];
- vXi[0] = fpToFixedPointVertical(prim[0].x);
- vYi[0] = fpToFixedPointVertical(prim[0].y);
- vXi[1] = fpToFixedPointVertical(prim[1].x);
- vYi[1] = fpToFixedPointVertical(prim[1].y);
-
- // compute x-major vs y-major mask
- simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
- simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
- simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
- uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
-
- // cull zero-length lines
- simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
- vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
-
- primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
-
- uint32_t *pPrimID = (uint32_t *)&primID;
- const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
-
- simdscalar vUnused = _simd_setzero_ps();
-
- // Calc bounding box of lines
- simdBBox bbox;
- bbox.xmin = _simd_min_epi32(vXi[0], vXi[1]);
- bbox.xmax = _simd_max_epi32(vXi[0], vXi[1]);
- bbox.ymin = _simd_min_epi32(vYi[0], vYi[1]);
- bbox.ymax = _simd_max_epi32(vYi[0], vYi[1]);
-
- // bloat bbox by line width along minor axis
- simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
- simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
- simdBBox bloatBox;
- bloatBox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
- bloatBox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
- bloatBox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
- bloatBox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
-
- bbox.xmin = _simd_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
- bbox.xmax = _simd_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
- bbox.ymin = _simd_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
- bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
-
- // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
- {
- simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
- if (state.backendState.readViewportArrayIndex)
- {
- GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
- }
- else // broadcast fast path for non-VPAI case.
- {
- scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
- }
-
- bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
- bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
- }
-
- // Cull prims completely outside scissor
- {
- simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
- simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
- simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
- uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
- primMask = primMask & ~maskOutsideScissor;
- }
-
- if (!primMask)
- {
- goto endBinLines;
- }
-
- // Convert triangle bbox to macrotile units.
- bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
- bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-
- OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
- _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
- _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
- _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
- _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
-
- // transpose verts needed for backend
- /// @todo modify BE to take non-transformed verts
- vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
- vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
- vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
- vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
-
- // store render target array index
- OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
- if (state.backendState.readRenderTargetArrayIndex)
- {
- simdvector vRtai[2];
- pa.Assemble(VERTEX_SGV_SLOT, vRtai);
- simdscalari vRtaii = _simd_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
- _simd_store_si((simdscalari*)aRTAI, vRtaii);
- }
- else
- {
- _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
- }
-
- // scan remaining valid prims and bin each separately
- DWORD primIndex;
- while (_BitScanForward(&primIndex, primMask))
- {
- uint32_t linkageCount = state.backendState.numAttributes;
- uint32_t numScalarAttribs = linkageCount * 4;
-
- BE_WORK work;
- work.type = DRAW;
-
- TRIANGLE_WORK_DESC &desc = work.desc.tri;
-
- desc.triFlags.frontFacing = 1;
- desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
- desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
- desc.triFlags.viewportIndex = pViewportIndex[primIndex];
-
- work.pfnWork = RasterizeLine;
-
- auto pArena = pDC->pArena;
- SWR_ASSERT(pArena != nullptr);
-
- // store active attribs
- desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
- desc.numAttribs = linkageCount;
- pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
-
- // store line vertex data
- desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
- SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
- SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
- SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
- SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
-
- // store user clip distances
- if (rastState.clipDistanceMask)
- {
- uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
- desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
- ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, &desc.pTriBuffer[12], desc.pUserClipBuffer);
- }
-
- MacroTileMgr *pTileMgr = pDC->pTileMgr;
- for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
- {
- for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
- {
-#if KNOB_ENABLE_TOSS_POINTS
- if (!KNOB_TOSS_SETUP_TRIS)
-#endif
- {
- pTileMgr->enqueue(x, y, &work);
- }
- }
- }
-
- primMask &= ~(1 << primIndex);
- }
-
-endBinLines:
-
- AR_END(FEBinLines, 1);
-}
-
-#if USE_SIMD16_FRONTEND
-void BinPostSetupLines_simd16(
+template <typename SIMD_T, uint32_t SIMD_WIDTH>
+void BinPostSetupLinesImpl(
DRAW_CONTEXT *pDC,
- PA_STATE& pa,
+ PA_STATE &pa,
uint32_t workerId,
- simd16vector prim[],
- simd16scalar recipW[],
+ typename SIMD_T::Vec4 prim[],
+ typename SIMD_T::Float recipW[],
uint32_t primMask,
- simd16scalari const &primID,
- simd16scalari const &viewportIdx)
+ typename SIMD_T::Integer const &primID,
+ typename SIMD_T::Integer const &viewportIdx)
{
SWR_CONTEXT *pContext = pDC->pContext;
AR_BEGIN(FEBinLines, pDC->drawId);
- const API_STATE& state = GetApiState(pDC);
- const SWR_RASTSTATE& rastState = state.rastState;
+ const API_STATE &state = GetApiState(pDC);
+ const SWR_RASTSTATE &rastState = state.rastState;
// Select attribute processor
PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
- simd16scalar& vRecipW0 = recipW[0];
- simd16scalar& vRecipW1 = recipW[1];
+ typename SIMD_T::Float &vRecipW0 = recipW[0];
+ typename SIMD_T::Float &vRecipW1 = recipW[1];
// convert to fixed point
- simd16scalari vXi[2], vYi[2];
+ typename SIMD_T::Integer vXi[2], vYi[2];
- vXi[0] = fpToFixedPointVertical(prim[0].x);
- vYi[0] = fpToFixedPointVertical(prim[0].y);
- vXi[1] = fpToFixedPointVertical(prim[1].x);
- vYi[1] = fpToFixedPointVertical(prim[1].y);
+ vXi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].x);
+ vYi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].y);
+ vXi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].x);
+ vYi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].y);
// compute x-major vs y-major mask
- simd16scalari xLength = _simd16_abs_epi32(_simd16_sub_epi32(vXi[0], vXi[1]));
- simd16scalari yLength = _simd16_abs_epi32(_simd16_sub_epi32(vYi[0], vYi[1]));
- simd16scalar vYmajorMask = _simd16_castsi_ps(_simd16_cmpgt_epi32(yLength, xLength));
- uint32_t yMajorMask = _simd16_movemask_ps(vYmajorMask);
+ typename SIMD_T::Integer xLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1]));
+ typename SIMD_T::Integer yLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1]));
+ typename SIMD_T::Float vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength));
+ uint32_t yMajorMask = SIMD_T::movemask_ps(vYmajorMask);
// cull zero-length lines
- simd16scalari vZeroLengthMask = _simd16_cmpeq_epi32(xLength, _simd16_setzero_si());
- vZeroLengthMask = _simd16_and_si(vZeroLengthMask, _simd16_cmpeq_epi32(yLength, _simd16_setzero_si()));
+ typename SIMD_T::Integer vZeroLengthMask = SIMD_T::cmpeq_epi32(xLength, SIMD_T::setzero_si());
+ vZeroLengthMask = SIMD_T::and_si(vZeroLengthMask, SIMD_T::cmpeq_epi32(yLength, SIMD_T::setzero_si()));
- primMask &= ~_simd16_movemask_ps(_simd16_castsi_ps(vZeroLengthMask));
+ primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vZeroLengthMask));
uint32_t *pPrimID = (uint32_t *)&primID;
const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
// Calc bounding box of lines
- simd16BBox bbox;
- bbox.xmin = _simd16_min_epi32(vXi[0], vXi[1]);
- bbox.xmax = _simd16_max_epi32(vXi[0], vXi[1]);
- bbox.ymin = _simd16_min_epi32(vYi[0], vYi[1]);
- bbox.ymax = _simd16_max_epi32(vYi[0], vYi[1]);
+ SIMDBBOX_T<SIMD_T> bbox;
+ bbox.xmin = SIMD_T::min_epi32(vXi[0], vXi[1]);
+ bbox.xmax = SIMD_T::max_epi32(vXi[0], vXi[1]);
+ bbox.ymin = SIMD_T::min_epi32(vYi[0], vYi[1]);
+ bbox.ymax = SIMD_T::max_epi32(vYi[0], vYi[1]);
// bloat bbox by line width along minor axis
- simd16scalar vHalfWidth = _simd16_set1_ps(rastState.lineWidth / 2.0f);
- simd16scalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
+ typename SIMD_T::Float vHalfWidth = SIMD_T::set1_ps(rastState.lineWidth / 2.0f);
+ typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
- simd16BBox bloatBox;
+ SIMDBBOX_T<SIMD_T> bloatBox;
- bloatBox.xmin = _simd16_sub_epi32(bbox.xmin, vHalfWidthi);
- bloatBox.xmax = _simd16_add_epi32(bbox.xmax, vHalfWidthi);
- bloatBox.ymin = _simd16_sub_epi32(bbox.ymin, vHalfWidthi);
- bloatBox.ymax = _simd16_add_epi32(bbox.ymax, vHalfWidthi);
+ bloatBox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
+ bloatBox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
+ bloatBox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
+ bloatBox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
- bbox.xmin = _simd16_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
- bbox.xmax = _simd16_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
- bbox.ymin = _simd16_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
- bbox.ymax = _simd16_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
+ bbox.xmin = SIMD_T::blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
+ bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
+ bbox.ymin = SIMD_T::blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
+ bbox.ymax = SIMD_T::blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
{
- simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
+ typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
if (state.backendState.readViewportArrayIndex)
{
- GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
+ GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
}
else // broadcast fast path for non-VPAI case.
{
- scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
}
- bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
- bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+ bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
+ bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
}
// Cull prims completely outside scissor
{
- simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax);
- simd16scalari maskOutsideScissorY = _simd16_cmpgt_epi32(bbox.ymin, bbox.ymax);
- simd16scalari maskOutsideScissorXY = _simd16_or_si(maskOutsideScissorX, maskOutsideScissorY);
- uint32_t maskOutsideScissor = _simd16_movemask_ps(_simd16_castsi_ps(maskOutsideScissorXY));
+ typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
+ typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
+ typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
+ uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
primMask = primMask & ~maskOutsideScissor;
}
- const simdscalar unused = _simd_setzero_ps();
-
// transpose verts needed for backend
/// @todo modify BE to take non-transformed verts
- simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
- simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
- simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
- simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ simd4scalar vHorizX[SIMD_WIDTH];
+ simd4scalar vHorizY[SIMD_WIDTH];
+ simd4scalar vHorizZ[SIMD_WIDTH];
+ simd4scalar vHorizW[SIMD_WIDTH];
if (!primMask)
{
}
// Convert triangle bbox to macrotile units.
- bbox.xmin = _simd16_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.ymin = _simd16_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
- bbox.xmax = _simd16_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
- bbox.ymax = _simd16_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
-
- OSALIGNSIMD16(uint32_t) aMTLeft[KNOB_SIMD16_WIDTH], aMTRight[KNOB_SIMD16_WIDTH], aMTTop[KNOB_SIMD16_WIDTH], aMTBottom[KNOB_SIMD16_WIDTH];
+#if SIMD_WA_SXXI_EPI32
+ bbox.xmin = simd_wa_srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
+ bbox.ymin = simd_wa_srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
+ bbox.xmax = simd_wa_srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
+ bbox.ymax = simd_wa_srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
+#else
+ bbox.xmin = SIMD_T::srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
+ bbox.ymin = SIMD_T::srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
+ bbox.xmax = SIMD_T::srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
+ bbox.ymax = SIMD_T::srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
+#endif
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTLeft), bbox.xmin);
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTRight), bbox.xmax);
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTTop), bbox.ymin);
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aMTBottom), bbox.ymax);
+ OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
- vTranspose3x8(vHorizX[0], _simd16_extract_ps(prim[0].x, 0), _simd16_extract_ps(prim[1].x, 0), unused);
- vTranspose3x8(vHorizY[0], _simd16_extract_ps(prim[0].y, 0), _simd16_extract_ps(prim[1].y, 0), unused);
- vTranspose3x8(vHorizZ[0], _simd16_extract_ps(prim[0].z, 0), _simd16_extract_ps(prim[1].z, 0), unused);
- vTranspose3x8(vHorizW[0], _simd16_extract_ps(vRecipW0, 0), _simd16_extract_ps(vRecipW1, 0), unused);
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft), bbox.xmin);
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight), bbox.xmax);
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop), bbox.ymin);
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
- vTranspose3x8(vHorizX[1], _simd16_extract_ps(prim[0].x, 1), _simd16_extract_ps(prim[1].x, 1), unused);
- vTranspose3x8(vHorizY[1], _simd16_extract_ps(prim[0].y, 1), _simd16_extract_ps(prim[1].y, 1), unused);
- vTranspose3x8(vHorizZ[1], _simd16_extract_ps(prim[0].z, 1), _simd16_extract_ps(prim[1].z, 1), unused);
- vTranspose3x8(vHorizW[1], _simd16_extract_ps(vRecipW0, 1), _simd16_extract_ps(vRecipW1, 1), unused);
+ TransposeVertices(vHorizX, prim[0].x, prim[1].x, SIMD_T::setzero_ps());
+ TransposeVertices(vHorizY, prim[0].y, prim[1].y, SIMD_T::setzero_ps());
+ TransposeVertices(vHorizZ, prim[0].z, prim[1].z, SIMD_T::setzero_ps());
+ TransposeVertices(vHorizW, vRecipW0, vRecipW1, SIMD_T::setzero_ps());
// store render target array index
- OSALIGNSIMD16(uint32_t) aRTAI[KNOB_SIMD16_WIDTH];
+ OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
if (state.backendState.readRenderTargetArrayIndex)
{
- simd16vector vRtai[2];
- pa.Assemble_simd16(VERTEX_SGV_SLOT, vRtai);
- simd16scalari vRtaii = _simd16_castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), vRtaii);
+ typename SIMD_T::Vec4 vRtai[2];
+ pa.Assemble(VERTEX_SGV_SLOT, vRtai);
+ typename SIMD_T::Integer vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
}
else
{
- _simd16_store_si(reinterpret_cast<simd16scalari *>(aRTAI), _simd16_setzero_si());
+ SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
}
// scan remaining valid prims and bin each separately
// store line vertex data
desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
- {
- const uint32_t i = primIndex >> 3; // triIndex / KNOB_SIMD_WIDTH
- const uint32_t j = primIndex & 7; // triIndex % KNOB_SIMD_WIDTH
-
- _mm_store_ps(&desc.pTriBuffer[ 0], vHorizX[i][j]);
- _mm_store_ps(&desc.pTriBuffer[ 4], vHorizY[i][j]);
- _mm_store_ps(&desc.pTriBuffer[ 8], vHorizZ[i][j]);
- _mm_store_ps(&desc.pTriBuffer[12], vHorizW[i][j]);
- }
+ _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
+ _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
+ _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
+ _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
// store user clip distances
if (rastState.clipDistanceMask)
AR_END(FEBinLines, 1);
}
-#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Bin SIMD lines to the backend.
/// @param pDC - pointer to draw context.
/// @param tri - Contains line position data for SIMDs worth of points.
/// @param primID - Primitive ID for each line.
/// @param viewportIdx - Viewport Array Index for each line.
-void BinLines(
+template <typename SIMD_T, uint32_t SIMD_WIDTH>
+void SIMDCALL BinLinesImpl(
DRAW_CONTEXT *pDC,
- PA_STATE& pa,
+ PA_STATE &pa,
uint32_t workerId,
- simdvector prim[],
+ typename SIMD_T::Vec4 prim[3],
uint32_t primMask,
- simdscalari const &primID)
+ typename SIMD_T::Integer const &primID)
{
const API_STATE& state = GetApiState(pDC);
const SWR_RASTSTATE& rastState = state.rastState;
const SWR_FRONTEND_STATE& feState = state.frontendState;
- simdscalar vRecipW[2] = { _simd_set1_ps(1.0f), _simd_set1_ps(1.0f) };
+ typename SIMD_T::Float vRecipW[2] = { SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f) };
- simdscalari viewportIdx = _simd_set1_epi32(0);
+ typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0);
if (state.backendState.readViewportArrayIndex)
{
- simdvector vpiAttrib[2];
+ typename SIMD_T::Vec4 vpiAttrib[2];
pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
- simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
- vpai = _simd_max_epi32(_simd_setzero_si(), vpai);
// OOB indices => forced to zero.
- simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
- simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports);
- viewportIdx = _simd_and_si(vClearMask, vpai);
+ typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+ vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai);
+ typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+ typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
+ viewportIdx = SIMD_T::and_si(vClearMask, vpai);
}
if (!feState.vpTransformDisable)
{
// perspective divide
- vRecipW[0] = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
- vRecipW[1] = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
+ vRecipW[0] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
+ vRecipW[1] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[1].w);
- prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW[0]);
- prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW[1]);
+ prim[0].v[0] = SIMD_T::mul_ps(prim[0].v[0], vRecipW[0]);
+ prim[1].v[0] = SIMD_T::mul_ps(prim[1].v[0], vRecipW[1]);
- prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW[0]);
- prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW[1]);
+ prim[0].v[1] = SIMD_T::mul_ps(prim[0].v[1], vRecipW[0]);
+ prim[1].v[1] = SIMD_T::mul_ps(prim[1].v[1], vRecipW[1]);
- prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW[0]);
- prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW[1]);
+ prim[0].v[2] = SIMD_T::mul_ps(prim[0].v[2], vRecipW[0]);
+ prim[1].v[2] = SIMD_T::mul_ps(prim[1].v[2], vRecipW[1]);
// viewport transform to screen coords
if (state.backendState.readViewportArrayIndex)
}
// adjust for pixel center location
- simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
- prim[0].x = _simd_add_ps(prim[0].x, offset);
- prim[0].y = _simd_add_ps(prim[0].y, offset);
+ typename SIMD_T::Float offset = g_pixelOffsets<SIMD_T>[rastState.pixelLocation];
+
+ prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
+ prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
- prim[1].x = _simd_add_ps(prim[1].x, offset);
- prim[1].y = _simd_add_ps(prim[1].y, offset);
+ prim[1].x = SIMD_T::add_ps(prim[1].x, offset);
+ prim[1].y = SIMD_T::add_ps(prim[1].y, offset);
- BinPostSetupLines(
+ BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
pDC,
pa,
workerId,
viewportIdx);
}
+void BinLines(
+ DRAW_CONTEXT *pDC,
+ PA_STATE &pa,
+ uint32_t workerId,
+ simdvector prim[],
+ uint32_t primMask,
+ simdscalari const &primID)
+{
+ BinLinesImpl<SIMD256, KNOB_SIMD_WIDTH>(pDC, pa, workerId, prim, primMask, primID);
+}
+
#if USE_SIMD16_FRONTEND
void SIMDCALL BinLines_simd16(
DRAW_CONTEXT *pDC,
- PA_STATE& pa,
+ PA_STATE &pa,
uint32_t workerId,
simd16vector prim[3],
uint32_t primMask,
simd16scalari const &primID)
{
- const API_STATE& state = GetApiState(pDC);
- const SWR_RASTSTATE& rastState = state.rastState;
- const SWR_FRONTEND_STATE& feState = state.frontendState;
-
- simd16scalar vRecipW[2] = { _simd16_set1_ps(1.0f), _simd16_set1_ps(1.0f) };
-
- simd16scalari viewportIdx = _simd16_set1_epi32(0);
- if (state.backendState.readViewportArrayIndex)
- {
- simd16vector vpiAttrib[2];
- pa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib);
-
- // OOB indices => forced to zero.
- simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
- vpai = _simd16_max_epi32(_simd16_setzero_si(), vpai);
- simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
- simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports);
- viewportIdx = _simd16_and_si(vClearMask, vpai);
- }
-
- if (!feState.vpTransformDisable)
- {
- // perspective divide
- vRecipW[0] = _simd16_div_ps(_simd16_set1_ps(1.0f), prim[0].w);
- vRecipW[1] = _simd16_div_ps(_simd16_set1_ps(1.0f), prim[1].w);
-
- prim[0].v[0] = _simd16_mul_ps(prim[0].v[0], vRecipW[0]);
- prim[1].v[0] = _simd16_mul_ps(prim[1].v[0], vRecipW[1]);
-
- prim[0].v[1] = _simd16_mul_ps(prim[0].v[1], vRecipW[0]);
- prim[1].v[1] = _simd16_mul_ps(prim[1].v[1], vRecipW[1]);
-
- prim[0].v[2] = _simd16_mul_ps(prim[0].v[2], vRecipW[0]);
- prim[1].v[2] = _simd16_mul_ps(prim[1].v[2], vRecipW[1]);
-
- // viewport transform to screen coords
- if (state.backendState.readViewportArrayIndex)
- {
- viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
- }
- else
- {
- viewportTransform<2>(prim, state.vpMatrices);
- }
- }
-
- // adjust for pixel center location
- simd16scalar offset = g_pixelOffsets_simd16[rastState.pixelLocation];
-
- prim[0].x = _simd16_add_ps(prim[0].x, offset);
- prim[0].y = _simd16_add_ps(prim[0].y, offset);
-
- prim[1].x = _simd16_add_ps(prim[1].x, offset);
- prim[1].y = _simd16_add_ps(prim[1].y, offset);
-
- BinPostSetupLines_simd16(
- pDC,
- pa,
- workerId,
- prim,
- vRecipW,
- primMask,
- primID,
- viewportIdx);
+ BinLinesImpl<SIMD512, KNOB_SIMD16_WIDTH>(pDC, pa, workerId, prim, primMask, primID);
}
#endif