#include "rdtsc_core.h"
// Temp storage used by the clipper
-extern THREAD simdvertex tlsTempVertices[7];
+extern THREAD SIMDVERTEX_T<SIMD256> tlsTempVertices[7];
#if USE_SIMD16_FRONTEND
-extern THREAD simd16vertex tlsTempVertices_simd16[7];
+extern THREAD SIMDVERTEX_T<SIMD512> tlsTempVertices_simd16[7];
#endif
enum SWR_CLIPCODES
#define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
-INLINE
-void ComputeClipCodes(const API_STATE& state, const simdvector& vertex, simdscalar& clipCodes, simdscalari const &viewportIndexes)
+template<typename SIMD_T>
+void ComputeClipCodes(const API_STATE &state, const typename SIMD_T::Vec4 &vertex, typename SIMD_T::Float &clipCodes, typename SIMD_T::Integer const &viewportIndexes)
{
- clipCodes = _simd_setzero_ps();
+ clipCodes = SIMD_T::setzero_ps();
// -w
- simdscalar vNegW = _simd_mul_ps(vertex.w, _simd_set1_ps(-1.0f));
+ typename SIMD_T::Float vNegW = SIMD_T::mul_ps(vertex.w,SIMD_T::set1_ps(-1.0f));
// FRUSTUM_LEFT
- simdscalar vRes = _simd_cmplt_ps(vertex.x, vNegW);
- clipCodes = _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_LEFT)));
+ typename SIMD_T::Float vRes = SIMD_T::cmplt_ps(vertex.x, vNegW);
+ clipCodes = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT)));
// FRUSTUM_TOP
- vRes = _simd_cmplt_ps(vertex.y, vNegW);
- clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_TOP))));
+ vRes = SIMD_T::cmplt_ps(vertex.y, vNegW);
+ clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP))));
// FRUSTUM_RIGHT
- vRes = _simd_cmpgt_ps(vertex.x, vertex.w);
- clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_RIGHT))));
+ vRes = SIMD_T::cmpgt_ps(vertex.x, vertex.w);
+ clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT))));
// FRUSTUM_BOTTOM
- vRes = _simd_cmpgt_ps(vertex.y, vertex.w);
- clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_BOTTOM))));
+ vRes = SIMD_T::cmpgt_ps(vertex.y, vertex.w);
+ clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM))));
if (state.rastState.depthClipEnable)
{
// DX clips depth [0..w], GL clips [-w..w]
if (state.rastState.clipHalfZ)
{
- vRes = _simd_cmplt_ps(vertex.z, _simd_setzero_ps());
+ vRes = SIMD_T::cmplt_ps(vertex.z, SIMD_T::setzero_ps());
}
else
{
- vRes = _simd_cmplt_ps(vertex.z, vNegW);
+ vRes = SIMD_T::cmplt_ps(vertex.z, vNegW);
}
- clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_NEAR))));
+ clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR))));
// FRUSTUM_FAR
- vRes = _simd_cmpgt_ps(vertex.z, vertex.w);
- clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_FAR))));
+ vRes = SIMD_T::cmpgt_ps(vertex.z, vertex.w);
+ clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR))));
}
// NEGW
- vRes = _simd_cmple_ps(vertex.w, _simd_setzero_ps());
- clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(NEGW))));
+ vRes = SIMD_T::cmple_ps(vertex.w, SIMD_T::setzero_ps());
+ clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW))));
// GUARDBAND_LEFT
- simdscalar gbMult = _simd_mul_ps(vNegW, _simd_i32gather_ps(&state.gbState.left[0], viewportIndexes, 4));
- vRes = _simd_cmplt_ps(vertex.x, gbMult);
- clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_LEFT))));
+ typename SIMD_T::Float gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.left[0], viewportIndexes));
+ vRes = SIMD_T::cmplt_ps(vertex.x, gbMult);
+ clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT))));
// GUARDBAND_TOP
- gbMult = _simd_mul_ps(vNegW, _simd_i32gather_ps(&state.gbState.top[0], viewportIndexes, 4));
- vRes = _simd_cmplt_ps(vertex.y, gbMult);
- clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_TOP))));
+ gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.top[0], viewportIndexes));
+ vRes = SIMD_T::cmplt_ps(vertex.y, gbMult);
+ clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP))));
// GUARDBAND_RIGHT
- gbMult = _simd_mul_ps(vertex.w, _simd_i32gather_ps(&state.gbState.right[0], viewportIndexes, 4));
- vRes = _simd_cmpgt_ps(vertex.x, gbMult);
- clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_RIGHT))));
+ gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.right[0], viewportIndexes));
+ vRes = SIMD_T::cmpgt_ps(vertex.x, gbMult);
+ clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT))));
// GUARDBAND_BOTTOM
- gbMult = _simd_mul_ps(vertex.w, _simd_i32gather_ps(&state.gbState.bottom[0], viewportIndexes, 4));
- vRes = _simd_cmpgt_ps(vertex.y, gbMult);
- clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_BOTTOM))));
+ gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.bottom[0], viewportIndexes));
+ vRes = SIMD_T::cmpgt_ps(vertex.y, gbMult);
+ clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM))));
}
-#if USE_SIMD16_FRONTEND
-INLINE
-void ComputeClipCodes(const API_STATE& state, const simd16vector& vertex, simd16scalar& clipCodes, simd16scalari const &viewportIndexes)
+template<typename SIMD_T>
+struct BinnerChooser
{
- clipCodes = _simd16_setzero_ps();
-
- // -w
- simd16scalar vNegW = _simd16_mul_ps(vertex.w, _simd16_set1_ps(-1.0f));
-
- // FRUSTUM_LEFT
- simd16scalar vRes = _simd16_cmplt_ps(vertex.x, vNegW);
- clipCodes = _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_LEFT)));
-
- // FRUSTUM_TOP
- vRes = _simd16_cmplt_ps(vertex.y, vNegW);
- clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_TOP))));
-
- // FRUSTUM_RIGHT
- vRes = _simd16_cmpgt_ps(vertex.x, vertex.w);
- clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_RIGHT))));
+};
- // FRUSTUM_BOTTOM
- vRes = _simd16_cmpgt_ps(vertex.y, vertex.w);
- clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_BOTTOM))));
+template<>
+struct BinnerChooser<SIMD256>
+{
+ PFN_PROCESS_PRIMS pfnBinFunc;
- if (state.rastState.depthClipEnable)
+ BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
+ :pfnBinFunc(nullptr)
{
- // FRUSTUM_NEAR
- // DX clips depth [0..w], GL clips [-w..w]
- if (state.rastState.clipHalfZ)
+ if (numVertsPerPrim == 3)
{
- vRes = _simd16_cmplt_ps(vertex.z, _simd16_setzero_ps());
+ pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
+
+ }
+ else if (numVertsPerPrim == 2)
+ {
+ pfnBinFunc = BinLines;
}
else
{
- vRes = _simd16_cmplt_ps(vertex.z, vNegW);
+ SWR_ASSERT(0 && "Unexpected points in clipper.");
}
- clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_NEAR))));
-
- // FRUSTUM_FAR
- vRes = _simd16_cmpgt_ps(vertex.z, vertex.w);
- clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(FRUSTUM_FAR))));
}
- // NEGW
- vRes = _simd16_cmple_ps(vertex.w, _simd16_setzero_ps());
- clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(NEGW))));
-
- // GUARDBAND_LEFT
- simd16scalar gbMult = _simd16_mul_ps(vNegW, _simd16_i32gather_ps(&state.gbState.left[0], viewportIndexes, 4));
- vRes = _simd16_cmplt_ps(vertex.x, gbMult);
- clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_LEFT))));
-
- // GUARDBAND_TOP
- gbMult = _simd16_mul_ps(vNegW, _simd16_i32gather_ps(&state.gbState.top[0], viewportIndexes, 4));
- vRes = _simd16_cmplt_ps(vertex.y, gbMult);
- clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_TOP))));
-
- // GUARDBAND_RIGHT
- gbMult = _simd16_mul_ps(vertex.w, _simd16_i32gather_ps(&state.gbState.right[0], viewportIndexes, 4));
- vRes = _simd16_cmpgt_ps(vertex.x, gbMult);
- clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_RIGHT))));
-
- // GUARDBAND_BOTTOM
- gbMult = _simd16_mul_ps(vertex.w, _simd16_i32gather_ps(&state.gbState.bottom[0], viewportIndexes, 4));
- vRes = _simd16_cmpgt_ps(vertex.y, gbMult);
- clipCodes = _simd16_or_ps(clipCodes, _simd16_and_ps(vRes, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_BOTTOM))));
-}
-
-#endif
-template<uint32_t NumVertsPerPrim>
-class Clipper
-{
-public:
- INLINE Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
- workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
+ BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
+ :pfnBinFunc(nullptr)
{
- static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
+ switch (topology)
+ {
+ case TOP_POINT_LIST:
+ pfnBinFunc = BinPoints;
+ break;
+ case TOP_LINE_LIST:
+ case TOP_LINE_STRIP:
+ case TOP_LINE_LOOP:
+ case TOP_LINE_LIST_ADJ:
+ case TOP_LISTSTRIP_ADJ:
+ pfnBinFunc = BinLines;
+ break;
+ default:
+ pfnBinFunc = GetBinTrianglesFunc(conservativeRast > 0);
+ break;
+ };
}
- INLINE void ComputeClipCodes(simdvector vertex[], simdscalari const &viewportIndexes)
+ void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD256::Vec4 prims[], uint32_t primMask, SIMD256::Integer const &primID)
{
- for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
- {
- ::ComputeClipCodes(this->state, vertex[i], this->clipCodes[i], viewportIndexes);
- }
+ SWR_ASSERT(pfnBinFunc != nullptr);
+
+ pfnBinFunc(pDC, pa, workerId, prims, primMask, primID);
}
+};
#if USE_SIMD16_FRONTEND
- INLINE void ComputeClipCodes(simd16vector vertex[], simd16scalari const &viewportIndexes)
+template<>
+struct BinnerChooser<SIMD512>
+{
+ PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc;
+
+ BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
+ :pfnBinFunc(nullptr)
{
- for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+ if (numVertsPerPrim == 3)
+ {
+ pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
+
+ }
+ else if (numVertsPerPrim == 2)
+ {
+ pfnBinFunc = BinLines_simd16;
+ }
+ else
{
- ::ComputeClipCodes(this->state, vertex[i], this->clipCodes_simd16[i], viewportIndexes);
+ SWR_ASSERT(0 && "Unexpected points in clipper.");
}
}
-#endif
- INLINE simdscalar ComputeClipCodeIntersection()
+ BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
+ :pfnBinFunc(nullptr)
{
- simdscalar result = this->clipCodes[0];
- for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
+ switch (topology)
{
- result = _simd_and_ps(result, this->clipCodes[i]);
- }
- return result;
+ case TOP_POINT_LIST:
+ pfnBinFunc = BinPoints_simd16;
+ break;
+ case TOP_LINE_LIST:
+ case TOP_LINE_STRIP:
+ case TOP_LINE_LOOP:
+ case TOP_LINE_LIST_ADJ:
+ case TOP_LISTSTRIP_ADJ:
+ pfnBinFunc = BinLines_simd16;
+ break;
+ default:
+ pfnBinFunc = GetBinTrianglesFunc_simd16(conservativeRast > 0);
+ break;
+ };
}
-#if USE_SIMD16_FRONTEND
- INLINE simd16scalar ComputeClipCodeIntersection_simd16()
+ void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD512::Vec4 prims[], uint32_t primMask, SIMD512::Integer const &primID)
{
- simd16scalar result = this->clipCodes_simd16[0];
- for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
- {
- result = _simd16_and_ps(result, this->clipCodes_simd16[i]);
- }
- return result;
+ SWR_ASSERT(pfnBinFunc != nullptr);
+
+ pfnBinFunc(pDC, pa, workerId, prims, primMask, primID);
}
+};
#endif
- INLINE simdscalar ComputeClipCodeUnion()
+template<typename SIMD_T>
+struct SimdHelper
+{
+};
+
+template<>
+struct SimdHelper<SIMD256>
+{
+ static SIMD256::Float insert_lo_ps(SIMD256::Float a)
{
- simdscalar result = this->clipCodes[0];
- for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
- {
- result = _simd_or_ps(result, this->clipCodes[i]);
- }
- return result;
+ return a;
+ }
+
+ static SIMD256::Mask cmpeq_ps_mask(SIMD256::Float a, SIMD256::Float b)
+ {
+ return SIMD256::movemask_ps(SIMD256::cmpeq_ps(a, b));
}
+};
#if USE_SIMD16_FRONTEND
- INLINE simd16scalar ComputeClipCodeUnion_simd16()
+template<>
+struct SimdHelper<SIMD512>
+{
+ static SIMD512::Float insert_lo_ps(SIMD256::Float a)
{
- simd16scalar result = this->clipCodes_simd16[0];
- for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
- {
- result = _simd16_or_ps(result, this->clipCodes_simd16[i]);
- }
- return result;
+ return SIMD512::insert_ps<0>(SIMD512::setzero_ps(), a);
}
-#endif
- INLINE int ComputeNegWMask()
+ static SIMD512::Mask cmpeq_ps_mask(SIMD512::Float a, SIMD512::Float b)
{
- simdscalar clipCodeUnion = ComputeClipCodeUnion();
- clipCodeUnion = _simd_and_ps(clipCodeUnion, _simd_castsi_ps(_simd_set1_epi32(NEGW)));
- return _simd_movemask_ps(_simd_cmpneq_ps(clipCodeUnion, _simd_setzero_ps()));
+ return SIMD512::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>(a, b);
}
+};
- INLINE int ComputeClipMask()
+#endif
+// Temp storage used by the clipper
+template<typename SIMD_T>
+struct ClipHelper
+{
+};
+
+template<>
+struct ClipHelper<SIMD256>
+{
+ static SIMDVERTEX_T<SIMD256> *GetTempVertices()
{
- simdscalar clipUnion = ComputeClipCodeUnion();
- clipUnion = _simd_and_ps(clipUnion, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_CLIP_MASK)));
- return _simd_movemask_ps(_simd_cmpneq_ps(clipUnion, _simd_setzero_ps()));
+ return tlsTempVertices;
}
+};
#if USE_SIMD16_FRONTEND
- INLINE int ComputeClipMask_simd16()
+template<>
+struct ClipHelper<SIMD512>
+{
+ static SIMDVERTEX_T<SIMD512> *GetTempVertices()
{
- simd16scalar clipUnion = ComputeClipCodeUnion_simd16();
- clipUnion = _simd16_and_ps(clipUnion, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_CLIP_MASK)));
- return _simd16_movemask_ps(_simd16_cmpneq_ps(clipUnion, _simd16_setzero_ps()));
+ return tlsTempVertices_simd16;
}
+};
#endif
- // clipper is responsible for culling any prims with NAN coordinates
- INLINE int ComputeNaNMask(simdvector prim[])
+template<typename SIMD_T, uint32_t NumVertsPerPrim>
+class Clipper
+{
+public:
+ INLINE Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
+ workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
{
- simdscalar vNanMask = _simd_setzero_ps();
- for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
+ static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
+ }
+
+ void ComputeClipCodes(typename SIMD_T::Vec4 vertex[], const typename SIMD_T::Integer &viewportIndexes)
+ {
+ for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
{
- simdscalar vNan01 = _simd_cmp_ps(prim[e].v[0], prim[e].v[1], _CMP_UNORD_Q);
- vNanMask = _simd_or_ps(vNanMask, vNan01);
- simdscalar vNan23 = _simd_cmp_ps(prim[e].v[2], prim[e].v[3], _CMP_UNORD_Q);
- vNanMask = _simd_or_ps(vNanMask, vNan23);
+ ::ComputeClipCodes<SIMD_T>(state, vertex[i], clipCodes[i], viewportIndexes);
}
-
- return _simd_movemask_ps(vNanMask);
}
-#if USE_SIMD16_FRONTEND
- INLINE int ComputeNaNMask(simd16vector prim[])
+ typename SIMD_T::Float ComputeClipCodeIntersection()
{
- simd16scalar vNanMask = _simd16_setzero_ps();
- for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
+ typename SIMD_T::Float result = clipCodes[0];
+
+ for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
{
- simd16scalar vNan01 = _simd16_cmp_ps(prim[e].v[0], prim[e].v[1], _CMP_UNORD_Q);
- vNanMask = _simd16_or_ps(vNanMask, vNan01);
- simd16scalar vNan23 = _simd16_cmp_ps(prim[e].v[2], prim[e].v[3], _CMP_UNORD_Q);
- vNanMask = _simd16_or_ps(vNanMask, vNan23);
+ result = SIMD_T::and_ps(result, clipCodes[i]);
}
- return _simd16_movemask_ps(vNanMask);
+ return result;
}
-#endif
- INLINE int ComputeUserClipCullMask(PA_STATE& pa, simdvector prim[])
+ typename SIMD_T::Float ComputeClipCodeUnion()
{
- uint8_t cullMask = this->state.rastState.cullDistanceMask;
- simdscalar vClipCullMask = _simd_setzero_ps();
- DWORD index;
-
- simdvector vClipCullDistLo[3];
- simdvector vClipCullDistHi[3];
+ typename SIMD_T::Float result = clipCodes[0];
- pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo);
- pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi);
- while (_BitScanForward(&index, cullMask))
+ for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
{
- cullMask &= ~(1 << index);
- uint32_t slot = index >> 2;
- uint32_t component = index & 0x3;
+ result = SIMD_T::or_ps(result, clipCodes[i]);
+ }
- simdscalar vCullMaskElem = _simd_set1_ps(-1.0f);
- for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
- {
- simdscalar vCullComp;
- if (slot == 0)
- {
- vCullComp = vClipCullDistLo[e][component];
- }
- else
- {
- vCullComp = vClipCullDistHi[e][component];
- }
+ return result;
+ }
- // cull if cull distance < 0 || NAN
- simdscalar vCull = _simd_cmp_ps(_mm256_setzero_ps(), vCullComp, _CMP_NLE_UQ);
- vCullMaskElem = _simd_and_ps(vCullMaskElem, vCull);
- }
- vClipCullMask = _simd_or_ps(vClipCullMask, vCullMaskElem);
- }
+ int ComputeClipMask()
+ {
+ typename SIMD_T::Float clipUnion = ComputeClipCodeUnion();
- // clipper should also discard any primitive with NAN clip distance
- uint8_t clipMask = this->state.rastState.clipDistanceMask;
- while (_BitScanForward(&index, clipMask))
- {
- clipMask &= ~(1 << index);
- uint32_t slot = index >> 2;
- uint32_t component = index & 0x3;
+ clipUnion = SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK)));
- for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
- {
- simdscalar vClipComp;
- if (slot == 0)
- {
- vClipComp = vClipCullDistLo[e][component];
- }
- else
- {
- vClipComp = vClipCullDistHi[e][component];
- }
+ return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion, SIMD_T::setzero_ps()));
+ }
- simdscalar vClip = _simd_cmp_ps(vClipComp, vClipComp, _CMP_UNORD_Q);
- vClipCullMask = _simd_or_ps(vClipCullMask, vClip);
- }
+ // clipper is responsible for culling any prims with NAN coordinates
+ int ComputeNaNMask(typename SIMD_T::Vec4 prim[])
+ {
+ typename SIMD_T::Float vNanMask = SIMD_T::setzero_ps();
+
+ for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
+ {
+ typename SIMD_T::Float vNan01 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]);
+ vNanMask = SIMD_T::or_ps(vNanMask, vNan01);
+
+ typename SIMD_T::Float vNan23 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]);
+ vNanMask = SIMD_T::or_ps(vNanMask, vNan23);
}
- return _simd_movemask_ps(vClipCullMask);
+ return SIMD_T::movemask_ps(vNanMask);
}
-#if USE_SIMD16_FRONTEND
- INLINE int ComputeUserClipCullMask(PA_STATE& pa, simd16vector prim[])
+ int ComputeUserClipCullMask(PA_STATE &pa, typename SIMD_T::Vec4 prim[])
{
- uint8_t cullMask = this->state.rastState.cullDistanceMask;
- simd16scalar vClipCullMask = _simd16_setzero_ps();
+ uint8_t cullMask = state.rastState.cullDistanceMask;
+ typename SIMD_T::Float vClipCullMask = SIMD_T::setzero_ps();
- simd16vector vClipCullDistLo[3];
- simd16vector vClipCullDistHi[3];
+ typename SIMD_T::Vec4 vClipCullDistLo[3];
+ typename SIMD_T::Vec4 vClipCullDistHi[3];
pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo);
pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi);
uint32_t slot = index >> 2;
uint32_t component = index & 0x3;
- simd16scalar vCullMaskElem = _simd16_set1_ps(-1.0f);
+ typename SIMD_T::Float vCullMaskElem = SIMD_T::set1_ps(-1.0f);
for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
{
- simd16scalar vCullComp;
+ typename SIMD_T::Float vCullComp;
if (slot == 0)
{
vCullComp = vClipCullDistLo[e][component];
}
// cull if cull distance < 0 || NAN
- simd16scalar vCull = _simd16_cmp_ps(_simd16_setzero_ps(), vCullComp, _CMP_NLE_UQ);
- vCullMaskElem = _simd16_and_ps(vCullMaskElem, vCull);
+ typename SIMD_T::Float vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vCullComp);
+ vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
}
- vClipCullMask = _simd16_or_ps(vClipCullMask, vCullMaskElem);
+ vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
}
// clipper should also discard any primitive with NAN clip distance
- uint8_t clipMask = this->state.rastState.clipDistanceMask;
+ uint8_t clipMask = state.rastState.clipDistanceMask;
while (_BitScanForward(&index, clipMask))
{
clipMask &= ~(1 << index);
for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
{
- simd16scalar vClipComp;
+ typename SIMD_T::Float vClipComp;
if (slot == 0)
{
vClipComp = vClipCullDistLo[e][component];
vClipComp = vClipCullDistHi[e][component];
}
- simd16scalar vClip = _simd16_cmp_ps(vClipComp, vClipComp, _CMP_UNORD_Q);
- vClipCullMask = _simd16_or_ps(vClipCullMask, vClip);
+ typename SIMD_T::Float vClip = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp);
+ vClipCullMask = SIMD_T::or_ps(vClipCullMask, vClip);
}
}
- return _simd16_movemask_ps(vClipCullMask);
+ return SIMD_T::movemask_ps(vClipCullMask);
}
-#endif
- // clip SIMD primitives
- INLINE void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId)
+ void ClipSimd(const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, PA_STATE &pa, const typename SIMD_T::Integer &vPrimId)
{
// input/output vertex store for clipper
- simdvertex vertices[7]; // maximum 7 verts generated per triangle
+ SIMDVERTEX_T<SIMD_T> vertices[7]; // maximum 7 verts generated per triangle
- uint32_t constantInterpMask = this->state.backendState.constantInterpolationMask;
+ uint32_t constantInterpMask = state.backendState.constantInterpolationMask;
uint32_t provokingVertex = 0;
- if(pa.binTopology == TOP_TRIANGLE_FAN)
+ if (pa.binTopology == TOP_TRIANGLE_FAN)
{
- provokingVertex = this->state.frontendState.provokingVertex.triFan;
+ provokingVertex = state.frontendState.provokingVertex.triFan;
}
///@todo: line topology for wireframe?
// assemble pos
- simdvector tmpVector[NumVertsPerPrim];
+ typename SIMD_T::Vec4 tmpVector[NumVertsPerPrim];
pa.Assemble(VERTEX_POSITION_SLOT, tmpVector);
for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
{
}
// assemble attribs
- const SWR_BACKEND_STATE& backendState = this->state.backendState;
+ const SWR_BACKEND_STATE& backendState = state.backendState;
int32_t maxSlot = -1;
for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
}
// assemble user clip distances if enabled
- if (this->state.rastState.clipDistanceMask & 0xf)
+ if (state.rastState.clipDistanceMask & 0xf)
{
pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector);
for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
}
}
- if (this->state.rastState.clipDistanceMask & 0xf0)
+ if (state.rastState.clipDistanceMask & 0xf0)
{
pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector);
for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
uint32_t numAttribs = maxSlot + 1;
- simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
+ typename SIMD_T::Integer vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
+
+ BinnerChooser<SIMD_T> binner(NumVertsPerPrim, pa.pDC->pState->state.rastState.conservativeRast);
// set up new PA for binning clipped primitives
- PFN_PROCESS_PRIMS pfnBinFunc = nullptr;
PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
if (NumVertsPerPrim == 3)
{
- pfnBinFunc = GetBinTrianglesFunc((pa.pDC->pState->state.rastState.conservativeRast > 0));
clipTopology = TOP_TRIANGLE_FAN;
// so that the binner knows to bloat wide points later
if (pa.binTopology == TOP_POINT_LIST)
+ {
clipTopology = TOP_POINT_LIST;
-
+ }
}
else if (NumVertsPerPrim == 2)
{
- pfnBinFunc = BinLines;
clipTopology = TOP_LINE_LIST;
}
else
{
SWR_ASSERT(0 && "Unexpected points in clipper.");
}
-
- uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts;
- uint32_t* pPrimitiveId = (uint32_t*)&vPrimId;
-
- const simdscalari vOffsets = _mm256_set_epi32(
- 0 * sizeof(simdvertex), // unused lane
- 6 * sizeof(simdvertex),
- 5 * sizeof(simdvertex),
- 4 * sizeof(simdvertex),
- 3 * sizeof(simdvertex),
- 2 * sizeof(simdvertex),
- 1 * sizeof(simdvertex),
- 0 * sizeof(simdvertex));
+
+ const uint32_t *pVertexCount = reinterpret_cast<const uint32_t *>(&vNumClippedVerts);
+ const uint32_t *pPrimitiveId = reinterpret_cast<const uint32_t *>(&vPrimId);
+
+ const SIMD256::Integer vOffsets = SIMD256::set_epi32(
+ 0 * sizeof(SIMDVERTEX_T<SIMD_T>), // unused lane
+ 6 * sizeof(SIMDVERTEX_T<SIMD_T>),
+ 5 * sizeof(SIMDVERTEX_T<SIMD_T>),
+ 4 * sizeof(SIMDVERTEX_T<SIMD_T>),
+ 3 * sizeof(SIMDVERTEX_T<SIMD_T>),
+ 2 * sizeof(SIMDVERTEX_T<SIMD_T>),
+ 1 * sizeof(SIMDVERTEX_T<SIMD_T>),
+ 0 * sizeof(SIMDVERTEX_T<SIMD_T>));
// only need to gather 7 verts
// @todo dynamic mask based on actual # of verts generated per lane
- const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1);
+ const SIMD256::Float vMask = SIMD256::set_ps(0, -1, -1, -1, -1, -1, -1, -1);
uint32_t numClippedPrims = 0;
-#if USE_SIMD16_FRONTEND
- const uint32_t numPrims = pa.NumPrims();
- const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
- SWR_ASSERT(numPrims <= numPrims_lo);
+ // tranpose clipper output so that each lane's vertices are in SIMD order
+ // set aside space for 2 vertices, as the PA will try to read up to 16 verts
+ // for triangle fan
+
+#if defined(_DEBUG)
+ // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack overflow in debug builds
+ SIMDVERTEX_T<SIMD_T> *transposedPrims = reinterpret_cast<SIMDVERTEX_T<SIMD_T> *>(malloc(sizeof(SIMDVERTEX_T<SIMD_T>) * 2));
- for (uint32_t inputPrim = 0; inputPrim < numPrims_lo; ++inputPrim)
#else
- for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim)
+ SIMDVERTEX_T<SIMD_T> transposedPrims[2];
+
#endif
+ for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim)
{
uint32_t numEmittedVerts = pVertexCount[inputPrim];
if (numEmittedVerts < NumVertsPerPrim)
SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
+ SWR_ASSERT(numEmittedPrims <= 7, "Unexpected primitive count from clipper.");
+
numClippedPrims += numEmittedPrims;
// tranpose clipper output so that each lane's vertices are in SIMD order
// set aside space for 2 vertices, as the PA will try to read up to 16 verts
// for triangle fan
-#if USE_SIMD16_FRONTEND
- simd16vertex transposedPrims[2];
-#else
- simdvertex transposedPrims[2];
-#endif
// transpose pos
- uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
+ uint8_t *pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
-#if USE_SIMD16_FRONTEND
+#if 0
// TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
static const float *dummy = reinterpret_cast<const float *>(pBase);
-#endif
+#endif
for (uint32_t c = 0; c < 4; ++c)
{
-#if USE_SIMD16_FRONTEND
- simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
- transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
-#else
- transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
-#endif
- pBase += sizeof(simdscalar);
+ SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
+ transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
+ pBase += sizeof(typename SIMD_T::Float);
}
// transpose attribs
- pBase = (uint8_t*)(&vertices[0].attrib[backendState.vertexAttribOffset]) + sizeof(float) * inputPrim;
+ pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[backendState.vertexAttribOffset]) + sizeof(float) * inputPrim;
+
for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
{
uint32_t attribSlot = backendState.vertexAttribOffset + attrib;
+
for (uint32_t c = 0; c < 4; ++c)
{
-#if USE_SIMD16_FRONTEND
- simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
- transposedPrims[0].attrib[attribSlot][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
-#else
- transposedPrims[0].attrib[attribSlot][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
-#endif
- pBase += sizeof(simdscalar);
+ SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
+ transposedPrims[0].attrib[attribSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
+ pBase += sizeof(typename SIMD_T::Float);
}
}
// transpose user clip distances if enabled
- if (this->state.rastState.clipDistanceMask & 0xf)
+ if (state.rastState.clipDistanceMask & 0x0f)
{
- pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim;
+ pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim;
+
for (uint32_t c = 0; c < 4; ++c)
{
-#if USE_SIMD16_FRONTEND
- simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
- transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
-#else
- transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
-#endif
- pBase += sizeof(simdscalar);
+ SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
+ transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
+ pBase += sizeof(typename SIMD_T::Float);
}
}
- if (this->state.rastState.clipDistanceMask & 0xf0)
+ if (state.rastState.clipDistanceMask & 0xf0)
{
- pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim;
+ pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim;
+
for (uint32_t c = 0; c < 4; ++c)
{
-#if USE_SIMD16_FRONTEND
- simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
- transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
-#else
- transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
-#endif
- pBase += sizeof(simdscalar);
+ SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
+ transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
+ pBase += sizeof(typename SIMD_T::Float);
}
}
- PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, SWR_VTX_NUM_SLOTS, true, clipTopology);
+ PA_STATE_OPT clipPA(pDC, numEmittedPrims, reinterpret_cast<uint8_t *>(&transposedPrims[0]), numEmittedVerts, SWR_VTX_NUM_SLOTS, true, clipTopology);
- while (clipPa.GetNextStreamOutput())
+ static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f };
+
+ const uint32_t primMask = primMaskMap[numEmittedPrims];
+
+ const typename SIMD_T::Integer primID = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]);
+
+ while (clipPA.GetNextStreamOutput())
{
do
{
-#if USE_SIMD16_FRONTEND
- simd16vector attrib_simd16[NumVertsPerPrim];
- bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, attrib_simd16);
+ typename SIMD_T::Vec4 attrib[NumVertsPerPrim];
+
+ bool assemble = clipPA.Assemble(VERTEX_POSITION_SLOT, attrib);
if (assemble)
{
- static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
-
- simdvector attrib[NumVertsPerPrim];
- for (uint32_t i = 0; i < NumVertsPerPrim; i += 1)
- {
- for (uint32_t j = 0; j < 4; j += 1)
- {
- attrib[i][j] = _simd16_extract_ps(attrib_simd16[i][j], 0);
- }
- }
-
- clipPa.useAlternateOffset = false;
- pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]));
- }
-#else
- simdvector attrib[NumVertsPerPrim];
- bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, attrib);
- if (assemble)
- {
- static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
- pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]));
+ binner.pfnBinFunc(pDC, clipPA, workerId, attrib, primMask, primID);
}
-#endif
- } while (clipPa.NextPrim());
+
+ } while (clipPA.NextPrim());
}
}
+#if defined(_DEBUG)
+ free(transposedPrims);
+
+#endif
// update global pipeline stat
UPDATE_STAT_FE(CPrimitives, numClippedPrims);
}
-
-#if USE_SIMD16_FRONTEND
- void ClipSimd(const simd16scalar& vPrimMask, const simd16scalar& vClipMask, PA_STATE& pa, const simd16scalari& vPrimId)
+
+ void ExecuteStage(PA_STATE &pa, typename SIMD_T::Vec4 prim[], uint32_t primMask, typename SIMD_T::Integer const &primId)
{
- // input/output vertex store for clipper
- simd16vertex vertices[7]; // maximum 7 verts generated per triangle
+ SWR_ASSERT(pa.pDC != nullptr);
- uint32_t constantInterpMask = this->state.backendState.constantInterpolationMask;
- uint32_t provokingVertex = 0;
- if (pa.binTopology == TOP_TRIANGLE_FAN)
- {
- provokingVertex = this->state.frontendState.provokingVertex.triFan;
- }
- ///@todo: line topology for wireframe?
-
- // assemble pos
- simd16vector tmpVector[NumVertsPerPrim];
- pa.Assemble(VERTEX_POSITION_SLOT, tmpVector);
- for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
- {
- vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i];
- }
-
- // assemble attribs
- const SWR_BACKEND_STATE& backendState = this->state.backendState;
-
- int32_t maxSlot = -1;
- for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
- {
- // Compute absolute attrib slot in vertex array
- uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
- maxSlot = std::max<int32_t>(maxSlot, mapSlot);
- uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot;
-
- pa.Assemble(inputSlot, tmpVector);
-
- // if constant interpolation enabled for this attribute, assign the provoking
- // vertex values to all edges
- if (CheckBit(constantInterpMask, slot))
- {
- for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
- {
- vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
- }
- }
- else
- {
- for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
- {
- vertices[i].attrib[inputSlot] = tmpVector[i];
- }
- }
- }
-
- // assemble user clip distances if enabled
- if (this->state.rastState.clipDistanceMask & 0xf)
- {
- pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector);
- for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
- {
- vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = tmpVector[i];
- }
- }
-
- if (this->state.rastState.clipDistanceMask & 0xf0)
- {
- pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector);
- for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
- {
- vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = tmpVector[i];
- }
- }
-
- uint32_t numAttribs = maxSlot + 1;
-
- simd16scalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
-
- // set up new PA for binning clipped primitives
- PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc = nullptr;
- PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
- if (NumVertsPerPrim == 3)
- {
- pfnBinFunc = GetBinTrianglesFunc_simd16((pa.pDC->pState->state.rastState.conservativeRast > 0));
- clipTopology = TOP_TRIANGLE_FAN;
-
- // so that the binner knows to bloat wide points later
- if (pa.binTopology == TOP_POINT_LIST)
- clipTopology = TOP_POINT_LIST;
-
- }
- else if (NumVertsPerPrim == 2)
- {
- pfnBinFunc = BinLines_simd16;
- clipTopology = TOP_LINE_LIST;
- }
- else
- {
- SWR_ASSERT(0 && "Unexpected points in clipper.");
- }
-
- uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts;
- uint32_t* pPrimitiveId = (uint32_t*)&vPrimId;
-
- const simdscalari vOffsets = _simd_set_epi32(
- 0 * sizeof(simd16vertex), // unused lane
- 6 * sizeof(simd16vertex),
- 5 * sizeof(simd16vertex),
- 4 * sizeof(simd16vertex),
- 3 * sizeof(simd16vertex),
- 2 * sizeof(simd16vertex),
- 1 * sizeof(simd16vertex),
- 0 * sizeof(simd16vertex));
-
- // only need to gather 7 verts
- // @todo dynamic mask based on actual # of verts generated per lane
- const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1);
-
- uint32_t numClippedPrims = 0;
-
- // tranpose clipper output so that each lane's vertices are in SIMD order
- // set aside space for 2 vertices, as the PA will try to read up to 16 verts
- // for triangle fan
-
-#if defined(_DEBUG)
- // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack overflow in debug builds
- simd16vertex *transposedPrims = reinterpret_cast<simd16vertex *>(malloc(sizeof(simd16vertex) * 2));
-
-#else
- simd16vertex transposedPrims[2];
-
-#endif
- for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim)
- {
- uint32_t numEmittedVerts = pVertexCount[inputPrim];
- if (numEmittedVerts < NumVertsPerPrim)
- {
- continue;
- }
- SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
-
- uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
- numClippedPrims += numEmittedPrims;
-
- // tranpose clipper output so that each lane's vertices are in SIMD order
- // set aside space for 2 vertices, as the PA will try to read up to 16 verts
- // for triangle fan
-
- // transpose pos
- uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
-
-#if 0
- // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
- static const float *dummy = reinterpret_cast<const float *>(pBase);
-#endif
-
- for (uint32_t c = 0; c < 4; ++c)
- {
- simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
- transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
- pBase += sizeof(simd16scalar);
- }
-
- // transpose attribs
- pBase = (uint8_t*)(&vertices[0].attrib[backendState.vertexAttribOffset]) + sizeof(float) * inputPrim;
- for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
- {
- uint32_t attribSlot = backendState.vertexAttribOffset + attrib;
- for (uint32_t c = 0; c < 4; ++c)
- {
- simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
- transposedPrims[0].attrib[attribSlot][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
- pBase += sizeof(simd16scalar);
- }
- }
-
- // transpose user clip distances if enabled
- if (this->state.rastState.clipDistanceMask & 0xf)
- {
- pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim;
- for (uint32_t c = 0; c < 4; ++c)
- {
- simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
- transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
- pBase += sizeof(simd16scalar);
- }
- }
-
- if (this->state.rastState.clipDistanceMask & 0xf0)
- {
- pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim;
- for (uint32_t c = 0; c < 4; ++c)
- {
- simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1);
- transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
- pBase += sizeof(simd16scalar);
- }
- }
-
- PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, SWR_VTX_NUM_SLOTS, true, clipTopology);
-
- while (clipPa.GetNextStreamOutput())
- {
- do
- {
- simd16vector attrib[NumVertsPerPrim];
- bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, attrib);
-
- if (assemble)
- {
- static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff, 0x7ff, 0xfff, 0x1fff, 0x3fff, 0x7fff, 0xffff };
-
- clipPa.useAlternateOffset = false;
- pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd16_set1_epi32(pPrimitiveId[inputPrim]));
- }
-
- } while (clipPa.NextPrim());
- }
- }
-
-#if defined(_DEBUG)
- free(transposedPrims);
-
-#endif
- // update global pipeline stat
- UPDATE_STAT_FE(CPrimitives, numClippedPrims);
- }
-
-#endif
- // execute the clipper stage
- void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari const &primId)
- {
- SWR_ASSERT(this->pDC != nullptr);
- SWR_CONTEXT* pContext = this->pDC->pContext;
- const API_STATE& apiState = this->pDC->pState->state;
+ SWR_CONTEXT *pContext = pa.pDC->pContext;
- // set up binner based on PA state
- PFN_PROCESS_PRIMS pfnBinner;
- switch (pa.binTopology)
- {
- case TOP_POINT_LIST:
- pfnBinner = BinPoints;
- break;
- case TOP_LINE_LIST:
- case TOP_LINE_STRIP:
- case TOP_LINE_LOOP:
- case TOP_LINE_LIST_ADJ:
- case TOP_LISTSTRIP_ADJ:
- pfnBinner = BinLines;
- break;
- default:
- pfnBinner = GetBinTrianglesFunc((apiState.rastState.conservativeRast > 0));
- break;
- };
+ BinnerChooser<SIMD_T> binner(pa.binTopology, pa.pDC->pState->state.rastState.conservativeRast);
// update clipper invocations pipeline stat
uint32_t numInvoc = _mm_popcnt_u32(primMask);
UPDATE_STAT_FE(CInvocations, numInvoc);
-
+
// Read back viewport index if required
- simdscalari viewportIdx = _simd_set1_epi32(0);
+ typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0);
+
if (state.backendState.readViewportArrayIndex)
{
- simdvector vpiAttrib[NumVertsPerPrim];
+ typename SIMD_T::Vec4 vpiAttrib[NumVertsPerPrim];
pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
- simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
// OOB indices => forced to zero.
- simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
- simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports);
- viewportIdx = _simd_and_si(vClearMask, vpai);
+ typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+ typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+ typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
+ viewportIdx = SIMD_T::and_si(vClearMask, vpai);
}
ComputeClipCodes(prim, viewportIdx);
}
// cull prims outside view frustum
- simdscalar clipIntersection = ComputeClipCodeIntersection();
- int validMask = primMask & _simd_movemask_ps(_simd_cmpeq_ps(clipIntersection, _simd_setzero_ps()));
+ typename SIMD_T::Float clipIntersection = ComputeClipCodeIntersection();
+ int validMask = primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps());
// skip clipping for points
uint32_t clipMask = 0;
AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
// we have to clip tris, execute the clipper, which will also
// call the binner
- ClipSimd(_simd_vmask_ps(primMask), _simd_vmask_ps(clipMask), pa, primId);
+ ClipSimd(SIMD_T::vmask_ps(primMask), SIMD_T::vmask_ps(clipMask), pa, primId);
AR_END(FEGuardbandClip, 1);
}
else if (validMask)
UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
// forward valid prims directly to binner
- pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId);
+ binner.pfnBinFunc(this->pDC, pa, this->workerId, prim, validMask, primId);
}
}
-#if USE_SIMD16_FRONTEND
- void ExecuteStage(PA_STATE& pa, simd16vector prim[], uint32_t primMask, simd16scalari const &primId)
- {
- SWR_ASSERT(pa.pDC != nullptr);
- SWR_CONTEXT* pContext = pa.pDC->pContext;
-
- // set up binner based on PA state
- PFN_PROCESS_PRIMS_SIMD16 pfnBinner;
- switch (pa.binTopology)
- {
- case TOP_POINT_LIST:
- pfnBinner = BinPoints_simd16;
- break;
- case TOP_LINE_LIST:
- case TOP_LINE_STRIP:
- case TOP_LINE_LOOP:
- case TOP_LINE_LIST_ADJ:
- case TOP_LISTSTRIP_ADJ:
- pfnBinner = BinLines_simd16;
- break;
- default:
- pfnBinner = GetBinTrianglesFunc_simd16((pa.pDC->pState->state.rastState.conservativeRast > 0));
- break;
- };
-
- // update clipper invocations pipeline stat
- uint32_t numInvoc = _mm_popcnt_u32(primMask);
- UPDATE_STAT_FE(CInvocations, numInvoc);
-
- // Read back viewport index if required
- simd16scalari viewportIdx = _simd16_set1_epi32(0);
- if (state.backendState.readViewportArrayIndex)
- {
- simd16vector vpiAttrib[NumVertsPerPrim];
- pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
-
- // OOB indices => forced to zero.
- simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
- simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
- simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports);
- viewportIdx = _simd16_and_si(vClearMask, vpai);
- }
- ComputeClipCodes(prim, viewportIdx);
-
- // cull prims with NAN coords
- primMask &= ~ComputeNaNMask(prim);
-
- // user cull distance cull
- if (this->state.rastState.cullDistanceMask)
- {
- primMask &= ~ComputeUserClipCullMask(pa, prim);
- }
-
- // cull prims outside view frustum
- simd16scalar clipIntersection = ComputeClipCodeIntersection_simd16();
- int validMask = primMask & _simd16_cmpeq_ps_mask(clipIntersection, _simd16_setzero_ps());
-
- // skip clipping for points
- uint32_t clipMask = 0;
- if (NumVertsPerPrim != 1)
- {
- clipMask = primMask & ComputeClipMask_simd16();
- }
-
- if (clipMask)
- {
- AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
- // we have to clip tris, execute the clipper, which will also
- // call the binner
- ClipSimd(_simd16_vmask_ps(primMask), _simd16_vmask_ps(clipMask), pa, primId);
- AR_END(FEGuardbandClip, 1);
- }
- else if (validMask)
- {
- // update CPrimitives pipeline state
- UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
-
- // forward valid prims directly to binner
- pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId);
- }
- }
-
-#endif
private:
- inline simdscalar ComputeInterpFactor(simdscalar const &boundaryCoord0, simdscalar const &boundaryCoord1)
+ typename SIMD_T::Float ComputeInterpFactor(typename SIMD_T::Float const &boundaryCoord0, typename SIMD_T::Float const &boundaryCoord1)
{
- return _simd_div_ps(boundaryCoord0, _simd_sub_ps(boundaryCoord0, boundaryCoord1));
+ return SIMD_T::div_ps(boundaryCoord0, SIMD_T::sub_ps(boundaryCoord0, boundaryCoord1));
}
-#if USE_SIMD16_FRONTEND
- inline simd16scalar ComputeInterpFactor(simd16scalar const &boundaryCoord0, simd16scalar const &boundaryCoord1)
+ typename SIMD_T::Integer ComputeOffsets(uint32_t attrib, typename SIMD_T::Integer const &vIndices, uint32_t component)
{
- return _simd16_div_ps(boundaryCoord0, _simd16_sub_ps(boundaryCoord0, boundaryCoord1));
- }
-
-#endif
- inline simdscalari ComputeOffsets(uint32_t attrib, simdscalari const &vIndices, uint32_t component)
- {
- const uint32_t simdVertexStride = sizeof(simdvertex);
- const uint32_t componentStride = sizeof(simdscalar);
- const uint32_t attribStride = sizeof(simdvector);
- const __m256i vElemOffset = _mm256_set_epi32(7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float),
- 3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float));
-
- // step to the simdvertex
- simdscalari vOffsets = _simd_mullo_epi32(vIndices, _simd_set1_epi32(simdVertexStride));
-
- // step to the attribute and component
- vOffsets = _simd_add_epi32(vOffsets, _simd_set1_epi32(attribStride * attrib + componentStride * component));
+ const uint32_t simdVertexStride = sizeof(SIMDVERTEX_T<SIMD_T>);
+ const uint32_t componentStride = sizeof(typename SIMD_T::Float);
+ const uint32_t attribStride = sizeof(typename SIMD_T::Vec4);
+
+ static const OSALIGNSIMD16(uint32_t) elemOffset[16] =
+ {
+ 0 * sizeof(float),
+ 1 * sizeof(float),
+ 2 * sizeof(float),
+ 3 * sizeof(float),
+ 4 * sizeof(float),
+ 5 * sizeof(float),
+ 6 * sizeof(float),
+ 7 * sizeof(float),
+ 8 * sizeof(float),
+ 9 * sizeof(float),
+ 10 * sizeof(float),
+ 11 * sizeof(float),
+ 12 * sizeof(float),
+ 13 * sizeof(float),
+ 14 * sizeof(float),
+ 15 * sizeof(float),
+ };
- // step to the lane
- vOffsets = _simd_add_epi32(vOffsets, vElemOffset);
+ static_assert(sizeof(typename SIMD_T::Integer) <= sizeof(elemOffset), "Clipper::ComputeOffsets, Increase number of element offsets.");
- return vOffsets;
- }
-
-#if USE_SIMD16_FRONTEND
- inline simd16scalari ComputeOffsets(uint32_t attrib, simd16scalari const &vIndices, uint32_t component)
- {
- const uint32_t simdVertexStride = sizeof(simd16vertex);
- const uint32_t componentStride = sizeof(simd16scalar);
- const uint32_t attribStride = sizeof(simd16vector);
- const simd16scalari vElemOffset = _simd16_set_epi32(
- 15 * sizeof(float), 14 * sizeof(float), 13 * sizeof(float), 12 * sizeof(float),
- 11 * sizeof(float), 10 * sizeof(float), 9 * sizeof(float), 8 * sizeof(float),
- 7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float),
- 3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float));
+ typename SIMD_T::Integer vElemOffset = SIMD_T::loadu_si(reinterpret_cast<const typename SIMD_T::Integer *>(elemOffset));
// step to the simdvertex
- simd16scalari vOffsets = _simd16_mullo_epi32(vIndices, _simd16_set1_epi32(simdVertexStride));
+ typename SIMD_T::Integer vOffsets = SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride));
// step to the attribute and component
- vOffsets = _simd16_add_epi32(vOffsets, _simd16_set1_epi32(attribStride * attrib + componentStride * component));
+ vOffsets = SIMD_T::add_epi32(vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component));
// step to the lane
- vOffsets = _simd16_add_epi32(vOffsets, vElemOffset);
-
- return vOffsets;
- }
-
-#endif
- // gathers a single component for a given attribute for each SIMD lane
- inline simdscalar GatherComponent(const float* pBuffer, uint32_t attrib, simdscalar const &vMask, simdscalari const &vIndices, uint32_t component)
- {
- simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
- simdscalar vSrc = _mm256_undefined_ps();
- return _simd_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1);
- }
-
-#if USE_SIMD16_FRONTEND
- inline simd16scalar GatherComponent(const float* pBuffer, uint32_t attrib, simd16scalar const &vMask, simd16scalari const &vIndices, uint32_t component)
- {
- simd16scalari vOffsets = ComputeOffsets(attrib, vIndices, component);
- simd16scalar vSrc = _simd16_setzero_ps();
- return _simd16_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1);
+ vOffsets = SIMD_T::add_epi32(vOffsets, vElemOffset);
+
+ return vOffsets;
}
-#endif
- inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simdscalar const &vMask, simdscalari const &vIndices, uint32_t component, simdscalar const &vSrc)
+ typename SIMD_T::Float GatherComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component)
{
- simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
+ typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component);
+ typename SIMD_T::Float vSrc = SIMD_T::setzero_ps();
- uint32_t* pOffsets = (uint32_t*)&vOffsets;
- float* pSrc = (float*)&vSrc;
- uint32_t mask = _simd_movemask_ps(vMask);
- DWORD lane;
- while (_BitScanForward(&lane, mask))
- {
- mask &= ~(1 << lane);
- uint8_t* pBuf = (uint8_t*)pBuffer + pOffsets[lane];
- *(float*)pBuf = pSrc[lane];
- }
+ return SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(vSrc, pBuffer, vOffsets, vMask);
}
-#if USE_SIMD16_FRONTEND
- inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simd16scalar const &vMask, simd16scalari const &vIndices, uint32_t component, simd16scalar const &vSrc)
+ void ScatterComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component, typename SIMD_T::Float const &vSrc)
{
- simd16scalari vOffsets = ComputeOffsets(attrib, vIndices, component);
+ typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component);
- uint32_t* pOffsets = (uint32_t*)&vOffsets;
- float* pSrc = (float*)&vSrc;
- uint32_t mask = _simd16_movemask_ps(vMask);
+ const uint32_t *pOffsets = reinterpret_cast<const uint32_t *>(&vOffsets);
+ const float *pSrc = reinterpret_cast<const float *>(&vSrc);
+ uint32_t mask = SIMD_T::movemask_ps(vMask);
DWORD lane;
while (_BitScanForward(&lane, mask))
{
mask &= ~(1 << lane);
- uint8_t* pBuf = (uint8_t*)pBuffer + pOffsets[lane];
- *(float*)pBuf = pSrc[lane];
- }
- }
-
-#endif
- template<SWR_CLIPCODES ClippingPlane>
- inline void intersect(
- const simdscalar& vActiveMask, // active lanes to operate on
- const simdscalari& s, // index to first edge vertex v0 in pInPts.
- const simdscalari& p, // index to second edge vertex v1 in pInPts.
- const simdvector& v1, // vertex 0 position
- const simdvector& v2, // vertex 1 position
- simdscalari& outIndex, // output index.
- const float *pInVerts, // array of all the input positions.
- uint32_t numInAttribs, // number of attributes per vertex.
- float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4.
- {
- uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
-
- // compute interpolation factor
- simdscalar t;
- switch (ClippingPlane)
- {
- case FRUSTUM_LEFT: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[0]), _simd_add_ps(v2[3], v2[0])); break;
- case FRUSTUM_RIGHT: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[0]), _simd_sub_ps(v2[3], v2[0])); break;
- case FRUSTUM_TOP: t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[1]), _simd_add_ps(v2[3], v2[1])); break;
- case FRUSTUM_BOTTOM: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[1]), _simd_sub_ps(v2[3], v2[1])); break;
- case FRUSTUM_NEAR:
- // DX Znear plane is 0, GL is -w
- if (this->state.rastState.clipHalfZ)
- {
- t = ComputeInterpFactor(v1[2], v2[2]);
- }
- else
- {
- t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[2]), _simd_add_ps(v2[3], v2[2]));
- }
- break;
- case FRUSTUM_FAR: t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[2]), _simd_sub_ps(v2[3], v2[2])); break;
- default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
- };
-
- // interpolate position and store
- for (uint32_t c = 0; c < 4; ++c)
- {
- simdscalar vOutPos = _simd_fmadd_ps(_simd_sub_ps(v2[c], v1[c]), t, v1[c]);
- ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
- }
-
- // interpolate attributes and store
- for (uint32_t a = 0; a < numInAttribs; ++a)
- {
- uint32_t attribSlot = vertexAttribOffset + a;
- for (uint32_t c = 0; c < 4; ++c)
- {
- simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
- simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
- simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
- ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
- }
- }
-
- // interpolate clip distance if enabled
- if (this->state.rastState.clipDistanceMask & 0xf)
- {
- uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
- for (uint32_t c = 0; c < 4; ++c)
- {
- simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
- simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
- simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
- ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
- }
- }
-
- if (this->state.rastState.clipDistanceMask & 0xf0)
- {
- uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
- for (uint32_t c = 0; c < 4; ++c)
- {
- simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
- simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
- simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
- ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
- }
+ const uint8_t *pBuf = reinterpret_cast<const uint8_t *>(pBuffer) + pOffsets[lane];
+ *(float *)pBuf = pSrc[lane];
}
}
-#if USE_SIMD16_FRONTEND
template<SWR_CLIPCODES ClippingPlane>
- inline void intersect(
- const simd16scalar& vActiveMask,// active lanes to operate on
- const simd16scalari& s, // index to first edge vertex v0 in pInPts.
- const simd16scalari& p, // index to second edge vertex v1 in pInPts.
- const simd16vector& v1, // vertex 0 position
- const simd16vector& v2, // vertex 1 position
- simd16scalari& outIndex, // output index.
- const float *pInVerts, // array of all the input positions.
- uint32_t numInAttribs, // number of attributes per vertex.
- float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4.
+ void intersect(
+ const typename SIMD_T::Float &vActiveMask, // active lanes to operate on
+ const typename SIMD_T::Integer &s, // index to first edge vertex v0 in pInPts.
+ const typename SIMD_T::Integer &p, // index to second edge vertex v1 in pInPts.
+ const typename SIMD_T::Vec4 &v1, // vertex 0 position
+ const typename SIMD_T::Vec4 &v2, // vertex 1 position
+ typename SIMD_T::Integer &outIndex, // output index.
+ const float *pInVerts, // array of all the input positions.
+ uint32_t numInAttribs, // number of attributes per vertex.
+ float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4.
{
uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
// compute interpolation factor
- simd16scalar t;
+ typename SIMD_T::Float t;
switch (ClippingPlane)
{
- case FRUSTUM_LEFT: t = ComputeInterpFactor(_simd16_add_ps(v1[3], v1[0]), _simd16_add_ps(v2[3], v2[0])); break;
- case FRUSTUM_RIGHT: t = ComputeInterpFactor(_simd16_sub_ps(v1[3], v1[0]), _simd16_sub_ps(v2[3], v2[0])); break;
- case FRUSTUM_TOP: t = ComputeInterpFactor(_simd16_add_ps(v1[3], v1[1]), _simd16_add_ps(v2[3], v2[1])); break;
- case FRUSTUM_BOTTOM: t = ComputeInterpFactor(_simd16_sub_ps(v1[3], v1[1]), _simd16_sub_ps(v2[3], v2[1])); break;
+ case FRUSTUM_LEFT: t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0])); break;
+ case FRUSTUM_RIGHT: t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[0]), SIMD_T::sub_ps(v2[3], v2[0])); break;
+ case FRUSTUM_TOP: t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[1]), SIMD_T::add_ps(v2[3], v2[1])); break;
+ case FRUSTUM_BOTTOM: t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[1]), SIMD_T::sub_ps(v2[3], v2[1])); break;
case FRUSTUM_NEAR:
// DX Znear plane is 0, GL is -w
if (this->state.rastState.clipHalfZ)
}
else
{
- t = ComputeInterpFactor(_simd16_add_ps(v1[3], v1[2]), _simd16_add_ps(v2[3], v2[2]));
+ t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[2]), SIMD_T::add_ps(v2[3], v2[2]));
}
break;
- case FRUSTUM_FAR: t = ComputeInterpFactor(_simd16_sub_ps(v1[3], v1[2]), _simd16_sub_ps(v2[3], v2[2])); break;
+ case FRUSTUM_FAR: t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[2]), SIMD_T::sub_ps(v2[3], v2[2])); break;
default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
};
// interpolate position and store
for (uint32_t c = 0; c < 4; ++c)
{
- simd16scalar vOutPos = _simd16_fmadd_ps(_simd16_sub_ps(v2[c], v1[c]), t, v1[c]);
+ typename SIMD_T::Float vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]);
ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
}
uint32_t attribSlot = vertexAttribOffset + a;
for (uint32_t c = 0; c < 4; ++c)
{
- simd16scalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
- simd16scalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
- simd16scalar vOutAttrib = _simd16_fmadd_ps(_simd16_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+ typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+ typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+ typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
}
}
uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
for (uint32_t c = 0; c < 4; ++c)
{
- simd16scalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
- simd16scalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
- simd16scalar vOutAttrib = _simd16_fmadd_ps(_simd16_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+ typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+ typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+ typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
}
}
uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
for (uint32_t c = 0; c < 4; ++c)
{
- simd16scalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
- simd16scalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
- simd16scalar vOutAttrib = _simd16_fmadd_ps(_simd16_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+ typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+ typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+ typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
}
}
}
-#endif
- template<SWR_CLIPCODES ClippingPlane>
- inline simdscalar inside(const simdvector& v)
- {
- switch (ClippingPlane)
- {
- case FRUSTUM_LEFT: return _simd_cmpge_ps(v[0], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
- case FRUSTUM_RIGHT: return _simd_cmple_ps(v[0], v[3]);
- case FRUSTUM_TOP: return _simd_cmpge_ps(v[1], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
- case FRUSTUM_BOTTOM: return _simd_cmple_ps(v[1], v[3]);
- case FRUSTUM_NEAR: return _simd_cmpge_ps(v[2], this->state.rastState.clipHalfZ ? _simd_setzero_ps() : _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
- case FRUSTUM_FAR: return _simd_cmple_ps(v[2], v[3]);
- default:
- SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
- return _simd_setzero_ps();
- }
- }
-
-#if USE_SIMD16_FRONTEND
template<SWR_CLIPCODES ClippingPlane>
- inline simd16scalar inside(const simd16vector& v)
+ typename SIMD_T::Float inside(const typename SIMD_T::Vec4 &v)
{
switch (ClippingPlane)
{
- case FRUSTUM_LEFT: return _simd16_cmpge_ps(v[0], _simd16_mul_ps(v[3], _simd16_set1_ps(-1.0f)));
- case FRUSTUM_RIGHT: return _simd16_cmple_ps(v[0], v[3]);
- case FRUSTUM_TOP: return _simd16_cmpge_ps(v[1], _simd16_mul_ps(v[3], _simd16_set1_ps(-1.0f)));
- case FRUSTUM_BOTTOM: return _simd16_cmple_ps(v[1], v[3]);
- case FRUSTUM_NEAR: return _simd16_cmpge_ps(v[2], this->state.rastState.clipHalfZ ? _simd16_setzero_ps() : _simd16_mul_ps(v[3], _simd16_set1_ps(-1.0f)));
- case FRUSTUM_FAR: return _simd16_cmple_ps(v[2], v[3]);
+ case FRUSTUM_LEFT: return SIMD_T::cmpge_ps(v[0], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
+ case FRUSTUM_RIGHT: return SIMD_T::cmple_ps(v[0], v[3]);
+ case FRUSTUM_TOP: return SIMD_T::cmpge_ps(v[1], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
+ case FRUSTUM_BOTTOM: return SIMD_T::cmple_ps(v[1], v[3]);
+ case FRUSTUM_NEAR: return SIMD_T::cmpge_ps(v[2], this->state.rastState.clipHalfZ ? SIMD_T::setzero_ps() : SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
+ case FRUSTUM_FAR: return SIMD_T::cmple_ps(v[2], v[3]);
default:
SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
- return _simd16_setzero_ps();
- }
- }
-
-#endif
- template<SWR_CLIPCODES ClippingPlane>
- simdscalari ClipTriToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
- {
- uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
-
- simdscalari vCurIndex = _simd_setzero_si();
- simdscalari vOutIndex = _simd_setzero_si();
- simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
-
- while (!_simd_testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
- {
- simdscalari s = vCurIndex;
- simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
- simdscalari underFlowMask = _simd_cmpgt_epi32(vNumInPts, p);
- p = _simd_castps_si(_simd_blendv_ps(_simd_setzero_ps(), _simd_castsi_ps(p), _simd_castsi_ps(underFlowMask)));
-
- // gather position
- simdvector vInPos0, vInPos1;
- for (uint32_t c = 0; c < 4; ++c)
- {
- vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
- vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
- }
-
- // compute inside mask
- simdscalar s_in = inside<ClippingPlane>(vInPos0);
- simdscalar p_in = inside<ClippingPlane>(vInPos1);
-
- // compute intersection mask (s_in != p_in)
- simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
- intersectMask = _simd_and_ps(intersectMask, vActiveMask);
-
- // store s if inside
- s_in = _simd_and_ps(s_in, vActiveMask);
- if (!_simd_testz_ps(s_in, s_in))
- {
- // store position
- for (uint32_t c = 0; c < 4; ++c)
- {
- ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
- }
-
- // store attribs
- for (uint32_t a = 0; a < numInAttribs; ++a)
- {
- uint32_t attribSlot = vertexAttribOffset + a;
- for (uint32_t c = 0; c < 4; ++c)
- {
- simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
- ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
- }
- }
-
- // store clip distance if enabled
- if (this->state.rastState.clipDistanceMask & 0xf)
- {
- uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
- for (uint32_t c = 0; c < 4; ++c)
- {
- simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
- ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
- }
- }
-
- if (this->state.rastState.clipDistanceMask & 0xf0)
- {
- uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
- for (uint32_t c = 0; c < 4; ++c)
- {
- simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
- ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
- }
- }
-
- // increment outIndex
- vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
- }
-
- // compute and store intersection
- if (!_simd_testz_ps(intersectMask, intersectMask))
- {
- intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
-
- // increment outIndex for active lanes
- vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
- }
-
- // increment loop index and update active mask
- vCurIndex = _simd_add_epi32(vCurIndex, _simd_set1_epi32(1));
- vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
+ return SIMD_T::setzero_ps();
}
-
- return vOutIndex;
}
-#if USE_SIMD16_FRONTEND
template<SWR_CLIPCODES ClippingPlane>
- simd16scalari ClipTriToPlane(const float* pInVerts, const simd16scalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
+ typename SIMD_T::Integer ClipTriToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
{
uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
- simd16scalari vCurIndex = _simd16_setzero_si();
- simd16scalari vOutIndex = _simd16_setzero_si();
- simd16scalar vActiveMask = _simd16_castsi_ps(_simd16_cmplt_epi32(vCurIndex, vNumInPts));
+ typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si();
+ typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si();
+ typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
- while (!_simd16_testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
+ while (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
{
- simd16scalari s = vCurIndex;
- simd16scalari p = _simd16_add_epi32(s, _simd16_set1_epi32(1));
- simd16scalari underFlowMask = _simd16_cmpgt_epi32(vNumInPts, p);
- p = _simd16_castps_si(_simd16_blendv_ps(_simd16_setzero_ps(), _simd16_castsi_ps(p), _simd16_castsi_ps(underFlowMask)));
+ typename SIMD_T::Integer s = vCurIndex;
+ typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
+ typename SIMD_T::Integer underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p);
+ p = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask)));
// gather position
- simd16vector vInPos0, vInPos1;
+ typename SIMD_T::Vec4 vInPos0, vInPos1;
for (uint32_t c = 0; c < 4; ++c)
{
vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
}
// compute inside mask
- simd16scalar s_in = inside<ClippingPlane>(vInPos0);
- simd16scalar p_in = inside<ClippingPlane>(vInPos1);
+ typename SIMD_T::Float s_in = inside<ClippingPlane>(vInPos0);
+ typename SIMD_T::Float p_in = inside<ClippingPlane>(vInPos1);
// compute intersection mask (s_in != p_in)
- simd16scalar intersectMask = _simd16_xor_ps(s_in, p_in);
- intersectMask = _simd16_and_ps(intersectMask, vActiveMask);
+ typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in);
+ intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
// store s if inside
- s_in = _simd16_and_ps(s_in, vActiveMask);
- if (!_simd16_testz_ps(s_in, s_in))
+ s_in = SIMD_T::and_ps(s_in, vActiveMask);
+ if (!SIMD_T::testz_ps(s_in, s_in))
{
// store position
for (uint32_t c = 0; c < 4; ++c)
uint32_t attribSlot = vertexAttribOffset + a;
for (uint32_t c = 0; c < 4; ++c)
{
- simd16scalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+ typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
}
}
uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
for (uint32_t c = 0; c < 4; ++c)
{
- simd16scalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+ typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
}
}
uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
for (uint32_t c = 0; c < 4; ++c)
{
- simd16scalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+ typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
}
}
// increment outIndex
- vOutIndex = _simd16_blendv_epi32(vOutIndex, _simd16_add_epi32(vOutIndex, _simd16_set1_epi32(1)), s_in);
+ vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
}
// compute and store intersection
- if (!_simd16_testz_ps(intersectMask, intersectMask))
+ if (!SIMD_T::testz_ps(intersectMask, intersectMask))
{
intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
// increment outIndex for active lanes
- vOutIndex = _simd16_blendv_epi32(vOutIndex, _simd16_add_epi32(vOutIndex, _simd16_set1_epi32(1)), intersectMask);
+ vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
}
// increment loop index and update active mask
- vCurIndex = _simd16_add_epi32(vCurIndex, _simd16_set1_epi32(1));
- vActiveMask = _simd16_castsi_ps(_simd16_cmplt_epi32(vCurIndex, vNumInPts));
- }
-
- return vOutIndex;
- }
-
-#endif
- template<SWR_CLIPCODES ClippingPlane>
- simdscalari ClipLineToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
- {
- uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
-
- simdscalari vCurIndex = _simd_setzero_si();
- simdscalari vOutIndex = _simd_setzero_si();
- simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
-
- if (!_simd_testz_ps(vActiveMask, vActiveMask))
- {
- simdscalari s = vCurIndex;
- simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
-
- // gather position
- simdvector vInPos0, vInPos1;
- for (uint32_t c = 0; c < 4; ++c)
- {
- vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
- vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
- }
-
- // compute inside mask
- simdscalar s_in = inside<ClippingPlane>(vInPos0);
- simdscalar p_in = inside<ClippingPlane>(vInPos1);
-
- // compute intersection mask (s_in != p_in)
- simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
- intersectMask = _simd_and_ps(intersectMask, vActiveMask);
-
- // store s if inside
- s_in = _simd_and_ps(s_in, vActiveMask);
- if (!_simd_testz_ps(s_in, s_in))
- {
- for (uint32_t c = 0; c < 4; ++c)
- {
- ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
- }
-
- // interpolate attributes and store
- for (uint32_t a = 0; a < numInAttribs; ++a)
- {
- uint32_t attribSlot = vertexAttribOffset + a;
- for (uint32_t c = 0; c < 4; ++c)
- {
- simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
- ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
- }
- }
-
- // increment outIndex
- vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
- }
-
- // compute and store intersection
- if (!_simd_testz_ps(intersectMask, intersectMask))
- {
- intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
-
- // increment outIndex for active lanes
- vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
- }
-
- // store p if inside
- p_in = _simd_and_ps(p_in, vActiveMask);
- if (!_simd_testz_ps(p_in, p_in))
- {
- for (uint32_t c = 0; c < 4; ++c)
- {
- ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
- }
-
- // interpolate attributes and store
- for (uint32_t a = 0; a < numInAttribs; ++a)
- {
- uint32_t attribSlot = vertexAttribOffset + a;
- for (uint32_t c = 0; c < 4; ++c)
- {
- simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
- ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
- }
- }
-
- // increment outIndex
- vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), p_in);
- }
+ vCurIndex = SIMD_T::add_epi32(vCurIndex, SIMD_T::set1_epi32(1));
+ vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
}
return vOutIndex;
}
-#if USE_SIMD16_FRONTEND
template<SWR_CLIPCODES ClippingPlane>
- simd16scalari ClipLineToPlane(const float* pInVerts, const simd16scalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
+ typename SIMD_T::Integer ClipLineToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
{
uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
- simd16scalari vCurIndex = _simd16_setzero_si();
- simd16scalari vOutIndex = _simd16_setzero_si();
- simd16scalar vActiveMask = _simd16_castsi_ps(_simd16_cmplt_epi32(vCurIndex, vNumInPts));
+ typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si();
+ typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si();
+ typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
- if (!_simd16_testz_ps(vActiveMask, vActiveMask))
+ if (!SIMD_T::testz_ps(vActiveMask, vActiveMask))
{
- simd16scalari s = vCurIndex;
- simd16scalari p = _simd16_add_epi32(s, _simd16_set1_epi32(1));
+ typename SIMD_T::Integer s = vCurIndex;
+ typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
// gather position
- simd16vector vInPos0, vInPos1;
+ typename SIMD_T::Vec4 vInPos0, vInPos1;
for (uint32_t c = 0; c < 4; ++c)
{
vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
}
// compute inside mask
- simd16scalar s_in = inside<ClippingPlane>(vInPos0);
- simd16scalar p_in = inside<ClippingPlane>(vInPos1);
+ typename SIMD_T::Float s_in = inside<ClippingPlane>(vInPos0);
+ typename SIMD_T::Float p_in = inside<ClippingPlane>(vInPos1);
// compute intersection mask (s_in != p_in)
- simd16scalar intersectMask = _simd16_xor_ps(s_in, p_in);
- intersectMask = _simd16_and_ps(intersectMask, vActiveMask);
+ typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in);
+ intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
// store s if inside
- s_in = _simd16_and_ps(s_in, vActiveMask);
- if (!_simd16_testz_ps(s_in, s_in))
+ s_in = SIMD_T::and_ps(s_in, vActiveMask);
+ if (!SIMD_T::testz_ps(s_in, s_in))
{
for (uint32_t c = 0; c < 4; ++c)
{
uint32_t attribSlot = vertexAttribOffset + a;
for (uint32_t c = 0; c < 4; ++c)
{
- simd16scalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+ typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
}
}
// increment outIndex
- vOutIndex = _simd16_blendv_epi32(vOutIndex, _simd16_add_epi32(vOutIndex, _simd16_set1_epi32(1)), s_in);
+ vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
}
// compute and store intersection
- if (!_simd16_testz_ps(intersectMask, intersectMask))
+ if (!SIMD_T::testz_ps(intersectMask, intersectMask))
{
intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
// increment outIndex for active lanes
- vOutIndex = _simd16_blendv_epi32(vOutIndex, _simd16_add_epi32(vOutIndex, _simd16_set1_epi32(1)), intersectMask);
+ vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
}
// store p if inside
- p_in = _simd16_and_ps(p_in, vActiveMask);
- if (!_simd16_testz_ps(p_in, p_in))
+ p_in = SIMD_T::and_ps(p_in, vActiveMask);
+ if (!SIMD_T::testz_ps(p_in, p_in))
{
for (uint32_t c = 0; c < 4; ++c)
{
uint32_t attribSlot = vertexAttribOffset + a;
for (uint32_t c = 0; c < 4; ++c)
{
- simd16scalar vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
+ typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
}
}
// increment outIndex
- vOutIndex = _simd16_blendv_epi32(vOutIndex, _simd16_add_epi32(vOutIndex, _simd16_set1_epi32(1)), p_in);
+ vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), p_in);
}
}
return vOutIndex;
}
-#endif
- //////////////////////////////////////////////////////////////////////////
- /// @brief Vertical clipper. Clips SIMD primitives at a time
- /// @param pVertices - pointer to vertices in SOA form. Clipper will read input and write results to this buffer
- /// @param vPrimMask - mask of valid input primitives, including non-clipped prims
- /// @param numAttribs - number of valid input attribs, including position
- simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs)
- {
- // temp storage
- float* pTempVerts = (float*)&tlsTempVertices[0];
-
- // zero out num input verts for non-active lanes
- simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim);
- vNumInPts = _simd_blendv_epi32(_simd_setzero_si(), vNumInPts, vClipMask);
-
- // clip prims to frustum
- simdscalari vNumOutPts;
- if (NumVertsPerPrim == 3)
- {
- vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
- vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
- vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
- vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
- vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
- vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
- }
- else
- {
- SWR_ASSERT(NumVertsPerPrim == 2);
- vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
- vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
- vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
- vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
- vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
- vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
- }
-
- // restore num verts for non-clipped, active lanes
- simdscalar vNonClippedMask = _simd_andnot_ps(vClipMask, vPrimMask);
- vNumOutPts = _simd_blendv_epi32(vNumOutPts, _simd_set1_epi32(NumVertsPerPrim), vNonClippedMask);
-
- return vNumOutPts;
- }
-#if USE_SIMD16_FRONTEND
- simd16scalari ClipPrims(float* pVertices, const simd16scalar& vPrimMask, const simd16scalar& vClipMask, int numAttribs)
+ typename SIMD_T::Integer ClipPrims(float *pVertices, const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, int numAttribs)
{
// temp storage
- float* pTempVerts = (float*)&tlsTempVertices_simd16[0];
+ float *pTempVerts = reinterpret_cast<float *>(ClipHelper<SIMD_T>::GetTempVertices());
// zero out num input verts for non-active lanes
- simd16scalari vNumInPts = _simd16_set1_epi32(NumVertsPerPrim);
- vNumInPts = _simd16_blendv_epi32(_simd16_setzero_si(), vNumInPts, vClipMask);
+ typename SIMD_T::Integer vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrim);
+ vNumInPts = SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts, vClipMask);
// clip prims to frustum
- simd16scalari vNumOutPts;
+ typename SIMD_T::Integer vNumOutPts;
if (NumVertsPerPrim == 3)
{
vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
}
// restore num verts for non-clipped, active lanes
- simd16scalar vNonClippedMask = _simd16_andnot_ps(vClipMask, vPrimMask);
- vNumOutPts = _simd16_blendv_epi32(vNumOutPts, _simd16_set1_epi32(NumVertsPerPrim), vNonClippedMask);
+ typename SIMD_T::Float vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask);
+ vNumOutPts = SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrim), vNonClippedMask);
return vNumOutPts;
}
-#endif
const uint32_t workerId{ 0 };
- DRAW_CONTEXT* pDC{ nullptr };
- const API_STATE& state;
- simdscalar clipCodes[NumVertsPerPrim];
-#if USE_SIMD16_FRONTEND
- simd16scalar clipCodes_simd16[NumVertsPerPrim];
-#endif
+ DRAW_CONTEXT *pDC{ nullptr };
+ const API_STATE &state;
+ typename SIMD_T::Float clipCodes[NumVertsPerPrim];
};