DRAW_CONTEXT *pDC,
PA_STATE &pa,
uint32_t workerId,
- typename SIMD_T::Vec4 prim[],
- typename SIMD_T::Float recipW[],
+ Vec4<SIMD_T> prim[],
+ Float<SIMD_T> recipW[],
uint32_t primMask,
- typename SIMD_T::Integer const &primID,
- typename SIMD_T::Integer const &viewportIdx,
- typename SIMD_T::Integer const &rtIdx);
+ Integer<SIMD_T> const &primID,
+ Integer<SIMD_T> const &viewportIdx,
+ Integer<SIMD_T> const &rtIdx);
template <typename SIMD_T, uint32_t SIMD_WIDTH>
void BinPostSetupPointsImpl(
DRAW_CONTEXT *pDC,
PA_STATE &pa,
uint32_t workerId,
- typename SIMD_T::Vec4 prim[],
+ Vec4<SIMD_T> prim[],
uint32_t primMask,
- typename SIMD_T::Integer const &primID,
- typename SIMD_T::Integer const &viewportIdx,
- typename SIMD_T::Integer const &rtIdx);
+ Integer<SIMD_T> const &primID,
+ Integer<SIMD_T> const &viewportIdx,
+ Integer<SIMD_T> const &rtIdx);
//////////////////////////////////////////////////////////////////////////
/// @brief Processes attributes for the backend based on linkage mask and
template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
uint32_t SIMDCALL EarlyRasterizer(
SIMDBBOX_T<SIMD_T> &er_bbox,
- typename SIMD_T::Integer (&vAi)[3],
- typename SIMD_T::Integer (&vBi)[3],
- typename SIMD_T::Integer (&vXi)[3],
- typename SIMD_T::Integer (&vYi)[3],
+ Integer<SIMD_T> (&vAi)[3],
+ Integer<SIMD_T> (&vBi)[3],
+ Integer<SIMD_T> (&vXi)[3],
+ Integer<SIMD_T> (&vYi)[3],
uint32_t cwTrisMask,
uint32_t triMask,
uint32_t oneTileMask)
{
// step to pixel center of top-left pixel of the triangle bbox
- typename SIMD_T::Integer vTopLeftX = SIMD_T::template slli_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(er_bbox.xmin);
+ Integer<SIMD_T> vTopLeftX = SIMD_T::template slli_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(er_bbox.xmin);
vTopLeftX = SIMD_T::add_epi32(vTopLeftX, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2));
- typename SIMD_T::Integer vTopLeftY = SIMD_T::template slli_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(er_bbox.ymin);
+ Integer<SIMD_T> vTopLeftY = SIMD_T::template slli_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(er_bbox.ymin);
vTopLeftY = SIMD_T::add_epi32(vTopLeftY, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2));
// negate A and B for CW tris
- typename SIMD_T::Integer vNegA0 = SIMD_T::mullo_epi32(vAi[0], SIMD_T::set1_epi32(-1));
- typename SIMD_T::Integer vNegA1 = SIMD_T::mullo_epi32(vAi[1], SIMD_T::set1_epi32(-1));
- typename SIMD_T::Integer vNegA2 = SIMD_T::mullo_epi32(vAi[2], SIMD_T::set1_epi32(-1));
- typename SIMD_T::Integer vNegB0 = SIMD_T::mullo_epi32(vBi[0], SIMD_T::set1_epi32(-1));
- typename SIMD_T::Integer vNegB1 = SIMD_T::mullo_epi32(vBi[1], SIMD_T::set1_epi32(-1));
- typename SIMD_T::Integer vNegB2 = SIMD_T::mullo_epi32(vBi[2], SIMD_T::set1_epi32(-1));
+ Integer<SIMD_T> vNegA0 = SIMD_T::mullo_epi32(vAi[0], SIMD_T::set1_epi32(-1));
+ Integer<SIMD_T> vNegA1 = SIMD_T::mullo_epi32(vAi[1], SIMD_T::set1_epi32(-1));
+ Integer<SIMD_T> vNegA2 = SIMD_T::mullo_epi32(vAi[2], SIMD_T::set1_epi32(-1));
+ Integer<SIMD_T> vNegB0 = SIMD_T::mullo_epi32(vBi[0], SIMD_T::set1_epi32(-1));
+ Integer<SIMD_T> vNegB1 = SIMD_T::mullo_epi32(vBi[1], SIMD_T::set1_epi32(-1));
+ Integer<SIMD_T> vNegB2 = SIMD_T::mullo_epi32(vBi[2], SIMD_T::set1_epi32(-1));
RDTSC_EVENT(FEEarlyRastEnter, _mm_popcnt_u32(oneTileMask & triMask), 0);
- typename SIMD_T::Integer vShiftCntrl = EarlyRastHelper <SIMD_T>::InitShiftCntrl();
- typename SIMD_T::Integer vCwTris = SIMD_T::set1_epi32(cwTrisMask);
- typename SIMD_T::Integer vMask = SIMD_T::sllv_epi32(vCwTris, vShiftCntrl);
+ Integer<SIMD_T> vShiftCntrl = EarlyRastHelper <SIMD_T>::InitShiftCntrl();
+ Integer<SIMD_T> vCwTris = SIMD_T::set1_epi32(cwTrisMask);
+ Integer<SIMD_T> vMask = SIMD_T::sllv_epi32(vCwTris, vShiftCntrl);
vAi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[0]), SIMD_T::castsi_ps(vNegA0), SIMD_T::castsi_ps(vMask)));
vAi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[1]), SIMD_T::castsi_ps(vNegA1), SIMD_T::castsi_ps(vMask)));
vBi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi[2]), SIMD_T::castsi_ps(vNegB2), SIMD_T::castsi_ps(vMask)));
// evaluate edge equations at top-left pixel
- typename SIMD_T::Integer vDeltaX0 = SIMD_T::sub_epi32(vTopLeftX, vXi[0]);
- typename SIMD_T::Integer vDeltaX1 = SIMD_T::sub_epi32(vTopLeftX, vXi[1]);
- typename SIMD_T::Integer vDeltaX2 = SIMD_T::sub_epi32(vTopLeftX, vXi[2]);
+ Integer<SIMD_T> vDeltaX0 = SIMD_T::sub_epi32(vTopLeftX, vXi[0]);
+ Integer<SIMD_T> vDeltaX1 = SIMD_T::sub_epi32(vTopLeftX, vXi[1]);
+ Integer<SIMD_T> vDeltaX2 = SIMD_T::sub_epi32(vTopLeftX, vXi[2]);
- typename SIMD_T::Integer vDeltaY0 = SIMD_T::sub_epi32(vTopLeftY, vYi[0]);
- typename SIMD_T::Integer vDeltaY1 = SIMD_T::sub_epi32(vTopLeftY, vYi[1]);
- typename SIMD_T::Integer vDeltaY2 = SIMD_T::sub_epi32(vTopLeftY, vYi[2]);
+ Integer<SIMD_T> vDeltaY0 = SIMD_T::sub_epi32(vTopLeftY, vYi[0]);
+ Integer<SIMD_T> vDeltaY1 = SIMD_T::sub_epi32(vTopLeftY, vYi[1]);
+ Integer<SIMD_T> vDeltaY2 = SIMD_T::sub_epi32(vTopLeftY, vYi[2]);
- typename SIMD_T::Integer vAX0 = SIMD_T::mullo_epi32(vAi[0], vDeltaX0);
- typename SIMD_T::Integer vAX1 = SIMD_T::mullo_epi32(vAi[1], vDeltaX1);
- typename SIMD_T::Integer vAX2 = SIMD_T::mullo_epi32(vAi[2], vDeltaX2);
+ Integer<SIMD_T> vAX0 = SIMD_T::mullo_epi32(vAi[0], vDeltaX0);
+ Integer<SIMD_T> vAX1 = SIMD_T::mullo_epi32(vAi[1], vDeltaX1);
+ Integer<SIMD_T> vAX2 = SIMD_T::mullo_epi32(vAi[2], vDeltaX2);
- typename SIMD_T::Integer vBY0 = SIMD_T::mullo_epi32(vBi[0], vDeltaY0);
- typename SIMD_T::Integer vBY1 = SIMD_T::mullo_epi32(vBi[1], vDeltaY1);
- typename SIMD_T::Integer vBY2 = SIMD_T::mullo_epi32(vBi[2], vDeltaY2);
+ Integer<SIMD_T> vBY0 = SIMD_T::mullo_epi32(vBi[0], vDeltaY0);
+ Integer<SIMD_T> vBY1 = SIMD_T::mullo_epi32(vBi[1], vDeltaY1);
+ Integer<SIMD_T> vBY2 = SIMD_T::mullo_epi32(vBi[2], vDeltaY2);
- typename SIMD_T::Integer vEdge0 = SIMD_T::add_epi32(vAX0, vBY0);
- typename SIMD_T::Integer vEdge1 = SIMD_T::add_epi32(vAX1, vBY1);
- typename SIMD_T::Integer vEdge2 = SIMD_T::add_epi32(vAX2, vBY2);
+ Integer<SIMD_T> vEdge0 = SIMD_T::add_epi32(vAX0, vBY0);
+ Integer<SIMD_T> vEdge1 = SIMD_T::add_epi32(vAX1, vBY1);
+ Integer<SIMD_T> vEdge2 = SIMD_T::add_epi32(vAX2, vBY2);
vEdge0 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge0);
vEdge1 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge1);
vEdge2 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge2);
// top left rule
- typename SIMD_T::Integer vEdgeAdjust0 = SIMD_T::sub_epi32(vEdge0, SIMD_T::set1_epi32(1));
- typename SIMD_T::Integer vEdgeAdjust1 = SIMD_T::sub_epi32(vEdge1, SIMD_T::set1_epi32(1));
- typename SIMD_T::Integer vEdgeAdjust2 = SIMD_T::sub_epi32(vEdge2, SIMD_T::set1_epi32(1));
+ Integer<SIMD_T> vEdgeAdjust0 = SIMD_T::sub_epi32(vEdge0, SIMD_T::set1_epi32(1));
+ Integer<SIMD_T> vEdgeAdjust1 = SIMD_T::sub_epi32(vEdge1, SIMD_T::set1_epi32(1));
+ Integer<SIMD_T> vEdgeAdjust2 = SIMD_T::sub_epi32(vEdge2, SIMD_T::set1_epi32(1));
// vA < 0
vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vAi[0])));
vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vAi[2])));
// vA == 0 && vB < 0
- typename SIMD_T::Integer vCmp0 = SIMD_T::cmpeq_epi32(vAi[0], SIMD_T::setzero_si());
- typename SIMD_T::Integer vCmp1 = SIMD_T::cmpeq_epi32(vAi[1], SIMD_T::setzero_si());
- typename SIMD_T::Integer vCmp2 = SIMD_T::cmpeq_epi32(vAi[2], SIMD_T::setzero_si());
+ Integer<SIMD_T> vCmp0 = SIMD_T::cmpeq_epi32(vAi[0], SIMD_T::setzero_si());
+ Integer<SIMD_T> vCmp1 = SIMD_T::cmpeq_epi32(vAi[1], SIMD_T::setzero_si());
+ Integer<SIMD_T> vCmp2 = SIMD_T::cmpeq_epi32(vAi[2], SIMD_T::setzero_si());
vCmp0 = SIMD_T::and_si(vCmp0, vBi[0]);
vCmp1 = SIMD_T::and_si(vCmp1, vBi[1]);
#if ER_SIMD_TILE_X_DIM == 4 && ER_SIMD_TILE_Y_DIM == 4
// Go down
// coverage pixel 0
- typename SIMD_T::Integer vMask0 = SIMD_T::and_si(vEdge0, vEdge1);
+ Integer<SIMD_T> vMask0 = SIMD_T::and_si(vEdge0, vEdge1);
vMask0 = SIMD_T::and_si(vMask0, vEdge2);
// coverage pixel 1
- typename SIMD_T::Integer vEdge0N = SIMD_T::add_epi32(vEdge0, vBi[0]);
- typename SIMD_T::Integer vEdge1N = SIMD_T::add_epi32(vEdge1, vBi[1]);
- typename SIMD_T::Integer vEdge2N = SIMD_T::add_epi32(vEdge2, vBi[2]);
- typename SIMD_T::Integer vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N);
+ Integer<SIMD_T> vEdge0N = SIMD_T::add_epi32(vEdge0, vBi[0]);
+ Integer<SIMD_T> vEdge1N = SIMD_T::add_epi32(vEdge1, vBi[1]);
+ Integer<SIMD_T> vEdge2N = SIMD_T::add_epi32(vEdge2, vBi[2]);
+ Integer<SIMD_T> vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N);
vMask1 = SIMD_T::and_si(vMask1, vEdge2N);
// coverage pixel 2
vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
- typename SIMD_T::Integer vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N);
+ Integer<SIMD_T> vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N);
vMask2 = SIMD_T::and_si(vMask2, vEdge2N);
// coverage pixel 3
vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
- typename SIMD_T::Integer vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N);
+ Integer<SIMD_T> vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N);
vMask3 = SIMD_T::and_si(vMask3, vEdge2N);
// One step to the right and then up
vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
- typename SIMD_T::Integer vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N);
+ Integer<SIMD_T> vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N);
vMask4 = SIMD_T::and_si(vMask4, vEdge2N);
// coverage pixel 5
vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
- typename SIMD_T::Integer vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N);
+ Integer<SIMD_T> vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N);
vMask5 = SIMD_T::and_si(vMask5, vEdge2N);
// coverage pixel 6
vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
- typename SIMD_T::Integer vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N);
+ Integer<SIMD_T> vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N);
vMask6 = SIMD_T::and_si(vMask6, vEdge2N);
// coverage pixel 7
vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
- typename SIMD_T::Integer vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N);
+ Integer<SIMD_T> vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N);
vMask7 = SIMD_T::and_si(vMask7, vEdge2N);
- typename SIMD_T::Integer vLit1 = SIMD_T::or_si(vMask0, vMask1);
+ Integer<SIMD_T> vLit1 = SIMD_T::or_si(vMask0, vMask1);
vLit1 = SIMD_T::or_si(vLit1, vMask2);
vLit1 = SIMD_T::or_si(vLit1, vMask3);
vLit1 = SIMD_T::or_si(vLit1, vMask4);
vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N);
vMask7 = SIMD_T::and_si(vMask7, vEdge2N);
- typename SIMD_T::Integer vLit2 = SIMD_T::or_si(vMask0, vMask1);
+ Integer<SIMD_T> vLit2 = SIMD_T::or_si(vMask0, vMask1);
vLit2 = SIMD_T::or_si(vLit2, vMask2);
vLit2 = SIMD_T::or_si(vLit2, vMask3);
vLit2 = SIMD_T::or_si(vLit2, vMask4);
vLit2 = SIMD_T::or_si(vLit2, vMask6);
vLit2 = SIMD_T::or_si(vLit2, vMask7);
- typename SIMD_T::Integer vLit = SIMD_T::or_si(vLit1, vLit2);
+ Integer<SIMD_T> vLit = SIMD_T::or_si(vLit1, vLit2);
#else
// Generic algorithm sweeping in row by row order
- typename SIMD_T::Integer vRowMask[ER_SIMD_TILE_Y_DIM];
+ Integer<SIMD_T> vRowMask[ER_SIMD_TILE_Y_DIM];
- typename SIMD_T::Integer vEdge0N = vEdge0;
- typename SIMD_T::Integer vEdge1N = vEdge1;
- typename SIMD_T::Integer vEdge2N = vEdge2;
+ Integer<SIMD_T> vEdge0N = vEdge0;
+ Integer<SIMD_T> vEdge1N = vEdge1;
+ Integer<SIMD_T> vEdge2N = vEdge2;
for (uint32_t row = 0; row < ER_SIMD_TILE_Y_DIM; row++)
{
// Store edge values at the beginning of the row
- typename SIMD_T::Integer vRowEdge0 = vEdge0N;
- typename SIMD_T::Integer vRowEdge1 = vEdge1N;
- typename SIMD_T::Integer vRowEdge2 = vEdge2N;
+ Integer<SIMD_T> vRowEdge0 = vEdge0N;
+ Integer<SIMD_T> vRowEdge1 = vEdge1N;
+ Integer<SIMD_T> vRowEdge2 = vEdge2N;
- typename SIMD_T::Integer vColMask[ER_SIMD_TILE_X_DIM];
+ Integer<SIMD_T> vColMask[ER_SIMD_TILE_X_DIM];
for (uint32_t col = 0; col < ER_SIMD_TILE_X_DIM; col++)
{
}
// compress all masks
- typename SIMD_T::Integer vLit = vRowMask[0];
+ Integer<SIMD_T> vLit = vRowMask[0];
for (uint32_t row = 1; row < ER_SIMD_TILE_Y_DIM; row++)
{
vLit = SIMD_T::or_si(vLit, vRowMask[row]);
DRAW_CONTEXT *pDC,
PA_STATE &pa,
uint32_t workerId,
- typename SIMD_T::Vec4 tri[3],
+ Vec4<SIMD_T> tri[3],
uint32_t triMask,
- typename SIMD_T::Integer const &primID,
- typename SIMD_T::Integer const &viewportIdx,
- typename SIMD_T::Integer const &rtIdx)
+ Integer<SIMD_T> const &primID,
+ Integer<SIMD_T> const &viewportIdx,
+ Integer<SIMD_T> const &rtIdx)
{
const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
MacroTileMgr *pTileMgr = pDC->pTileMgr;
- typename SIMD_T::Float vRecipW0 = SIMD_T::set1_ps(1.0f);
- typename SIMD_T::Float vRecipW1 = SIMD_T::set1_ps(1.0f);
- typename SIMD_T::Float vRecipW2 = SIMD_T::set1_ps(1.0f);
+ Float<SIMD_T> vRecipW0 = SIMD_T::set1_ps(1.0f);
+ Float<SIMD_T> vRecipW1 = SIMD_T::set1_ps(1.0f);
+ Float<SIMD_T> vRecipW2 = SIMD_T::set1_ps(1.0f);
if (feState.vpTransformDisable)
{
}
// Adjust for pixel center location
- typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
+ Float<SIMD_T> offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
tri[0].x = SIMD_T::add_ps(tri[0].x, offset);
tri[0].y = SIMD_T::add_ps(tri[0].y, offset);
tri[2].y = SIMD_T::add_ps(tri[2].y, offset);
// Set vXi, vYi to required fixed point precision
- typename SIMD_T::Integer vXi[3], vYi[3];
+ Integer<SIMD_T> vXi[3], vYi[3];
FPToFixedPoint<SIMD_T>(tri, vXi, vYi);
// triangle setup
- typename SIMD_T::Integer vAi[3], vBi[3];
+ Integer<SIMD_T> vAi[3], vBi[3];
triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
// determinant
- typename SIMD_T::Integer vDet[2];
+ Integer<SIMD_T> vDet[2];
calcDeterminantIntVertical(vAi, vBi, vDet);
// cull zero area
if (cullZeroAreaMask > 0)
{
// e0 = v1-v0
- const typename SIMD_T::Integer x0x1Mask = SIMD_T::cmpeq_epi32(vXi[0], vXi[1]);
- const typename SIMD_T::Integer y0y1Mask = SIMD_T::cmpeq_epi32(vYi[0], vYi[1]);
+ const Integer<SIMD_T> x0x1Mask = SIMD_T::cmpeq_epi32(vXi[0], vXi[1]);
+ const Integer<SIMD_T> y0y1Mask = SIMD_T::cmpeq_epi32(vYi[0], vYi[1]);
uint32_t e0Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask, y0y1Mask)));
// e1 = v2-v1
- const typename SIMD_T::Integer x1x2Mask = SIMD_T::cmpeq_epi32(vXi[1], vXi[2]);
- const typename SIMD_T::Integer y1y2Mask = SIMD_T::cmpeq_epi32(vYi[1], vYi[2]);
+ const Integer<SIMD_T> x1x2Mask = SIMD_T::cmpeq_epi32(vXi[1], vXi[2]);
+ const Integer<SIMD_T> y1y2Mask = SIMD_T::cmpeq_epi32(vYi[1], vYi[2]);
uint32_t e1Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask, y1y2Mask)));
int cullCenterMask;
{
- typename SIMD_T::Integer xmin = SIMD_T::add_epi32(bbox.xmin, SIMD_T::set1_epi32(127));
+ Integer<SIMD_T> xmin = SIMD_T::add_epi32(bbox.xmin, SIMD_T::set1_epi32(127));
xmin = SIMD_T::and_si(xmin, SIMD_T::set1_epi32(~255));
- typename SIMD_T::Integer xmax = SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(128));
+ Integer<SIMD_T> xmax = SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(128));
xmax = SIMD_T::and_si(xmax, SIMD_T::set1_epi32(~255));
- typename SIMD_T::Integer vMaskH = SIMD_T::cmpeq_epi32(xmin, xmax);
+ Integer<SIMD_T> vMaskH = SIMD_T::cmpeq_epi32(xmin, xmax);
- typename SIMD_T::Integer ymin = SIMD_T::add_epi32(bbox.ymin, SIMD_T::set1_epi32(127));
+ Integer<SIMD_T> ymin = SIMD_T::add_epi32(bbox.ymin, SIMD_T::set1_epi32(127));
ymin = SIMD_T::and_si(ymin, SIMD_T::set1_epi32(~255));
- typename SIMD_T::Integer ymax = SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(128));
+ Integer<SIMD_T> ymax = SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(128));
ymax = SIMD_T::and_si(ymax, SIMD_T::set1_epi32(~255));
- typename SIMD_T::Integer vMaskV = SIMD_T::cmpeq_epi32(ymin, ymax);
+ Integer<SIMD_T> vMaskV = SIMD_T::cmpeq_epi32(ymin, ymax);
vMaskV = SIMD_T::or_si(vMaskH, vMaskV);
cullCenterMask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vMaskV));
// Gather the AOS effective scissor rects based on the per-prim VP index.
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
{
- typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
+ Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax;
if (pa.viewportArrayActive)
{
// in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
// some area. Bump the xmax/ymax edges out
- typename SIMD_T::Integer topEqualsBottom = SIMD_T::cmpeq_epi32(bbox.ymin, bbox.ymax);
+ Integer<SIMD_T> topEqualsBottom = SIMD_T::cmpeq_epi32(bbox.ymin, bbox.ymax);
bbox.ymax = SIMD_T::blendv_epi32(bbox.ymax, SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), topEqualsBottom);
- typename SIMD_T::Integer leftEqualsRight = SIMD_T::cmpeq_epi32(bbox.xmin, bbox.xmax);
+ Integer<SIMD_T> leftEqualsRight = SIMD_T::cmpeq_epi32(bbox.xmin, bbox.xmax);
bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), leftEqualsRight);
}
// Cull tris completely outside scissor
{
- typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
- typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
- typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
+ Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
+ Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
+ Integer<SIMD_T> maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
triMask = triMask & ~maskOutsideScissor;
}
er_bbox.ymin = SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymin);
er_bbox.ymax = SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymax);
- typename SIMD_T::Integer vTileX = SIMD_T::cmpeq_epi32(er_bbox.xmin, er_bbox.xmax);
- typename SIMD_T::Integer vTileY = SIMD_T::cmpeq_epi32(er_bbox.ymin, er_bbox.ymax);
+ Integer<SIMD_T> vTileX = SIMD_T::cmpeq_epi32(er_bbox.xmin, er_bbox.xmax);
+ Integer<SIMD_T> vTileY = SIMD_T::cmpeq_epi32(er_bbox.ymin, er_bbox.ymax);
// Take only triangles that fit into ER tile
uint32_t oneTileMask = triMask & SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(vTileX, vTileY)));
{
// Simple non-conformant wireframe mode, useful for debugging
// construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
- typename SIMD_T::Vec4 line[2];
- typename SIMD_T::Float recipW[2];
+ Vec4<SIMD_T> line[2];
+ Float<SIMD_T> recipW[2];
line[0] = tri[0];
line[1] = tri[1];
OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
- SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft), bbox.xmin);
- SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight), bbox.xmax);
- SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop), bbox.ymin);
- SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
+ SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTLeft), bbox.xmin);
+ SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTRight), bbox.xmax);
+ SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTTop), bbox.ymin);
+ SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTBottom), bbox.ymax);
// transpose verts needed for backend
/// @todo modify BE to take non-transformed verts
DRAW_CONTEXT *pDC,
PA_STATE &pa,
uint32_t workerId,
- typename SIMD_T::Vec4 prim[],
+ Vec4<SIMD_T> prim[],
uint32_t primMask,
- typename SIMD_T::Integer const &primID,
- typename SIMD_T::Integer const &viewportIdx,
- typename SIMD_T::Integer const &rtIdx)
+ Integer<SIMD_T> const &primID,
+ Integer<SIMD_T> const &viewportIdx,
+ Integer<SIMD_T> const &rtIdx)
{
RDTSC_BEGIN(FEBinPoints, pDC->drawId);
- typename SIMD_T::Vec4 &primVerts = prim[0];
+ Vec4<SIMD_T> &primVerts = prim[0];
const API_STATE& state = GetApiState(pDC);
const SWR_RASTSTATE& rastState = state.rastState;
state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
// convert to fixed point
- typename SIMD_T::Integer vXi, vYi;
+ Integer<SIMD_T> vXi, vYi;
vXi = fpToFixedPointVertical<SIMD_T>(primVerts.x);
vYi = fpToFixedPointVertical<SIMD_T>(primVerts.y);
primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi));
// compute macro tile coordinates
- typename SIMD_T::Integer macroX = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(vXi);
- typename SIMD_T::Integer macroY = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(vYi);
+ Integer<SIMD_T> macroX = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(vXi);
+ Integer<SIMD_T> macroY = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(vYi);
OSALIGNSIMD16(uint32_t) aMacroX[SIMD_WIDTH], aMacroY[SIMD_WIDTH];
- SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroX), macroX);
- SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroY), macroY);
+ SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMacroX), macroX);
+ SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMacroY), macroY);
// compute raster tile coordinates
- typename SIMD_T::Integer rasterX = SIMD_T::template srai_epi32<KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT>(vXi);
- typename SIMD_T::Integer rasterY = SIMD_T::template srai_epi32<KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT>(vYi);
+ Integer<SIMD_T> rasterX = SIMD_T::template srai_epi32<KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT>(vXi);
+ Integer<SIMD_T> rasterY = SIMD_T::template srai_epi32<KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT>(vYi);
// compute raster tile relative x,y for coverage mask
- typename SIMD_T::Integer tileAlignedX = SIMD_T::template slli_epi32<KNOB_TILE_X_DIM_SHIFT>(rasterX);
- typename SIMD_T::Integer tileAlignedY = SIMD_T::template slli_epi32<KNOB_TILE_Y_DIM_SHIFT>(rasterY);
+ Integer<SIMD_T> tileAlignedX = SIMD_T::template slli_epi32<KNOB_TILE_X_DIM_SHIFT>(rasterX);
+ Integer<SIMD_T> tileAlignedY = SIMD_T::template slli_epi32<KNOB_TILE_Y_DIM_SHIFT>(rasterY);
- typename SIMD_T::Integer tileRelativeX = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vXi), tileAlignedX);
- typename SIMD_T::Integer tileRelativeY = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vYi), tileAlignedY);
+ Integer<SIMD_T> tileRelativeX = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vXi), tileAlignedX);
+ Integer<SIMD_T> tileRelativeY = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vYi), tileAlignedY);
OSALIGNSIMD16(uint32_t) aTileRelativeX[SIMD_WIDTH];
OSALIGNSIMD16(uint32_t) aTileRelativeY[SIMD_WIDTH];
- SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeX), tileRelativeX);
- SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeY), tileRelativeY);
+ SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aTileRelativeX), tileRelativeX);
+ SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aTileRelativeY), tileRelativeY);
OSALIGNSIMD16(uint32_t) aTileAlignedX[SIMD_WIDTH];
OSALIGNSIMD16(uint32_t) aTileAlignedY[SIMD_WIDTH];
- SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedX), tileAlignedX);
- SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedY), tileAlignedY);
+ SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aTileAlignedX), tileAlignedX);
+ SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aTileAlignedY), tileAlignedY);
OSALIGNSIMD16(float) aZ[SIMD_WIDTH];
SIMD_T::store_ps(reinterpret_cast<float *>(aZ), primVerts.z);
else
{
// non simple points need to be potentially binned to multiple macro tiles
- typename SIMD_T::Float vPointSize;
+ Float<SIMD_T> vPointSize;
if (rastState.pointParam)
{
- typename SIMD_T::Vec4 size[3];
+ Vec4<SIMD_T> size[3];
pa.Assemble(VERTEX_SGV_SLOT, size);
vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP];
}
bbox.xmin = bbox.xmax = vXi;
bbox.ymin = bbox.ymax = vYi;
- typename SIMD_T::Float vHalfWidth = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f));
- typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
+ Float<SIMD_T> vHalfWidth = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f));
+ Integer<SIMD_T> vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
bbox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
bbox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
// Gather the AOS effective scissor rects based on the per-prim VP index.
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
{
- typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
+ Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax;
if (pa.viewportArrayActive)
{
}
// Cull bloated points completely outside scissor
- typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
- typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
- typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
+ Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
+ Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
+ Integer<SIMD_T> maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
primMask = primMask & ~maskOutsideScissor;
OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
- SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft), bbox.xmin);
- SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight), bbox.xmax);
- SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop), bbox.ymin);
- SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
+ SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTLeft), bbox.xmin);
+ SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTRight), bbox.xmax);
+ SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTTop), bbox.ymin);
+ SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTBottom), bbox.ymax);
// store render target array index
const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
DRAW_CONTEXT *pDC,
PA_STATE &pa,
uint32_t workerId,
- typename SIMD_T::Vec4 prim[3],
+ Vec4<SIMD_T> prim[3],
uint32_t primMask,
- typename SIMD_T::Integer const &primID,
- typename SIMD_T::Integer const &viewportIdx,
- typename SIMD_T::Integer const &rtIdx)
+ Integer<SIMD_T> const &primID,
+ Integer<SIMD_T> const &viewportIdx,
+ Integer<SIMD_T> const &rtIdx)
{
const API_STATE& state = GetApiState(pDC);
const SWR_FRONTEND_STATE& feState = state.frontendState;
if (!feState.vpTransformDisable)
{
// perspective divide
- typename SIMD_T::Float vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
+ Float<SIMD_T> vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
prim[0].x = SIMD_T::mul_ps(prim[0].x, vRecipW0);
prim[0].y = SIMD_T::mul_ps(prim[0].y, vRecipW0);
}
}
- typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
+ Float<SIMD_T> offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
DRAW_CONTEXT *pDC,
PA_STATE &pa,
uint32_t workerId,
- typename SIMD_T::Vec4 prim[],
- typename SIMD_T::Float recipW[],
+ Vec4<SIMD_T> prim[],
+ Float<SIMD_T> recipW[],
uint32_t primMask,
- typename SIMD_T::Integer const &primID,
- typename SIMD_T::Integer const &viewportIdx,
- typename SIMD_T::Integer const &rtIdx)
+ Integer<SIMD_T> const &primID,
+ Integer<SIMD_T> const &viewportIdx,
+ Integer<SIMD_T> const &rtIdx)
{
const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
- typename SIMD_T::Float &vRecipW0 = recipW[0];
- typename SIMD_T::Float &vRecipW1 = recipW[1];
+ Float<SIMD_T> &vRecipW0 = recipW[0];
+ Float<SIMD_T> &vRecipW1 = recipW[1];
// convert to fixed point
- typename SIMD_T::Integer vXi[2], vYi[2];
+ Integer<SIMD_T> vXi[2], vYi[2];
vXi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].x);
vYi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].y);
vYi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].y);
// compute x-major vs y-major mask
- typename SIMD_T::Integer xLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1]));
- typename SIMD_T::Integer yLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1]));
- typename SIMD_T::Float vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength));
+ Integer<SIMD_T> xLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1]));
+ Integer<SIMD_T> yLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1]));
+ Float<SIMD_T> vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength));
uint32_t yMajorMask = SIMD_T::movemask_ps(vYmajorMask);
// cull zero-length lines
- typename SIMD_T::Integer vZeroLengthMask = SIMD_T::cmpeq_epi32(xLength, SIMD_T::setzero_si());
+ Integer<SIMD_T> vZeroLengthMask = SIMD_T::cmpeq_epi32(xLength, SIMD_T::setzero_si());
vZeroLengthMask = SIMD_T::and_si(vZeroLengthMask, SIMD_T::cmpeq_epi32(yLength, SIMD_T::setzero_si()));
primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vZeroLengthMask));
bbox.ymax = SIMD_T::max_epi32(vYi[0], vYi[1]);
// bloat bbox by line width along minor axis
- typename SIMD_T::Float vHalfWidth = SIMD_T::set1_ps(rastState.lineWidth / 2.0f);
- typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
+ Float<SIMD_T> vHalfWidth = SIMD_T::set1_ps(rastState.lineWidth / 2.0f);
+ Integer<SIMD_T> vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
SIMDBBOX_T<SIMD_T> bloatBox;
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
{
- typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
+ Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax;
if (pa.viewportArrayActive)
{
// Cull prims completely outside scissor
{
- typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
- typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
- typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
+ Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
+ Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
+ Integer<SIMD_T> maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
primMask = primMask & ~maskOutsideScissor;
}
OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
- SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft), bbox.xmin);
- SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight), bbox.xmax);
- SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop), bbox.ymin);
- SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
+ SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTLeft), bbox.xmin);
+ SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTRight), bbox.xmax);
+ SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTTop), bbox.ymin);
+ SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTBottom), bbox.ymax);
TransposeVertices(vHorizX, prim[0].x, prim[1].x, SIMD_T::setzero_ps());
TransposeVertices(vHorizY, prim[0].y, prim[1].y, SIMD_T::setzero_ps());
DRAW_CONTEXT *pDC,
PA_STATE &pa,
uint32_t workerId,
- typename SIMD_T::Vec4 prim[3],
+ Vec4<SIMD_T> prim[3],
uint32_t primMask,
- typename SIMD_T::Integer const &primID,
- typename SIMD_T::Integer const &viewportIdx,
- typename SIMD_T::Integer const & rtIdx)
+ Integer<SIMD_T> const &primID,
+ Integer<SIMD_T> const &viewportIdx,
+ Integer<SIMD_T> const & rtIdx)
{
const API_STATE& state = GetApiState(pDC);
const SWR_RASTSTATE& rastState = state.rastState;
const SWR_FRONTEND_STATE& feState = state.frontendState;
- typename SIMD_T::Float vRecipW[2] = { SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f) };
+ Float<SIMD_T> vRecipW[2] = { SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f) };
if (!feState.vpTransformDisable)
{
}
// adjust for pixel center location
- typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
+ Float<SIMD_T> offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
#define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
template<typename SIMD_T>
-void ComputeClipCodes(const API_STATE &state, const typename SIMD_T::Vec4 &vertex, typename SIMD_T::Float &clipCodes, typename SIMD_T::Integer const &viewportIndexes)
+void ComputeClipCodes(const API_STATE &state, const Vec4<SIMD_T> &vertex, Float<SIMD_T> &clipCodes, Integer<SIMD_T> const &viewportIndexes)
{
clipCodes = SIMD_T::setzero_ps();
// -w
- typename SIMD_T::Float vNegW = SIMD_T::mul_ps(vertex.w,SIMD_T::set1_ps(-1.0f));
+ Float<SIMD_T> vNegW = SIMD_T::mul_ps(vertex.w,SIMD_T::set1_ps(-1.0f));
// FRUSTUM_LEFT
- typename SIMD_T::Float vRes = SIMD_T::cmplt_ps(vertex.x, vNegW);
+ Float<SIMD_T> vRes = SIMD_T::cmplt_ps(vertex.x, vNegW);
clipCodes = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT)));
// FRUSTUM_TOP
clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW))));
// GUARDBAND_LEFT
- typename SIMD_T::Float gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.left[0], viewportIndexes));
+ Float<SIMD_T> gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(&state.gbState.left[0], viewportIndexes));
vRes = SIMD_T::cmplt_ps(vertex.x, gbMult);
clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT))));
// GUARDBAND_TOP
- gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.top[0], viewportIndexes));
+ gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(&state.gbState.top[0], viewportIndexes));
vRes = SIMD_T::cmplt_ps(vertex.y, gbMult);
clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP))));
// GUARDBAND_RIGHT
- gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.right[0], viewportIndexes));
+ gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(&state.gbState.right[0], viewportIndexes));
vRes = SIMD_T::cmpgt_ps(vertex.x, gbMult);
clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT))));
// GUARDBAND_BOTTOM
- gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.bottom[0], viewportIndexes));
+ gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(&state.gbState.bottom[0], viewportIndexes));
vRes = SIMD_T::cmpgt_ps(vertex.y, gbMult);
clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM))));
}
static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
}
- void ComputeClipCodes(typename SIMD_T::Vec4 vertex[], const typename SIMD_T::Integer &viewportIndexes)
+ void ComputeClipCodes(Vec4<SIMD_T> vertex[], const Integer<SIMD_T> &viewportIndexes)
{
for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
{
}
}
- typename SIMD_T::Float ComputeClipCodeIntersection()
+ Float<SIMD_T> ComputeClipCodeIntersection()
{
- typename SIMD_T::Float result = clipCodes[0];
+ Float<SIMD_T> result = clipCodes[0];
for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
{
return result;
}
- typename SIMD_T::Float ComputeClipCodeUnion()
+ Float<SIMD_T> ComputeClipCodeUnion()
{
- typename SIMD_T::Float result = clipCodes[0];
+ Float<SIMD_T> result = clipCodes[0];
for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
{
int ComputeClipMask()
{
- typename SIMD_T::Float clipUnion = ComputeClipCodeUnion();
+ Float<SIMD_T> clipUnion = ComputeClipCodeUnion();
clipUnion = SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK)));
}
// clipper is responsible for culling any prims with NAN coordinates
- int ComputeNaNMask(typename SIMD_T::Vec4 prim[])
+ int ComputeNaNMask(Vec4<SIMD_T> prim[])
{
- typename SIMD_T::Float vNanMask = SIMD_T::setzero_ps();
+ Float<SIMD_T> vNanMask = SIMD_T::setzero_ps();
for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
{
- typename SIMD_T::Float vNan01 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]);
+ Float<SIMD_T> vNan01 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]);
vNanMask = SIMD_T::or_ps(vNanMask, vNan01);
- typename SIMD_T::Float vNan23 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]);
+ Float<SIMD_T> vNan23 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]);
vNanMask = SIMD_T::or_ps(vNanMask, vNan23);
}
return SIMD_T::movemask_ps(vNanMask);
}
- int ComputeUserClipCullMask(PA_STATE &pa, typename SIMD_T::Vec4 prim[])
+ int ComputeUserClipCullMask(PA_STATE &pa, Vec4<SIMD_T> prim[])
{
uint8_t cullMask = state.backendState.cullDistanceMask;
uint32_t vertexClipCullOffset = state.backendState.vertexClipCullOffset;
- typename SIMD_T::Float vClipCullMask = SIMD_T::setzero_ps();
+ Float<SIMD_T> vClipCullMask = SIMD_T::setzero_ps();
- typename SIMD_T::Vec4 vClipCullDistLo[3];
- typename SIMD_T::Vec4 vClipCullDistHi[3];
+ Vec4<SIMD_T> vClipCullDistLo[3];
+ Vec4<SIMD_T> vClipCullDistHi[3];
pa.Assemble(vertexClipCullOffset, vClipCullDistLo);
pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi);
uint32_t slot = index >> 2;
uint32_t component = index & 0x3;
- typename SIMD_T::Float vCullMaskElem = SIMD_T::set1_ps(-1.0f);
+ Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
{
- typename SIMD_T::Float vCullComp;
+ Float<SIMD_T> vCullComp;
if (slot == 0)
{
vCullComp = vClipCullDistLo[e][component];
}
// cull if cull distance < 0 || NAN
- typename SIMD_T::Float vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vCullComp);
+ Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vCullComp);
vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
}
vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
uint32_t slot = index >> 2;
uint32_t component = index & 0x3;
- typename SIMD_T::Float vCullMaskElem = SIMD_T::set1_ps(-1.0f);
+ Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
{
- typename SIMD_T::Float vClipComp;
+ Float<SIMD_T> vClipComp;
if (slot == 0)
{
vClipComp = vClipCullDistLo[e][component];
vClipComp = vClipCullDistHi[e][component];
}
- typename SIMD_T::Float vClip = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp);
- typename SIMD_T::Float vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vClipComp);
+ Float<SIMD_T> vClip = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp);
+ Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vClipComp);
vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
vClipCullMask = SIMD_T::or_ps(vClipCullMask, vClip);
}
return SIMD_T::movemask_ps(vClipCullMask);
}
- void ClipSimd(const typename SIMD_T::Vec4 prim[], const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, PA_STATE &pa,
- const typename SIMD_T::Integer &vPrimId, const typename SIMD_T::Integer &vViewportIdx, const typename SIMD_T::Integer &vRtIdx)
+ void ClipSimd(const Vec4<SIMD_T> prim[], const Float<SIMD_T> &vPrimMask, const Float<SIMD_T> &vClipMask, PA_STATE &pa,
+ const Integer<SIMD_T> &vPrimId, const Integer<SIMD_T> &vViewportIdx, const Integer<SIMD_T> &vRtIdx)
{
// input/output vertex store for clipper
SIMDVERTEX_T<SIMD_T> vertices[7]; // maximum 7 verts generated per triangle
///@todo: line topology for wireframe?
// assemble pos
- typename SIMD_T::Vec4 tmpVector[NumVertsPerPrim];
+ Vec4<SIMD_T> tmpVector[NumVertsPerPrim];
for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
{
vertices[i].attrib[VERTEX_POSITION_SLOT] = prim[i];
uint32_t numAttribs = maxSlot + 1;
- typename SIMD_T::Integer vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
+ Integer<SIMD_T> vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
BinnerChooser<SIMD_T> binner(NumVertsPerPrim, pa.pDC->pState->state.rastState.conservativeRast);
#endif
for (uint32_t c = 0; c < 4; ++c)
{
- SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
+ SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
- pBase += sizeof(typename SIMD_T::Float);
+ pBase += sizeof(Float<SIMD_T>);
}
// transpose attribs
for (uint32_t c = 0; c < 4; ++c)
{
- SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
+ SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
transposedPrims[0].attrib[attribSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
- pBase += sizeof(typename SIMD_T::Float);
+ pBase += sizeof(Float<SIMD_T>);
}
}
for (uint32_t c = 0; c < 4; ++c)
{
- SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
+ SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
transposedPrims[0].attrib[vertexClipCullSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
- pBase += sizeof(typename SIMD_T::Float);
+ pBase += sizeof(Float<SIMD_T>);
}
}
for (uint32_t c = 0; c < 4; ++c)
{
- SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
+ SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
transposedPrims[0].attrib[vertexClipCullSlot + 1][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
- pBase += sizeof(typename SIMD_T::Float);
+ pBase += sizeof(Float<SIMD_T>);
}
}
const uint32_t primMask = primMaskMap[numEmittedPrims];
- const typename SIMD_T::Integer primID = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]);
- const typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(pViewportIdx[inputPrim]);
- const typename SIMD_T::Integer rtIdx = SIMD_T::set1_epi32(pRtIdx[inputPrim]);
+ const Integer<SIMD_T> primID = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]);
+ const Integer<SIMD_T> viewportIdx = SIMD_T::set1_epi32(pViewportIdx[inputPrim]);
+ const Integer<SIMD_T> rtIdx = SIMD_T::set1_epi32(pRtIdx[inputPrim]);
while (clipPA.GetNextStreamOutput())
{
do
{
- typename SIMD_T::Vec4 attrib[NumVertsPerPrim];
+ Vec4<SIMD_T> attrib[NumVertsPerPrim];
bool assemble = clipPA.Assemble(VERTEX_POSITION_SLOT, attrib);
UPDATE_STAT_FE(CPrimitives, numClippedPrims);
}
- void ExecuteStage(PA_STATE &pa, typename SIMD_T::Vec4 prim[], uint32_t primMask,
- typename SIMD_T::Integer const &primId, typename SIMD_T::Integer const &viewportIdx, typename SIMD_T::Integer const &rtIdx)
+ void ExecuteStage(PA_STATE &pa, Vec4<SIMD_T> prim[], uint32_t primMask,
+ Integer<SIMD_T> const &primId, Integer<SIMD_T> const &viewportIdx, Integer<SIMD_T> const &rtIdx)
{
SWR_ASSERT(pa.pDC != nullptr);
}
// cull prims outside view frustum
- typename SIMD_T::Float clipIntersection = ComputeClipCodeIntersection();
+ Float<SIMD_T> clipIntersection = ComputeClipCodeIntersection();
int validMask = primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps());
// skip clipping for points
}
private:
- typename SIMD_T::Float ComputeInterpFactor(typename SIMD_T::Float const &boundaryCoord0, typename SIMD_T::Float const &boundaryCoord1)
+ Float<SIMD_T> ComputeInterpFactor(Float<SIMD_T> const &boundaryCoord0, Float<SIMD_T> const &boundaryCoord1)
{
return SIMD_T::div_ps(boundaryCoord0, SIMD_T::sub_ps(boundaryCoord0, boundaryCoord1));
}
- typename SIMD_T::Integer ComputeOffsets(uint32_t attrib, typename SIMD_T::Integer const &vIndices, uint32_t component)
+ Integer<SIMD_T> ComputeOffsets(uint32_t attrib, Integer<SIMD_T> const &vIndices, uint32_t component)
{
const uint32_t simdVertexStride = sizeof(SIMDVERTEX_T<SIMD_T>);
- const uint32_t componentStride = sizeof(typename SIMD_T::Float);
- const uint32_t attribStride = sizeof(typename SIMD_T::Vec4);
+ const uint32_t componentStride = sizeof(Float<SIMD_T>);
+ const uint32_t attribStride = sizeof(Vec4<SIMD_T>);
static const OSALIGNSIMD16(uint32_t) elemOffset[16] =
{
15 * sizeof(float),
};
- static_assert(sizeof(typename SIMD_T::Integer) <= sizeof(elemOffset), "Clipper::ComputeOffsets, Increase number of element offsets.");
+ static_assert(sizeof(Integer<SIMD_T>) <= sizeof(elemOffset), "Clipper::ComputeOffsets, Increase number of element offsets.");
- typename SIMD_T::Integer vElemOffset = SIMD_T::loadu_si(reinterpret_cast<const typename SIMD_T::Integer *>(elemOffset));
+ Integer<SIMD_T> vElemOffset = SIMD_T::loadu_si(reinterpret_cast<const Integer<SIMD_T> *>(elemOffset));
// step to the simdvertex
- typename SIMD_T::Integer vOffsets = SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride));
+ Integer<SIMD_T> vOffsets = SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride));
// step to the attribute and component
vOffsets = SIMD_T::add_epi32(vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component));
return vOffsets;
}
- typename SIMD_T::Float GatherComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component)
+ Float<SIMD_T> GatherComponent(const float* pBuffer, uint32_t attrib, Float<SIMD_T> const &vMask, Integer<SIMD_T> const &vIndices, uint32_t component)
{
- typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component);
- typename SIMD_T::Float vSrc = SIMD_T::setzero_ps();
+ Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component);
+ Float<SIMD_T> vSrc = SIMD_T::setzero_ps();
- return SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(vSrc, pBuffer, vOffsets, vMask);
+ return SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(vSrc, pBuffer, vOffsets, vMask);
}
- void ScatterComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component, typename SIMD_T::Float const &vSrc)
+ void ScatterComponent(const float* pBuffer, uint32_t attrib, Float<SIMD_T> const &vMask, Integer<SIMD_T> const &vIndices, uint32_t component, Float<SIMD_T> const &vSrc)
{
- typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component);
+ Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component);
const uint32_t *pOffsets = reinterpret_cast<const uint32_t *>(&vOffsets);
const float *pSrc = reinterpret_cast<const float *>(&vSrc);
template<SWR_CLIPCODES ClippingPlane>
void intersect(
- const typename SIMD_T::Float &vActiveMask, // active lanes to operate on
- const typename SIMD_T::Integer &s, // index to first edge vertex v0 in pInPts.
- const typename SIMD_T::Integer &p, // index to second edge vertex v1 in pInPts.
- const typename SIMD_T::Vec4 &v1, // vertex 0 position
- const typename SIMD_T::Vec4 &v2, // vertex 1 position
- typename SIMD_T::Integer &outIndex, // output index.
+ const Float<SIMD_T> &vActiveMask, // active lanes to operate on
+ const Integer<SIMD_T> &s, // index to first edge vertex v0 in pInPts.
+ const Integer<SIMD_T> &p, // index to second edge vertex v1 in pInPts.
+ const Vec4<SIMD_T> &v1, // vertex 0 position
+ const Vec4<SIMD_T> &v2, // vertex 1 position
+ Integer<SIMD_T> &outIndex, // output index.
const float *pInVerts, // array of all the input positions.
uint32_t numInAttribs, // number of attributes per vertex.
float *pOutVerts) // array of output positions. We'll write our new intersection point at i*4.
uint32_t vertexClipCullOffset = this->state.backendState.vertexClipCullOffset;
// compute interpolation factor
- typename SIMD_T::Float t;
+ Float<SIMD_T> t;
switch (ClippingPlane)
{
case FRUSTUM_LEFT: t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0])); break;
// interpolate position and store
for (uint32_t c = 0; c < 4; ++c)
{
- typename SIMD_T::Float vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]);
+ Float<SIMD_T> vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]);
ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
}
uint32_t attribSlot = vertexAttribOffset + a;
for (uint32_t c = 0; c < 4; ++c)
{
- typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
- typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
- typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+ Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+ Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+ Float<SIMD_T> vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
}
}
uint32_t attribSlot = vertexClipCullOffset;
for (uint32_t c = 0; c < 4; ++c)
{
- typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
- typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
- typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+ Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+ Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+ Float<SIMD_T> vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
}
}
uint32_t attribSlot = vertexClipCullOffset + 1;
for (uint32_t c = 0; c < 4; ++c)
{
- typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
- typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
- typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+ Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+ Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+ Float<SIMD_T> vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
}
}
}
template<SWR_CLIPCODES ClippingPlane>
- typename SIMD_T::Float inside(const typename SIMD_T::Vec4 &v)
+ Float<SIMD_T> inside(const Vec4<SIMD_T> &v)
{
switch (ClippingPlane)
{
}
template<SWR_CLIPCODES ClippingPlane>
- typename SIMD_T::Integer ClipTriToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
+ Integer<SIMD_T> ClipTriToPlane(const float *pInVerts, const Integer<SIMD_T> &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
{
uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
- typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si();
- typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si();
- typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
+ Integer<SIMD_T> vCurIndex = SIMD_T::setzero_si();
+ Integer<SIMD_T> vOutIndex = SIMD_T::setzero_si();
+ Float<SIMD_T> vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
while (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
{
- typename SIMD_T::Integer s = vCurIndex;
- typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
- typename SIMD_T::Integer underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p);
+ Integer<SIMD_T> s = vCurIndex;
+ Integer<SIMD_T> p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
+ Integer<SIMD_T> underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p);
p = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask)));
// gather position
- typename SIMD_T::Vec4 vInPos0, vInPos1;
+ Vec4<SIMD_T> vInPos0, vInPos1;
for (uint32_t c = 0; c < 4; ++c)
{
vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
}
// compute inside mask
- typename SIMD_T::Float s_in = inside<ClippingPlane>(vInPos0);
- typename SIMD_T::Float p_in = inside<ClippingPlane>(vInPos1);
+ Float<SIMD_T> s_in = inside<ClippingPlane>(vInPos0);
+ Float<SIMD_T> p_in = inside<ClippingPlane>(vInPos1);
// compute intersection mask (s_in != p_in)
- typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in);
+ Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in);
intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
// store s if inside
uint32_t attribSlot = vertexAttribOffset + a;
for (uint32_t c = 0; c < 4; ++c)
{
- typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+ Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
}
}
uint32_t attribSlot = vertexClipCullSlot;
for (uint32_t c = 0; c < 4; ++c)
{
- typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+ Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
}
}
uint32_t attribSlot = vertexClipCullSlot + 1;
for (uint32_t c = 0; c < 4; ++c)
{
- typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+ Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
}
}
}
template<SWR_CLIPCODES ClippingPlane>
- typename SIMD_T::Integer ClipLineToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
+ Integer<SIMD_T> ClipLineToPlane(const float *pInVerts, const Integer<SIMD_T> &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
{
uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
- typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si();
- typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si();
- typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
+ Integer<SIMD_T> vCurIndex = SIMD_T::setzero_si();
+ Integer<SIMD_T> vOutIndex = SIMD_T::setzero_si();
+ Float<SIMD_T> vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
if (!SIMD_T::testz_ps(vActiveMask, vActiveMask))
{
- typename SIMD_T::Integer s = vCurIndex;
- typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
+ Integer<SIMD_T> s = vCurIndex;
+ Integer<SIMD_T> p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
// gather position
- typename SIMD_T::Vec4 vInPos0, vInPos1;
+ Vec4<SIMD_T> vInPos0, vInPos1;
for (uint32_t c = 0; c < 4; ++c)
{
vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
}
// compute inside mask
- typename SIMD_T::Float s_in = inside<ClippingPlane>(vInPos0);
- typename SIMD_T::Float p_in = inside<ClippingPlane>(vInPos1);
+ Float<SIMD_T> s_in = inside<ClippingPlane>(vInPos0);
+ Float<SIMD_T> p_in = inside<ClippingPlane>(vInPos1);
// compute intersection mask (s_in != p_in)
- typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in);
+ Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in);
intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
// store s if inside
uint32_t attribSlot = vertexAttribOffset + a;
for (uint32_t c = 0; c < 4; ++c)
{
- typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+ Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
}
}
uint32_t attribSlot = vertexAttribOffset + a;
for (uint32_t c = 0; c < 4; ++c)
{
- typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
+ Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
}
}
return vOutIndex;
}
- typename SIMD_T::Integer ClipPrims(float *pVertices, const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, int numAttribs)
+ Integer<SIMD_T> ClipPrims(float *pVertices, const Float<SIMD_T> &vPrimMask, const Float<SIMD_T> &vClipMask, int numAttribs)
{
// temp storage
float *pTempVerts = reinterpret_cast<float *>(ClipHelper<SIMD_T>::GetTempVertices());
// zero out num input verts for non-active lanes
- typename SIMD_T::Integer vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrim);
+ Integer<SIMD_T> vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrim);
vNumInPts = SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts, vClipMask);
// clip prims to frustum
- typename SIMD_T::Integer vNumOutPts;
+ Integer<SIMD_T> vNumOutPts;
if (NumVertsPerPrim == 3)
{
vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
}
// restore num verts for non-clipped, active lanes
- typename SIMD_T::Float vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask);
+ Float<SIMD_T> vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask);
vNumOutPts = SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrim), vNonClippedMask);
return vNumOutPts;
const uint32_t workerId{ 0 };
DRAW_CONTEXT *pDC{ nullptr };
const API_STATE &state;
- typename SIMD_T::Float clipCodes[NumVertsPerPrim];
+ Float<SIMD_T> clipCodes[NumVertsPerPrim];
};