inputSlot = backendState.vertexAttribOffset + i;
}
- __m128 attrib[3]; // triangle attribs (always 4 wide)
+ simd4scalar attrib[3]; // triangle attribs (always 4 wide)
float* pAttribStart = pBuffer;
if (HasConstantInterpT::value || IsDegenerate::value)
for (uint32_t i = 0; i < NumVertsT::value; ++i)
{
- _mm_store_ps(pBuffer, attrib[vid]);
+ SIMD128::store_ps(pBuffer, attrib[vid]);
pBuffer += 4;
}
}
for (uint32_t i = 0; i < NumVertsT::value; ++i)
{
- _mm_store_ps(pBuffer, attrib[i]);
+ SIMD128::store_ps(pBuffer, attrib[i]);
pBuffer += 4;
}
}
for (uint32_t i = 0; i < NumVertsT::value; ++i)
{
- _mm_store_ps(pBuffer, attrib[i]);
+ SIMD128::store_ps(pBuffer, attrib[i]);
pBuffer += 4;
}
}
// effect of the missing vertices in the triangle interpolation.
for (uint32_t v = NumVertsT::value; v < 3; ++v)
{
- _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
+ SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]);
pBuffer += 4;
}
{
static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
simd16scalari &scisXmin, simd16scalari &scisYmin,
- simd16scalari &scisXmax, simd16scalari &scisYmax)
- {
+ simd16scalari &scisXmax, simd16scalari &scisYmax) {
scisXmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
pScissorsInFixedPoint[pViewportIndex[1]].xmin,
pScissorsInFixedPoint[pViewportIndex[2]].xmin,
uint32_t clipAttribSlot = clipSlot == 0 ?
VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
- __m128 primClipDist[3];
+ simd4scalar primClipDist[3];
pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
float vertClipDist[NumVerts];
for (uint32_t e = 0; e < NumVerts; ++e)
{
OSALIGNSIMD(float) aVertClipDist[4];
- _mm_store_ps(aVertClipDist, primClipDist[e]);
+ SIMD128::store_ps(aVertClipDist, primClipDist[e]);
vertClipDist[e] = aVertClipDist[clipComp];
};
(SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
}
+ simdBBox bbox;
+
if (!triMask)
{
goto endBinTriangles;
}
// Calc bounding box of triangles
- simdBBox bbox;
calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
// determine if triangle falls between pixel centers and discard
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
// Gather the AOS effective scissor rects based on the per-prim VP index.
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
- simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
- if (state.backendState.readViewportArrayIndex)
- {
- GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
- }
- else // broadcast fast path for non-VPAI case.
{
- scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
- }
+ simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+ if (state.backendState.readViewportArrayIndex)
+ {
+ GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+ scisXmin, scisYmin, scisXmax, scisYmax);
+ }
+ else // broadcast fast path for non-VPAI case.
+ {
+ scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ }
- // Make triangle bbox inclusive
- bbox.xmax = _simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1));
- bbox.ymax = _simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1));
+ // Make triangle bbox inclusive
+ bbox.xmax = _simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1));
+ bbox.ymax = _simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1));
- bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd_min_epi32(bbox.xmax, scisXmax);
- bbox.ymax = _simd_min_epi32(bbox.ymax, scisYmax);
+ bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = _simd_min_epi32(bbox.xmax, scisXmax);
+ bbox.ymax = _simd_min_epi32(bbox.ymax, scisYmax);
+ }
if (CT::IsConservativeT::value)
{
// transpose verts needed for backend
/// @todo modify BE to take non-transformed verts
- __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
+ simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
// store triangle vertex data
desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
- _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
- _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
- _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
- _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
// store user clip distances
if (rastState.clipDistanceMask)
#if USE_SIMD16_FRONTEND
template <typename CT>
-void SIMDAPI BinTriangles_simd16(
+void SIMDCALL BinTriangles_simd16(
DRAW_CONTEXT *pDC,
PA_STATE& pa,
uint32_t workerId,
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
// Gather the AOS effective scissor rects based on the per-prim VP index.
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
- simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
-
- if (state.backendState.readViewportArrayIndex)
- {
- GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
- }
- else // broadcast fast path for non-VPAI case.
{
- scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
- }
+ simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
+
+ if (state.backendState.readViewportArrayIndex)
+ {
+ GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+ scisXmin, scisYmin, scisXmax, scisYmax);
+ }
+ else // broadcast fast path for non-VPAI case.
+ {
+ scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ }
- // Make triangle bbox inclusive
- bbox.xmax = _simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1));
- bbox.ymax = _simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1));
+ // Make triangle bbox inclusive
+ bbox.xmax = _simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1));
+ bbox.ymax = _simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1));
- bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd16_min_epi32(bbox.xmax, scisXmax);
- bbox.ymax = _simd16_min_epi32(bbox.ymax, scisYmax);
+ bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = _simd16_min_epi32(bbox.xmax, scisXmax);
+ bbox.ymax = _simd16_min_epi32(bbox.ymax, scisYmax);
+ }
if (CT::IsConservativeT::value)
{
// transpose verts needed for backend
/// @todo modify BE to take non-transformed verts
- __m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
- __m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
- __m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
- __m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
vTranspose3x8(vHorizX[0], _simd16_extract_ps(tri[0].x, 0), _simd16_extract_ps(tri[1].x, 0), _simd16_extract_ps(tri[2].x, 0));
vTranspose3x8(vHorizY[0], _simd16_extract_ps(tri[0].y, 0), _simd16_extract_ps(tri[1].y, 0), _simd16_extract_ps(tri[2].y, 0));
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
// Gather the AOS effective scissor rects based on the per-prim VP index.
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
- simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
- if (state.backendState.readViewportArrayIndex)
- {
- GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
- }
- else // broadcast fast path for non-VPAI case.
{
- scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
- }
+ simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+ if (state.backendState.readViewportArrayIndex)
+ {
+ GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+ scisXmin, scisYmin, scisXmax, scisYmax);
+ }
+ else // broadcast fast path for non-VPAI case.
+ {
+ scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ }
- bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
- bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+ bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
+ bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+ }
// Cull bloated points completely outside scissor
simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
// Gather the AOS effective scissor rects based on the per-prim VP index.
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
- simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
- if (state.backendState.readViewportArrayIndex)
{
- GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
- }
- else // broadcast fast path for non-VPAI case.
- {
- scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
- }
+ simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
+ if (state.backendState.readViewportArrayIndex)
+ {
+ GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+ scisXmin, scisYmin, scisXmax, scisYmax);
+ }
+ else // broadcast fast path for non-VPAI case.
+ {
+ scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ }
- bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
- bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+ bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
+ bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+ }
// Cull bloated points completely outside scissor
simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax);
AR_END(FEBinPoints, 1);
}
-void SIMDAPI BinPoints_simd16(
+void SIMDCALL BinPoints_simd16(
DRAW_CONTEXT *pDC,
PA_STATE& pa,
uint32_t workerId,
simdscalar& vRecipW0 = recipW[0];
simdscalar& vRecipW1 = recipW[1];
+ simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
+
// convert to fixed point
simdscalari vXi[2], vYi[2];
vXi[0] = fpToFixedPointVertical(prim[0].x);
bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
- simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
- if (state.backendState.readViewportArrayIndex)
- {
- GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
- }
- else // broadcast fast path for non-VPAI case.
{
- scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
- }
+ simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+ if (state.backendState.readViewportArrayIndex)
+ {
+ GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+ scisXmin, scisYmin, scisXmax, scisYmax);
+ }
+ else // broadcast fast path for non-VPAI case.
+ {
+ scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ }
- bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
- bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+ bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
+ bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+ }
// Cull prims completely outside scissor
{
// transpose verts needed for backend
/// @todo modify BE to take non-transformed verts
- __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
// store line vertex data
desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
- _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
- _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
- _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
- _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
+ SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
// store user clip distances
if (rastState.clipDistanceMask)
bbox.ymax = _simd16_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
- simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
-
- if (state.backendState.readViewportArrayIndex)
- {
- GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
- scisXmin, scisYmin, scisXmax, scisYmax);
- }
- else // broadcast fast path for non-VPAI case.
{
- scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
- scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
- scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
- scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
- }
+ simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
+
+ if (state.backendState.readViewportArrayIndex)
+ {
+ GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+ scisXmin, scisYmin, scisXmax, scisYmax);
+ }
+ else // broadcast fast path for non-VPAI case.
+ {
+ scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+ scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+ scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+ scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+ }
- bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
- bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
- bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
- bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+ bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
+ bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
+ bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
+ bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+ }
// Cull prims completely outside scissor
{
// transpose verts needed for backend
/// @todo modify BE to take non-transformed verts
- __m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
- __m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
- __m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
- __m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+ simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
vTranspose3x8(vHorizX[0], _simd16_extract_ps(prim[0].x, 0), _simd16_extract_ps(prim[1].x, 0), unused);
vTranspose3x8(vHorizY[0], _simd16_extract_ps(prim[0].y, 0), _simd16_extract_ps(prim[1].y, 0), unused);
}
#if USE_SIMD16_FRONTEND
-void SIMDAPI BinLines_simd16(
+void SIMDCALL BinLines_simd16(
DRAW_CONTEXT *pDC,
PA_STATE& pa,
uint32_t workerId,