Implement widened clipper and binner interfaces for SIMD16.
Reviewed-by: George Kyriazis <george.kyriazis@intel.com>
}
PFN_PROCESS_PRIMS pfnBinner;
+#if USE_SIMD16_FRONTEND
+ PFN_PROCESS_PRIMS_SIMD16 pfnBinner_simd16;
+#endif
switch (pState->state.topology)
{
case TOP_POINT_LIST:
pState->pfnProcessPrims = ClipPoints;
pfnBinner = BinPoints;
+#if USE_SIMD16_FRONTEND
+ pState->pfnProcessPrims_simd16 = ClipPoints_simd16;
+ pfnBinner_simd16 = BinPoints_simd16;
+#endif
break;
case TOP_LINE_LIST:
case TOP_LINE_STRIP:
case TOP_LISTSTRIP_ADJ:
pState->pfnProcessPrims = ClipLines;
pfnBinner = BinLines;
+#if USE_SIMD16_FRONTEND
+ pState->pfnProcessPrims_simd16 = ClipLines_simd16;
+ pfnBinner_simd16 = BinLines_simd16;
+#endif
break;
default:
pState->pfnProcessPrims = ClipTriangles;
pfnBinner = GetBinTrianglesFunc((rastState.conservativeRast > 0));
+#if USE_SIMD16_FRONTEND
+ pState->pfnProcessPrims_simd16 = ClipTriangles_simd16;
+ pfnBinner_simd16 = GetBinTrianglesFunc_simd16((rastState.conservativeRast > 0));
+#endif
break;
};
if (pState->state.frontendState.vpTransformDisable)
{
pState->pfnProcessPrims = pfnBinner;
+#if USE_SIMD16_FRONTEND
+ pState->pfnProcessPrims_simd16 = pfnBinner_simd16;
+#endif
}
if ((pState->state.psState.pfnPixelShader == nullptr) &&
(pState->state.backendState.numAttributes == 0))
{
pState->pfnProcessPrims = nullptr;
+#if USE_SIMD16_FRONTEND
+ pState->pfnProcessPrims_simd16 = nullptr;
+#endif
}
if (pState->state.soState.rasterizerDisable == true)
{
pState->pfnProcessPrims = nullptr;
+#if USE_SIMD16_FRONTEND
+ pState->pfnProcessPrims_simd16 = nullptr;
+#endif
}
AR_END(FEBinTriangles, 1);
}
+#if USE_SIMD16_FRONTEND
+inline uint32_t GetPrimMaskLo(uint32_t primMask)
+{
+ return primMask & 255;
+}
+
+inline uint32_t GetPrimMaskHi(uint32_t primMask)
+{
+ return (primMask >> 8) & 255;
+}
+
+template <typename CT>
+void BinTriangles_simd16(
+ DRAW_CONTEXT *pDC,
+ PA_STATE& pa,
+ uint32_t workerId,
+ simd16vector tri[3],
+ uint32_t triMask,
+ simd16scalari primID,
+ simd16scalari viewportIdx)
+{
+ enum { VERTS_PER_PRIM = 3 };
+
+ simdvector verts[VERTS_PER_PRIM];
+
+ for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+ {
+ for (uint32_t j = 0; j < 4; j += 1)
+ {
+ verts[i][j] = _simd16_extract_ps(tri[i][j], 0);
+ }
+ }
+
+ pa.useAlternateOffset = false;
+ BinTriangles<CT>(pDC, pa, workerId, verts, GetPrimMaskLo(triMask), _simd16_extract_si(primID, 0), _simd16_extract_si(viewportIdx, 0));
+
+ if (GetPrimMaskHi(triMask))
+ {
+ for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+ {
+ for (uint32_t j = 0; j < 4; j += 1)
+ {
+ verts[i][j] = _simd16_extract_ps(tri[i][j], 1);
+ }
+ }
+
+ pa.useAlternateOffset = true;
+ BinTriangles<CT>(pDC, pa, workerId, verts, GetPrimMaskHi(triMask), _simd16_extract_si(primID, 1), _simd16_extract_si(viewportIdx, 1));
+ }
+}
+
+#endif
struct FEBinTrianglesChooser
{
typedef PFN_PROCESS_PRIMS FuncType;
return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
}
+#if USE_SIMD16_FRONTEND
+struct FEBinTrianglesChooser_simd16
+{
+ typedef PFN_PROCESS_PRIMS_SIMD16 FuncType;
+
+ template <typename... ArgsB>
+ static FuncType GetFunc()
+ {
+ return BinTriangles_simd16<ConservativeRastFETraits<ArgsB...>>;
+ }
+};
+
+// Selector for correct templated BinTrinagles function
+PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative)
+{
+ return TemplateArgUnroller<FEBinTrianglesChooser_simd16>::GetFunc(IsConservative);
+}
+
+#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Bin SIMD points to the backend. Only supports point size of 1
AR_END(FEBinPoints, 1);
}
+#if USE_SIMD16_FRONTEND
+void BinPoints_simd16(
+ DRAW_CONTEXT *pDC,
+ PA_STATE& pa,
+ uint32_t workerId,
+ simd16vector prim[3],
+ uint32_t primMask,
+ simd16scalari primID,
+ simd16scalari viewportIdx)
+{
+ enum { VERTS_PER_PRIM = 1 };
+
+ simdvector verts[VERTS_PER_PRIM];
+
+ for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+ {
+ for (uint32_t j = 0; j < 4; j += 1)
+ {
+ verts[i][j] = _simd16_extract_ps(prim[i][j], 0);
+ }
+ }
+
+ pa.useAlternateOffset = false;
+ BinPoints(pDC, pa, workerId, verts, GetPrimMaskLo(primMask), _simd16_extract_si(primID, 0), _simd16_extract_si(viewportIdx, 0));
+
+ if (GetPrimMaskHi(primMask))
+ {
+ for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+ {
+ for (uint32_t j = 0; j < 4; j += 1)
+ {
+ verts[i][j] = _simd16_extract_ps(prim[i][j], 1);
+ }
+ }
+
+ pa.useAlternateOffset = true;
+ BinPoints(pDC, pa, workerId, verts, GetPrimMaskHi(primMask), _simd16_extract_si(primID, 1), _simd16_extract_si(viewportIdx, 1));
+ }
+}
+
+#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Bin SIMD lines to the backend.
/// @param pDC - pointer to draw context.
primID,
viewportIdx);
}
+
+#if USE_SIMD16_FRONTEND
+void BinLines_simd16(
+ DRAW_CONTEXT *pDC,
+ PA_STATE& pa,
+ uint32_t workerId,
+ simd16vector prim[3],
+ uint32_t primMask,
+ simd16scalari primID,
+ simd16scalari viewportIdx)
+{
+ enum { VERTS_PER_PRIM = 2 };
+
+ simdvector verts[VERTS_PER_PRIM];
+
+ for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+ {
+ for (uint32_t j = 0; j < 4; j += 1)
+ {
+ verts[i][j] = _simd16_extract_ps(prim[i][j], 0);
+ }
+ }
+
+ pa.useAlternateOffset = false;
+ BinLines(pDC, pa, workerId, verts, GetPrimMaskLo(primMask), _simd16_extract_si(primID, 0), _simd16_extract_si(viewportIdx, 0));
+
+ if (GetPrimMaskHi(primMask))
+ {
+ for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+ {
+ for (uint32_t j = 0; j < 4; j += 1)
+ {
+ verts[i][j] = _simd16_extract_ps(prim[i][j], 1);
+ }
+ }
+
+ pa.useAlternateOffset = true;
+ BinLines(pDC, pa, workerId, verts, GetPrimMaskHi(primMask), _simd16_extract_si(primID, 1), _simd16_extract_si(viewportIdx, 1));
+ }
+}
+
+#endif
clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx);
AR_END(FEClipLines, 1);
}
+
void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx)
{
SWR_CONTEXT *pContext = pDC->pContext;
AR_END(FEClipPoints, 1);
}
+#if USE_SIMD16_FRONTEND
+inline uint32_t GetPrimMaskLo(uint32_t primMask)
+{
+ return primMask & 255;
+}
+
+inline uint32_t GetPrimMaskHi(uint32_t primMask)
+{
+ return (primMask >> 8) & 255;
+}
+
+void ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx)
+{
+ SWR_CONTEXT *pContext = pDC->pContext;
+ AR_BEGIN(FEClipTriangles, pDC->drawId);
+
+ enum { VERTS_PER_PRIM = 3 };
+
+ Clipper<VERTS_PER_PRIM> clipper(workerId, pDC);
+
+ simdvector verts[VERTS_PER_PRIM];
+
+ for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+ {
+ for (uint32_t j = 0; j < 4; j += 1)
+ {
+ verts[i][j] = _simd16_extract_ps(prims[i][j], 0);
+ }
+ }
+
+ pa.useAlternateOffset = false;
+ clipper.ExecuteStage(pa, verts, GetPrimMaskLo(primMask), _simd16_extract_si(primId, 0), _simd16_extract_si(viewportIdx, 0));
+
+ if (GetPrimMaskHi(primMask))
+ {
+ for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+ {
+ for (uint32_t j = 0; j < 4; j += 1)
+ {
+ verts[i][j] = _simd16_extract_ps(prims[i][j], 1);
+ }
+ }
+
+ pa.useAlternateOffset = true;
+ clipper.ExecuteStage(pa, verts, GetPrimMaskHi(primMask), _simd16_extract_si(primId, 1), _simd16_extract_si(viewportIdx, 1));
+ }
+
+ AR_END(FEClipTriangles, 1);
+}
+
+void ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx)
+{
+ SWR_CONTEXT *pContext = pDC->pContext;
+ AR_BEGIN(FEClipLines, pDC->drawId);
+
+ enum { VERTS_PER_PRIM = 2 };
+
+ Clipper<VERTS_PER_PRIM> clipper(workerId, pDC);
+
+ simdvector verts[VERTS_PER_PRIM];
+
+ for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+ {
+ for (uint32_t j = 0; j < 4; j += 1)
+ {
+ verts[i][j] = _simd16_extract_ps(prims[i][j], 0);
+ }
+ }
+
+ pa.useAlternateOffset = false;
+ clipper.ExecuteStage(pa, verts, GetPrimMaskLo(primMask), _simd16_extract_si(primId, 0), _simd16_extract_si(viewportIdx, 0));
+
+ if (GetPrimMaskHi(primMask))
+ {
+ for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+ {
+ for (uint32_t j = 0; j < 4; j += 1)
+ {
+ verts[i][j] = _simd16_extract_ps(prims[i][j], 1);
+ }
+ }
+
+ pa.useAlternateOffset = true;
+ clipper.ExecuteStage(pa, verts, GetPrimMaskHi(primMask), _simd16_extract_si(primId, 1), _simd16_extract_si(viewportIdx, 1));
+ }
+
+ AR_END(FEClipLines, 1);
+}
+
+void ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx)
+{
+ SWR_CONTEXT *pContext = pDC->pContext;
+ AR_BEGIN(FEClipPoints, pDC->drawId);
+
+ enum { VERTS_PER_PRIM = 1 };
+
+ Clipper<VERTS_PER_PRIM> clipper(workerId, pDC);
+
+ simdvector verts[VERTS_PER_PRIM];
+
+ for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+ {
+ for (uint32_t j = 0; j < 4; j += 1)
+ {
+ verts[i][j] = _simd16_extract_ps(prims[i][j], 0);
+ }
+ }
+
+ pa.useAlternateOffset = false;
+ clipper.ExecuteStage(pa, verts, GetPrimMaskLo(primMask), _simd16_extract_si(primId, 0), _simd16_extract_si(viewportIdx, 0));
+
+ if (GetPrimMaskHi(primMask))
+ {
+ for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+ {
+ for (uint32_t j = 0; j < 4; j += 1)
+ {
+ verts[i][j] = _simd16_extract_ps(prims[i][j], 1);
+ }
+ }
+
+ pa.useAlternateOffset = true;
+ clipper.ExecuteStage(pa, verts, GetPrimMaskHi(primMask), _simd16_extract_si(primId, 1), _simd16_extract_si(viewportIdx, 1));
+ }
+
+ AR_END(FEClipPoints, 1);
+}
+
+#endif
+
void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
+#if USE_SIMD16_FRONTEND
+void ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx);
+void ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx);
+void ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx);
+#endif
+
// pipeline function pointers, filled in by API thread when setting up the draw
BACKEND_FUNCS backendFuncs;
PFN_PROCESS_PRIMS pfnProcessPrims;
+#if USE_SIMD16_FRONTEND
+ PFN_PROCESS_PRIMS_SIMD16 pfnProcessPrims_simd16;
+#endif
CachingArena* pArena; // This should only be used by API thread.
};
}
// set up new binner and state for the GS output topology
+#if USE_SIMD16_FRONTEND
+ PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
+ if (HasRastT::value)
+ {
+ switch (pState->outputTopology)
+ {
+ case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles_simd16; break;
+ case TOP_LINE_STRIP: pfnClipFunc = ClipLines_simd16; break;
+ case TOP_POINT_LIST: pfnClipFunc = ClipPoints_simd16; break;
+ default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
+ }
+ }
+
+#else
PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
if (HasRastT::value)
{
}
}
+#endif
// foreach input prim:
// - setup a new PA based on the emitted verts for that prim
// - loop over the new verts, calling PA to assemble each prim
vViewPortIdx = _simd16_set1_epi32(0);
}
- const uint32_t primMask = GenMask(gsPa.NumPrims());
- const uint32_t primMask_lo = primMask & 255;
- const uint32_t primMask_hi = (primMask >> 8) & 255;
-
- const simd16scalari primID = vPrimId;
- const simdscalari primID_lo = _simd16_extract_si(primID, 0);
- const simdscalari primID_hi = _simd16_extract_si(primID, 1);
-
- for (uint32_t i = 0; i < 3; i += 1)
- {
- for (uint32_t j = 0; j < 4; j += 1)
- {
- attrib[i][j] = _simd16_extract_ps(attrib_simd16[i][j], 0);
- }
- }
-
gsPa.useAlternateOffset = false;
- pfnClipFunc(pDC, gsPa, workerId, attrib, primMask_lo, primID_lo, _simd16_extract_si(vViewPortIdx, 0));
-
- if (primMask_hi)
- {
- for (uint32_t i = 0; i < 3; i += 1)
- {
- for (uint32_t j = 0; j < 4; j += 1)
- {
- attrib[i][j] = _simd16_extract_ps(attrib_simd16[i][j], 1);
- }
- }
-
- gsPa.useAlternateOffset = true;
- pfnClipFunc(pDC, gsPa, workerId, attrib, primMask_hi, primID_hi, _simd16_extract_si(vViewPortIdx, 1));
- }
-
+ pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
#else
simdscalari vPrimId;
// pull primitiveID from the GS output if available
}
SWR_ASSERT(tsCtx);
+#if USE_SIMD16_FRONTEND
+ PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
+ if (HasRastT::value)
+ {
+ switch (tsState.postDSTopology)
+ {
+ case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles_simd16; break;
+ case TOP_LINE_LIST: pfnClipFunc = ClipLines_simd16; break;
+ case TOP_POINT_LIST: pfnClipFunc = ClipPoints_simd16; break;
+ default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
+ }
+ }
+
+#else
PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
if (HasRastT::value)
{
}
}
+#endif
SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
hsContext.pCPout = gt_pTessellationThreadData->patchData;
hsContext.PrimitiveID = primID;
SWR_ASSERT(pfnClipFunc);
#if USE_SIMD16_FRONTEND
- for (uint32_t i = 0; i < 3; i += 1)
- {
- for (uint32_t j = 0; j < 4; j += 1)
- {
- prim[i][j] = _simd16_extract_ps(prim_simd16[i][j], 0);
- }
- }
-
tessPa.useAlternateOffset = false;
- pfnClipFunc(pDC, tessPa, workerId, prim, primMask_lo, primID_lo, _simd_set1_epi32(0));
-
- if (primMask_hi)
- {
- for (uint32_t i = 0; i < 3; i += 1)
- {
- for (uint32_t j = 0; j < 4; j += 1)
- {
- prim[i][j] = _simd16_extract_ps(prim_simd16[i][j], 1);
- }
- }
-
- tessPa.useAlternateOffset = true;
- pfnClipFunc(pDC, tessPa, workerId, prim, primMask_hi, primID_hi, _simd_set1_epi32(0));
- }
+ pfnClipFunc(pDC, tessPa, workerId, prim_simd16, primMask, primID, _simd16_set1_epi32(0));
#else
pfnClipFunc(pDC, tessPa, workerId, prim,
GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
if (HasRastT::value)
{
- SWR_ASSERT(pDC->pState->pfnProcessPrims);
-
- simdvector prim[MAX_NUM_VERTS_PER_PRIM];
-
- for (uint32_t i = 0; i < 3; i += 1)
- {
- for (uint32_t j = 0; j < 4; j += 1)
- {
- prim[i][j] = _simd16_extract_ps(prim_simd16[i][j], 0);
- }
- }
+ SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
pa.useAlternateOffset = false;
- pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, primMask_lo, primID_lo, _simd_setzero_si());
-
- if (primMask_hi)
- {
- for (uint32_t i = 0; i < 3; i += 1)
- {
- for (uint32_t j = 0; j < 4; j += 1)
- {
- prim[i][j] = _simd16_extract_ps(prim_simd16[i][j], 1);
- }
- }
-
- pa.useAlternateOffset = true;
- pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, primMask_hi, primID_hi, _simd_setzero_si());
- }
+ pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, primMask, primID, _simd16_setzero_si());
}
}
}
void ProcessShutdown(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative);
+#if USE_SIMD16_FRONTEND
+PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative);
+#endif
struct PA_STATE_BASE; // forward decl
void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
+#if USE_SIMD16_FRONTEND
+void BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
+void BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
+#endif
simdvector a;
simdvector b;
+#if 1
+ const simd16vector &leadvert_16 = PaGetSimdVector_simd16(pa, pa.first, slot);
+#else
const simd16vector &leadvert_16 = pa.leadingVertex.attrib[slot];
+#endif
if (!pa.useAlternateOffset)
{
bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
{
#if USE_SIMD16_FRONTEND
+#if 1
+ const simd16vector &a = PaGetSimdVector_simd16(pa, pa.first, slot);
+#else
const simd16vector &a = pa.leadingVertex.attrib[slot];
+#endif
#else
simd16vector a;
void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
{
#if USE_SIMD16_FRONTEND
+#if 1
+ const simd16vector &a = PaGetSimdVector_simd16(pa, pa.first, slot);
+#else
const simd16vector &a = pa.leadingVertex.attrib[slot];
+#endif
const simd16vector &b = PaGetSimdVector_simd16(pa, pa.prev, slot);
const simd16vector &c = PaGetSimdVector_simd16(pa, pa.cur, slot);