case TOP_TRI_STRIP_REVERSE:
case TOP_PATCHLIST_BASE:
case TOP_UNKNOWN:
- SWR_ASSERT(false, "Unsupported topology: %d", mode);
+ SWR_INVALID("Unsupported topology: %d", mode);
return 0;
}
case TOP_TRI_STRIP_REVERSE:
case TOP_PATCHLIST_BASE:
case TOP_UNKNOWN:
- SWR_ASSERT(false, "Unsupported topology: %d", mode);
+ SWR_INVALID("Unsupported topology: %d", mode);
return 0;
}
numVerts = topology - TOP_PATCHLIST_BASE;
break;
default:
- SWR_ASSERT(false, "Unsupported topology: %d", topology);
+ SWR_INVALID("Unsupported topology: %d", topology);
break;
}
uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
// The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
- uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
+ uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t);
SWR_STREAMOUT_CONTEXT soContext = { 0 };
}
uint32_t numPrims = pa.NumPrims();
+
for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
{
DWORD slot = 0;
// Write all entries into primitive data buffer for SOS.
while (_BitScanForward(&slot, soMask))
{
- __m128 attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
- uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
+ simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
+ uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex];
pa.AssembleSingle(paSlot, primIndex, attrib);
// Attribute offset is relative offset from start of vertex.
_mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
}
+
soMask &= ~(1 << slot);
}
AR_END(FEStreamout, 1);
}
+#if USE_SIMD16_FRONTEND
+//////////////////////////////////////////////////////////////////////////
+/// Is value an even number (a multiple of two)
+///
+template <typename T>
+INLINE static bool IsEven(T value)
+{
+ return (value & 1) == 0;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Round up value to an even number (a multiple of two)
+///
+template <typename T>
+INLINE static T RoundUpEven(T value)
+{
+ return (value + 1) & ~1;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Round down value to an even number (a multiple of two)
+///
+template <typename T>
+INLINE static T RoundDownEven(T value)
+{
+ return value & ~1;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
+///
+/// vertexCount is in terms of the source simdvertexes and must be even
+///
+/// attribCount will limit the vector copies to those attribs specified
+///
+/// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
+///
+void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount)
+{
+ SWR_ASSERT(vertex);
+ SWR_ASSERT(vertex_simd16);
+ SWR_ASSERT(attribCount <= SWR_VTX_NUM_SLOTS);
+
+ simd16vertex temp;
+
+ for (uint32_t i = 0; i < vertexCount; i += 2)
+ {
+ for (uint32_t j = 0; j < attribCount; j += 1)
+ {
+ for (uint32_t k = 0; k < 4; k += 1)
+ {
+ temp.attrib[j][k] = _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
+
+ if ((i + 1) < vertexCount)
+ {
+ temp.attrib[j][k] = _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
+ }
+ }
+ }
+
+ for (uint32_t j = 0; j < attribCount; j += 1)
+ {
+ vertex_simd16[i >> 1].attrib[j] = temp.attrib[j];
+ }
+ }
+}
+
+#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Computes number of invocations. The current index represents
/// the start of the SIMD. The max index represents how much work
uint32_t maxIndex)
{
uint32_t remainder = (maxIndex - curIndex);
+#if USE_SIMD16_FRONTEND
+ return (remainder >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : remainder;
+#else
return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
+#endif
}
//////////////////////////////////////////////////////////////////////////
}
curInputByte >>= 2;
}
-
+
*pCutBuffer++ = outByte;
}
}
THREAD SWR_GS_CONTEXT tlsGsContext;
+template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
+struct GsBufferInfo
+{
+ GsBufferInfo(const SWR_GS_STATE &gsState)
+ {
+ const uint32_t vertexCount = gsState.maxNumVerts;
+ const uint32_t vertexStride = sizeof(SIMDVERTEX);
+ const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / SIMD_WIDTH;
+
+ vertexPrimitiveStride = vertexStride * numSimdBatches;
+ vertexInstanceStride = vertexPrimitiveStride * SIMD_WIDTH;
+
+ if (gsState.isSingleStream)
+ {
+ cutPrimitiveStride = (vertexCount + 7) / 8;
+ cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
+
+ streamCutPrimitiveStride = 0;
+ streamCutInstanceStride = 0;
+ }
+ else
+ {
+ cutPrimitiveStride = AlignUp(vertexCount * 2 / 8, 4);
+ cutInstanceStride = cutPrimitiveStride * SIMD_WIDTH;
+
+ streamCutPrimitiveStride = (vertexCount + 7) / 8;
+ streamCutInstanceStride = streamCutPrimitiveStride * SIMD_WIDTH;
+ }
+ }
+
+ uint32_t vertexPrimitiveStride;
+ uint32_t vertexInstanceStride;
+
+ uint32_t cutPrimitiveStride;
+ uint32_t cutInstanceStride;
+
+ uint32_t streamCutPrimitiveStride;
+ uint32_t streamCutInstanceStride;
+};
+
//////////////////////////////////////////////////////////////////////////
/// @brief Implements GS stage.
/// @param pDC - pointer to draw context.
void* pCutBuffer,
void* pStreamCutBuffer,
uint32_t* pSoPrimData,
+#if USE_SIMD16_FRONTEND
+ uint32_t numPrims_simd8,
+#endif
simdscalari primID)
{
SWR_CONTEXT *pContext = pDC->pContext;
tlsGsContext.PrimitiveID = primID;
uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
- simdvector attrib[MAX_ATTRIBUTES];
+ simdvector attrib[MAX_NUM_VERTS_PER_PRIM];
// assemble all attributes for the input primitive
for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
{
- uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
+ uint32_t attribSlot = pState->vertexAttribOffset + slot;
pa.Assemble(attribSlot, attrib);
for (uint32_t i = 0; i < numVertsPerPrim; ++i)
{
- tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
+ tlsGsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = attrib[i];
}
}
-
+
// assemble position
pa.Assemble(VERTEX_POSITION_SLOT, attrib);
for (uint32_t i = 0; i < numVertsPerPrim; ++i)
tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
}
- const uint32_t vertexStride = sizeof(simdvertex);
- const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
- const uint32_t inputPrimStride = numSimdBatches * vertexStride;
- const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
- uint32_t cutPrimStride;
- uint32_t cutInstanceStride;
-
- if (pState->isSingleStream)
- {
- cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
- cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
- }
- else
- {
- cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
- cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
- }
+#if USE_SIMD16_FRONTEND
+ const GsBufferInfo<simd16vertex, KNOB_SIMD16_WIDTH> bufferInfo(state.gsState);
+#else
+ const GsBufferInfo<simdvertex, KNOB_SIMD_WIDTH> bufferInfo(state.gsState);
+#endif
// record valid prims from the frontend to avoid over binning the newly generated
// prims from the GS
+#if USE_SIMD16_FRONTEND
+ uint32_t numInputPrims = numPrims_simd8;
+#else
uint32_t numInputPrims = pa.NumPrims();
+#endif
for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
{
// execute the geometry shader
state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
- tlsGsContext.pStream += instanceStride;
- tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
+ tlsGsContext.pStream += bufferInfo.vertexInstanceStride;
+ tlsGsContext.pCutOrStreamIdBuffer += bufferInfo.cutInstanceStride;
}
// set up new binner and state for the GS output topology
+#if USE_SIMD16_FRONTEND
+ PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
+ if (HasRastT::value)
+ {
+ switch (pState->outputTopology)
+ {
+ case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles_simd16; break;
+ case TOP_LINE_STRIP: pfnClipFunc = ClipLines_simd16; break;
+ case TOP_POINT_LIST: pfnClipFunc = ClipPoints_simd16; break;
+ default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
+ }
+ }
+
+#else
PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
if (HasRastT::value)
{
case TOP_TRIANGLE_STRIP: pfnClipFunc = ClipTriangles; break;
case TOP_LINE_STRIP: pfnClipFunc = ClipLines; break;
case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
- default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
+ default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
}
}
+#endif
// foreach input prim:
// - setup a new PA based on the emitted verts for that prim
// - loop over the new verts, calling PA to assemble each prim
uint32_t totalPrimsGenerated = 0;
for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
{
- uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
- uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
+ uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * bufferInfo.vertexPrimitiveStride;
+ uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * bufferInfo.cutPrimitiveStride;
+
for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
{
uint32_t numEmittedVerts = pVertexCount[inputPrim];
continue;
}
- uint8_t* pBase = pInstanceBase + instance * instanceStride;
- uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
-
+ uint8_t* pBase = pInstanceBase + instance * bufferInfo.vertexInstanceStride;
+ uint8_t* pCutBase = pCutBufferBase + instance * bufferInfo.cutInstanceStride;
+
uint32_t numAttribs = state.feNumAttributes;
for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
processCutVerts = false;
}
- PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
+#if USE_SIMD16_FRONTEND
+ PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, SWR_VTX_NUM_SLOTS, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
+
+#else
+ PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, SWR_VTX_NUM_SLOTS, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
+#endif
while (gsPa.GetNextStreamOutput())
{
do
{
+#if USE_SIMD16_FRONTEND
+ simd16vector attrib_simd16[3];
+
+ bool assemble = gsPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib_simd16);
+
+#else
bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
+#endif
if (assemble)
{
totalPrimsGenerated += gsPa.NumPrims();
if (HasStreamOutT::value)
{
+#if ENABLE_AVX512_SIMD16
+ gsPa.useAlternateOffset = false;
+#endif
StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
}
if (HasRastT::value && state.soState.streamToRasterizer == stream)
{
- simdscalari vPrimId;
- // pull primitiveID from the GS output if available
- if (state.gsState.emitsPrimitiveID)
- {
- simdvector primIdAttrib[3];
- gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
- vPrimId = _simd_castps_si(primIdAttrib[0].x);
- }
- else
- {
- vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
- }
-
- // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
- simdscalari vViewPortIdx;
- if (state.gsState.emitsViewportArrayIndex)
- {
- simdvector vpiAttrib[3];
- gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
-
- // OOB indices => forced to zero.
- simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
- simdscalari vClearMask = _simd_cmplt_epi32(_simd_castps_si(vpiAttrib[0].x), vNumViewports);
- vpiAttrib[0].x = _simd_and_ps(_simd_castsi_ps(vClearMask), vpiAttrib[0].x);
-
- vViewPortIdx = _simd_castps_si(vpiAttrib[0].x);
- }
- else
- {
- vViewPortIdx = _simd_set1_epi32(0);
- }
-
- pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
+#if USE_SIMD16_FRONTEND
+ simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
+
+ gsPa.useAlternateOffset = false;
+ pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId);
+#else
+ simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
+ pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId);
+#endif
}
}
} while (gsPa.NextPrim());
// update GS pipeline stats
UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
- AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
+ AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
AR_END(FEGeometryShader, 1);
}
/// @param state - API state
/// @param ppGsOut - pointer to GS output buffer allocation
/// @param ppCutBuffer - pointer to GS output cut buffer allocation
+template<typename SIMDVERTEX, uint32_t SIMD_WIDTH>
static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
void **ppStreamCutBuffer)
{
auto pArena = pDC->pArena;
SWR_ASSERT(pArena != nullptr);
SWR_ASSERT(state.gsState.gsEnable);
+
// allocate arena space to hold GS output verts
// @todo pack attribs
// @todo support multiple streams
- const uint32_t vertexStride = sizeof(simdvertex);
- const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
- uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
- *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
- const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
- const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
- const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
- const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
+ const GsBufferInfo<SIMDVERTEX, SIMD_WIDTH> bufferInfo(state.gsState);
+
+ const uint32_t vertexBufferSize = state.gsState.instanceCount * bufferInfo.vertexInstanceStride;
+
+ *ppGsOut = pArena->AllocAligned(vertexBufferSize, SIMD_WIDTH * sizeof(float));
// allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
// maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
// allocate space for temporary per-stream cut buffer if multi-stream is enabled
if (state.gsState.isSingleStream)
{
- *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
+ const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
+
+ *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
*ppStreamCutBuffer = nullptr;
}
else
{
- *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
- *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
- }
+ const uint32_t cutBufferSize = state.gsState.instanceCount * bufferInfo.cutInstanceStride;
+ const uint32_t streamCutBufferSize = state.gsState.instanceCount * bufferInfo.streamCutInstanceStride;
+ *ppCutBuffer = pArena->AllocAligned(cutBufferSize, SIMD_WIDTH * sizeof(float));
+ *ppStreamCutBuffer = pArena->AllocAligned(streamCutBufferSize, SIMD_WIDTH * sizeof(float));
+ }
}
//////////////////////////////////////////////////////////////////////////
void* pCutBuffer,
void* pCutStreamBuffer,
uint32_t* pSoPrimData,
+#if USE_SIMD16_FRONTEND
+ uint32_t numPrims_simd8,
+#endif
simdscalari primID)
{
SWR_CONTEXT *pContext = pDC->pContext;
}
SWR_ASSERT(tsCtx);
+#if USE_SIMD16_FRONTEND
+ PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
+ if (HasRastT::value)
+ {
+ switch (tsState.postDSTopology)
+ {
+ case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles_simd16; break;
+ case TOP_LINE_LIST: pfnClipFunc = ClipLines_simd16; break;
+ case TOP_POINT_LIST: pfnClipFunc = ClipPoints_simd16; break;
+ default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
+ }
+ }
+
+#else
PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
if (HasRastT::value)
{
case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
case TOP_LINE_LIST: pfnClipFunc = ClipLines; break;
case TOP_POINT_LIST: pfnClipFunc = ClipPoints; break;
- default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
+ default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
}
}
+#endif
SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
hsContext.pCPout = gt_pTessellationThreadData->patchData;
hsContext.PrimitiveID = primID;
// assemble all attributes for the input primitives
for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
{
- uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
+ uint32_t attribSlot = tsState.vertexAttribOffset + slot;
pa.Assemble(attribSlot, simdattrib);
for (uint32_t i = 0; i < numVertsPerPrim; ++i)
{
- hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
+ hsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = simdattrib[i];
}
}
memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
#endif
+#if USE_SIMD16_FRONTEND
+ uint32_t numPrims = numPrims_simd8;
+#else
uint32_t numPrims = pa.NumPrims();
+#endif
hsContext.mask = GenerateMask(numPrims);
// Run the HS
SWR_TS_TESSELLATED_DATA tsData = { 0 };
AR_BEGIN(FETessellation, pDC->drawId);
TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
- AR_EVENT(TessPrimCount(1));
+ AR_EVENT(TessPrimCount(1));
AR_END(FETessellation, 0);
if (tsData.NumPrimitives == 0)
// Allocate DS Output memory
uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
+#if USE_SIMD16_FRONTEND
+ size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs; // simd8 -> simd16, padding
+#else
size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
+#endif
if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
{
AlignedFree(gt_pTessellationThreadData->pDSOutput);
gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
+#if USE_SIMD16_FRONTEND
+ gt_pTessellationThreadData->numDSOutputVectors = RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs; // simd8 -> simd16, padding
+#else
gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
+#endif
}
SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
+#if USE_SIMD16_FRONTEND
+ dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); // simd8 -> simd16
+#else
dsContext.vectorStride = requiredDSVectorInvocations;
+#endif
uint32_t dsInvocations = 0;
}
UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
+#if USE_SIMD16_FRONTEND
+ SWR_ASSERT(IsEven(dsContext.vectorStride)); // simd8 -> simd16
+
+#endif
PA_TESS tessPa(
pDC,
+#if USE_SIMD16_FRONTEND
+ reinterpret_cast<const simd16scalar *>(dsContext.pOutputData), // simd8 -> simd16
+ dsContext.vectorStride / 2, // simd8 -> simd16
+#else
dsContext.pOutputData,
dsContext.vectorStride,
+#endif
+ SWR_VTX_NUM_SLOTS,
tsState.numDsOutputAttribs,
tsData.ppIndices,
tsData.NumPrimitives,
while (tessPa.HasWork())
{
+#if USE_SIMD16_FRONTEND
+ const uint32_t numPrims = tessPa.NumPrims();
+ const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
+ const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
+
+ const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID);
+ const simdscalari primID_lo = _simd16_extract_si(primID, 0);
+ const simdscalari primID_hi = _simd16_extract_si(primID, 1);
+
+#endif
if (HasGeometryShaderT::value)
{
+#if USE_SIMD16_FRONTEND
+ tessPa.useAlternateOffset = false;
+ GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_lo, primID_lo);
+
+ if (numPrims_hi)
+ {
+ tessPa.useAlternateOffset = true;
+ GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData, numPrims_hi, primID_hi);
+ }
+#else
GeometryShaderStage<HasStreamOutT, HasRastT>(
pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
_simd_set1_epi32(dsContext.PrimitiveID));
+#endif
}
else
{
if (HasStreamOutT::value)
{
+#if ENABLE_AVX512_SIMD16
+ tessPa.useAlternateOffset = false;
+#endif
StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
}
if (HasRastT::value)
{
- simdvector prim[3]; // Only deal with triangles, lines, or points
+#if USE_SIMD16_FRONTEND
+ simd16vector prim_simd16[3]; // Only deal with triangles, lines, or points
+#else
+ simdvector prim[3]; // Only deal with triangles, lines, or points
+#endif
AR_BEGIN(FEPAAssemble, pDC->drawId);
-#if SWR_ENABLE_ASSERTS
bool assemble =
-#endif
+#if USE_SIMD16_FRONTEND
+ tessPa.Assemble_simd16(VERTEX_POSITION_SLOT, prim_simd16);
+#else
tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
+#endif
AR_END(FEPAAssemble, 1);
SWR_ASSERT(assemble);
SWR_ASSERT(pfnClipFunc);
+#if USE_SIMD16_FRONTEND
+ tessPa.useAlternateOffset = false;
+ pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID);
+#else
pfnClipFunc(pDC, tessPa, workerId, prim,
- GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
+ GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID));
+#endif
}
}
} // while (tessPa.HasWork())
} // for (uint32_t p = 0; p < numPrims; ++p)
+#if USE_SIMD16_FRONTEND
+ if (gt_pTessellationThreadData->pDSOutput != nullptr)
+ {
+ AlignedFree(gt_pTessellationThreadData->pDSOutput);
+ gt_pTessellationThreadData->pDSOutput = nullptr;
+ }
+ gt_pTessellationThreadData->numDSOutputVectors = 0;
+
+#endif
TSDestroyCtx(tsCtx);
}
+THREAD PA_STATE::SIMDVERTEX *pVertexStore = nullptr;
+THREAD uint32_t gVertexStoreSize = 0;
+
//////////////////////////////////////////////////////////////////////////
/// @brief FE handler for SwrDraw.
/// @tparam IsIndexedT - Is indexed drawing enabled
DRAW_WORK& work = *(DRAW_WORK*)pUserData;
const API_STATE& state = GetApiState(pDC);
- __m256i vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
- SWR_VS_CONTEXT vsContext;
- simdvertex vin;
- int indexSize = 0;
- uint32_t endVertex = work.numVerts;
+ uint32_t indexSize = 0;
+ uint32_t endVertex = work.numVerts;
const int32_t* pLastRequestedIndex = nullptr;
if (IsIndexedT::value)
pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
break;
default:
- SWR_ASSERT(0);
+ SWR_INVALID("Invalid work.type: %d", work.type);
}
}
else
endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
}
- SWR_FETCH_CONTEXT fetchInfo = { 0 };
- fetchInfo.pStreams = &state.vertexBuffers[0];
- fetchInfo.StartInstance = work.startInstance;
- fetchInfo.StartVertex = 0;
-
- vsContext.pVin = &vin;
-
- if (IsIndexedT::value)
- {
- fetchInfo.BaseVertex = work.baseVertex;
-
- // if the entire index buffer isn't being consumed, set the last index
- // so that fetches < a SIMD wide will be masked off
- fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
- if (pLastRequestedIndex < fetchInfo.pLastIndex)
- {
- fetchInfo.pLastIndex = pLastRequestedIndex;
- }
- }
- else
- {
- fetchInfo.StartVertex = work.startVertex;
- }
-
#if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
#endif
void* pStreamCutBuffer = nullptr;
if (HasGeometryShaderT::value)
{
- AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
+#if USE_SIMD16_FRONTEND
+ AllocateGsBuffers<simd16vertex, KNOB_SIMD16_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
+#else
+ AllocateGsBuffers<simdvertex, KNOB_SIMD_WIDTH>(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
+#endif
}
if (HasTessellationT::value)
pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
}
+ const uint32_t vertexCount = NumVertsPerPrim(state.topology, true);
+#if USE_SIMD16_FRONTEND
+ uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simd16vector);
+#else
+ uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector);
+#endif
+
+ SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);
+
+ // Compute storage requirements for vertex store
+ // TODO: allocation needs to be rethought for better cut support
+ uint32_t numVerts = vertexCount + 2; // Need extra space for PA state machine
+ uint32_t vertexStoreSize = numVerts * simdVertexSizeBytes;
+
+ // grow the vertex store for the PA as necessary
+ if (gVertexStoreSize < vertexStoreSize)
+ {
+ if (pVertexStore != nullptr)
+ {
+ AlignedFree(pVertexStore);
+ }
+
+ pVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX *>(AlignedMalloc(vertexStoreSize, 64));
+ gVertexStoreSize = vertexStoreSize;
+
+ SWR_ASSERT(pVertexStore != nullptr);
+ }
+
// choose primitive assembler
- PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
+ PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, pVertexStore, numVerts, state.frontendState.vsVertexSize);
PA_STATE& pa = paFactory.GetPA();
+#if USE_SIMD16_FRONTEND
+ simdvertex vin_lo;
+ simdvertex vin_hi;
+ SWR_VS_CONTEXT vsContext_lo;
+ SWR_VS_CONTEXT vsContext_hi;
+
+ vsContext_lo.pVin = &vin_lo;
+ vsContext_hi.pVin = &vin_hi;
+ vsContext_lo.AlternateOffset = 0;
+ vsContext_hi.AlternateOffset = 1;
+
+ SWR_FETCH_CONTEXT fetchInfo_lo = { 0 };
+
+ fetchInfo_lo.pStreams = &state.vertexBuffers[0];
+ fetchInfo_lo.StartInstance = work.startInstance;
+ fetchInfo_lo.StartVertex = 0;
+
+ if (IsIndexedT::value)
+ {
+ fetchInfo_lo.BaseVertex = work.baseVertex;
+
+ // if the entire index buffer isn't being consumed, set the last index
+ // so that fetches < a SIMD wide will be masked off
+ fetchInfo_lo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
+ if (pLastRequestedIndex < fetchInfo_lo.pLastIndex)
+ {
+ fetchInfo_lo.pLastIndex = pLastRequestedIndex;
+ }
+ }
+ else
+ {
+ fetchInfo_lo.StartVertex = work.startVertex;
+ }
+
+ SWR_FETCH_CONTEXT fetchInfo_hi = fetchInfo_lo;
+
+ const simd16scalari vScale = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+
+ for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
+ {
+ uint32_t i = 0;
+
+ simd16scalari vIndex;
+
+ if (IsIndexedT::value)
+ {
+ fetchInfo_lo.pIndices = work.pIB;
+ fetchInfo_hi.pIndices = (int32_t *)((uint8_t *)fetchInfo_lo.pIndices + KNOB_SIMD_WIDTH * indexSize); // 1/2 of KNOB_SIMD16_WIDTH
+ }
+ else
+ {
+ vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
+
+ fetchInfo_lo.pIndices = (const int32_t *)&vIndex;
+ fetchInfo_hi.pIndices = (const int32_t *)&vIndex + KNOB_SIMD_WIDTH; // 1/2 of KNOB_SIMD16_WIDTH
+ }
+
+ fetchInfo_lo.CurInstance = instanceNum;
+ fetchInfo_hi.CurInstance = instanceNum;
+
+ vsContext_lo.InstanceID = instanceNum;
+ vsContext_hi.InstanceID = instanceNum;
+
+ while (pa.HasWork())
+ {
+ // GetNextVsOutput currently has the side effect of updating some PA state machine state.
+ // So we need to keep this outside of (i < endVertex) check.
+
+ simdmask *pvCutIndices_lo = nullptr;
+ simdmask *pvCutIndices_hi = nullptr;
+
+ if (IsIndexedT::value)
+ {
+ // simd16mask <=> simdmask[2]
+
+ pvCutIndices_lo = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[0];
+ pvCutIndices_hi = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[1];
+ }
+
+ simd16vertex &vout = pa.GetNextVsOutput();
+
+ vsContext_lo.pVout = reinterpret_cast<simdvertex *>(&vout);
+ vsContext_hi.pVout = reinterpret_cast<simdvertex *>(&vout);
+
+ if (i < endVertex)
+ {
+ // 1. Execute FS/VS for a single SIMD.
+ AR_BEGIN(FEFetchShader, pDC->drawId);
+ state.pfnFetchFunc(fetchInfo_lo, vin_lo);
+
+ if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
+ {
+ state.pfnFetchFunc(fetchInfo_hi, vin_hi);
+ }
+ AR_END(FEFetchShader, 0);
+
+ // forward fetch generated vertex IDs to the vertex shader
+ vsContext_lo.VertexID = fetchInfo_lo.VertexID;
+ vsContext_hi.VertexID = fetchInfo_hi.VertexID;
+
+ // Setup active mask for vertex shader.
+ vsContext_lo.mask = GenerateMask(endVertex - i);
+ vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
+
+ // forward cut mask to the PA
+ if (IsIndexedT::value)
+ {
+ *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
+ *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
+ }
+
+ UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
+
+#if KNOB_ENABLE_TOSS_POINTS
+ if (!KNOB_TOSS_FETCH)
+#endif
+ {
+ AR_BEGIN(FEVertexShader, pDC->drawId);
+ state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);
+
+ if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
+ {
+ state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_hi);
+ }
+ AR_END(FEVertexShader, 0);
+
+ UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
+ }
+ }
+
+ // 2. Assemble primitives given the last two SIMD.
+ do
+ {
+ simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
+
+ RDTSC_START(FEPAAssemble);
+ bool assemble = pa.Assemble_simd16(VERTEX_POSITION_SLOT, prim_simd16);
+ RDTSC_STOP(FEPAAssemble, 1, 0);
+
+#if KNOB_ENABLE_TOSS_POINTS
+ if (!KNOB_TOSS_FETCH)
+#endif
+ {
+#if KNOB_ENABLE_TOSS_POINTS
+ if (!KNOB_TOSS_VS)
+#endif
+ {
+ if (assemble)
+ {
+ UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
+
+ const uint32_t numPrims = pa.NumPrims();
+ const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
+ const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
+
+ const simd16scalari primID = pa.GetPrimID(work.startPrimID);
+ const simdscalari primID_lo = _simd16_extract_si(primID, 0);
+ const simdscalari primID_hi = _simd16_extract_si(primID, 1);
+
+ if (HasTessellationT::value)
+ {
+ pa.useAlternateOffset = false;
+ TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);
+
+ if (numPrims_hi)
+ {
+ pa.useAlternateOffset = true;
+ TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);
+ }
+ }
+ else if (HasGeometryShaderT::value)
+ {
+ pa.useAlternateOffset = false;
+ GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_lo, primID_lo);
+
+ if (numPrims_hi)
+ {
+ pa.useAlternateOffset = true;
+ GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, numPrims_hi, primID_hi);
+ }
+ }
+ else
+ {
+ // If streamout is enabled then stream vertices out to memory.
+ if (HasStreamOutT::value)
+ {
+ pa.useAlternateOffset = false;
+ StreamOut(pDC, pa, workerId, pSoPrimData, 0);
+ }
+
+ if (HasRastT::value)
+ {
+ SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
+
+ pa.useAlternateOffset = false;
+ pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID);
+ }
+ }
+ }
+ }
+ }
+ } while (pa.NextPrim());
+
+ if (IsIndexedT::value)
+ {
+ fetchInfo_lo.pIndices = (int32_t *)((uint8_t*)fetchInfo_lo.pIndices + KNOB_SIMD16_WIDTH * indexSize);
+ fetchInfo_hi.pIndices = (int32_t *)((uint8_t*)fetchInfo_hi.pIndices + KNOB_SIMD16_WIDTH * indexSize);
+ }
+ else
+ {
+ vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
+ }
+
+ i += KNOB_SIMD16_WIDTH;
+ }
+
+ pa.Reset();
+ }
+
+#else
+ SWR_VS_CONTEXT vsContext;
+ SWR_FETCH_CONTEXT fetchInfo = { 0 };
+
+ fetchInfo.pStreams = &state.vertexBuffers[0];
+ fetchInfo.StartInstance = work.startInstance;
+ fetchInfo.StartVertex = 0;
+
+ if (IsIndexedT::value)
+ {
+ fetchInfo.BaseVertex = work.baseVertex;
+
+ // if the entire index buffer isn't being consumed, set the last index
+ // so that fetches < a SIMD wide will be masked off
+ fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
+ if (pLastRequestedIndex < fetchInfo.pLastIndex)
+ {
+ fetchInfo.pLastIndex = pLastRequestedIndex;
+ }
+ }
+ else
+ {
+ fetchInfo.StartVertex = work.startVertex;
+ }
+
+ const simdscalari vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+
/// @todo: temporarily move instance loop in the FE to ensure SO ordering
for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
{
while (pa.HasWork())
{
- // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
+ // GetNextVsOutput currently has the side effect of updating some PA state machine state.
// So we need to keep this outside of (i < endVertex) check.
simdmask* pvCutIndices = nullptr;
if (IsIndexedT::value)
}
simdvertex& vout = pa.GetNextVsOutput();
+ vsContext.pVin = &vout;
vsContext.pVout = &vout;
if (i < endVertex)
// 1. Execute FS/VS for a single SIMD.
AR_BEGIN(FEFetchShader, pDC->drawId);
- state.pfnFetchFunc(fetchInfo, vin);
+ state.pfnFetchFunc(fetchInfo, vout);
AR_END(FEFetchShader, 0);
// forward fetch generated vertex IDs to the vertex shader
if (HasRastT::value)
{
SWR_ASSERT(pDC->pState->pfnProcessPrims);
+
pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
- GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
+ GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID));
}
}
}
}
} while (pa.NextPrim());
- i += KNOB_SIMD_WIDTH;
if (IsIndexedT::value)
{
fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
{
vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
}
+
+ i += KNOB_SIMD_WIDTH;
}
pa.Reset();
}
+#endif
AR_END(FEProcessDraw, numPrims * work.numInstances);
}
bool HasRasterization)
{
return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
-}
\ No newline at end of file
+}