// Write all entries into primitive data buffer for SOS.
while (_BitScanForward(&slot, soMask))
{
- __m128 attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
- uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
+ simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
+ uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex];
pa.AssembleSingle(paSlot, primIndex, attrib);
// Attribute offset is relative offset from start of vertex.
tlsGsContext.PrimitiveID = primID;
uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
- simdvector attrib[MAX_ATTRIBUTES];
+ simdvector attrib[MAX_NUM_VERTS_PER_PRIM];
// assemble all attributes for the input primitive
for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
{
- uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
+ uint32_t attribSlot = pState->vertexAttribOffset + slot;
pa.Assemble(attribSlot, attrib);
for (uint32_t i = 0; i < numVertsPerPrim; ++i)
{
- tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
+ tlsGsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = attrib[i];
}
}
if (HasStreamOutT::value)
{
+#if ENABLE_AVX512_SIMD16
gsPa.useAlternateOffset = false;
+#endif
StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
}
#if USE_SIMD16_FRONTEND
simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
- // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
- simd16scalari vViewPortIdx;
- if (state.gsState.emitsViewportArrayIndex)
- {
- simd16vector vpiAttrib[3];
- gsPa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib);
-
- // OOB indices => forced to zero.
- simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
- simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
- simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports);
- vViewPortIdx = _simd16_and_si(vClearMask, vpai);
- }
- else
- {
- vViewPortIdx = _simd16_set1_epi32(0);
- }
-
gsPa.useAlternateOffset = false;
- pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
+ pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId);
#else
simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
-
- // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
- simdscalari vViewPortIdx;
- if (state.gsState.emitsViewportArrayIndex)
- {
- simdvector vpiAttrib[3];
- gsPa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
- simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
-
- // OOB indices => forced to zero.
- simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
- simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports);
- vViewPortIdx = _simd_and_si(vClearMask, vpai);
- }
- else
- {
- vViewPortIdx = _simd_set1_epi32(0);
- }
-
- pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
+ pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId);
#endif
}
}
// assemble all attributes for the input primitives
for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
{
- uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
+ uint32_t attribSlot = tsState.vertexAttribOffset + slot;
pa.Assemble(attribSlot, simdattrib);
for (uint32_t i = 0; i < numVertsPerPrim; ++i)
{
- hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
+ hsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = simdattrib[i];
}
}
{
if (HasStreamOutT::value)
{
+#if ENABLE_AVX512_SIMD16
tessPa.useAlternateOffset = false;
+#endif
StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
}
SWR_ASSERT(pfnClipFunc);
#if USE_SIMD16_FRONTEND
tessPa.useAlternateOffset = false;
- pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID, _simd16_set1_epi32(0));
+ pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID);
#else
pfnClipFunc(pDC, tessPa, workerId, prim,
- GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
+ GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID));
#endif
}
}
pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
}
- const uint32_t vertexCount = NumVertsPerPrim(state.topology, state.gsState.gsEnable);
+ const uint32_t vertexCount = NumVertsPerPrim(state.topology, true);
+#if USE_SIMD16_FRONTEND
+ uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simd16vector);
+#else
+ uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector);
+#endif
SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);
+ // Compute storage requirements for vertex store
+ // TODO: allocation needs to be rethought for better cut support
+ uint32_t numVerts = vertexCount + 2; // Need extra space for PA state machine
+ uint32_t vertexStoreSize = numVerts * simdVertexSizeBytes;
+
// grow the vertex store for the PA as necessary
- if (gVertexStoreSize < vertexCount)
+ if (gVertexStoreSize < vertexStoreSize)
{
if (pVertexStore != nullptr)
{
AlignedFree(pVertexStore);
}
- while (gVertexStoreSize < vertexCount)
- {
-#if USE_SIMD16_FRONTEND
- gVertexStoreSize += 4; // grow in chunks of 4 simd16vertex
-#else
- gVertexStoreSize += 8; // grow in chunks of 8 simdvertex
-#endif
- }
-
- SWR_ASSERT(gVertexStoreSize <= MAX_NUM_VERTS_PER_PRIM);
-
- pVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX *>(AlignedMalloc(gVertexStoreSize * sizeof(pVertexStore[0]), 64));
+ pVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX *>(AlignedMalloc(vertexStoreSize, 64));
+ gVertexStoreSize = vertexStoreSize;
SWR_ASSERT(pVertexStore != nullptr);
}
// choose primitive assembler
- PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, pVertexStore, gVertexStoreSize, SWR_VTX_NUM_SLOTS);
+ PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, pVertexStore, numVerts, state.frontendState.vsVertexSize);
PA_STATE& pa = paFactory.GetPA();
#if USE_SIMD16_FRONTEND
SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
pa.useAlternateOffset = false;
- pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID, _simd16_setzero_si());
+ pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID);
}
}
}
}
#else
- simdvertex vin;
SWR_VS_CONTEXT vsContext;
-
- vsContext.pVin = &vin;
-
SWR_FETCH_CONTEXT fetchInfo = { 0 };
fetchInfo.pStreams = &state.vertexBuffers[0];
}
simdvertex& vout = pa.GetNextVsOutput();
+ vsContext.pVin = &vout;
vsContext.pVout = &vout;
if (i < endVertex)
// 1. Execute FS/VS for a single SIMD.
AR_BEGIN(FEFetchShader, pDC->drawId);
- state.pfnFetchFunc(fetchInfo, vin);
+ state.pfnFetchFunc(fetchInfo, vout);
AR_END(FEFetchShader, 0);
// forward fetch generated vertex IDs to the vertex shader
SWR_ASSERT(pDC->pState->pfnProcessPrims);
pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
- GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
+ GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID));
}
}
}