///
/// attribCount will limit the vector copies to those attribs specified
///
+/// note: the stride between vertexes is determinded by KNOB_NUM_ATTRIBUTES
+///
void PackPairsOfSimdVertexIntoSimd16VertexInPlace(simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount)
{
SWR_ASSERT(vertex);
uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
#if USE_SIMD16_FRONTEND
- size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSOutputVectors); // simd8 -> simd16, padding
+ size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs; // simd8 -> simd16, padding
#else
size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
#endif
AlignedFree(gt_pTessellationThreadData->pDSOutput);
gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
#if USE_SIMD16_FRONTEND
- gt_pTessellationThreadData->numDSOutputVectors = RoundUpEven(requiredDSOutputVectors); // simd8 -> simd16, padding
+ gt_pTessellationThreadData->numDSOutputVectors = RoundUpEven(requiredDSVectorInvocations) * tsState.numDsOutputAttribs; // simd8 -> simd16, padding
#else
gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
#endif
dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
+#if USE_SIMD16_FRONTEND
+ dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); // simd8 -> simd16
+#else
dsContext.vectorStride = requiredDSVectorInvocations;
+#endif
uint32_t dsInvocations = 0;
UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
#if USE_SIMD16_FRONTEND
- // TEMPORARY: DS outputs simdvertex, PA inputs simd16vertex, so convert simdvertex to simd16vertex, in-place
-
- PackPairsOfSimdVertexIntoSimd16VertexInPlace(
- reinterpret_cast<simdvertex *>(dsContext.pOutputData),
- RoundUpEven(dsContext.vectorStride), // simd8 -> simd16
- tsState.numDsOutputAttribs);
+ SWR_ASSERT(IsEven(dsContext.vectorStride)); // simd8 -> simd16
#endif
PA_TESS tessPa(
pDC,
#if USE_SIMD16_FRONTEND
reinterpret_cast<const simd16scalar *>(dsContext.pOutputData), // simd8 -> simd16
- RoundUpEven(dsContext.vectorStride) / 2, // simd8 -> simd16
+ dsContext.vectorStride / 2, // simd8 -> simd16
#else
dsContext.pOutputData,
dsContext.vectorStride,
SetNextPaState_simd16(
pa,
PaPatchList_simd16<TotalControlPoints, CurrentControlPoints + 1>,
+ PaPatchList<TotalControlPoints, CurrentControlPoints + 1>,
PaPatchListSingle<TotalControlPoints>);
return false;
SetNextPaState_simd16(
pa,
PaPatchList_simd16<TotalControlPoints>,
+ PaPatchList<TotalControlPoints>,
PaPatchListSingle<TotalControlPoints>,
0,
KNOB_SIMD16_WIDTH,
#if ENABLE_AVX512_SIMD16
bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
{
- SetNextPaState_simd16(pa, PaTriList1_simd16, PaTriListSingle0);
+ SetNextPaState_simd16(pa, PaTriList1_simd16, PaTriList1, PaTriListSingle0);
return false; // Not enough vertices to assemble 16 triangles
}
bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
{
- SetNextPaState_simd16(pa, PaTriList2_simd16, PaTriListSingle0);
+ SetNextPaState_simd16(pa, PaTriList2_simd16, PaTriList2, PaTriListSingle0);
return false; // Not enough vertices to assemble 16 triangles
}
v2[i] = _simd16_permute_ps(temp2, perm2);
}
- SetNextPaState_simd16(pa, PaTriList0_simd16, PaTriListSingle0, 0, KNOB_SIMD16_WIDTH, true);
+ SetNextPaState_simd16(pa, PaTriList0_simd16, PaTriList0, PaTriListSingle0, 0, KNOB_SIMD16_WIDTH, true);
return true;
}
#if ENABLE_AVX512_SIMD16
bool PaTriStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
{
- SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStripSingle0);
+ SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, PaTriStripSingle0);
return false; // Not enough vertices to assemble 16 triangles.
}
v2[i] = _simd16_shuffle_ps(a[i], shuff, _MM_SHUFFLE(2, 2, 2, 2)); // a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
}
- SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStripSingle0, 0, KNOB_SIMD16_WIDTH);
+ SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, PaTriStripSingle0, 0, KNOB_SIMD16_WIDTH);
return true;
}
#if ENABLE_AVX512_SIMD16
bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
{
- SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFanSingle0);
+ SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0);
return false; // Not enough vertices to assemble 16 triangles.
}
v1[i] = _simd16_shuffle_ps(b[i], v2[i], _MM_SHUFFLE(2, 1, 2, 1)); // b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
}
- SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFanSingle0, 0, KNOB_SIMD16_WIDTH);
+ SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0, 0, KNOB_SIMD16_WIDTH);
return true;
}
#if ENABLE_AVX512_SIMD16
bool PaQuadList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
{
- SetNextPaState_simd16(pa, PaQuadList1_simd16, PaQuadListSingle0);
+ SetNextPaState_simd16(pa, PaQuadList1_simd16, PaQuadList1, PaQuadListSingle0);
return false; // Not enough vertices to assemble 16 triangles.
}
v2[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 2, 3, 2)); // a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF
}
- SetNextPaState_simd16(pa, PaQuadList0_simd16, PaQuadListSingle0, 0, KNOB_SIMD16_WIDTH, true);
+ SetNextPaState_simd16(pa, PaQuadList0_simd16, PaQuadList0, PaQuadListSingle0, 0, KNOB_SIMD16_WIDTH, true);
return true;
}
#if ENABLE_AVX512_SIMD16
bool PaLineLoop0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
{
- SetNextPaState_simd16(pa, PaLineLoop1_simd16, PaLineLoopSingle0);
+ SetNextPaState_simd16(pa, PaLineLoop1_simd16, PaLineLoop1, PaLineLoopSingle0);
return false;
}
}
}
- SetNextPaState_simd16(pa, PaLineLoop1_simd16, PaLineLoopSingle0, 0, KNOB_SIMD16_WIDTH);
+ SetNextPaState_simd16(pa, PaLineLoop1_simd16, PaLineLoop1, PaLineLoopSingle0, 0, KNOB_SIMD16_WIDTH);
return true;
}
#if ENABLE_AVX512_SIMD16
bool PaLineList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
{
- SetNextPaState_simd16(pa, PaLineList1_simd16, PaLineListSingle0);
+ SetNextPaState_simd16(pa, PaLineList1_simd16, PaLineList1, PaLineListSingle0);
return false; // Not enough vertices to assemble 16 lines
}
v1[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1)); // a1 a3 a5 a7 a9 aB aD aF b1 b3 b5 b7 b9 bB bD bF
}
- SetNextPaState_simd16(pa, PaLineList0_simd16, PaLineListSingle0, 0, KNOB_SIMD16_WIDTH, true);
+ SetNextPaState_simd16(pa, PaLineList0_simd16, PaLineList0, PaLineListSingle0, 0, KNOB_SIMD16_WIDTH, true);
return true;
}
#if ENABLE_AVX512_SIMD16
bool PaLineStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
{
- SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStripSingle0);
+ SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStrip1, PaLineStripSingle0);
return false; // Not enough vertices to assemble 16 lines
}
v1[i] = _simd16_permute_ps(temp, perm); // a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0
}
- SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStripSingle0, 0, KNOB_SIMD16_WIDTH);
+ SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStrip1, PaLineStripSingle0, 0, KNOB_SIMD16_WIDTH);
return true;
}
verts[0] = a; // points only have 1 vertex.
- SetNextPaState_simd16(pa, PaPoints0_simd16, PaPointsSingle0, 0, KNOB_SIMD16_WIDTH, true);
+ SetNextPaState_simd16(pa, PaPoints0_simd16, PaPoints0, PaPointsSingle0, 0, KNOB_SIMD16_WIDTH, true);
return true;
}
/// There is not enough to assemble 8 triangles.
bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
{
- SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectListSingle0);
+ SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectList1, PaRectListSingle0);
return false;
}
v2[i] = _simd16_insert_ps(_simd16_setzero_ps(), v2_lo, 0);
}
- SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectListSingle0, 0, KNOB_SIMD16_WIDTH, true);
+ SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectList1, PaRectListSingle0, 0, KNOB_SIMD16_WIDTH, true);
return true;
}
simd16vector verts[])
{
SWR_INVALID("Is rect list used for anything other then clears?");
- SetNextPaState_simd16(pa, PaRectList0_simd16, PaRectListSingle0, 0, KNOB_SIMD16_WIDTH, true);
+ SetNextPaState_simd16(pa, PaRectList0_simd16, PaRectList0, PaRectListSingle0, 0, KNOB_SIMD16_WIDTH, true);
return true;
}