{
if (state.soBuffer[i].pWriteOffset)
{
- bool nullTileAccessed = false;
- void* pWriteOffset = pDC->pContext->pfnTranslateGfxptrForWrite(GetPrivateState(pDC), soContext.pBuffer[i]->pWriteOffset, &nullTileAccessed);
+ bool nullTileAccessed = false;
+ void* pWriteOffset = pDC->pContext->pfnTranslateGfxptrForWrite(
+ GetPrivateState(pDC), soContext.pBuffer[i]->pWriteOffset, &nullTileAccessed);
*((uint32_t*)pWriteOffset) = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
}
{
auto attribGatherX = SIMD_T::mask_i32gather_ps(
SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
- auto attribGatherY = SIMD_T::mask_i32gather_ps(
- SIMD_T::setzero_ps(),
- (const float*)(pSrcBase + sizeof(float)),
- vGatherOffsets,
- vMask);
- auto attribGatherZ = SIMD_T::mask_i32gather_ps(
- SIMD_T::setzero_ps(),
- (const float*)(pSrcBase + sizeof(float) * 2),
- vGatherOffsets,
- vMask);
- auto attribGatherW = SIMD_T::mask_i32gather_ps(
- SIMD_T::setzero_ps(),
- (const float*)(pSrcBase + sizeof(float) * 3),
- vGatherOffsets,
- vMask);
+ auto attribGatherY = SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
+ (const float*)(pSrcBase + sizeof(float)),
+ vGatherOffsets,
+ vMask);
+ auto attribGatherZ =
+ SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
+ (const float*)(pSrcBase + sizeof(float) * 2),
+ vGatherOffsets,
+ vMask);
+ auto attribGatherW =
+ SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
+ (const float*)(pSrcBase + sizeof(float) * 3),
+ vGatherOffsets,
+ vMask);
SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>)), viMask, attribGatherY);
struct TessellationThreadLocalData
{
SWR_HS_CONTEXT hsContext;
- ScalarPatch patchData[KNOB_SIMD_WIDTH];
void* pTxCtx;
size_t tsCtxSize;
+ uint8_t* pHSOutput;
+ size_t hsOutputAllocSize;
+
simdscalar* pDSOutput;
size_t dsOutputAllocSize;
};
}
#endif
- SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
- hsContext.pCPout = gt_pTessellationThreadData->patchData;
- hsContext.PrimitiveID = primID;
+ SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
+ hsContext.PrimitiveID = primID;
+ hsContext.outputSize = tsState.hsAllocationSize;
uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
// Max storage for one attribute for an entire simdprimitive
// assemble all attributes for the input primitives
for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
{
- uint32_t attribSlot = tsState.vertexAttribOffset + slot;
+ uint32_t attribSlot = tsState.srcVertexAttribOffset + slot;
pa.Assemble(attribSlot, simdattrib);
for (uint32_t i = 0; i < numVertsPerPrim; ++i)
{
- hsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = simdattrib[i];
+ hsContext.vert[i].attrib[tsState.vertexAttribOffset + slot] = simdattrib[i];
}
}
+ // Allocate HS output storage
+ uint32_t requiredAllocSize = KNOB_SIMD_WIDTH * tsState.hsAllocationSize;
+
+ if (requiredAllocSize > gt_pTessellationThreadData->hsOutputAllocSize)
+ {
+ AlignedFree(gt_pTessellationThreadData->pHSOutput);
+ gt_pTessellationThreadData->pHSOutput = (uint8_t*)AlignedMalloc(requiredAllocSize, 64);
+ gt_pTessellationThreadData->hsOutputAllocSize = requiredAllocSize;
+ }
+
+ hsContext.pCPout = (ScalarPatch*)gt_pTessellationThreadData->pHSOutput;
+
#if defined(_DEBUG)
- memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
+ //memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
#endif
#if USE_SIMD16_FRONTEND
for (uint32_t p = 0; p < numPrims; ++p)
{
+ ScalarPatch* pCPout = (ScalarPatch*)(gt_pTessellationThreadData->pHSOutput + tsState.hsAllocationSize * p);
+
+ SWR_TESSELLATION_FACTORS tessFactors;
+ tessFactors = hsContext.pCPout[p].tessFactors;
+
// Run Tessellator
SWR_TS_TESSELLATED_DATA tsData = {0};
RDTSC_BEGIN(pDC->pContext->pBucketMgr, FETessellation, pDC->drawId);
- TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
+ TSTessellate(tsCtx, tessFactors, tsData);
AR_EVENT(TessPrimCount(1));
RDTSC_END(pDC->pContext->pBucketMgr, FETessellation, 0);
// Run Domain Shader
SWR_DS_CONTEXT dsContext;
dsContext.PrimitiveID = pPrimId[p];
- dsContext.pCpIn = &hsContext.pCPout[p];
+ dsContext.pCpIn = pCPout;
dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
enum SWR_OUTER_TESSFACTOR_ID
{
SWR_QUAD_U_EQ0_TRI_U_LINE_DETAIL,
- SWR_QUAD_V_EQ0_TRI_V_LINE_DENSITY,
- SWR_QUAD_U_EQ1_TRI_W,
+ SWR_QUAD_U_EQ1_TRI_V_LINE_DENSITY,
+ SWR_QUAD_V_EQ0_TRI_W,
SWR_QUAD_V_EQ1,
SWR_NUM_OUTER_TESS_FACTORS,
{
float OuterTessFactors[SWR_NUM_OUTER_TESS_FACTORS];
float InnerTessFactors[SWR_NUM_INNER_TESS_FACTORS];
+ float pad[2];
};
+SWR_STATIC_ASSERT(sizeof(SWR_TESSELLATION_FACTORS) == 32);
+
#define MAX_NUM_VERTS_PER_PRIM 32 // support up to 32 control point patches
struct ScalarPatch
{
simdvertex vert[MAX_NUM_VERTS_PER_PRIM]; // IN: (SIMD) input primitive data
simdscalari PrimitiveID; // IN: (SIMD) primitive ID generated from the draw call
simdscalari mask; // IN: Active mask for shader
+ uint32_t outputSize; // IN: Size of HS output (per lane)
ScalarPatch* pCPout; // OUT: Output control point patch SIMD-sized-array of SCALAR patches
SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast.
};
uint32_t numHsInputAttribs;
uint32_t numHsOutputAttribs;
+ uint32_t hsAllocationSize; // Size of HS output in bytes, per lane
+
uint32_t numDsOutputAttribs;
uint32_t dsAllocationSize;
uint32_t dsOutVtxAttribOffset;
// Offset to the start of the attributes of the input vertices, in simdvector units
+ uint32_t srcVertexAttribOffset;
+
+ // Offset to the start of the attributes expected by the hull shader
uint32_t vertexAttribOffset;
};