From ad9aff5528a30dbc775c042b9bbf6c5bc9f3eff9 Mon Sep 17 00:00:00 2001 From: Jan Zielinski Date: Wed, 24 Jul 2019 12:10:27 +0200 Subject: [PATCH] swr/rasterizer: cleanups for tessellation This commit introduces small fixes in preparation for tessellation support. Reviewed-by: Bruce Cherniak --- .../drivers/swr/rasterizer/core/frontend.cpp | 71 ++++++++++++------- .../drivers/swr/rasterizer/core/state.h | 13 +++- 2 files changed, 56 insertions(+), 28 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index 5eda4d7d870..816b84e643e 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -583,8 +583,9 @@ static void StreamOut( { if (state.soBuffer[i].pWriteOffset) { - bool nullTileAccessed = false; - void* pWriteOffset = pDC->pContext->pfnTranslateGfxptrForWrite(GetPrivateState(pDC), soContext.pBuffer[i]->pWriteOffset, &nullTileAccessed); + bool nullTileAccessed = false; + void* pWriteOffset = pDC->pContext->pfnTranslateGfxptrForWrite( + GetPrivateState(pDC), soContext.pBuffer[i]->pWriteOffset, &nullTileAccessed); *((uint32_t*)pWriteOffset) = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t); } @@ -786,21 +787,20 @@ void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t { auto attribGatherX = SIMD_T::mask_i32gather_ps( SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask); - auto attribGatherY = SIMD_T::mask_i32gather_ps( - SIMD_T::setzero_ps(), - (const float*)(pSrcBase + sizeof(float)), - vGatherOffsets, - vMask); - auto attribGatherZ = SIMD_T::mask_i32gather_ps( - SIMD_T::setzero_ps(), - (const float*)(pSrcBase + sizeof(float) * 2), - vGatherOffsets, - vMask); - auto attribGatherW = SIMD_T::mask_i32gather_ps( - SIMD_T::setzero_ps(), - (const float*)(pSrcBase + sizeof(float) * 3), - vGatherOffsets, - vMask); + auto attribGatherY = SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(), + (const float*)(pSrcBase + sizeof(float)), + vGatherOffsets, + vMask); + auto attribGatherZ = + SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(), + (const float*)(pSrcBase + sizeof(float) * 2), + vGatherOffsets, + vMask); + auto attribGatherW = + SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(), + (const float*)(pSrcBase + sizeof(float) * 3), + vGatherOffsets, + vMask); SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX); SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float)), viMask, attribGatherY); @@ -1235,10 +1235,12 @@ static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, struct TessellationThreadLocalData { SWR_HS_CONTEXT hsContext; - ScalarPatch patchData[KNOB_SIMD_WIDTH]; void* pTxCtx; size_t tsCtxSize; + uint8_t* pHSOutput; + size_t hsOutputAllocSize; + simdscalar* pDSOutput; size_t dsOutputAllocSize; }; @@ -1340,9 +1342,9 @@ static void TessellationStages(DRAW_CONTEXT* pDC, } #endif - SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext; - hsContext.pCPout = gt_pTessellationThreadData->patchData; - hsContext.PrimitiveID = primID; + SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext; + hsContext.PrimitiveID = primID; + hsContext.outputSize = tsState.hsAllocationSize; uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false); // Max storage for one attribute for an entire simdprimitive @@ -1351,17 +1353,29 @@ static void TessellationStages(DRAW_CONTEXT* pDC, // assemble all attributes for the input primitives for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot) { - uint32_t attribSlot = tsState.vertexAttribOffset + slot; + uint32_t attribSlot = tsState.srcVertexAttribOffset + slot; pa.Assemble(attribSlot, simdattrib); for (uint32_t i = 0; i < numVertsPerPrim; ++i) { - hsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = simdattrib[i]; + hsContext.vert[i].attrib[tsState.vertexAttribOffset + slot] = simdattrib[i]; } } + // Allocate HS output storage + uint32_t requiredAllocSize = KNOB_SIMD_WIDTH * tsState.hsAllocationSize; + + if (requiredAllocSize > gt_pTessellationThreadData->hsOutputAllocSize) + { + AlignedFree(gt_pTessellationThreadData->pHSOutput); + gt_pTessellationThreadData->pHSOutput = (uint8_t*)AlignedMalloc(requiredAllocSize, 64); + gt_pTessellationThreadData->hsOutputAllocSize = requiredAllocSize; + } + + hsContext.pCPout = (ScalarPatch*)gt_pTessellationThreadData->pHSOutput; + #if defined(_DEBUG) - memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH); + //memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH); #endif #if USE_SIMD16_FRONTEND @@ -1383,10 +1397,15 @@ static void TessellationStages(DRAW_CONTEXT* pDC, for (uint32_t p = 0; p < numPrims; ++p) { + ScalarPatch* pCPout = (ScalarPatch*)(gt_pTessellationThreadData->pHSOutput + tsState.hsAllocationSize * p); + + SWR_TESSELLATION_FACTORS tessFactors; + tessFactors = hsContext.pCPout[p].tessFactors; + // Run Tessellator SWR_TS_TESSELLATED_DATA tsData = {0}; RDTSC_BEGIN(pDC->pContext->pBucketMgr, FETessellation, pDC->drawId); - TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData); + TSTessellate(tsCtx, tessFactors, tsData); AR_EVENT(TessPrimCount(1)); RDTSC_END(pDC->pContext->pBucketMgr, FETessellation, 0); @@ -1423,7 +1442,7 @@ static void TessellationStages(DRAW_CONTEXT* pDC, // Run Domain Shader SWR_DS_CONTEXT dsContext; dsContext.PrimitiveID = pPrimId[p]; - dsContext.pCpIn = &hsContext.pCPout[p]; + dsContext.pCpIn = pCPout; dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU; dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV; dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput; diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index b6734e2ad0f..8b24c43fe80 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -169,8 +169,8 @@ enum SWR_INNER_TESSFACTOR_ID enum SWR_OUTER_TESSFACTOR_ID { SWR_QUAD_U_EQ0_TRI_U_LINE_DETAIL, - SWR_QUAD_V_EQ0_TRI_V_LINE_DENSITY, - SWR_QUAD_U_EQ1_TRI_W, + SWR_QUAD_U_EQ1_TRI_V_LINE_DENSITY, + SWR_QUAD_V_EQ0_TRI_W, SWR_QUAD_V_EQ1, SWR_NUM_OUTER_TESS_FACTORS, @@ -281,8 +281,11 @@ struct SWR_TESSELLATION_FACTORS { float OuterTessFactors[SWR_NUM_OUTER_TESS_FACTORS]; float InnerTessFactors[SWR_NUM_INNER_TESS_FACTORS]; + float pad[2]; }; +SWR_STATIC_ASSERT(sizeof(SWR_TESSELLATION_FACTORS) == 32); + #define MAX_NUM_VERTS_PER_PRIM 32 // support up to 32 control point patches struct ScalarPatch { @@ -300,6 +303,7 @@ struct SWR_HS_CONTEXT simdvertex vert[MAX_NUM_VERTS_PER_PRIM]; // IN: (SIMD) input primitive data simdscalari PrimitiveID; // IN: (SIMD) primitive ID generated from the draw call simdscalari mask; // IN: Active mask for shader + uint32_t outputSize; // IN: Size of HS output (per lane) ScalarPatch* pCPout; // OUT: Output control point patch SIMD-sized-array of SCALAR patches SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast. }; @@ -818,11 +822,16 @@ struct SWR_TS_STATE uint32_t numHsInputAttribs; uint32_t numHsOutputAttribs; + uint32_t hsAllocationSize; // Size of HS output in bytes, per lane + uint32_t numDsOutputAttribs; uint32_t dsAllocationSize; uint32_t dsOutVtxAttribOffset; // Offset to the start of the attributes of the input vertices, in simdvector units + uint32_t srcVertexAttribOffset; + + // Offset to the start of the attributes expected by the hull shader uint32_t vertexAttribOffset; }; -- 2.30.2