swr: [rasterizer core] Alleviate potential stack overflow for 32bit builds
authorTim Rowley <timothy.o.rowley@intel.com>
Tue, 8 Mar 2016 17:56:06 +0000 (11:56 -0600)
committerTim Rowley <timothy.o.rowley@intel.com>
Fri, 25 Mar 2016 19:43:14 +0000 (14:43 -0500)
Move large stack allocations in the GS and clipper into thread local storage.

src/gallium/drivers/swr/rasterizer/core/clip.cpp
src/gallium/drivers/swr/rasterizer/core/clip.h
src/gallium/drivers/swr/rasterizer/core/frontend.cpp

index ce27bf71d3c32d0c13627ad7cb7159c1ed572ccf..3a2a8b35be8b5376be26903318cb626eba082740 100644 (file)
@@ -31,6 +31,9 @@
 #include "common/os.h"
 #include "core/clip.h"
 
+// Temp storage used by the clipper
+THREAD simdvertex tlsTempVertices[7];
+
 float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1)
 {
     return (boundaryCoord0 / (boundaryCoord0 - boundaryCoord1));
index b0b95d64f392db68e8863b998229887eec74267c..4f51388d9cfdb8eccc47226419cddd52a53a5a7c 100644 (file)
@@ -32,6 +32,9 @@
 #include "core/pa.h"
 #include "rdtsc_core.h"
 
+// Temp storage used by the clipper
+extern THREAD simdvertex tlsTempVertices[7];
+
 enum SWR_CLIPCODES
 {
     // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
@@ -818,8 +821,7 @@ private:
     simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs)
     {
         // temp storage
-        simdvertex tempVertices[7];
-        float* pTempVerts = (float*)&tempVertices[0];
+        float* pTempVerts = (float*)&tlsTempVertices[0];
 
         // zero out num input verts for non-active lanes
         simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim);
index f43a672bd82e604360c4c3c25d3983d6b724fd07..d092a8644c679361a0697bf467f8c6c1b2e330eb 100644 (file)
@@ -630,6 +630,8 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t num
     }
 }
 
+THREAD SWR_GS_CONTEXT tlsGsContext;
+
 //////////////////////////////////////////////////////////////////////////
 /// @brief Implements GS stage.
 /// @param pDC - pointer to draw context.
@@ -651,7 +653,6 @@ static void GeometryShaderStage(
 {
     RDTSC_START(FEGeometryShader);
 
-    SWR_GS_CONTEXT gsContext;
     SWR_CONTEXT* pContext = pDC->pContext;
 
     const API_STATE& state = GetApiState(pDC);
@@ -660,9 +661,9 @@ static void GeometryShaderStage(
     SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
     SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
 
-    gsContext.pStream = (uint8_t*)pGsOut;
-    gsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
-    gsContext.PrimitiveID = primID;
+    tlsGsContext.pStream = (uint8_t*)pGsOut;
+    tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
+    tlsGsContext.PrimitiveID = primID;
 
     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
     simdvector attrib[MAX_ATTRIBUTES];
@@ -675,7 +676,7 @@ static void GeometryShaderStage(
 
         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
         {
-            gsContext.vert[i].attrib[attribSlot] = attrib[i];
+            tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
         }
     }
     
@@ -683,7 +684,7 @@ static void GeometryShaderStage(
     pa.Assemble(VERTEX_POSITION_SLOT, attrib);
     for (uint32_t i = 0; i < numVertsPerPrim; ++i)
     {
-        gsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
+        tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
     }
 
     const uint32_t vertexStride = sizeof(simdvertex);
@@ -710,14 +711,14 @@ static void GeometryShaderStage(
 
     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
     {
-        gsContext.InstanceID = instance;
-        gsContext.mask = GenerateMask(numInputPrims);
+        tlsGsContext.InstanceID = instance;
+        tlsGsContext.mask = GenerateMask(numInputPrims);
 
         // execute the geometry shader
-        state.pfnGsFunc(GetPrivateState(pDC), &gsContext);
+        state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
 
-        gsContext.pStream += instanceStride;
-        gsContext.pCutOrStreamIdBuffer += cutInstanceStride;
+        tlsGsContext.pStream += instanceStride;
+        tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
     }
 
     // set up new binner and state for the GS output topology
@@ -736,7 +737,7 @@ static void GeometryShaderStage(
     // foreach input prim:
     // - setup a new PA based on the emitted verts for that prim
     // - loop over the new verts, calling PA to assemble each prim
-    uint32_t* pVertexCount = (uint32_t*)&gsContext.vertexCount;
+    uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
     uint32_t* pPrimitiveId = (uint32_t*)&primID;
 
     uint32_t totalPrimsGenerated = 0;