swr/rast: Optimize late/bindless JIT of samplers
authorGeorge Kyriazis <george.kyriazis@intel.com>
Tue, 10 Apr 2018 06:05:19 +0000 (01:05 -0500)
committerGeorge Kyriazis <george.kyriazis@intel.com>
Wed, 18 Apr 2018 15:51:38 +0000 (10:51 -0500)
Add per-worker thread private data to all shader calls
Add per-worker sampler cache and jit context
Add late LoadTexel JIT support
Add per-worker-thread Sampler / LoadTexel JIT

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
25 files changed:
src/gallium/drivers/swr/rasterizer/core/api.cpp
src/gallium/drivers/swr/rasterizer/core/api.h
src/gallium/drivers/swr/rasterizer/core/backend.cpp
src/gallium/drivers/swr/rasterizer/core/backend.h
src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp
src/gallium/drivers/swr/rasterizer/core/backend_impl.h
src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
src/gallium/drivers/swr/rasterizer/core/binner.cpp
src/gallium/drivers/swr/rasterizer/core/context.h
src/gallium/drivers/swr/rasterizer/core/frontend.cpp
src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h
src/gallium/drivers/swr/rasterizer/core/state.h
src/gallium/drivers/swr/rasterizer/core/threads.cpp
src/gallium/drivers/swr/rasterizer/core/threads.h
src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
src/gallium/drivers/swr/rasterizer/core/tilemgr.h
src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp
src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp
src/gallium/drivers/swr/swr_memory.h
src/gallium/drivers/swr/swr_shader.cpp

index 3141db69ef15edae63f9575d8ffd98337037bb54..e37e2e4a5384d9c08cea03a2f6e4211eed9c77fc 100644 (file)
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -122,6 +122,11 @@ HANDLE SwrCreateContext(
         pContext->apiThreadInfo.numAPIThreadsPerCore    = 1;
     }
 
+    if (pCreateInfo->pWorkerPrivateState)
+    {
+        pContext->workerPrivateState = *pCreateInfo->pWorkerPrivateState;
+    }
+
     memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
     memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
     new (&pContext->WaitLock) std::mutex();
index 7247fa4215f10a8de73b39204799e40f72f7127a..b171188c927ee54f8a615570c9a7166711694fdd 100644 (file)
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -115,7 +115,8 @@ struct SWR_RECT
 /// @param x - destination x coordinate
 /// @param y - destination y coordinate
 /// @param pDstHotTile - pointer to the hot tile surface
-typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, SWR_FORMAT dstFormat,
+typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, HANDLE hWorkerPrivateData,
+    SWR_FORMAT dstFormat,
     SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
     uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, uint8_t *pDstHotTile);
 
@@ -127,7 +128,8 @@ typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, SWR_FORMAT dstForma
 /// @param x - destination x coordinate
 /// @param y - destination y coordinate
 /// @param pSrcHotTile - pointer to the hot tile surface
-typedef void(SWR_API *PFN_STORE_TILE)(HANDLE hPrivateContext, SWR_FORMAT srcFormat,
+typedef void(SWR_API *PFN_STORE_TILE)(HANDLE hPrivateContext, HANDLE hWorkerPrivateData,
+    SWR_FORMAT srcFormat,
     SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
     uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, uint8_t *pSrcHotTile);
 
@@ -139,7 +141,7 @@ typedef void(SWR_API *PFN_STORE_TILE)(HANDLE hPrivateContext, SWR_FORMAT srcForm
 /// @param y - destination y coordinate
 /// @param renderTargetArrayIndex - render target array offset from arrayIndex
 /// @param pClearColor - pointer to the hot tile's clear value
-typedef void(SWR_API *PFN_CLEAR_TILE)(HANDLE hPrivateContext,
+typedef void(SWR_API *PFN_CLEAR_TILE)(HANDLE hPrivateContext, HANDLE hWorkerPrivateData,
     SWR_RENDERTARGET_ATTACHMENT rtIndex,
     uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, const float* pClearColor);
 
@@ -208,6 +210,21 @@ struct SWR_API_THREADING_INFO
                                     // Independent of KNOB_MAX_THREADS_PER_CORE.
 };
 
+//////////////////////////////////////////////////////////////////////////
+/// SWR_WORKER_PRIVATE_STATE
+/// Data used to allocate per-worker thread private data.  A pointer
+/// to this data will be passed in to each shader function.
+/////////////////////////////////////////////////////////////////////////
+struct SWR_WORKER_PRIVATE_STATE
+{
+    typedef void (SWR_API *PFN_WORKER_DATA)(HANDLE hWorkerPrivateData, uint32_t iWorkerNum);
+
+    size_t              perWorkerPrivateStateSize;  ///< Amount of data to allocate per-worker
+    PFN_WORKER_DATA     pfnInitWorkerData;          ///< Init function for worker data.  If null
+                                                    ///< worker data will be initialized to 0.
+    PFN_WORKER_DATA     pfnFinishWorkerData;        ///< Finish / destroy function for worker data.
+                                                    ///< Can be null.
+};
 
 //////////////////////////////////////////////////////////////////////////
 /// SWR_CREATECONTEXT_INFO
@@ -216,7 +233,10 @@ struct SWR_CREATECONTEXT_INFO
 {
     // External functions (e.g. sampler) need per draw context state.
     // Use SwrGetPrivateContextState() to access private state.
-    uint32_t privateStateSize;
+    size_t                      privateStateSize;
+
+    // Optional per-worker state, can be NULL for no worker-private data
+    SWR_WORKER_PRIVATE_STATE*   pWorkerPrivateState;
 
     // Callback functions
     PFN_LOAD_TILE               pfnLoadTile;
@@ -229,23 +249,23 @@ struct SWR_CREATECONTEXT_INFO
 
     // Pointer to rdtsc buckets mgr returned to the caller.
     // Only populated when KNOB_ENABLE_RDTSC is set
-    BucketManager* pBucketMgr;
+    BucketManager*              pBucketMgr;
 
     // Output: size required memory passed to for SwrSaveState / SwrRestoreState
-    size_t  contextSaveSize;
+    size_t                      contextSaveSize;
 
     // ArchRast event manager.
-    HANDLE  hArEventManager;
+    HANDLE                      hArEventManager;
 
     // Input (optional): Threading info that overrides any set KNOB values.
-    SWR_THREADING_INFO* pThreadInfo;
+    SWR_THREADING_INFO*         pThreadInfo;
 
-    // Input (optional}: Info for reserving API threads
-    SWR_API_THREADING_INFO* pApiThreadInfo;
+    // Input (optional): Info for reserving API threads
+    SWR_API_THREADING_INFO*     pApiThreadInfo;
 
     // Input: if set to non-zero value, overrides KNOB value for maximum
     // number of draws in flight
-    uint32_t MAX_DRAWS_IN_FLIGHT;
+    uint32_t                    MAX_DRAWS_IN_FLIGHT;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -714,6 +734,7 @@ SWR_FUNC(void, SwrInit);
 /// @param x, y - Coordinates to raster tile.
 /// @param pDstHotTile - Pointer to Hot Tile
 SWR_FUNC(void, SwrLoadHotTile,
+    HANDLE hWorkerPrivateData,
     const SWR_SURFACE_STATE *pSrcSurface,
     SWR_FORMAT dstFormat,
     SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
@@ -728,6 +749,7 @@ SWR_FUNC(void, SwrLoadHotTile,
 /// @param x, y - Coordinates to raster tile.
 /// @param pSrcHotTile - Pointer to Hot Tile
 SWR_FUNC(void, SwrStoreHotTileToSurface,
+    HANDLE hWorkerPrivateData,
     SWR_SURFACE_STATE *pDstSurface,
     SWR_FORMAT srcFormat,
     SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
@@ -741,6 +763,7 @@ SWR_FUNC(void, SwrStoreHotTileToSurface,
 /// @param x, y - Coordinates to raster tile.
 /// @param pClearColor - Pointer to clear color
 SWR_FUNC(void, SwrStoreHotTileClear,
+         HANDLE hWorkerPrivateData,
          SWR_SURFACE_STATE *pDstSurface,
          SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
          uint32_t x,
index 1e0769ae57422c9cbb73e3721bc4df2df05c068e..5ac9ceb165e2e25cf47c340f28d8cfba6d71c64d 100644 (file)
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -78,7 +78,7 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup
     csContext.pScratchSpace = (uint8_t*)pScratchSpace;
     csContext.scratchSpacePerSimd = pDC->pState->state.scratchSpaceSize;
 
-    state.pfnCsFunc(GetPrivateState(pDC), &csContext);
+    state.pfnCsFunc(GetPrivateState(pDC), pContext->threadPool.pThreadData[workerId].pWorkerPrivateData, &csContext);
 
     UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup);
     AR_EVENT(CSStats(csContext.stats.numInstExecuted));
@@ -107,6 +107,7 @@ void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile
     SWR_RENDERTARGET_ATTACHMENT attachment)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
+    HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
 
     RDTSC_BEGIN(BEStoreTiles, pDC->drawId);
 
@@ -139,7 +140,7 @@ void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile
             PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[srcFormat];
             SWR_ASSERT(pfnClearTiles != nullptr);
 
-            pfnClearTiles(pDC, attachment, macroTile, pHotTile->renderTargetArrayIndex, pHotTile->clearData, pDesc->rect);
+            pfnClearTiles(pDC, hWorkerPrivateData, attachment, macroTile, pHotTile->renderTargetArrayIndex, pHotTile->clearData, pDesc->rect);
         }
 
         if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
@@ -147,7 +148,7 @@ void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile
             int32_t destX = KNOB_MACROTILE_X_DIM * x;
             int32_t destY = KNOB_MACROTILE_Y_DIM * y;
 
-            pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat,
+            pContext->pfnStoreTile(GetPrivateState(pDC), hWorkerPrivateData, srcFormat,
                 attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
         }
         
index c8c37e65257ee57c58d170a8559abb180d4d165d..7a842fe0e20904a151b285d0b6e6b851e1bdcf28 100644 (file)
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -41,7 +41,7 @@ void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTil
 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
 void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
 
-typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, uint32_t, DWORD[4], const SWR_RECT& rect);
+typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, HANDLE hWorkerData, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, uint32_t, DWORD[4], const SWR_RECT& rect);
 
 extern PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS];
 extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
index baaa7e61f75bcfc0eb67afa26165e6a725f62c9b..af031f9f9d71b9b23894d0542c1b2b514f65d0c4 100644 (file)
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -76,7 +76,7 @@ void ClearRasterTile(uint8_t *pTileBuffer, simd16vector &value)
 
 #endif
 template<SWR_FORMAT format>
-INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, uint32_t renderTargetArrayIndex, DWORD clear[4], const SWR_RECT& rect)
+INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, HANDLE hWorkerPrivateData, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, uint32_t renderTargetArrayIndex, DWORD clear[4], const SWR_RECT& rect)
 {
     // convert clear color to hottile format
     // clear color is in RGBA float/uint32
@@ -146,7 +146,7 @@ INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, ui
     const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
     const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
 
-    HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples, renderTargetArrayIndex);
+    HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, hWorkerPrivateData, macroTile, rt, true, numSamples, renderTargetArrayIndex);
     uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, clearTile.xmin, clearTile.ymin)) * numSamples;
     uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
 
@@ -172,6 +172,7 @@ INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, ui
 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
+    HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
 
     if (KNOB_FAST_CLEAR)
     {
@@ -191,7 +192,7 @@ void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, vo
             {
                 mask &= ~(1 << rt);
 
-                HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)rt, true, numSamples, pClear->renderTargetArrayIndex);
+                HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, hWorkerPrivateData, macroTile, (SWR_RENDERTARGET_ATTACHMENT)rt, true, numSamples, pClear->renderTargetArrayIndex);
 
                 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
                 pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
@@ -204,14 +205,14 @@ void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, vo
 
         if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
         {
-            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples, pClear->renderTargetArrayIndex);
+            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, hWorkerPrivateData, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples, pClear->renderTargetArrayIndex);
             pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
             pHotTile->state = HOTTILE_CLEAR;
         }
 
         if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
         {
-            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples, pClear->renderTargetArrayIndex);
+            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, hWorkerPrivateData, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples, pClear->renderTargetArrayIndex);
 
             pHotTile->clearData[0] = pClear->clearStencil;
             pHotTile->state = HOTTILE_CLEAR;
@@ -242,7 +243,7 @@ void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, vo
             {
                 mask &= ~(1 << rt);
 
-                pfnClearTiles(pDC, (SWR_RENDERTARGET_ATTACHMENT)rt, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
+                pfnClearTiles(pDC, hWorkerPrivateData, (SWR_RENDERTARGET_ATTACHMENT)rt, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
             }
         }
 
@@ -253,7 +254,7 @@ void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, vo
             PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
             SWR_ASSERT(pfnClearTiles != nullptr);
 
-            pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
+            pfnClearTiles(pDC, hWorkerPrivateData, SWR_ATTACHMENT_DEPTH, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
         }
 
         if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
@@ -262,7 +263,7 @@ void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, vo
             clearData[0] = pClear->clearStencil;
             PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
 
-            pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
+            pfnClearTiles(pDC, hWorkerPrivateData, SWR_ATTACHMENT_STENCIL, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
         }
 
         RDTSC_END(BEClear, 1);
index 20b2ec58287e19b2f4ddcb19a70f578aefd510d5..05234c21822a2cc0d36836fe0468f3ca7a88b65f 100644 (file)
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -884,6 +884,9 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
     BarycentricCoeffs coeffs;
     SetupBarycentricCoeffs(&coeffs, work);
 
+    SWR_CONTEXT *pContext = pDC->pContext;
+    void* pWorkerData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
+
     SWR_PS_CONTEXT psContext;
     const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
     SetupPixelShaderContext<T>(&psContext, samplePos, work);
@@ -964,7 +967,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
 
             // execute pixel shader
             RDTSC_BEGIN(BEPixelShader, pDC->drawId);
-            state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+            state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext);
             UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
             RDTSC_END(BEPixelShader, 0);
 
index c7c6c533e37f770c80d13ee39daeb363e8bd8ff8..5940aa7ba45495e1818bcf8f1c80823dfe1deb63 100644 (file)
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -43,6 +43,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
     RDTSC_BEGIN(BESampleRateBackend, pDC->drawId);
     RDTSC_BEGIN(BESetup, pDC->drawId);
 
+    void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
     const API_STATE &state = GetApiState(pDC);
 
     BarycentricCoeffs coeffs;
@@ -163,7 +164,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
 
                     // execute pixel shader
                     RDTSC_BEGIN(BEPixelShader, pDC->drawId);
-                    state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+                    state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext);
                     RDTSC_END(BEPixelShader, 0);
 
                     // update stats
index 26d5a75bd12308d07d5426edcfe2132b3b7b3de9..aaaba636ed375cd70c695b83bdd33a492fe5eeea 100644 (file)
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -43,6 +43,8 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
     RDTSC_BEGIN(BESingleSampleBackend, pDC->drawId);
     RDTSC_BEGIN(BESetup, pDC->drawId);
 
+    void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
+
     const API_STATE &state = GetApiState(pDC);
 
     BarycentricCoeffs coeffs;
@@ -146,7 +148,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
 
                 // execute pixel shader
                 RDTSC_BEGIN(BEPixelShader, pDC->drawId);
-                state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+                state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext);
                 RDTSC_END(BEPixelShader, 0);
 
                 // update stats
index d31fd37095d21b8282fef7a2e82e22e3669cd9d3..9f8dc887aa6457859fc9c15748613c8e00a76a1c 100644 (file)
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
index 7bc69f507236e8ba69431c4a921e174041372ac1..af8f4b8db4f352a1bdc0c67599eeec88163f818f 100644 (file)
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -483,6 +483,7 @@ struct SWR_CONTEXT
     THREAD_POOL threadPool; // Thread pool associated with this context
     SWR_THREADING_INFO threadInfo;
     SWR_API_THREADING_INFO apiThreadInfo;
+    SWR_WORKER_PRIVATE_STATE workerPrivateState;
 
     uint32_t MAX_DRAWS_IN_FLIGHT;
 
index 30c2e7bab517ab599868db2eb34059bcc093417a..9630afa036d111c9ffdbb2cb44c81c8a910af5e3 100644 (file)
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -799,6 +799,8 @@ static void GeometryShaderStage(
 {
     RDTSC_BEGIN(FEGeometryShader, pDC->drawId);
 
+    void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
+
     const API_STATE& state = GetApiState(pDC);
     const SWR_GS_STATE* pState = &state.gsState;
     SWR_GS_CONTEXT gsContext;
@@ -850,7 +852,7 @@ static void GeometryShaderStage(
         gsContext.mask = GenerateMask(numInputPrims);
 
         // execute the geometry shader
-        state.pfnGsFunc(GetPrivateState(pDC), &gsContext);
+        state.pfnGsFunc(GetPrivateState(pDC), pWorkerData, &gsContext);
         AR_EVENT(GSStats(gsContext.stats.numInstExecuted));
 
         for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
@@ -1169,6 +1171,7 @@ static void TessellationStages(
 {
     const API_STATE& state = GetApiState(pDC);
     const SWR_TS_STATE& tsState = state.tsState;
+    void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
 
     SWR_ASSERT(gt_pTessellationThreadData);
 
@@ -1250,7 +1253,7 @@ static void TessellationStages(
 
     // Run the HS
     RDTSC_BEGIN(FEHullShader, pDC->drawId);
-    state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
+    state.pfnHsFunc(GetPrivateState(pDC), pWorkerData, &hsContext);
     RDTSC_END(FEHullShader, 0);
 
     UPDATE_STAT_FE(HsInvocations, numPrims);
@@ -1315,7 +1318,7 @@ static void TessellationStages(
             dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
 
             RDTSC_BEGIN(FEDomainShader, pDC->drawId);
-            state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
+            state.pfnDsFunc(GetPrivateState(pDC), pWorkerData, &dsContext);
             RDTSC_END(FEDomainShader, 0);
 
             AR_EVENT(DSStats(dsContext.stats.numInstExecuted));
@@ -1521,6 +1524,8 @@ void ProcessDraw(
 
     RDTSC_BEGIN(FEProcessDraw, pDC->drawId);
 
+    void* pWorkerData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
+
     DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
     const API_STATE&    state = GetApiState(pDC);
 
@@ -1738,13 +1743,13 @@ void ProcessDraw(
                 // 1. Execute FS/VS for a single SIMD.
                 RDTSC_BEGIN(FEFetchShader, pDC->drawId);
 #if USE_SIMD16_SHADERS
-                state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo_lo, vin);
+                state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin);
 #else
-                state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo_lo, vin_lo);
+                state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin_lo);
 
                 if ((i + KNOB_SIMD_WIDTH) < endVertex)  // 1/2 of KNOB_SIMD16_WIDTH
                 {
-                    state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo_hi, vin_hi);
+                    state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_hi, vin_hi);
                 }
 #endif
                 RDTSC_END(FEFetchShader, 0);
@@ -1793,15 +1798,15 @@ void ProcessDraw(
                 {
                     RDTSC_BEGIN(FEVertexShader, pDC->drawId);
 #if USE_SIMD16_VS
-                    state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);
+                    state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo);
                     AR_EVENT(VSStats(vsContext_lo.stats.numInstExecuted));
 #else
-                    state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);
+                    state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo);
                     AR_EVENT(VSStats(vsContext_lo.stats.numInstExecuted));
 
                     if ((i + KNOB_SIMD_WIDTH) < endVertex)  // 1/2 of KNOB_SIMD16_WIDTH
                     {
-                        state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_hi);
+                        state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_hi);
                         AR_EVENT(VSStats(vsContext_hi.stats.numInstExecuted));
                     }
 #endif
@@ -1994,7 +1999,7 @@ void ProcessDraw(
 
                 // 1. Execute FS/VS for a single SIMD.
                 RDTSC_BEGIN(FEFetchShader, pDC->drawId);
-                state.pfnFetchFunc(GetPrivateState(pDC), fetchInfo, vout);
+                state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo, vout);
                 RDTSC_END(FEFetchShader, 0);
 
                 // forward fetch generated vertex IDs to the vertex shader
@@ -2016,7 +2021,7 @@ void ProcessDraw(
 #endif
                 {
                     RDTSC_BEGIN(FEVertexShader, pDC->drawId);
-                    state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
+                    state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext);
                     RDTSC_END(FEVertexShader, 0);
 
                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
index 08dd51bf73168932edb872ca788a37e4492d665e..67c28ad97c43202c6a9f45d11f129bc08018f4b8 100644 (file)
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -302,7 +302,7 @@ void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTi
     triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z;
 
     RenderOutputBuffers renderBuffers;
-    GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT, 
+    GetRenderHotTiles(pDC, workerId, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT, 
         renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
 
     RDTSC_BEGIN(BEPixelBackend, pDC->drawId);
index 7f9b3788c7f364640cea1bb6bd2a216fe38a7178..ca39d7c38f8ff0ee4e1f1e5176c73447d1a94cbd 100644 (file)
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -40,7 +40,7 @@
 extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT][STATE_VALID_TRI_EDGE_COUNT][2];
 
 template <uint32_t numSamples = 1>
-void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex);
+void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex);
 template <typename RT>
 void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers);
 template <typename RT>
@@ -1145,7 +1145,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
     uint32_t maxX = maxTileX;
 
     RenderOutputBuffers renderBuffers, currentRenderBufferRow;
-    GetRenderHotTiles<RT::MT::numSamples>(pDC, macroTile, minTileX, minTileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
+    GetRenderHotTiles<RT::MT::numSamples>(pDC, workerId, macroTile, minTileX, minTileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
     currentRenderBufferRow = renderBuffers;
 
     // rasterize and generate coverage masks per sample
@@ -1297,10 +1297,11 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
 
 // Get pointers to hot tile memory for color RT, depth, stencil
 template <uint32_t numSamples>
-void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex)
+void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex)
 {
     const API_STATE& state = GetApiState(pDC);
     SWR_CONTEXT *pContext = pDC->pContext;
+    HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
 
     uint32_t mx, my;
     MacroTileMgr::getTileIndices(macroID, mx, my);
@@ -1316,7 +1317,7 @@ void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint
     uint32_t colorHottileEnableMask = state.colorHottileEnable;
     while(_BitScanForward(&rtSlot, colorHottileEnableMask))
     {
-        HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, 
+        HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, hWorkerPrivateData, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, 
             numSamples, renderTargetArrayIndex);
         pColor->state = HOTTILE_DIRTY;
         renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;
@@ -1328,7 +1329,7 @@ void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint
         const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
         uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
         offset*=numSamples;
-        HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, 
+        HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_DEPTH, true,
             numSamples, renderTargetArrayIndex);
         pDepth->state = HOTTILE_DIRTY;
         SWR_ASSERT(pDepth->pBuffer != nullptr);
@@ -1339,7 +1340,7 @@ void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint
         const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
         uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
         offset*=numSamples;
-        HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, 
+        HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_STENCIL, true,
             numSamples, renderTargetArrayIndex);
         pStencil->state = HOTTILE_DIRTY;
         SWR_ASSERT(pStencil->pBuffer != nullptr);
index cdb30f60fdf27f109579066bcd71331994a84848..217cf44c58fee224c62d63e2c648bd96e45bdc76 100644 (file)
@@ -911,18 +911,18 @@ struct SWR_BLEND_CONTEXT
 /// FUNCTION POINTERS FOR SHADERS
 
 #if USE_SIMD16_SHADERS
-typedef void(__cdecl *PFN_FETCH_FUNC)(HANDLE hPrivateData, SWR_FETCH_CONTEXT& fetchInfo, simd16vertex& out);
+typedef void(__cdecl *PFN_FETCH_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_FETCH_CONTEXT& fetchInfo, simd16vertex& out);
 #else
-typedef void(__cdecl *PFN_FETCH_FUNC)(HANDLE hPrivateData, SWR_FETCH_CONTEXT& fetchInfo, simdvertex& out);
+typedef void(__cdecl *PFN_FETCH_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_FETCH_CONTEXT& fetchInfo, simdvertex& out);
 #endif
-typedef void(__cdecl *PFN_VERTEX_FUNC)(HANDLE hPrivateData, SWR_VS_CONTEXT* pVsContext);
-typedef void(__cdecl *PFN_HS_FUNC)(HANDLE hPrivateData, SWR_HS_CONTEXT* pHsContext);
-typedef void(__cdecl *PFN_DS_FUNC)(HANDLE hPrivateData, SWR_DS_CONTEXT* pDsContext);
-typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, SWR_GS_CONTEXT* pGsContext);
-typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, SWR_CS_CONTEXT* pCsContext);
+typedef void(__cdecl *PFN_VERTEX_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_VS_CONTEXT* pVsContext);
+typedef void(__cdecl *PFN_HS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_HS_CONTEXT* pHsContext);
+typedef void(__cdecl *PFN_DS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_DS_CONTEXT* pDsContext);
+typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_GS_CONTEXT* pGsContext);
+typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_CS_CONTEXT* pCsContext);
 typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT& soContext);
-typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext);
-typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext);
+typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT *pContext);
+typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT *pContext);
 typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(SWR_BLEND_CONTEXT*);
 typedef simdscalar(*PFN_QUANTIZE_DEPTH)(simdscalar const &);
 
index 3eb20abcbfc6d2af8c3f530f9ffbee55e8c8e5aa..9e16246c3f407cf23998232bc7a03fdf1c48b09f 100644 (file)
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -42,6 +42,7 @@
 #endif
 
 #include "common/os.h"
+#include "core/api.h"
 #include "context.h"
 #include "frontend.h"
 #include "backend.h"
@@ -1128,7 +1129,8 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
 
     if (pContext->threadInfo.SINGLE_THREADED)
     {
-        return;
+        numAPIReservedThreads = 0;
+        numThreads = 1;
     }
 
     if (numAPIReservedThreads)
@@ -1139,6 +1141,10 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
         {
             numAPIReservedThreads = 0;
         }
+        else
+        {
+            memset(pPool->pApiThreadData, 0, sizeof(THREAD_DATA) * numAPIReservedThreads);
+        }
     }
     pPool->numReservedThreads = numAPIReservedThreads;
 
@@ -1147,8 +1153,37 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
 
     pPool->pThreadData = new (std::nothrow) THREAD_DATA[pPool->numThreads];
     SWR_ASSERT(pPool->pThreadData);
+    memset(pPool->pThreadData, 0, sizeof(THREAD_DATA) * pPool->numThreads);
     pPool->numaMask = 0;
 
+    // Allocate worker private data
+    pPool->pWorkerPrivateDataArray = nullptr;
+    if (pContext->workerPrivateState.perWorkerPrivateStateSize)
+    {
+        size_t perWorkerSize = AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64);
+        size_t totalSize = perWorkerSize * pPool->numThreads;
+        if (totalSize)
+        {
+            pPool->pWorkerPrivateDataArray = AlignedMalloc(totalSize, 64);
+            SWR_ASSERT(pPool->pWorkerPrivateDataArray);
+
+            void* pWorkerData = pPool->pWorkerPrivateDataArray;
+            for (uint32_t i = 0; i < pPool->numThreads; ++i)
+            {
+                pPool->pThreadData[i].pWorkerPrivateData = pWorkerData;
+                if (pContext->workerPrivateState.pfnInitWorkerData)
+                {
+                    pContext->workerPrivateState.pfnInitWorkerData(pWorkerData, i);
+                }
+                pWorkerData = PtrAdd(pWorkerData, perWorkerSize);
+            }
+        }
+    }
+
+    if (pContext->threadInfo.SINGLE_THREADED)
+    {
+        return;
+    }
 
     pPool->pThreads = new (std::nothrow) THREAD_PTR[pPool->numThreads];
     SWR_ASSERT(pPool->pThreads);
@@ -1293,13 +1328,13 @@ void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
 /// @param pPool - pointer to thread pool object.
 void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
 {
-    if (!pContext->threadInfo.SINGLE_THREADED)
-    {
-        // Wait for all threads to finish
-        SwrWaitForIdle(pContext);
+    // Wait for all threads to finish
+    SwrWaitForIdle(pContext);
 
-        // Wait for threads to finish and destroy them
-        for (uint32_t t = 0; t < pPool->numThreads; ++t)
+    // Wait for threads to finish and destroy them
+    for (uint32_t t = 0; t < pPool->numThreads; ++t)
+    {
+        if (!pContext->threadInfo.SINGLE_THREADED)
         {
             // Detach from thread.  Cannot join() due to possibility (in Windows) of code
             // in some DLLMain(THREAD_DETATCH case) blocking the thread until after this returns.
@@ -1307,10 +1342,17 @@ void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
             delete(pPool->pThreads[t]);
         }
 
-        delete[] pPool->pThreads;
-
-        // Clean up data used by threads
-        delete[] pPool->pThreadData;
-        delete[] pPool->pApiThreadData;
+        if (pContext->workerPrivateState.pfnFinishWorkerData)
+        {
+            pContext->workerPrivateState.pfnFinishWorkerData(pPool->pThreadData[t].pWorkerPrivateData, t);
+        }
     }
+
+    delete[] pPool->pThreads;
+
+    // Clean up data used by threads
+    delete[] pPool->pThreadData;
+    delete[] pPool->pApiThreadData;
+
+    AlignedFree(pPool->pWorkerPrivateDataArray);
 }
index 2e53265f424b8a331bfe85ca876c6783a6d333ef..cb918ddb60d76c707e20b8264ee7ce3f8791169b 100644 (file)
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -35,9 +35,11 @@ typedef std::thread* THREAD_PTR;
 
 struct SWR_CONTEXT;
 struct DRAW_CONTEXT;
+struct SWR_WORKER_PRIVATE_STATE;
 
 struct THREAD_DATA
 {
+    void* pWorkerPrivateData;// Pointer to per-worker private data
     uint32_t procGroupId;   // Will always be 0 for non-Windows OS
     uint32_t threadId;      // within the procGroup for Windows
     uint32_t numaId;        // NUMA node id
@@ -55,6 +57,7 @@ struct THREAD_POOL
     uint32_t numThreads;
     uint32_t numaMask;
     THREAD_DATA *pThreadData;
+    void* pWorkerPrivateDataArray; // All memory for worker private data
     uint32_t numReservedThreads; // Number of threads reserved for API use
     THREAD_DATA *pApiThreadData;
 };
index f4686703291fdecfe5bebcaf59003904beebe630..28fa78771140875cf86b81e11cb443424b0a8c66 100644 (file)
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -83,7 +83,7 @@ void MacroTileMgr::markTileComplete(uint32_t id)
     tile.mWorkItemsBE = 0;
 }
 
-HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples,
+HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, HANDLE hWorkerPrivateData, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples,
     uint32_t renderTargetArrayIndex)
 {
     uint32_t x, y;
@@ -163,11 +163,11 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
 
             if (hotTile.state == HOTTILE_DIRTY)
             {
-                pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment,
+                pContext->pfnStoreTile(GetPrivateState(pDC), hWorkerPrivateData, format, attachment,
                     x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer);
             }
 
-            pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment,
+            pContext->pfnLoadTile(GetPrivateState(pDC), hWorkerPrivateData, format, attachment,
                 x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer);
 
             hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
@@ -379,6 +379,7 @@ void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile)
 void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroID)
 {
     const API_STATE& state = GetApiState(pDC);
+    HANDLE hWorkerPrivateData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
 
     uint32_t x, y;
     MacroTileMgr::getTileIndices(macroID, x, y);
@@ -392,13 +393,13 @@ void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, ui
     uint32_t colorHottileEnableMask = state.colorHottileEnable;
     while (_BitScanForward(&rtSlot, colorHottileEnableMask))
     {
-        HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples);
+        HOTTILE* pHotTile = GetHotTile(pContext, pDC, hWorkerPrivateData, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples);
 
         if (pHotTile->state == HOTTILE_INVALID)
         {
             RDTSC_BEGIN(BELoadTiles, pDC->drawId);
             // invalid hottile before draw requires a load from surface before we can draw to it
-            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+            pContext->pfnLoadTile(GetPrivateState(pDC), hWorkerPrivateData, KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
             pHotTile->state = HOTTILE_DIRTY;
             RDTSC_END(BELoadTiles, 0);
         }
@@ -416,12 +417,12 @@ void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, ui
     // check depth if enabled
     if (state.depthHottileEnable)
     {
-        HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
+        HOTTILE* pHotTile = GetHotTile(pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
         if (pHotTile->state == HOTTILE_INVALID)
         {
             RDTSC_BEGIN(BELoadTiles, pDC->drawId);
             // invalid hottile before draw requires a load from surface before we can draw to it
-            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+            pContext->pfnLoadTile(GetPrivateState(pDC), hWorkerPrivateData, KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
             pHotTile->state = HOTTILE_DIRTY;
             RDTSC_END(BELoadTiles, 0);
         }
@@ -438,12 +439,12 @@ void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, ui
     // check stencil if enabled
     if (state.stencilHottileEnable)
     {
-        HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
+        HOTTILE* pHotTile = GetHotTile(pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
         if (pHotTile->state == HOTTILE_INVALID)
         {
             RDTSC_BEGIN(BELoadTiles, pDC->drawId);
             // invalid hottile before draw requires a load from surface before we can draw to it
-            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+            pContext->pfnLoadTile(GetPrivateState(pDC), hWorkerPrivateData, KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
             pHotTile->state = HOTTILE_DIRTY;
             RDTSC_END(BELoadTiles, 0);
         }
index 8ef74a7f4b9880a0956a1257b4d9a89e36e92645..2831010b12f2718a87670ae86892b16f72583c21 100644 (file)
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -305,7 +305,7 @@ public:
 
     void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroID);
 
-    HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1,
+    HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, HANDLE hWorkerData, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1,
         uint32_t renderTargetArrayIndex = 0);
 
     HOTTILE *GetHotTileNoLoad(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1);
index 7f9c9dd9d7babded5af9aa106633158a9100babf..284eb27a7d3ece7a8384e52565e99588f93b9be0 100644 (file)
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
-* 
+*
 * @file JitManager.cpp
-* 
+*
 * @brief Implementation if the Jit Manager.
-* 
+*
 * Notes:
-* 
+*
 ******************************************************************************/
 #include "jit_pch.hpp"
 
@@ -66,7 +66,7 @@ JitManager::JitManager(uint32_t simdWidth, const char *arch, const char* core)
     InitializeNativeTargetAsmPrinter();
     InitializeNativeTargetDisassembler();
 
-        
+
     TargetOptions    tOpts;
     tOpts.AllowFPOpFusion = FPOpFusion::Fast;
     tOpts.NoInfsFPMath = false;
@@ -125,6 +125,8 @@ JitManager::JitManager(uint32_t simdWidth, const char *arch, const char* core)
     // llvm5 is picky and does not take a void * type
     fsArgs.push_back(PointerType::get(Gen_SWR_FETCH_CONTEXT(this), 0));
 
+    fsArgs.push_back(Type::getInt8PtrTy(mContext));
+
     fsArgs.push_back(PointerType::get(Gen_SWR_FETCH_CONTEXT(this), 0));
 #if USE_SIMD16_SHADERS
     fsArgs.push_back(PointerType::get(Gen_simd16vertex(this), 0));
@@ -158,7 +160,7 @@ JitManager::JitManager(uint32_t simdWidth, const char *arch, const char* core)
 void JitManager::SetupNewModule()
 {
     SWR_ASSERT(mIsModuleFinalized == true && "Current module is not finalized!");
-    
+
     std::unique_ptr<Module> newModule(new Module("", mContext));
     mpCurrentModule = newModule.get();
 #if defined(_WIN32)
index af97b83cb2d6e3fb3261a8b3b40c346cd83c21c3..09590b7104731c5d879de289f4c9d9c078b6765c 100644 (file)
@@ -1,5 +1,5 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -91,6 +91,7 @@ struct FetchJit : public BuilderGfxMem
     void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
     void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
 
+    Value* mpWorkerData;
     Value* mpFetchInfo;
 };
 
@@ -113,6 +114,8 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
     privateContext->setName("privateContext");
     SetPrivateContext(privateContext);
 
+    mpWorkerData = &*argitr; ++argitr;
+    mpWorkerData->setName("pWorkerData");
     mpFetchInfo = &*argitr; ++argitr;
     mpFetchInfo->setName("fetchInfo");
     Value*    pVtxOut = &*argitr;
@@ -1097,8 +1100,7 @@ Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
     Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets);
 
     // Load the indices; OOB loads 0
-    pIndices = BITCAST(pIndices, PointerType::get(mSimdInt32Ty, 0));
-    return MASKED_LOAD(pIndices, 4, vIndexMask, VIMMED1(0));
+    return MASKED_LOAD(pIndices, 4, vIndexMask, VIMMED1(0), "vIndices", PointerType::get(mSimdInt32Ty, 0), GFX_MEM_CLIENT_FETCH);
 }
 
 //////////////////////////////////////////////////////////////////////////
index 98bf28b21d12745d1ca60883aa201b6615d2a982..6a528b6a0f211f583671ff905a7042bab3b3e0b2 100644 (file)
@@ -153,6 +153,7 @@ struct StoreMacroTileClear
 /// @param x, y - Coordinates to raster tile.
 /// @param pClearColor - Pointer to clear color
 void SwrStoreHotTileClear(
+    HANDLE hWorkerPrivateData,
     SWR_SURFACE_STATE *pDstSurface,
     SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
     UINT x,
index 9dbc16ad9ae526beff34237dfacff66856b5417b..8033304d20c6ea4d7288606ef0f8029842e84f97 100644 (file)
@@ -54,6 +54,7 @@ static std::mutex sBucketMutex;
 /// @param x, y - Coordinates to raster tile.
 /// @param pDstHotTile - Pointer to Hot Tile
 void SwrLoadHotTile(
+    HANDLE hWorkerPrivateData,
     const SWR_SURFACE_STATE *pSrcSurface,
     SWR_FORMAT dstFormat,
     SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
index 9c20669f77ba0e6d8bb8a272dea1820a0ed31df4..53b82c4c12c145990e1f7d0b14dbd409018b1034 100644 (file)
@@ -59,6 +59,7 @@ static std::vector<int32_t> sBuckets(NUM_SWR_FORMATS, -1);
 /// @param x, y - Coordinates to raster tile.
 /// @param pSrcHotTile - Pointer to Hot Tile
 void SwrStoreHotTileToSurface(
+    HANDLE hWorkerPrivateData,
     SWR_SURFACE_STATE *pDstSurface,
     SWR_FORMAT srcFormat,
     SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
index fc5561680cbec3fd8b771a106c5d8bef59548134..bab7800c604e11e5c5d5fd3992a0c9585a1d2c94 100644 (file)
@@ -25,6 +25,7 @@
 
 INLINE void
 swr_LoadHotTile(HANDLE hPrivateContext,
+                HANDLE hWorkerPrivateData,
                 SWR_FORMAT dstFormat,
                 SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
                 UINT x, UINT y,
@@ -34,11 +35,12 @@ swr_LoadHotTile(HANDLE hPrivateContext,
    swr_draw_context *pDC = (swr_draw_context*)hPrivateContext;
    SWR_SURFACE_STATE *pSrcSurface = &pDC->renderTargets[renderTargetIndex];
 
-   pDC->pAPI->pfnSwrLoadHotTile(pSrcSurface, dstFormat, renderTargetIndex, x, y, renderTargetArrayIndex, pDstHotTile);
+   pDC->pAPI->pfnSwrLoadHotTile(hWorkerPrivateData, pSrcSurface, dstFormat, renderTargetIndex, x, y, renderTargetArrayIndex, pDstHotTile);
 }
 
 INLINE void
 swr_StoreHotTile(HANDLE hPrivateContext,
+                 HANDLE hWorkerPrivateData,
                  SWR_FORMAT srcFormat,
                  SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
                  UINT x, UINT y,
@@ -48,11 +50,12 @@ swr_StoreHotTile(HANDLE hPrivateContext,
    swr_draw_context *pDC = (swr_draw_context*)hPrivateContext;
    SWR_SURFACE_STATE *pDstSurface = &pDC->renderTargets[renderTargetIndex];
 
-   pDC->pAPI->pfnSwrStoreHotTileToSurface(pDstSurface, srcFormat, renderTargetIndex, x, y, renderTargetArrayIndex, pSrcHotTile);
+   pDC->pAPI->pfnSwrStoreHotTileToSurface(hWorkerPrivateData, pDstSurface, srcFormat, renderTargetIndex, x, y, renderTargetArrayIndex, pSrcHotTile);
 }
 
 INLINE void
 swr_StoreHotTileClear(HANDLE hPrivateContext,
+                      HANDLE hWorkerPrivateData,
                       SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
                       UINT x,
                       UINT y,
@@ -63,5 +66,5 @@ swr_StoreHotTileClear(HANDLE hPrivateContext,
    swr_draw_context *pDC = (swr_draw_context*)hPrivateContext;
    SWR_SURFACE_STATE *pDstSurface = &pDC->renderTargets[renderTargetIndex];
 
-   pDC->pAPI->pfnSwrStoreHotTileClear(pDstSurface, renderTargetIndex, x, y, renderTargetArrayIndex, pClearColor);
+   pDC->pAPI->pfnSwrStoreHotTileClear(hWorkerPrivateData, pDstSurface, renderTargetIndex, x, y, renderTargetArrayIndex, pClearColor);
 }
index 6ea021a987eaebe4f33e9d576e8c302a50cbb082..13d89863fd7b65122704f33846bc8bf22efeb2ee 100644 (file)
@@ -586,6 +586,7 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key)
    attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float));
 
    std::vector<Type *> gsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0),
+                              PointerType::get(mInt8Ty, 0),
                               PointerType::get(Gen_SWR_GS_CONTEXT(JM()), 0)};
    FunctionType *vsFuncType =
       FunctionType::get(Type::getVoidTy(JM()->mContext), gsArgs, false);
@@ -610,6 +611,8 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key)
    auto argitr = pFunction->arg_begin();
    Value *hPrivateData = &*argitr++;
    hPrivateData->setName("hPrivateData");
+   Value *pWorkerData = &*argitr++;
+   pWorkerData->setName("pWorkerData");
    Value *pGsCtx = &*argitr++;
    pGsCtx->setName("gsCtx");
 
@@ -754,6 +757,7 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key)
    attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float));
 
    std::vector<Type *> vsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0),
+                              PointerType::get(mInt8Ty, 0),
                               PointerType::get(Gen_SWR_VS_CONTEXT(JM()), 0)};
    FunctionType *vsFuncType =
       FunctionType::get(Type::getVoidTy(JM()->mContext), vsArgs, false);
@@ -778,6 +782,8 @@ BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key)
    auto argitr = pFunction->arg_begin();
    Value *hPrivateData = &*argitr++;
    hPrivateData->setName("hPrivateData");
+   Value *pWorkerData = &*argitr++;
+   pWorkerData->setName("pWorkerData");
    Value *pVsCtx = &*argitr++;
    pVsCtx->setName("vsCtx");
    
@@ -1037,6 +1043,7 @@ BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_fs_key &key)
    attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float));
 
    std::vector<Type *> fsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0),
+                              PointerType::get(mInt8Ty, 0),
                               PointerType::get(Gen_SWR_PS_CONTEXT(JM()), 0)};
    FunctionType *funcType =
       FunctionType::get(Type::getVoidTy(JM()->mContext), fsArgs, false);
@@ -1060,6 +1067,8 @@ BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_fs_key &key)
    auto args = pFunction->arg_begin();
    Value *hPrivateData = &*args++;
    hPrivateData->setName("hPrivateData");
+   Value *pWorkerData = &*args++;
+   pWorkerData->setName("pWorkerData");
    Value *pPS = &*args++;
    pPS->setName("psCtx");