swr/rasterizer: modernize thread TLB
authorJan Zielinski <jan.zielinski@intel.com>
Thu, 1 Aug 2019 13:14:03 +0000 (15:14 +0200)
committerJan Zielinski <jan.zielinski@intel.com>
Thu, 8 Aug 2019 10:33:21 +0000 (12:33 +0200)
Reviewed-by: Alok Hota <alok.hota@intel.com>
14 files changed:
src/gallium/drivers/swr/rasterizer/core/api.cpp
src/gallium/drivers/swr/rasterizer/core/api.h
src/gallium/drivers/swr/rasterizer/core/backend_impl.h
src/gallium/drivers/swr/rasterizer/core/context.h
src/gallium/drivers/swr/rasterizer/core/frontend.cpp
src/gallium/drivers/swr/rasterizer/core/knobs.h
src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
src/gallium/drivers/swr/rasterizer/core/state.h
src/gallium/drivers/swr/rasterizer/core/threads.cpp
src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
src/gallium/drivers/swr/rasterizer/jitter/builder.h
src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp

index f1b0dc03352c4a11fc25720c170452a63d91de15..a6f86b36f9805087216c79b8cdd7f0b526d186f4 100644 (file)
@@ -71,6 +71,21 @@ HANDLE SwrCreateContext(SWR_CREATECONTEXT_INFO* pCreateInfo)
 
     pContext->privateStateSize = pCreateInfo->privateStateSize;
 
+    // initialize callback functions
+    pContext->pfnLoadTile                = pCreateInfo->pfnLoadTile;
+    pContext->pfnStoreTile               = pCreateInfo->pfnStoreTile;
+    pContext->pfnTranslateGfxptrForRead  = pCreateInfo->pfnTranslateGfxptrForRead;
+    pContext->pfnTranslateGfxptrForWrite = pCreateInfo->pfnTranslateGfxptrForWrite;
+    pContext->pfnMakeGfxPtr              = pCreateInfo->pfnMakeGfxPtr;
+    pContext->pfnCreateMemoryContext     = pCreateInfo->pfnCreateMemoryContext;
+    pContext->pfnDestroyMemoryContext    = pCreateInfo->pfnDestroyMemoryContext;
+    pContext->pfnUpdateSoWriteOffset     = pCreateInfo->pfnUpdateSoWriteOffset;
+    pContext->pfnUpdateStats             = pCreateInfo->pfnUpdateStats;
+    pContext->pfnUpdateStatsFE           = pCreateInfo->pfnUpdateStatsFE;
+
+
+    pContext->hExternalMemory = pCreateInfo->hExternalMemory;
+
     pContext->MAX_DRAWS_IN_FLIGHT = KNOB_MAX_DRAWS_IN_FLIGHT;
     if (pCreateInfo->MAX_DRAWS_IN_FLIGHT != 0)
     {
@@ -169,13 +184,13 @@ HANDLE SwrCreateContext(SWR_CREATECONTEXT_INFO* pCreateInfo)
             pContext->threadPool.pThreadData ? pContext->threadPool.pThreadData[i].numaId : 0;
         pContext->ppScratch[i] = (uint8_t*)VirtualAllocExNuma(GetCurrentProcess(),
                                                               nullptr,
-                                                              32 * sizeof(KILOBYTE),
+                                                              KNOB_WORKER_SCRATCH_SPACE_SIZE,
                                                               MEM_RESERVE | MEM_COMMIT,
                                                               PAGE_READWRITE,
                                                               numaNode);
 #else
         pContext->ppScratch[i] =
-            (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4);
+            (uint8_t*)AlignedMalloc(KNOB_WORKER_SCRATCH_SPACE_SIZE, KNOB_SIMD_WIDTH * 4);
 #endif
 
 #if defined(KNOB_ENABLE_AR)
@@ -200,17 +215,6 @@ HANDLE SwrCreateContext(SWR_CREATECONTEXT_INFO* pCreateInfo)
     // initialize hot tile manager
     pContext->pHotTileMgr = new HotTileMgr();
 
-    // initialize callback functions
-    pContext->pfnLoadTile                = pCreateInfo->pfnLoadTile;
-    pContext->pfnStoreTile               = pCreateInfo->pfnStoreTile;
-    pContext->pfnTranslateGfxptrForRead  = pCreateInfo->pfnTranslateGfxptrForRead;
-    pContext->pfnTranslateGfxptrForWrite = pCreateInfo->pfnTranslateGfxptrForWrite;
-    pContext->pfnMakeGfxPtr              = pCreateInfo->pfnMakeGfxPtr;
-    pContext->pfnUpdateSoWriteOffset     = pCreateInfo->pfnUpdateSoWriteOffset;
-    pContext->pfnUpdateStats             = pCreateInfo->pfnUpdateStats;
-    pContext->pfnUpdateStatsFE           = pCreateInfo->pfnUpdateStatsFE;
-
-
     // pass pointer to bucket manager back to caller
 #ifdef KNOB_ENABLE_RDTSC
     pCreateInfo->pBucketMgr = pContext->pBucketMgr;
@@ -1531,7 +1535,9 @@ void SWR_API SwrDiscardRect(HANDLE hContext, uint32_t attachmentMask, const SWR_
 void SwrDispatch(HANDLE   hContext,
                  uint32_t threadGroupCountX,
                  uint32_t threadGroupCountY,
-                 uint32_t threadGroupCountZ)
+                 uint32_t threadGroupCountZ
+
+)
 {
     if (KNOB_TOSS_DRAW)
     {
@@ -1551,6 +1557,7 @@ void SwrDispatch(HANDLE   hContext,
     pTaskData->threadGroupCountX = threadGroupCountX;
     pTaskData->threadGroupCountY = threadGroupCountY;
     pTaskData->threadGroupCountZ = threadGroupCountZ;
+
     pTaskData->enableThreadDispatch = false;
 
     uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
index 8058defb3886c0dd341c3b284cb9e4d68c5d302d..93ea0d42535a51fcae3f209ecfbdd6569554a5ce 100644 (file)
@@ -147,14 +147,20 @@ typedef void(SWR_API* PFN_CLEAR_TILE)(HANDLE                      hPrivateContex
 
 typedef void*(SWR_API* PFN_TRANSLATE_GFXPTR_FOR_READ)(HANDLE   hPrivateContext,
                                                       gfxptr_t xpAddr,
-                                                      bool*    pbNullTileAccessed);
+                                                      bool*    pbNullTileAccessed,
+                                                      HANDLE   hPrivateWorkerData);
 
 typedef void*(SWR_API* PFN_TRANSLATE_GFXPTR_FOR_WRITE)(HANDLE   hPrivateContext,
                                                        gfxptr_t xpAddr,
-                                                       bool*    pbNullTileAccessed);
+                                                       bool*    pbNullTileAccessed,
+                                                       HANDLE   hPrivateWorkerData);
 
 typedef gfxptr_t(SWR_API* PFN_MAKE_GFXPTR)(HANDLE hPrivateContext, void* sysAddr);
 
+typedef HANDLE(SWR_API* PFN_CREATE_MEMORY_CONTEXT)(HANDLE hExternalMemory);
+
+typedef void(SWR_API* PFN_DESTROY_MEMORY_CONTEXT)(HANDLE hExternalMemory, HANDLE hMemoryContext);
+
 //////////////////////////////////////////////////////////////////////////
 /// @brief Callback to allow driver to update their copy of streamout write offset.
 ///        This is call is made for any draw operation that has streamout enabled
@@ -219,10 +225,11 @@ struct SWR_API_THREADING_INFO
                                    // Independent of KNOB_MAX_THREADS_PER_CORE.
 };
 
-struct SWR_WORKER_DATA
-{
-    HANDLE hArContext;  // handle to the archrast context
-};
+//////////////////////////////////////////////////////////////////////////
+/// SWR_CONTEXT
+/// Forward Declaration (see context.h for full definition)
+/////////////////////////////////////////////////////////////////////////
+class SWR_CONTEXT;
 
 //////////////////////////////////////////////////////////////////////////
 /// SWR_WORKER_PRIVATE_STATE
@@ -233,7 +240,7 @@ struct SWR_WORKER_DATA
 /////////////////////////////////////////////////////////////////////////
 struct SWR_WORKER_PRIVATE_STATE
 {
-    typedef void(SWR_API* PFN_WORKER_DATA)(HANDLE hWorkerPrivateData, uint32_t iWorkerNum);
+    typedef void(SWR_API* PFN_WORKER_DATA)(SWR_CONTEXT* pContext, HANDLE hWorkerPrivateData, uint32_t iWorkerNum);
 
     size_t          perWorkerPrivateStateSize; ///< Amount of data to allocate per-worker
     PFN_WORKER_DATA pfnInitWorkerData;         ///< Init function for worker data.  If null
@@ -260,6 +267,8 @@ struct SWR_CREATECONTEXT_INFO
     PFN_TRANSLATE_GFXPTR_FOR_READ  pfnTranslateGfxptrForRead;
     PFN_TRANSLATE_GFXPTR_FOR_WRITE pfnTranslateGfxptrForWrite;
     PFN_MAKE_GFXPTR                pfnMakeGfxPtr;
+    PFN_CREATE_MEMORY_CONTEXT      pfnCreateMemoryContext;
+    PFN_DESTROY_MEMORY_CONTEXT     pfnDestroyMemoryContext;
     PFN_UPDATE_SO_WRITE_OFFSET     pfnUpdateSoWriteOffset;
     PFN_UPDATE_STATS               pfnUpdateStats;
     PFN_UPDATE_STATS_FE            pfnUpdateStatsFE;
@@ -275,6 +284,9 @@ struct SWR_CREATECONTEXT_INFO
     // ArchRast event manager.
     HANDLE hArEventManager;
 
+    // handle to external memory for worker datas to create memory contexts
+    HANDLE hExternalMemory;
+
     // Input (optional): Threading info that overrides any set KNOB values.
     SWR_THREADING_INFO* pThreadInfo;
 
@@ -588,7 +600,6 @@ SWR_FUNC(void,
          uint32_t threadGroupCountY,
          uint32_t threadGroupCountZ);
 
-
 /// @note this enum needs to be kept in sync with HOTTILE_STATE!
 enum SWR_TILE_STATE
 {
index 9e74e2cee8e2e826becd2764d4df31671e63fcc3..1bd2e743781096460ab92e7c06788a97748b323f 100644 (file)
@@ -1141,7 +1141,6 @@ void BackendPixelRate(DRAW_CONTEXT*        pDC,
             // execute pixel shader
             RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelShader, pDC->drawId);
             state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext);
-            UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
             RDTSC_END(pDC->pContext->pBucketMgr, BEPixelShader, 0);
 
             // update stats
index 8891cc881a36d579d5450b0481377d72f756985e..13cb7c8b856b2e20502846a3d525e254637d25c2 100644 (file)
@@ -535,6 +535,8 @@ struct SWR_CONTEXT
     PFN_TRANSLATE_GFXPTR_FOR_READ  pfnTranslateGfxptrForRead;
     PFN_TRANSLATE_GFXPTR_FOR_WRITE pfnTranslateGfxptrForWrite;
     PFN_MAKE_GFXPTR                pfnMakeGfxPtr;
+    PFN_CREATE_MEMORY_CONTEXT      pfnCreateMemoryContext;
+    PFN_DESTROY_MEMORY_CONTEXT     pfnDestroyMemoryContext;
     PFN_UPDATE_SO_WRITE_OFFSET     pfnUpdateSoWriteOffset;
     PFN_UPDATE_STATS               pfnUpdateStats;
     PFN_UPDATE_STATS_FE            pfnUpdateStatsFE;
@@ -558,6 +560,9 @@ struct SWR_CONTEXT
     // ArchRast thread contexts.
     HANDLE* pArContext;
 
+    // handle to external memory for worker datas to create memory contexts
+    HANDLE hExternalMemory;
+
     BucketManager *pBucketMgr;
 };
 
index a27b33d2051184d79676ea0473484ae9d51ca9f8..1aa98f49fd7b4a8670d31918e370772157d2c5ba 100644 (file)
@@ -578,7 +578,7 @@ static void StreamOut(
         {
             bool  nullTileAccessed = false;
             void* pWriteOffset     = pDC->pContext->pfnTranslateGfxptrForWrite(
-                GetPrivateState(pDC), soContext.pBuffer[i]->pWriteOffset, &nullTileAccessed);
+                GetPrivateState(pDC), soContext.pBuffer[i]->pWriteOffset, &nullTileAccessed, pWorkerData);
             *((uint32_t*)pWriteOffset) = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
         }
 
index 92fbf8840e13a3a23dfc45a956546318cb392c1b..8dab50dab01d980c0a687ad8ac539d93262c84b4 100644 (file)
@@ -84,6 +84,9 @@
 #define KNOB_GUARDBAND_WIDTH 32768.0f
 #define KNOB_GUARDBAND_HEIGHT 32768.0f
 
+// Scratch space requirements per worker. Currently only used for TGSM sizing for some stages
+#define KNOB_WORKER_SCRATCH_SPACE_SIZE (32 * 1024)
+
 ///////////////////////////////
 // Macro tile configuration
 ///////////////////////////////
index 44c486c80bf68b46d956bc73b4796fd771895c03..4f1d8ccff229bd891dbc6d5ffd5556b357d81671 100644 (file)
@@ -271,7 +271,7 @@ void RasterizeSimplePoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTi
                                                {48, 49, 52, 53, 56, 57, 60, 61},
                                                {50, 51, 54, 55, 58, 59, 62, 63}};
 
-    OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
+    OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc = {};
 
     // pull point information from triangle buffer
     // @todo use structs for readability
@@ -287,8 +287,12 @@ void RasterizeSimplePoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTi
     // mask indices by the maximum valid index for x/y of coveragemap.
     uint32_t tX = workDesc.triFlags.coverageMask & 0x7;
     uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7;
-    // todo: multisample points?
-    triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX];
+    for (uint32_t i = 0; i < _countof(triDesc.coverageMask); ++i)
+    {
+        triDesc.coverageMask[i] = 1ULL << coverageMap[tY][tX];
+    }
+    triDesc.anyCoveredSamples = triDesc.coverageMask[0];
+    triDesc.innerCoverageMask = triDesc.coverageMask[0];
 
     // no persp divide needed for points
     triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
index 5202e6146a14d5492fd6d7688a3c4dfaef834adf..25d4fed957826e8d6d005fa4062a6de153e225ea 100644 (file)
@@ -213,6 +213,11 @@ struct SIMDVERTEX_T
     typename SIMD_T::Vec4 attrib[SWR_VTX_NUM_SLOTS];
 };
 
+struct SWR_WORKER_DATA
+{
+    HANDLE hArContext;  // handle to the archrast context
+};
+
 //////////////////////////////////////////////////////////////////////////
 /// SWR_SHADER_STATS
 /// @brief Structure passed to shader for stats collection.
index a0ddd96c61f07f6214c8b8f249eed5d138cb792a..987469340d2b7a89b3c0462af6726fb74f038c03 100644 (file)
@@ -1216,7 +1216,7 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
             pPool->pThreadData[i].pWorkerPrivateData = pWorkerData;
             if (pContext->workerPrivateState.pfnInitWorkerData)
             {
-                pContext->workerPrivateState.pfnInitWorkerData(pWorkerData, i);
+                pContext->workerPrivateState.pfnInitWorkerData(pContext, pWorkerData, i);
             }
             pWorkerData = PtrAdd(pWorkerData, perWorkerSize);
         }
@@ -1396,7 +1396,7 @@ void DestroyThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
         if (pContext->workerPrivateState.pfnFinishWorkerData)
         {
             pContext->workerPrivateState.pfnFinishWorkerData(
-                pPool->pThreadData[t].pWorkerPrivateData, t);
+                pContext, pPool->pThreadData[t].pWorkerPrivateData, t);
         }
     }
 
index 0f78bd661a51951703e8823259a1115389369833..74edd4febbc5ac51654f473d6b645e5777c29fec 100644 (file)
@@ -649,7 +649,8 @@ JitCache::JitCache()
 
 int ExecUnhookedProcess(const std::string& CmdLine, std::string* pStdOut, std::string* pStdErr)
 {
-    return ExecCmd(CmdLine, "", pStdOut, pStdErr);
+
+    return ExecCmd(CmdLine, nullptr, pStdOut, pStdErr);
 }
 
 /// Calculate actual directory where module will be cached.
index 30481b432084b82a35f4688e1cd4a242bc1310d8..e7ba0040d9da11bfd1fc4c584f5f95cfeefd34d0 100644 (file)
@@ -110,6 +110,7 @@ namespace SwrJit
         mSimdVectorTy    = ArrayType::get(mSimdFP32Ty, 4);
         mSimdVectorIntTy = ArrayType::get(mSimdInt32Ty, 4);
         mSimdVectorTRTy  = ArrayType::get(mSimdFP32Ty, 5);
+        mSimdVectorTRIntTy  = ArrayType::get(mSimdInt32Ty, 5);
     }
 
     /// @brief Mark this alloca as temporary to avoid hoisting later on
index 6e1d94b9e68594155020739771501c6bec2666ed..9f2c199464da5ae07441aab997e9988c52b41d68 100644 (file)
@@ -108,6 +108,7 @@ namespace SwrJit
         Type* mSimdVectorTy;
         Type* mSimdVectorTRTy;
         Type* mSimdVectorIntTy;
+        Type* mSimdVectorTRIntTy;
 
         // Built in types: simd16
 
index 3987a5f3476d1385b6443a44d24e3e47b9788791..616c73b254aee399efa5e0bff10a04910e1151ac 100644 (file)
@@ -50,6 +50,17 @@ Constant* C(const std::initializer_list<Ty>& constList)
     return ConstantVector::get(vConsts);
 }
 
+template <typename Ty>
+Constant* C(const std::vector<Ty>& constList)
+{
+    std::vector<Constant*> vConsts;
+    for (auto i : constList)
+    {
+        vConsts.push_back(C((Ty)i));
+    }
+    return ConstantVector::get(vConsts);
+}
+
 template <typename Ty>
 Constant* CA(LLVMContext& ctx, ArrayRef<Ty> constList)
 {
index fe5b48e584bd86f1afe2a3d3d24fc9b399eb7f40..72704e94e4ccc47a87e98f5120b89056fac715ae 100644 (file)
@@ -1103,6 +1103,63 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,
     }
 }
 
+
+typedef void* (*PFN_TRANSLATEGFXADDRESS_FUNC)(void* pdc, gfxptr_t va, bool* out_pbNullTileAccessed, void* pWorkerData);
+
+template <typename T>
+void GetSimdValidIndicesGfx(gfxptr_t                     indices,
+                            gfxptr_t                     lastIndex,
+                            uint32_t                     vWidth,
+                            PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate,
+                            void*                        pdc,
+                            uint32_t*                    outIndices,
+                            void*                        pWorkerData)
+{
+    SWR_ASSERT(outIndices != nullptr);
+
+    gfxptr_t indexPtr = indices;
+    for (int64_t lane = 0; lane < vWidth; lane++)
+    {
+        uint32_t index = 0;
+
+        if (indexPtr < lastIndex)
+        {
+            // translate indexPtr and load from it
+            T* addr = (T*)pfnTranslate(pdc, indexPtr, nullptr, pWorkerData);
+            SWR_ASSERT(addr != nullptr);
+            index = *addr;
+        }
+
+        // index to 32 bits and insert into the correct simd lane
+        outIndices[lane] = index;
+
+        indexPtr += sizeof(T);
+    }
+}
+
+void GetSimdValid8bitIndicesGfx(gfxptr_t                     indices,
+                                gfxptr_t                     lastIndex,
+                                uint32_t                     vWidth,
+                                PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate,
+                                void*                        pdc,
+                                uint32_t*                    outIndices,
+                                void*                        pWorkerData)
+{
+    GetSimdValidIndicesGfx<uint8_t>(indices, lastIndex, vWidth, pfnTranslate, pdc, outIndices, pWorkerData);
+}
+
+void GetSimdValid16bitIndicesGfx(gfxptr_t                     indices,
+                                 gfxptr_t                     lastIndex,
+                                 uint32_t                     vWidth,
+                                 PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate,
+                                 void*                        pdc,
+                                 uint32_t*                    outIndices,
+                                 void*                        pWorkerData)
+{
+    GetSimdValidIndicesGfx<uint16_t>(indices, lastIndex, vWidth, pfnTranslate, pdc, outIndices, pWorkerData);
+}
+
+
 template <typename T>
 Value* FetchJit::GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex)
 {