swr: [rasterizer core] NUMA optimizations...

author Tim Rowley <timothy.o.rowley@intel.com>

Thu, 24 Mar 2016 06:01:23 +0000 (00:01 -0600)

committer Tim Rowley <timothy.o.rowley@intel.com>

Fri, 25 Mar 2016 19:45:40 +0000 (14:45 -0500)
author Tim Rowley <timothy.o.rowley@intel.com>
Thu, 24 Mar 2016 06:01:23 +0000 (00:01 -0600)
committer Tim Rowley <timothy.o.rowley@intel.com>
Fri, 25 Mar 2016 19:45:40 +0000 (14:45 -0500)
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp

index 7ca182242e5cbf22add82e415ef69c9323bc70d2..f0f7956b5903941caf34237653301bbab73e8f82 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -184,7 +184,7 @@ void QueueWork(SWR_CONTEXT *pContext)
              static TileSet lockedTiles;
              uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
              WorkOnFifoFE(pContext, 0, curDraw[0], 0);
-            WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles);
+            WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles, 0, 0);
          }
          else
          {
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp

index 845e28ea497b9dfdf287240d471271484f7ea506..07bc94a1a544656aacd078b201d15b11a450aa29 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -349,7 +349,9 @@ void WorkOnFifoBE(
      SWR_CONTEXT *pContext,
      uint32_t workerId,
      uint64_t &curDrawBE,
-    TileSet& lockedTiles)
+    TileSet& lockedTiles,
+    uint32_t numaNode,
+    uint32_t numaMask)
  {
      // Find the first incomplete draw that has pending work. If no such draw is found then
      // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
@@ -390,68 +392,78 @@ void WorkOnFifoBE(
  
          for (uint32_t tileID : macroTiles)
          {
+            // Only work on tiles for for this numa node
+            uint32_t x, y;
+            pDC->pTileMgr->getTileIndices(tileID, x, y);
+            if (((x ^ y) & numaMask) != numaNode)
+            {
+                continue;
+            }
+
              MacroTileQueue &tile = pDC->pTileMgr->getMacroTileQueue(tileID);
              
+            if (!tile.getNumQueued())
+            {
+                continue;
+            }
+
              // can only work on this draw if it's not in use by other threads
-            if (lockedTiles.find(tileID) == lockedTiles.end())
+            if (lockedTiles.find(tileID) != lockedTiles.end())
              {
-                if (tile.getNumQueued())
+                continue;
+            }
+
+            if (tile.tryLock())
+            {
+                BE_WORK *pWork;
+
+                RDTSC_START(WorkerFoundWork);
+
+                uint32_t numWorkItems = tile.getNumQueued();
+                SWR_ASSERT(numWorkItems);
+
+                pWork = tile.peek();
+                SWR_ASSERT(pWork);
+                if (pWork->type == DRAW)
                  {
-                    if (tile.tryLock())
-                    {
-                        BE_WORK *pWork;
-
-                        RDTSC_START(WorkerFoundWork);
-
-                        uint32_t numWorkItems = tile.getNumQueued();
-
-                        if (numWorkItems != 0)
-                        {
-                            pWork = tile.peek();
-                            SWR_ASSERT(pWork);
-                            if (pWork->type == DRAW)
-                            {
-                                pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, tileID);
-                            }
-                        }
-
-                        while ((pWork = tile.peek()) != nullptr)
-                        {
-                            pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
-                            tile.dequeue();
-                        }
-                        RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId);
-
-                        _ReadWriteBarrier();
-
-                        pDC->pTileMgr->markTileComplete(tileID);
-
-                        // Optimization: If the draw is complete and we're the last one to have worked on it then
-                        // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
-                        if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete())
-                        {
-                            // We can increment the current BE and safely move to next draw since we know this draw is complete.
-                            curDrawBE++;
-                            CompleteDrawContext(pContext, pDC);
-
-                            lastRetiredDraw++;
-
-                            lockedTiles.clear();
-                            break;
-                        }
-                    }
-                    else
-                    {
-                        // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
-                        lockedTiles.insert(tileID);
-                    }
+                    pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, tileID);
+                }
+
+                while ((pWork = tile.peek()) != nullptr)
+                {
+                    pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
+                    tile.dequeue();
+                }
+                RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId);
+
+                _ReadWriteBarrier();
+
+                pDC->pTileMgr->markTileComplete(tileID);
+
+                // Optimization: If the draw is complete and we're the last one to have worked on it then
+                // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
+                if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete())
+                {
+                    // We can increment the current BE and safely move to next draw since we know this draw is complete.
+                    curDrawBE++;
+                    CompleteDrawContext(pContext, pDC);
+
+                    lastRetiredDraw++;
+
+                    lockedTiles.clear();
+                    break;
                  }
              }
+            else
+            {
+                // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
+                lockedTiles.insert(tileID);
+            }
          }
      }
  }
  
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, int numaNode)
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode)
  {
      // Try to grab the next DC from the ring
      uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
@@ -547,7 +559,8 @@ DWORD workerThreadMain(LPVOID pData)
  
      RDTSC_INIT(threadId);
  
-    int numaNode = (int)pThreadData->numaId;
+    uint32_t numaNode = pThreadData->numaId;
+    uint32_t numaMask = pContext->threadPool.numaMask;
  
      // flush denormals to 0
      _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
@@ -619,7 +632,7 @@ DWORD workerThreadMain(LPVOID pData)
          }
  
          RDTSC_START(WorkerWorkOnFifoBE);
-        WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles);
+        WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
          RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
  
          WorkOnCompute(pContext, workerId, curDrawBE);
@@ -740,6 +753,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
  
      pPool->inThreadShutdown = false;
      pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));
+    pPool->numaMask = 0;
  
      if (KNOB_MAX_WORKER_THREADS)
      {
@@ -760,6 +774,8 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
      }
      else
      {
+        pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.)
+
          uint32_t workerId = 0;
          for (uint32_t n = 0; n < numNodes; ++n)
          {
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h

index 6cc8c96a00f5329bf99634941ee4edcd68bf31f0..821d7dcb16e857ea427161f8f200ea02a6ab0906 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -51,6 +51,7 @@ struct THREAD_POOL
  {
      THREAD_PTR threads[KNOB_MAX_NUM_THREADS];
      uint32_t numThreads;
+    uint32_t numaMask;
      volatile bool inThreadShutdown;
      THREAD_DATA *pThreadData;
  };
@@ -61,7 +62,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
  void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
  
  // Expose FE and BE worker functions to the API thread if single threaded
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, int numaNode);
-void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles);
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode);
+void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask);
  void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE);
  int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC);
 \ No newline at end of file
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp

index 89c779e04d932ef44d59b0c473f74468b91c9cc3..794577270cf1ecc581a63c8c2cdcf5ab1fdc40ca 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -119,7 +119,8 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
          if (create)
          {
              uint32_t size = numSamples * mHotTileSize[attachment];
-            hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
+            uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
+            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode);
              hotTile.state = HOTTILE_INVALID;
              hotTile.numSamples = numSamples;
              hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
@@ -139,10 +140,11 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
              SWR_ASSERT((hotTile.state == HOTTILE_INVALID) ||
                  (hotTile.state == HOTTILE_RESOLVED) ||
                  (hotTile.state == HOTTILE_CLEAR));
-            _aligned_free(hotTile.pBuffer);
+            FreeHotTileMem(hotTile.pBuffer);
  
              uint32_t size = numSamples * mHotTileSize[attachment];
-            hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
+            uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
+            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode);
              hotTile.state = HOTTILE_INVALID;
              hotTile.numSamples = numSamples;
          }
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h

index cf9d2fea32ab9435ae45ab8c2fab56d99fbfdf63..aa561badc1c770da1cb3d05e9076eb78076d577c 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@@ -291,11 +291,7 @@ public:
              {
                  for (int a = 0; a < SWR_NUM_ATTACHMENTS; ++a)
                  {
-                    if (mHotTiles[x][y].Attachment[a].pBuffer != NULL)
-                    {
-                        _aligned_free(mHotTiles[x][y].Attachment[a].pBuffer);
-                        mHotTiles[x][y].Attachment[a].pBuffer = NULL;
-                    }
+                    FreeHotTileMem(mHotTiles[x][y].Attachment[a].pBuffer);
                  }
              }
          }
@@ -315,5 +311,30 @@ public:
  private:
      HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y];
      uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS];
+
+    void* AllocHotTileMem(size_t size, uint32_t align, uint32_t numaNode)
+    {
+        void* p = nullptr;
+#if defined(_WIN32)
+        HANDLE hProcess = GetCurrentProcess();
+        p = VirtualAllocExNuma(hProcess, nullptr, size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE, numaNode);
+#else
+        p = _aligned_malloc(size, align);
+#endif
+
+        return p;
+    }
+
+    void FreeHotTileMem(void* pBuffer)
+    {
+        if (pBuffer)
+        {
+#if defined(_WIN32)
+            VirtualFree(pBuffer, 0, MEM_RELEASE);
+#else
+            _aligned_free(pBuffer);
+#endif
+        }
+    }
  };
author	Tim Rowley <timothy.o.rowley@intel.com>
	Thu, 24 Mar 2016 06:01:23 +0000 (00:01 -0600)
committer	Tim Rowley <timothy.o.rowley@intel.com>
	Fri, 25 Mar 2016 19:45:40 +0000 (14:45 -0500)
src/gallium/drivers/swr/rasterizer/core/api.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/threads.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/threads.h		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/tilemgr.h		patch \| blob \| history