src/gallium/drivers/swr/rasterizer/core/threads.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 ****************************************************************************/
  23
  24 #include <stdio.h>
  25 #include <thread>
  26 #include <algorithm>
  27 #include <float.h>
  28 #include <vector>
  29 #include <utility>
  30 #include <fstream>
  31 #include <string>
  32
  33 #if defined(__linux__) || defined(__gnu_linux__) || defined(__APPLE__)
  34 #include <pthread.h>
  35 #include <sched.h>
  36 #include <unistd.h>
  37 #endif
  38
  39 #ifdef __APPLE__
  40 #include <sys/types.h>
  41 #include <sys/sysctl.h>
  42 #endif
  43
  44 #include "common/os.h"
  45 #include "core/api.h"
  46 #include "context.h"
  47 #include "frontend.h"
  48 #include "backend.h"
  49 #include "rasterizer.h"
  50 #include "rdtsc_core.h"
  51 #include "tilemgr.h"
  52
  53
  54
  55
  56 // ThreadId
  57 struct Core
  58 {
  59     uint32_t                procGroup = 0;
  60     std::vector<uint32_t>   threadIds;
  61 };
  62
  63 struct NumaNode
  64 {
  65     uint32_t          numaId;
  66     std::vector<Core> cores;
  67 };
  68
  69 typedef std::vector<NumaNode> CPUNumaNodes;
  70
  71 void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup)
  72 {
  73     out_nodes.clear();
  74     out_numThreadsPerProcGroup = 0;
  75
  76 #if defined(_WIN32)
  77
  78     std::vector<KAFFINITY> threadMaskPerProcGroup;
  79
  80     static std::mutex m;
  81     std::lock_guard<std::mutex> l(m);
  82
  83     DWORD bufSize = 0;
  84
  85     BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufSize);
  86     SWR_ASSERT(ret == FALSE && GetLastError() == ERROR_INSUFFICIENT_BUFFER);
  87
  88     PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize);
  89     SWR_ASSERT(pBufferMem);
  90
  91     ret = GetLogicalProcessorInformationEx(RelationProcessorCore, pBufferMem, &bufSize);
  92     SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information");
  93
  94     uint32_t count = bufSize / pBufferMem->Size;
  95     PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = pBufferMem;
  96
  97     for (uint32_t i = 0; i < count; ++i)
  98     {
  99         SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore);
 100         for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g)
 101         {
 102             auto& gmask = pBuffer->Processor.GroupMask[g];
 103             uint32_t threadId = 0;
 104             uint32_t procGroup = gmask.Group;
 105
 106             Core* pCore = nullptr;
 107
 108             uint32_t numThreads = (uint32_t)_mm_popcount_sizeT(gmask.Mask);
 109
 110             while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask))
 111             {
 112                 // clear mask
 113                 KAFFINITY threadMask = KAFFINITY(1) << threadId;
 114                 gmask.Mask &= ~threadMask;
 115
 116                 if (procGroup >= threadMaskPerProcGroup.size())
 117                 {
 118                     threadMaskPerProcGroup.resize(procGroup + 1);
 119                 }
 120
 121                 if (threadMaskPerProcGroup[procGroup] & threadMask)
 122                 {
 123                     // Already seen this mask.  This means that we are in 32-bit mode and
 124                     // have seen more than 32 HW threads for this procGroup
 125                     // Don't use it
 126 #if defined(_WIN64)
 127                     SWR_INVALID("Shouldn't get here in 64-bit mode");
 128 #endif
 129                     continue;
 130                 }
 131
 132                 threadMaskPerProcGroup[procGroup] |= (KAFFINITY(1) << threadId);
 133
 134                 // Find Numa Node
 135                 uint32_t numaId = 0;
 136                 PROCESSOR_NUMBER procNum = {};
 137                 procNum.Group = WORD(procGroup);
 138                 procNum.Number = UCHAR(threadId);
 139
 140                 ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId);
 141                 SWR_ASSERT(ret);
 142
 143                 // Store data
 144                 if (out_nodes.size() <= numaId)
 145                 {
 146                     out_nodes.resize(numaId + 1);
 147                 }
 148                 auto& numaNode = out_nodes[numaId];
 149                 numaNode.numaId = numaId;
 150
 151                 uint32_t coreId = 0;
 152
 153                 if (nullptr == pCore)
 154                 {
 155                     numaNode.cores.push_back(Core());
 156                     pCore = &numaNode.cores.back();
 157                     pCore->procGroup = procGroup;
 158                 }
 159                 pCore->threadIds.push_back(threadId);
 160                 if (procGroup == 0)
 161                 {
 162                     out_numThreadsPerProcGroup++;
 163                 }
 164             }
 165         }
 166         pBuffer = PtrAdd(pBuffer, pBuffer->Size);
 167     }
 168
 169     free(pBufferMem);
 170
 171
 172 #elif defined(__linux__) || defined (__gnu_linux__)
 173
 174     // Parse /proc/cpuinfo to get full topology
 175     std::ifstream input("/proc/cpuinfo");
 176     std::string line;
 177     char* c;
 178     uint32_t procId = uint32_t(-1);
 179     uint32_t coreId = uint32_t(-1);
 180     uint32_t physId = uint32_t(-1);
 181
 182     while (std::getline(input, line))
 183     {
 184         if (line.find("processor") != std::string::npos)
 185         {
 186             auto data_start = line.find(": ") + 2;
 187             procId = std::strtoul(&line.c_str()[data_start], &c, 10);
 188             continue;
 189         }
 190         if (line.find("core id") != std::string::npos)
 191         {
 192             auto data_start = line.find(": ") + 2;
 193             coreId = std::strtoul(&line.c_str()[data_start], &c, 10);
 194             continue;
 195         }
 196         if (line.find("physical id") != std::string::npos)
 197         {
 198             auto data_start = line.find(": ") + 2;
 199             physId = std::strtoul(&line.c_str()[data_start], &c, 10);
 200             continue;
 201         }
 202         if (line.length() == 0)
 203         {
 204             if (physId + 1 > out_nodes.size())
 205                 out_nodes.resize(physId + 1);
 206             auto& numaNode = out_nodes[physId];
 207             numaNode.numaId = physId;
 208
 209             if (coreId + 1 > numaNode.cores.size())
 210                 numaNode.cores.resize(coreId + 1);
 211             auto& core = numaNode.cores[coreId];
 212             core.procGroup = coreId;
 213             core.threadIds.push_back(procId);
 214         }
 215     }
 216
 217     out_numThreadsPerProcGroup = 0;
 218     for (auto &node : out_nodes)
 219     {
 220         for (auto &core : node.cores)
 221         {
 222             out_numThreadsPerProcGroup += core.threadIds.size();
 223         }
 224     }
 225
 226 #elif defined(__APPLE__)
 227
 228     auto numProcessors = 0;
 229     auto numCores = 0;
 230     auto numPhysicalIds = 0;
 231
 232     int value;
 233     size_t size = sizeof(value);
 234
 235     int result = sysctlbyname("hw.packages", &value, &size, NULL, 0);
 236     SWR_ASSERT(result == 0);
 237     numPhysicalIds = value;
 238
 239     result = sysctlbyname("hw.logicalcpu", &value, &size, NULL, 0);
 240     SWR_ASSERT(result == 0);
 241     numProcessors = value;
 242
 243     result = sysctlbyname("hw.physicalcpu", &value, &size, NULL, 0);
 244     SWR_ASSERT(result == 0);
 245     numCores = value;
 246
 247     out_nodes.resize(numPhysicalIds);
 248
 249     for (auto physId = 0; physId < numPhysicalIds; ++physId)
 250     {
 251         auto &numaNode = out_nodes[physId];
 252         auto procId = 0;
 253
 254         numaNode.cores.resize(numCores);
 255
 256         while (procId < numProcessors)
 257         {
 258             for (auto coreId = 0; coreId < numaNode.cores.size(); ++coreId, ++procId)
 259             {
 260                 auto &core = numaNode.cores[coreId];
 261
 262                 core.procGroup = coreId;
 263                 core.threadIds.push_back(procId);
 264             }
 265         }
 266     }
 267
 268     out_numThreadsPerProcGroup = 0;
 269
 270     for (auto &node : out_nodes)
 271     {
 272         for (auto &core : node.cores)
 273         {
 274             out_numThreadsPerProcGroup += core.threadIds.size();
 275         }
 276     }
 277
 278 #else
 279
 280 #error Unsupported platform
 281
 282 #endif
 283
 284     // Prune empty cores and numa nodes
 285     for (auto node_it = out_nodes.begin(); node_it != out_nodes.end(); )
 286     {
 287         // Erase empty cores (first)
 288         for (auto core_it = node_it->cores.begin(); core_it != node_it->cores.end(); )
 289         {
 290             if (core_it->threadIds.size() == 0)
 291             {
 292                 core_it = node_it->cores.erase(core_it);
 293             }
 294             else
 295             {
 296                 ++core_it;
 297             }
 298         }
 299
 300         // Erase empty numa nodes (second)
 301         if (node_it->cores.size() == 0)
 302         {
 303             node_it = out_nodes.erase(node_it);
 304         }
 305         else
 306         {
 307             ++node_it;
 308         }
 309     }
 310 }
 311
 312 void bindThread(SWR_CONTEXT* pContext, uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=false)
 313 {
 314     // Only bind threads when MAX_WORKER_THREADS isn't set.
 315     if (pContext->threadInfo.SINGLE_THREADED || (pContext->threadInfo.MAX_WORKER_THREADS && bindProcGroup == false))
 316     {
 317         return;
 318     }
 319
 320 #if defined(_WIN32)
 321
 322     GROUP_AFFINITY affinity = {};
 323     affinity.Group = procGroupId;
 324
 325 #if !defined(_WIN64)
 326     if (threadId >= 32)
 327     {
 328         // Hopefully we don't get here.  Logic in CreateThreadPool should prevent this.
 329         SWR_INVALID("Shouldn't get here");
 330
 331         // In a 32-bit process on Windows it is impossible to bind
 332         // to logical processors 32-63 within a processor group.
 333         // In this case set the mask to 0 and let the system assign
 334         // the processor.  Hopefully it will make smart choices.
 335         affinity.Mask = 0;
 336     }
 337     else
 338 #endif
 339     {
 340         // If MAX_WORKER_THREADS is set, only bind to the proc group,
 341         // Not the individual HW thread.
 342         if (!bindProcGroup  && !pContext->threadInfo.MAX_WORKER_THREADS)
 343         {
 344             affinity.Mask = KAFFINITY(1) << threadId;
 345         }
 346         else
 347         {
 348             affinity.Mask = KAFFINITY(0);
 349         }
 350     }
 351
 352     if (!SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr))
 353     {
 354         SWR_INVALID("Failed to set Thread Affinity");
 355     }
 356
 357 #elif defined(__linux__) || defined(__gnu_linux__)
 358
 359     cpu_set_t cpuset;
 360     pthread_t thread = pthread_self();
 361     CPU_ZERO(&cpuset);
 362     CPU_SET(threadId, &cpuset);
 363
 364     int err = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
 365     if (err != 0)
 366     {
 367         fprintf(stderr, "pthread_setaffinity_np failure for tid %u: %s\n", threadId, strerror(err));
 368     }
 369
 370 #endif
 371 }
 372
 373 INLINE
 374 uint32_t GetEnqueuedDraw(SWR_CONTEXT *pContext)
 375 {
 376     return pContext->dcRing.GetHead();
 377 }
 378
 379 INLINE
 380 DRAW_CONTEXT *GetDC(SWR_CONTEXT *pContext, uint32_t drawId)
 381 {
 382     return &pContext->dcRing[(drawId-1) % pContext->MAX_DRAWS_IN_FLIGHT];
 383 }
 384
 385 INLINE
 386 bool IDComparesLess(uint32_t a, uint32_t b)
 387 {
 388     // Use signed delta to ensure that wrap-around to 0 is correctly handled.
 389     int32_t delta = int32_t(a - b);
 390     return (delta < 0);
 391 }
 392
 393 // returns true if dependency not met
 394 INLINE
 395 bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t lastRetiredDraw)
 396 {
 397     return pDC->dependent && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
 398 }
 399
 400 bool CheckDependencyFE(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t lastRetiredDraw)
 401 {
 402     return pDC->dependentFE && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
 403 }
 404
 405 //////////////////////////////////////////////////////////////////////////
 406 /// @brief Update client stats.
 407 INLINE void UpdateClientStats(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
 408 {
 409     if ((pContext->pfnUpdateStats == nullptr) || (GetApiState(pDC).enableStatsBE == false))
 410     {
 411         return;
 412     }
 413
 414     DRAW_DYNAMIC_STATE& dynState = pDC->dynState;
 415     OSALIGNLINE(SWR_STATS) stats{ 0 };
 416
 417     // Sum up stats across all workers before sending to client.
 418     for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
 419     {
 420         stats.DepthPassCount += dynState.pStats[i].DepthPassCount;
 421
 422         stats.PsInvocations  += dynState.pStats[i].PsInvocations;
 423         stats.CsInvocations  += dynState.pStats[i].CsInvocations;
 424     }
 425
 426
 427     pContext->pfnUpdateStats(GetPrivateState(pDC), &stats);
 428 }
 429
 430 INLINE void ExecuteCallbacks(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
 431 {
 432     UpdateClientStats(pContext, workerId, pDC);
 433
 434     if (pDC->retireCallback.pfnCallbackFunc)
 435     {
 436         pDC->retireCallback.pfnCallbackFunc(pDC->retireCallback.userData,
 437             pDC->retireCallback.userData2,
 438             pDC->retireCallback.userData3);
 439     }
 440 }
 441
 442 // inlined-only version
 443 INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
 444 {
 445     int32_t result = static_cast<int32_t>(InterlockedDecrement(&pDC->threadsDone));
 446     SWR_ASSERT(result >= 0);
 447
 448     AR_FLUSH(pDC->drawId);
 449
 450     if (result == 0)
 451     {
 452         ExecuteCallbacks(pContext, workerId, pDC);
 453
 454         // Cleanup memory allocations
 455         pDC->pArena->Reset(true);
 456         if (!pDC->isCompute)
 457         {
 458             pDC->pTileMgr->initialize();
 459         }
 460         if (pDC->cleanupState)
 461         {
 462             pDC->pState->pArena->Reset(true);
 463         }
 464
 465         _ReadWriteBarrier();
 466
 467         pContext->dcRing.Dequeue();  // Remove from tail
 468     }
 469
 470     return result;
 471 }
 472
 473 // available to other translation modules
 474 int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
 475 {
 476     return CompleteDrawContextInl(pContext, 0, pDC);
 477 }
 478
 479 INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawBE, uint32_t& drawEnqueued)
 480 {
 481     // increment our current draw id to the first incomplete draw
 482     drawEnqueued = GetEnqueuedDraw(pContext);
 483     while (IDComparesLess(curDrawBE, drawEnqueued))
 484     {
 485         DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT];
 486
 487         // If its not compute and FE is not done then break out of loop.
 488         if (!pDC->doneFE && !pDC->isCompute) break;
 489
 490         bool isWorkComplete = pDC->isCompute ?
 491             pDC->pDispatch->isWorkComplete() :
 492             pDC->pTileMgr->isWorkComplete();
 493
 494         if (isWorkComplete)
 495         {
 496             curDrawBE++;
 497             CompleteDrawContextInl(pContext, workerId, pDC);
 498         }
 499         else
 500         {
 501             break;
 502         }
 503     }
 504
 505     // If there are no more incomplete draws then return false.
 506     return IDComparesLess(curDrawBE, drawEnqueued);
 507 }
 508
 509 //////////////////////////////////////////////////////////////////////////
 510 /// @brief If there is any BE work then go work on it.
 511 /// @param pContext - pointer to SWR context.
 512 /// @param workerId - The unique worker ID that is assigned to this thread.
 513 /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
 514 ///                    has its own curDrawBE counter and this ensures that each worker processes all the
 515 ///                    draws in order.
 516 /// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its
 517 ///                      own set and each time it fails to lock a macrotile, because its already locked,
 518 ///                      then it will add that tile to the lockedTiles set. As a worker begins to work
 519 ///                      on future draws the lockedTiles ensure that it doesn't work on tiles that may
 520 ///                      still have work pending in a previous draw. Additionally, the lockedTiles is
 521 ///                      hueristic that can steer a worker back to the same macrotile that it had been
 522 ///                      working on in a previous draw.
 523 /// @returns        true if worker thread should shutdown
 524 bool WorkOnFifoBE(
 525     SWR_CONTEXT *pContext,
 526     uint32_t workerId,
 527     uint32_t &curDrawBE,
 528     TileSet& lockedTiles,
 529     uint32_t numaNode,
 530     uint32_t numaMask)
 531 {
 532     bool bShutdown = false;
 533
 534     // Find the first incomplete draw that has pending work. If no such draw is found then
 535     // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
 536     uint32_t drawEnqueued = 0;
 537     if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
 538     {
 539         return false;
 540     }
 541
 542     uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;
 543
 544     // Reset our history for locked tiles. We'll have to re-learn which tiles are locked.
 545     lockedTiles.clear();
 546
 547     // Try to work on each draw in order of the available draws in flight.
 548     //   1. If we're on curDrawBE, we can work on any macrotile that is available.
 549     //   2. If we're trying to work on draws after curDrawBE, we are restricted to
 550     //      working on those macrotiles that are known to be complete in the prior draw to
 551     //      maintain order. The locked tiles provides the history to ensures this.
 552     for (uint32_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
 553     {
 554         DRAW_CONTEXT *pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
 555
 556         if (pDC->isCompute) return false; // We don't look at compute work.
 557
 558         // First wait for FE to be finished with this draw. This keeps threading model simple
 559         // but if there are lots of bubbles between draws then serializing FE and BE may
 560         // need to be revisited.
 561         if (!pDC->doneFE) return false;
 562
 563         // If this draw is dependent on a previous draw then we need to bail.
 564         if (CheckDependency(pContext, pDC, lastRetiredDraw))
 565         {
 566             return false;
 567         }
 568
 569         // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it.
 570         auto &macroTiles = pDC->pTileMgr->getDirtyTiles();
 571
 572         for (auto tile : macroTiles)
 573         {
 574             uint32_t tileID = tile->mId;
 575
 576             // Only work on tiles for this numa node
 577             uint32_t x, y;
 578             pDC->pTileMgr->getTileIndices(tileID, x, y);
 579             if (((x ^ y) & numaMask) != numaNode)
 580             {
 581                 continue;
 582             }
 583
 584             if (!tile->getNumQueued())
 585             {
 586                 continue;
 587             }
 588
 589             // can only work on this draw if it's not in use by other threads
 590             if (lockedTiles.find(tileID) != lockedTiles.end())
 591             {
 592                 continue;
 593             }
 594
 595             if (tile->tryLock())
 596             {
 597                 BE_WORK *pWork;
 598
 599                 RDTSC_BEGIN(WorkerFoundWork, pDC->drawId);
 600
 601                 uint32_t numWorkItems = tile->getNumQueued();
 602                 SWR_ASSERT(numWorkItems);
 603
 604                 pWork = tile->peek();
 605                 SWR_ASSERT(pWork);
 606                 if (pWork->type == DRAW)
 607                 {
 608                     pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, workerId, tileID);
 609                 }
 610                 else if (pWork->type == SHUTDOWN)
 611                 {
 612                     bShutdown = true;
 613                 }
 614
 615                 while ((pWork = tile->peek()) != nullptr)
 616                 {
 617                     pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
 618                     tile->dequeue();
 619                 }
 620                 RDTSC_END(WorkerFoundWork, numWorkItems);
 621
 622                 _ReadWriteBarrier();
 623
 624                 pDC->pTileMgr->markTileComplete(tileID);
 625
 626                 // Optimization: If the draw is complete and we're the last one to have worked on it then
 627                 // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
 628                 if ((curDrawBE == i) && (bShutdown || pDC->pTileMgr->isWorkComplete()))
 629                 {
 630                     // We can increment the current BE and safely move to next draw since we know this draw is complete.
 631                     curDrawBE++;
 632                     CompleteDrawContextInl(pContext, workerId, pDC);
 633
 634                     lastRetiredDraw++;
 635
 636                     lockedTiles.clear();
 637                     break;
 638                 }
 639
 640                 if (bShutdown)
 641                 {
 642                     break;
 643                 }
 644             }
 645             else
 646             {
 647                 // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
 648                 lockedTiles.insert(tileID);
 649             }
 650         }
 651     }
 652
 653     return bShutdown;
 654 }
 655
 656 //////////////////////////////////////////////////////////////////////////
 657 /// @brief Called when FE work is complete for this DC.
 658 INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
 659 {
 660     if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStatsFE)
 661     {
 662         SWR_STATS_FE& stats = pDC->dynState.statsFE;
 663
 664         AR_EVENT(FrontendStatsEvent(pDC->drawId,
 665             stats.IaVertices, stats.IaPrimitives, stats.VsInvocations, stats.HsInvocations,
 666             stats.DsInvocations, stats.GsInvocations, stats.GsPrimitives, stats.CInvocations, stats.CPrimitives,
 667             stats.SoPrimStorageNeeded[0], stats.SoPrimStorageNeeded[1], stats.SoPrimStorageNeeded[2], stats.SoPrimStorageNeeded[3],
 668             stats.SoNumPrimsWritten[0], stats.SoNumPrimsWritten[1], stats.SoNumPrimsWritten[2], stats.SoNumPrimsWritten[3]
 669         ));
 670                 AR_EVENT(FrontendDrawEndEvent(pDC->drawId));
 671
 672         pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &stats);
 673     }
 674
 675     if (pContext->pfnUpdateSoWriteOffset)
 676     {
 677         for (uint32_t i = 0; i < MAX_SO_BUFFERS; ++i)
 678         {
 679             if ((pDC->dynState.SoWriteOffsetDirty[i]) &&
 680                 (pDC->pState->state.soBuffer[i].soWriteEnable))
 681             {
 682                 pContext->pfnUpdateSoWriteOffset(GetPrivateState(pDC), i, pDC->dynState.SoWriteOffset[i]);
 683             }
 684         }
 685     }
 686
 687     // Ensure all streaming writes are globally visible before marking this FE done
 688     _mm_mfence();
 689     pDC->doneFE = true;
 690
 691     InterlockedDecrement(&pContext->drawsOutstandingFE);
 692 }
 693
 694 void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE)
 695 {
 696     // Try to grab the next DC from the ring
 697     uint32_t drawEnqueued = GetEnqueuedDraw(pContext);
 698     while (IDComparesLess(curDrawFE, drawEnqueued))
 699     {
 700         uint32_t dcSlot = curDrawFE % pContext->MAX_DRAWS_IN_FLIGHT;
 701         DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
 702         if (pDC->isCompute || pDC->doneFE)
 703         {
 704             CompleteDrawContextInl(pContext, workerId, pDC);
 705             curDrawFE++;
 706         }
 707         else
 708         {
 709             break;
 710         }
 711     }
 712
 713     uint32_t lastRetiredFE = curDrawFE - 1;
 714     uint32_t curDraw = curDrawFE;
 715     while (IDComparesLess(curDraw, drawEnqueued))
 716     {
 717         uint32_t dcSlot = curDraw % pContext->MAX_DRAWS_IN_FLIGHT;
 718         DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
 719
 720         if (!pDC->isCompute && !pDC->FeLock)
 721         {
 722             if (CheckDependencyFE(pContext, pDC, lastRetiredFE))
 723             {
 724                 return;
 725             }
 726
 727             uint32_t initial = InterlockedCompareExchange((volatile uint32_t*)&pDC->FeLock, 1, 0);
 728             if (initial == 0)
 729             {
 730                 // successfully grabbed the DC, now run the FE
 731                 pDC->FeWork.pfnWork(pContext, pDC, workerId, &pDC->FeWork.desc);
 732
 733                 CompleteDrawFE(pContext, workerId, pDC);
 734             }
 735         }
 736         curDraw++;
 737     }
 738 }
 739
 740 //////////////////////////////////////////////////////////////////////////
 741 /// @brief If there is any compute work then go work on it.
 742 /// @param pContext - pointer to SWR context.
 743 /// @param workerId - The unique worker ID that is assigned to this thread.
 744 /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
 745 ///                    has its own curDrawBE counter and this ensures that each worker processes all the
 746 ///                    draws in order.
 747 void WorkOnCompute(
 748     SWR_CONTEXT *pContext,
 749     uint32_t workerId,
 750     uint32_t& curDrawBE)
 751 {
 752     uint32_t drawEnqueued = 0;
 753     if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
 754     {
 755         return;
 756     }
 757
 758     uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;
 759
 760     for (uint64_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
 761     {
 762         DRAW_CONTEXT *pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
 763         if (pDC->isCompute == false) return;
 764
 765         // check dependencies
 766         if (CheckDependency(pContext, pDC, lastRetiredDraw))
 767         {
 768             return;
 769         }
 770
 771         SWR_ASSERT(pDC->pDispatch != nullptr);
 772         DispatchQueue& queue = *pDC->pDispatch;
 773
 774         // Is there any work remaining?
 775         if (queue.getNumQueued() > 0)
 776         {
 777             void* pSpillFillBuffer = nullptr;
 778             void* pScratchSpace = nullptr;
 779             uint32_t threadGroupId = 0;
 780             while (queue.getWork(threadGroupId))
 781             {
 782                 queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer, pScratchSpace);
 783                 queue.finishedWork();
 784             }
 785
 786             // Ensure all streaming writes are globally visible before moving onto the next draw
 787             _mm_mfence();
 788         }
 789     }
 790 }
 791
 792 void BindApiThread(SWR_CONTEXT *pContext, uint32_t apiThreadId)
 793 {
 794     if (nullptr == pContext)
 795     {
 796         return;
 797     }
 798
 799     if (apiThreadId >= pContext->threadPool.numReservedThreads)
 800     {
 801         if (pContext->threadPool.numReservedThreads)
 802         {
 803             const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[0];
 804             // Just bind to the process group used for API thread 0
 805             bindThread(pContext, 0, threadData.procGroupId, true);
 806         }
 807         return;
 808     }
 809
 810     const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[apiThreadId];
 811
 812     bindThread(pContext, threadData.threadId, threadData.procGroupId, threadData.forceBindProcGroup);
 813 }
 814
 815 template<bool IsFEThread, bool IsBEThread>
 816 DWORD workerThreadMain(LPVOID pData)
 817 {
 818     THREAD_DATA *pThreadData = (THREAD_DATA*)pData;
 819     SWR_CONTEXT *pContext = pThreadData->pContext;
 820     uint32_t threadId = pThreadData->threadId;
 821     uint32_t workerId = pThreadData->workerId;
 822
 823     bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup);
 824
 825     {
 826         char threadName[64];
 827         sprintf_s(threadName,
 828 #if defined(_WIN32)
 829                   "SWRWorker_%02d_NUMA%d_Core%02d_T%d",
 830 #else
 831                   // linux pthread name limited to 16 chars (including \0)
 832                   "w%03d-n%d-c%03d-t%d",
 833 #endif
 834             workerId, pThreadData->numaId, pThreadData->coreId, pThreadData->htId);
 835         SetCurrentThreadName(threadName);
 836     }
 837
 838     RDTSC_INIT(threadId);
 839
 840     // Only need offset numa index from base for correct masking
 841     uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE;
 842     uint32_t numaMask = pContext->threadPool.numaMask;
 843
 844     // flush denormals to 0
 845     _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
 846
 847     // Track tiles locked by other threads. If we try to lock a macrotile and find its already
 848     // locked then we'll add it to this list so that we don't try and lock it again.
 849     TileSet lockedTiles;
 850
 851     // each worker has the ability to work on any of the queued draws as long as certain
 852     // conditions are met. the data associated
 853     // with a draw is guaranteed to be active as long as a worker hasn't signaled that he
 854     // has moved on to the next draw when he determines there is no more work to do. The api
 855     // thread will not increment the head of the dc ring until all workers have moved past the
 856     // current head.
 857     // the logic to determine what to work on is:
 858     // 1- try to work on the FE any draw that is queued. For now there are no dependencies
 859     //    on the FE work, so any worker can grab any FE and process in parallel.  Eventually
 860     //    we'll need dependency tracking to force serialization on FEs.  The worker will try
 861     //    to pick an FE by atomically incrementing a counter in the swr context.  he'll keep
 862     //    trying until he reaches the tail.
 863     // 2- BE work must be done in strict order. we accomplish this today by pulling work off
 864     //    the oldest draw (ie the head) of the dcRing. the worker can determine if there is
 865     //    any work left by comparing the total # of binned work items and the total # of completed
 866     //    work items. If they are equal, then there is no more work to do for this draw, and
 867     //    the worker can safely increment its oldestDraw counter and move on to the next draw.
 868     std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
 869
 870     auto threadHasWork = [&](uint32_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };
 871
 872     uint32_t curDrawBE = 0;
 873     uint32_t curDrawFE = 0;
 874
 875     bool bShutdown = false;
 876
 877     while (true)
 878     {
 879         if (bShutdown && !threadHasWork(curDrawBE))
 880         {
 881             break;
 882         }
 883
 884         uint32_t loop = 0;
 885         while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE))
 886         {
 887             _mm_pause();
 888         }
 889
 890         if (!threadHasWork(curDrawBE))
 891         {
 892             lock.lock();
 893
 894             // check for thread idle condition again under lock
 895             if (threadHasWork(curDrawBE))
 896             {
 897                 lock.unlock();
 898                 continue;
 899             }
 900
 901             pContext->FifosNotEmpty.wait(lock);
 902             lock.unlock();
 903         }
 904
 905         if (IsBEThread)
 906         {
 907             RDTSC_BEGIN(WorkerWorkOnFifoBE, 0);
 908             bShutdown |= WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
 909             RDTSC_END(WorkerWorkOnFifoBE, 0);
 910
 911             WorkOnCompute(pContext, workerId, curDrawBE);
 912         }
 913
 914         if (IsFEThread)
 915         {
 916             WorkOnFifoFE(pContext, workerId, curDrawFE);
 917
 918             if (!IsBEThread)
 919             {
 920                 curDrawBE = curDrawFE;
 921             }
 922         }
 923     }
 924
 925     return 0;
 926 }
 927 template<> DWORD workerThreadMain<false, false>(LPVOID) = delete;
 928
 929 template <bool IsFEThread, bool IsBEThread>
 930 DWORD workerThreadInit(LPVOID pData)
 931 {
 932 #if defined(_WIN32)
 933     __try
 934 #endif // _WIN32
 935     {
 936         return workerThreadMain<IsFEThread, IsBEThread>(pData);
 937     }
 938
 939 #if defined(_WIN32)
 940     __except(EXCEPTION_CONTINUE_SEARCH)
 941     {
 942     }
 943
 944 #endif // _WIN32
 945
 946     return 1;
 947 }
 948 template<> DWORD workerThreadInit<false, false>(LPVOID pData) = delete;
 949
 950 static void InitPerThreadStats(SWR_CONTEXT* pContext, uint32_t numThreads)
 951 {
 952     // Initialize DRAW_CONTEXT's per-thread stats
 953     for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
 954     {
 955         pContext->dcRing[dc].dynState.pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64);
 956         memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads);
 957     }
 958 }
 959
 960 //////////////////////////////////////////////////////////////////////////
 961 /// @brief Creates thread pool info but doesn't launch threads.
 962 /// @param pContext - pointer to context
 963 /// @param pPool - pointer to thread pool object.
 964 void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
 965 {
 966     CPUNumaNodes nodes;
 967     uint32_t numThreadsPerProcGroup = 0;
 968     CalculateProcessorTopology(nodes, numThreadsPerProcGroup);
 969
 970     // Assumption, for asymmetric topologies, multi-threaded cores will appear
 971     // in the list before single-threaded cores.  This appears to be true for
 972     // Windows when the total HW threads is limited to 64.
 973     uint32_t numHWNodes         = (uint32_t)nodes.size();
 974     uint32_t numHWCoresPerNode  = (uint32_t)nodes[0].cores.size();
 975     uint32_t numHWHyperThreads  = (uint32_t)nodes[0].cores[0].threadIds.size();
 976
 977 #if defined(_WIN32) && !defined(_WIN64)
 978     if (!pContext->threadInfo.MAX_WORKER_THREADS)
 979     {
 980         // Limit 32-bit windows to bindable HW threads only
 981         if ((numHWCoresPerNode * numHWHyperThreads) > 32)
 982         {
 983             numHWCoresPerNode = 32 / numHWHyperThreads;
 984         }
 985     }
 986 #endif
 987
 988     // Calculate num HW threads.  Due to asymmetric topologies, this is not
 989     // a trivial multiplication.
 990     uint32_t numHWThreads = 0;
 991     for (auto const& node : nodes)
 992     {
 993         for (auto const& core : node.cores)
 994         {
 995             numHWThreads += (uint32_t)core.threadIds.size();
 996         }
 997     }
 998
 999     uint32_t numNodes           = numHWNodes;
1000     uint32_t numCoresPerNode    = numHWCoresPerNode;
1001     uint32_t numHyperThreads    = numHWHyperThreads;
1002
1003     // Calc used threads per-core
1004     if (numHyperThreads > pContext->threadInfo.BASE_THREAD)
1005     {
1006         numHyperThreads -= pContext->threadInfo.BASE_THREAD;
1007     }
1008     else
1009     {
1010         SWR_ASSERT(
1011             false,
1012             "Cannot use BASE_THREAD value: %d, maxThreads: %d, reverting BASE_THREAD to 0",
1013             pContext->threadInfo.BASE_THREAD,
1014             numHyperThreads);
1015         pContext->threadInfo.BASE_THREAD = 0;
1016     }
1017
1018     if (pContext->threadInfo.MAX_THREADS_PER_CORE)
1019     {
1020         numHyperThreads = std::min(numHyperThreads, pContext->threadInfo.MAX_THREADS_PER_CORE);
1021     }
1022
1023     // Prune any cores that don't support the number of threads
1024     if (numHyperThreads > 1)
1025     {
1026         for (auto& node : nodes)
1027         {
1028             uint32_t numUsableCores = 0;
1029             for (auto& core : node.cores)
1030             {
1031                 numUsableCores += (core.threadIds.size() >= numHyperThreads);
1032             }
1033             numCoresPerNode = std::min(numCoresPerNode, numUsableCores);
1034         }
1035     }
1036
1037     // Calc used cores per NUMA node
1038     if (numCoresPerNode > pContext->threadInfo.BASE_CORE)
1039     {
1040         numCoresPerNode -= pContext->threadInfo.BASE_CORE;
1041     }
1042     else
1043     {
1044         SWR_ASSERT(
1045             false,
1046             "Cannot use BASE_CORE value: %d, maxCores: %d, reverting BASE_CORE to 0",
1047             pContext->threadInfo.BASE_CORE,
1048             numCoresPerNode);
1049         pContext->threadInfo.BASE_CORE = 0;
1050     }
1051
1052     if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE)
1053     {
1054         numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE);
1055     }
1056
1057     // Calc used NUMA nodes
1058     if (numNodes > pContext->threadInfo.BASE_NUMA_NODE)
1059     {
1060         numNodes -= pContext->threadInfo.BASE_NUMA_NODE;
1061     }
1062     else
1063     {
1064         SWR_ASSERT(
1065             false,
1066             "Cannot use BASE_NUMA_NODE value: %d, maxNodes: %d, reverting BASE_NUMA_NODE to 0",
1067             pContext->threadInfo.BASE_NUMA_NODE,
1068             numNodes);
1069         pContext->threadInfo.BASE_NUMA_NODE = 0;
1070     }
1071
1072     if (pContext->threadInfo.MAX_NUMA_NODES)
1073     {
1074         numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES);
1075     }
1076
1077     // Calculate numThreads - at this point everything should be symmetric
1078     uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
1079     SWR_REL_ASSERT(numThreads <= numHWThreads);
1080
1081     uint32_t& numAPIReservedThreads = pContext->apiThreadInfo.numAPIReservedThreads;
1082     uint32_t& numAPIThreadsPerCore = pContext->apiThreadInfo.numAPIThreadsPerCore;
1083     uint32_t numRemovedThreads = 0;
1084
1085     if (pContext->threadInfo.SINGLE_THREADED)
1086     {
1087         numAPIReservedThreads = 0;
1088         numThreads = 1;
1089         pContext->NumWorkerThreads = 1;
1090         pContext->NumFEThreads = 1;
1091         pContext->NumBEThreads = 1;
1092         pPool->numThreads = 0;
1093     }
1094     else if (pContext->threadInfo.MAX_WORKER_THREADS)
1095     {
1096         numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, numHWThreads);
1097         pContext->threadInfo.BASE_NUMA_NODE = 0;
1098         pContext->threadInfo.BASE_CORE = 0;
1099         pContext->threadInfo.BASE_THREAD = 0;
1100         numAPIReservedThreads = 0;
1101     }
1102     else
1103     {
1104         if (numAPIReservedThreads >= numThreads)
1105         {
1106             numAPIReservedThreads = 0;
1107         }
1108         else if (numAPIReservedThreads)
1109         {
1110             numAPIThreadsPerCore = std::min(numAPIThreadsPerCore, numHWHyperThreads);
1111
1112             if (0 == numAPIThreadsPerCore)
1113             {
1114                 numAPIThreadsPerCore = numHWHyperThreads;
1115             }
1116
1117             numRemovedThreads = numAPIReservedThreads;
1118             if (numAPIThreadsPerCore == 2 && numHyperThreads == 1)
1119             {
1120                 // Adjust removed threads to make logic below work
1121                 numRemovedThreads = std::max(1U, (numRemovedThreads + numAPIThreadsPerCore - 1) / 2);
1122             }
1123
1124             numThreads -= numRemovedThreads;
1125         }
1126     }
1127
1128     InitPerThreadStats(pContext, numThreads);
1129
1130     if (pContext->threadInfo.SINGLE_THREADED)
1131     {
1132         numAPIReservedThreads = 0;
1133         numThreads = 1;
1134     }
1135
1136     if (numAPIReservedThreads)
1137     {
1138         pPool->pApiThreadData = new (std::nothrow) THREAD_DATA[numAPIReservedThreads];
1139         SWR_ASSERT(pPool->pApiThreadData);
1140         if (!pPool->pApiThreadData)
1141         {
1142             numAPIReservedThreads = 0;
1143         }
1144         else
1145         {
1146             memset(pPool->pApiThreadData, 0, sizeof(THREAD_DATA) * numAPIReservedThreads);
1147         }
1148     }
1149     pPool->numReservedThreads = numAPIReservedThreads;
1150
1151     pPool->numThreads = numThreads;
1152     pContext->NumWorkerThreads = pPool->numThreads;
1153
1154     pPool->pThreadData = new (std::nothrow) THREAD_DATA[pPool->numThreads];
1155     SWR_ASSERT(pPool->pThreadData);
1156     memset(pPool->pThreadData, 0, sizeof(THREAD_DATA) * pPool->numThreads);
1157     pPool->numaMask = 0;
1158
1159     // Allocate worker private data
1160     pPool->pWorkerPrivateDataArray = nullptr;
1161     if (pContext->workerPrivateState.perWorkerPrivateStateSize)
1162     {
1163         size_t perWorkerSize = AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64);
1164         size_t totalSize = perWorkerSize * pPool->numThreads;
1165         if (totalSize)
1166         {
1167             pPool->pWorkerPrivateDataArray = AlignedMalloc(totalSize, 64);
1168             SWR_ASSERT(pPool->pWorkerPrivateDataArray);
1169
1170             void* pWorkerData = pPool->pWorkerPrivateDataArray;
1171             for (uint32_t i = 0; i < pPool->numThreads; ++i)
1172             {
1173                 pPool->pThreadData[i].pWorkerPrivateData = pWorkerData;
1174                 if (pContext->workerPrivateState.pfnInitWorkerData)
1175                 {
1176                     pContext->workerPrivateState.pfnInitWorkerData(pWorkerData, i);
1177                 }
1178                 pWorkerData = PtrAdd(pWorkerData, perWorkerSize);
1179             }
1180         }
1181     }
1182
1183     if (pContext->threadInfo.SINGLE_THREADED)
1184     {
1185         return;
1186     }
1187
1188     pPool->pThreads = new (std::nothrow) THREAD_PTR[pPool->numThreads];
1189     SWR_ASSERT(pPool->pThreads);
1190
1191     if (pContext->threadInfo.MAX_WORKER_THREADS)
1192     {
1193         bool bForceBindProcGroup = (numThreads > numThreadsPerProcGroup);
1194         uint32_t numProcGroups = (numThreads + numThreadsPerProcGroup - 1) / numThreadsPerProcGroup;
1195         // When MAX_WORKER_THREADS is set we don't bother to bind to specific HW threads
1196         // But Windows will still require binding to specific process groups
1197         for (uint32_t workerId = 0; workerId < numThreads; ++workerId)
1198         {
1199             pPool->pThreadData[workerId].workerId = workerId;
1200             pPool->pThreadData[workerId].procGroupId = workerId % numProcGroups;
1201             pPool->pThreadData[workerId].threadId = 0;
1202             pPool->pThreadData[workerId].numaId = 0;
1203             pPool->pThreadData[workerId].coreId = 0;
1204             pPool->pThreadData[workerId].htId = 0;
1205             pPool->pThreadData[workerId].pContext = pContext;
1206             pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
1207
1208             pContext->NumBEThreads++;
1209             pContext->NumFEThreads++;
1210         }
1211     }
1212     else
1213     {
1214         // numa distribution assumes workers on all nodes
1215         bool useNuma = true;
1216         if (numCoresPerNode * numHyperThreads == 1)
1217         {
1218             useNuma = false;
1219         }
1220
1221         if (useNuma)
1222         {
1223             pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.)
1224         }
1225         else
1226         {
1227             pPool->numaMask = 0;
1228         }
1229
1230         uint32_t workerId = 0;
1231         uint32_t numReservedThreads = numAPIReservedThreads;
1232         for (uint32_t n = 0; n < numNodes; ++n)
1233         {
1234             if ((n + pContext->threadInfo.BASE_NUMA_NODE) >= nodes.size())
1235             {
1236                 break;
1237             }
1238             auto& node = nodes[n + pContext->threadInfo.BASE_NUMA_NODE];
1239             uint32_t numCores = numCoresPerNode;
1240             for (uint32_t c = 0; c < numCores; ++c)
1241             {
1242                 if ((c + pContext->threadInfo.BASE_CORE) >= node.cores.size())
1243                 {
1244                     break;
1245                 }
1246
1247                 auto& core = node.cores[c + pContext->threadInfo.BASE_CORE];
1248                 for (uint32_t t = 0; t < numHyperThreads; ++t)
1249                 {
1250                     if ((t + pContext->threadInfo.BASE_THREAD) >= core.threadIds.size())
1251                     {
1252                         break;
1253                     }
1254
1255                     if (numRemovedThreads)
1256                     {
1257                         --numRemovedThreads;
1258                         SWR_REL_ASSERT(numReservedThreads);
1259                         --numReservedThreads;
1260                         pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU;
1261                         pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
1262                         pPool->pApiThreadData[numReservedThreads].threadId = core.threadIds[t];
1263                         pPool->pApiThreadData[numReservedThreads].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
1264                         pPool->pApiThreadData[numReservedThreads].coreId = c + pContext->threadInfo.BASE_CORE;
1265                         pPool->pApiThreadData[numReservedThreads].htId = t + pContext->threadInfo.BASE_THREAD;
1266                         pPool->pApiThreadData[numReservedThreads].pContext = pContext;
1267                         pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
1268
1269
1270                         if (numAPIThreadsPerCore > numHyperThreads && numReservedThreads)
1271                         {
1272                             --numReservedThreads;
1273                             pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU;
1274                             pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
1275                             pPool->pApiThreadData[numReservedThreads].threadId = core.threadIds[t + 1];
1276                             pPool->pApiThreadData[numReservedThreads].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
1277                             pPool->pApiThreadData[numReservedThreads].coreId = c + pContext->threadInfo.BASE_CORE;
1278                             pPool->pApiThreadData[numReservedThreads].htId = t + pContext->threadInfo.BASE_THREAD;
1279                             pPool->pApiThreadData[numReservedThreads].pContext = pContext;
1280                             pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
1281                         }
1282
1283                         continue;
1284                     }
1285
1286                     SWR_ASSERT(workerId < numThreads);
1287
1288                     pPool->pThreadData[workerId].workerId = workerId;
1289                     pPool->pThreadData[workerId].procGroupId = core.procGroup;
1290                     pPool->pThreadData[workerId].threadId = core.threadIds[t + pContext->threadInfo.BASE_THREAD];
1291                     pPool->pThreadData[workerId].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
1292                     pPool->pThreadData[workerId].coreId = c + pContext->threadInfo.BASE_CORE;
1293                     pPool->pThreadData[workerId].htId = t + pContext->threadInfo.BASE_THREAD;
1294                     pPool->pThreadData[workerId].pContext = pContext;
1295                     pPool->pThreadData[workerId].forceBindProcGroup = false;
1296
1297                     pContext->NumBEThreads++;
1298                     pContext->NumFEThreads++;
1299
1300                     ++workerId;
1301                 }
1302             }
1303         }
1304         SWR_ASSERT(workerId == pContext->NumWorkerThreads);
1305     }
1306 }
1307
1308 //////////////////////////////////////////////////////////////////////////
1309 /// @brief Launches worker threads in thread pool.
1310 /// @param pContext - pointer to context
1311 /// @param pPool - pointer to thread pool object.
1312 void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
1313 {
1314     if (pContext->threadInfo.SINGLE_THREADED)
1315     {
1316         return;
1317     }
1318
1319     for (uint32_t workerId = 0; workerId < pContext->NumWorkerThreads; ++workerId)
1320     {
1321         pPool->pThreads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
1322     }
1323 }
1324
1325 //////////////////////////////////////////////////////////////////////////
1326 /// @brief Destroys thread pool.
1327 /// @param pContext - pointer to context
1328 /// @param pPool - pointer to thread pool object.
1329 void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
1330 {
1331     // Wait for all threads to finish
1332     SwrWaitForIdle(pContext);
1333
1334     // Wait for threads to finish and destroy them
1335     for (uint32_t t = 0; t < pPool->numThreads; ++t)
1336     {
1337         if (!pContext->threadInfo.SINGLE_THREADED)
1338         {
1339             // Detach from thread.  Cannot join() due to possibility (in Windows) of code
1340             // in some DLLMain(THREAD_DETATCH case) blocking the thread until after this returns.
1341             pPool->pThreads[t]->detach();
1342             delete(pPool->pThreads[t]);
1343         }
1344
1345         if (pContext->workerPrivateState.pfnFinishWorkerData)
1346         {
1347             pContext->workerPrivateState.pfnFinishWorkerData(pPool->pThreadData[t].pWorkerPrivateData, t);
1348         }
1349     }
1350
1351     delete[] pPool->pThreads;
1352
1353     // Clean up data used by threads
1354     delete[] pPool->pThreadData;
1355     delete[] pPool->pApiThreadData;
1356
1357     AlignedFree(pPool->pWorkerPrivateDataArray);
1358 }