src/gallium/drivers/swr/rasterizer/core/threads.cpp

   1 /****************************************************************************
   2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  ****************************************************************************/
  23
  24 #include <stdio.h>
  25 #include <thread>
  26 #include <algorithm>
  27 #include <float.h>
  28 #include <vector>
  29 #include <utility>
  30 #include <fstream>
  31 #include <string>
  32
  33 #if defined(__linux__) || defined(__gnu_linux__) || defined(__APPLE__)
  34 #include <pthread.h>
  35 #include <sched.h>
  36 #include <unistd.h>
  37 #endif
  38
  39 #ifdef __APPLE__
  40 #include <sys/types.h>
  41 #include <sys/sysctl.h>
  42 #endif
  43
  44 #include "common/os.h"
  45 #include "core/api.h"
  46 #include "context.h"
  47 #include "frontend.h"
  48 #include "backend.h"
  49 #include "rasterizer.h"
  50 #include "rdtsc_core.h"
  51 #include "tilemgr.h"
  52 #include "tileset.h"
  53
  54
  55 // ThreadId
  56 struct Core
  57 {
  58     uint32_t              procGroup = 0;
  59     std::vector<uint32_t> threadIds;
  60 };
  61
  62 struct NumaNode
  63 {
  64     uint32_t          numaId;
  65     std::vector<Core> cores;
  66 };
  67
  68 typedef std::vector<NumaNode> CPUNumaNodes;
  69
  70 void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup)
  71 {
  72     out_nodes.clear();
  73     out_numThreadsPerProcGroup = 0;
  74
  75 #if defined(_WIN32)
  76
  77     std::vector<KAFFINITY> threadMaskPerProcGroup;
  78
  79     static std::mutex           m;
  80     std::lock_guard<std::mutex> l(m);
  81
  82     DWORD bufSize = 0;
  83
  84     BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufSize);
  85     SWR_ASSERT(ret == FALSE && GetLastError() == ERROR_INSUFFICIENT_BUFFER);
  86
  87     PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem =
  88         (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize);
  89     SWR_ASSERT(pBufferMem);
  90
  91     ret = GetLogicalProcessorInformationEx(RelationProcessorCore, pBufferMem, &bufSize);
  92     SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information");
  93
  94     uint32_t                                 count   = bufSize / pBufferMem->Size;
  95     PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = pBufferMem;
  96
  97     for (uint32_t i = 0; i < count; ++i)
  98     {
  99         SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore);
 100         for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g)
 101         {
 102             auto&    gmask     = pBuffer->Processor.GroupMask[g];
 103             uint32_t threadId  = 0;
 104             uint32_t procGroup = gmask.Group;
 105
 106             Core* pCore = nullptr;
 107
 108             while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask))
 109             {
 110                 // clear mask
 111                 KAFFINITY threadMask = KAFFINITY(1) << threadId;
 112                 gmask.Mask &= ~threadMask;
 113
 114                 if (procGroup >= threadMaskPerProcGroup.size())
 115                 {
 116                     threadMaskPerProcGroup.resize(procGroup + 1);
 117                 }
 118
 119                 if (threadMaskPerProcGroup[procGroup] & threadMask)
 120                 {
 121                     // Already seen this mask.  This means that we are in 32-bit mode and
 122                     // have seen more than 32 HW threads for this procGroup
 123                     // Don't use it
 124 #if defined(_WIN64)
 125                     SWR_INVALID("Shouldn't get here in 64-bit mode");
 126 #endif
 127                     continue;
 128                 }
 129
 130                 threadMaskPerProcGroup[procGroup] |= (KAFFINITY(1) << threadId);
 131
 132                 // Find Numa Node
 133                 uint32_t         numaId  = 0;
 134                 PROCESSOR_NUMBER procNum = {};
 135                 procNum.Group            = WORD(procGroup);
 136                 procNum.Number           = UCHAR(threadId);
 137
 138                 ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId);
 139                 SWR_ASSERT(ret);
 140
 141                 // Store data
 142                 if (out_nodes.size() <= numaId)
 143                 {
 144                     out_nodes.resize(numaId + 1);
 145                 }
 146                 auto& numaNode  = out_nodes[numaId];
 147                 numaNode.numaId = numaId;
 148
 149                 if (nullptr == pCore)
 150                 {
 151                     numaNode.cores.push_back(Core());
 152                     pCore            = &numaNode.cores.back();
 153                     pCore->procGroup = procGroup;
 154                 }
 155                 pCore->threadIds.push_back(threadId);
 156                 if (procGroup == 0)
 157                 {
 158                     out_numThreadsPerProcGroup++;
 159                 }
 160             }
 161         }
 162         pBuffer = PtrAdd(pBuffer, pBuffer->Size);
 163     }
 164
 165     free(pBufferMem);
 166
 167 #elif defined(__linux__) || defined(__gnu_linux__)
 168
 169     // Parse /proc/cpuinfo to get full topology
 170     std::ifstream input("/proc/cpuinfo");
 171     std::string   line;
 172     char*         c;
 173     uint32_t      procId = uint32_t(-1);
 174     uint32_t      coreId = uint32_t(-1);
 175     uint32_t      physId = uint32_t(-1);
 176
 177     while (std::getline(input, line))
 178     {
 179         if (line.find("processor") != std::string::npos)
 180         {
 181             auto data_start = line.find(": ") + 2;
 182             procId          = std::strtoul(&line.c_str()[data_start], &c, 10);
 183             continue;
 184         }
 185         if (line.find("core id") != std::string::npos)
 186         {
 187             auto data_start = line.find(": ") + 2;
 188             coreId          = std::strtoul(&line.c_str()[data_start], &c, 10);
 189             continue;
 190         }
 191         if (line.find("physical id") != std::string::npos)
 192         {
 193             auto data_start = line.find(": ") + 2;
 194             physId          = std::strtoul(&line.c_str()[data_start], &c, 10);
 195             continue;
 196         }
 197         if (line.length() == 0)
 198         {
 199             if (physId + 1 > out_nodes.size())
 200                 out_nodes.resize(physId + 1);
 201             auto& numaNode  = out_nodes[physId];
 202             numaNode.numaId = physId;
 203
 204             if (coreId + 1 > numaNode.cores.size())
 205                 numaNode.cores.resize(coreId + 1);
 206             auto& core     = numaNode.cores[coreId];
 207             core.procGroup = coreId;
 208             core.threadIds.push_back(procId);
 209         }
 210     }
 211
 212     out_numThreadsPerProcGroup = 0;
 213     for (auto& node : out_nodes)
 214     {
 215         for (auto& core : node.cores)
 216         {
 217             out_numThreadsPerProcGroup += core.threadIds.size();
 218         }
 219     }
 220
 221 #elif defined(__APPLE__)
 222
 223     auto numProcessors  = 0;
 224     auto numCores       = 0;
 225     auto numPhysicalIds = 0;
 226
 227     int    value;
 228     size_t size = sizeof(value);
 229
 230     int result = sysctlbyname("hw.packages", &value, &size, NULL, 0);
 231     SWR_ASSERT(result == 0);
 232     numPhysicalIds = value;
 233
 234     result = sysctlbyname("hw.logicalcpu", &value, &size, NULL, 0);
 235     SWR_ASSERT(result == 0);
 236     numProcessors = value;
 237
 238     result = sysctlbyname("hw.physicalcpu", &value, &size, NULL, 0);
 239     SWR_ASSERT(result == 0);
 240     numCores = value;
 241
 242     out_nodes.resize(numPhysicalIds);
 243
 244     for (auto physId = 0; physId < numPhysicalIds; ++physId)
 245     {
 246         auto& numaNode = out_nodes[physId];
 247         auto  procId   = 0;
 248
 249         numaNode.cores.resize(numCores);
 250
 251         while (procId < numProcessors)
 252         {
 253             for (auto coreId = 0; coreId < numaNode.cores.size(); ++coreId, ++procId)
 254             {
 255                 auto& core = numaNode.cores[coreId];
 256
 257                 core.procGroup = coreId;
 258                 core.threadIds.push_back(procId);
 259             }
 260         }
 261     }
 262
 263     out_numThreadsPerProcGroup = 0;
 264
 265     for (auto& node : out_nodes)
 266     {
 267         for (auto& core : node.cores)
 268         {
 269             out_numThreadsPerProcGroup += core.threadIds.size();
 270         }
 271     }
 272
 273 #else
 274
 275 #error Unsupported platform
 276
 277 #endif
 278
 279     // Prune empty cores and numa nodes
 280     for (auto node_it = out_nodes.begin(); node_it != out_nodes.end();)
 281     {
 282         // Erase empty cores (first)
 283         for (auto core_it = node_it->cores.begin(); core_it != node_it->cores.end();)
 284         {
 285             if (core_it->threadIds.size() == 0)
 286             {
 287                 core_it = node_it->cores.erase(core_it);
 288             }
 289             else
 290             {
 291                 ++core_it;
 292             }
 293         }
 294
 295         // Erase empty numa nodes (second)
 296         if (node_it->cores.size() == 0)
 297         {
 298             node_it = out_nodes.erase(node_it);
 299         }
 300         else
 301         {
 302             ++node_it;
 303         }
 304     }
 305 }
 306
 307 void bindThread(SWR_CONTEXT* pContext,
 308                 uint32_t     threadId,
 309                 uint32_t     procGroupId   = 0,
 310                 bool         bindProcGroup = false)
 311 {
 312     // Only bind threads when MAX_WORKER_THREADS isn't set.
 313     if (pContext->threadInfo.SINGLE_THREADED ||
 314         (pContext->threadInfo.MAX_WORKER_THREADS && bindProcGroup == false))
 315     {
 316         return;
 317     }
 318
 319 #if defined(_WIN32)
 320
 321     GROUP_AFFINITY affinity = {};
 322     affinity.Group          = procGroupId;
 323
 324 #if !defined(_WIN64)
 325     if (threadId >= 32)
 326     {
 327         // Hopefully we don't get here.  Logic in CreateThreadPool should prevent this.
 328         SWR_INVALID("Shouldn't get here");
 329
 330         // In a 32-bit process on Windows it is impossible to bind
 331         // to logical processors 32-63 within a processor group.
 332         // In this case set the mask to 0 and let the system assign
 333         // the processor.  Hopefully it will make smart choices.
 334         affinity.Mask = 0;
 335     }
 336     else
 337 #endif
 338     {
 339         // If MAX_WORKER_THREADS is set, only bind to the proc group,
 340         // Not the individual HW thread.
 341         if (!bindProcGroup && !pContext->threadInfo.MAX_WORKER_THREADS)
 342         {
 343             affinity.Mask = KAFFINITY(1) << threadId;
 344         }
 345         else
 346         {
 347             affinity.Mask = KAFFINITY(0);
 348         }
 349     }
 350
 351     if (!SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr))
 352     {
 353         SWR_INVALID("Failed to set Thread Affinity");
 354     }
 355
 356 #elif defined(__linux__) || defined(__gnu_linux__)
 357
 358     cpu_set_t cpuset;
 359     pthread_t thread = pthread_self();
 360     CPU_ZERO(&cpuset);
 361     CPU_SET(threadId, &cpuset);
 362
 363     int err = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
 364     if (err != 0)
 365     {
 366         fprintf(stderr, "pthread_setaffinity_np failure for tid %u: %s\n", threadId, strerror(err));
 367     }
 368
 369 #endif
 370 }
 371
 372 INLINE
 373 uint32_t GetEnqueuedDraw(SWR_CONTEXT* pContext)
 374 {
 375     return pContext->dcRing.GetHead();
 376 }
 377
 378 INLINE
 379 DRAW_CONTEXT* GetDC(SWR_CONTEXT* pContext, uint32_t drawId)
 380 {
 381     return &pContext->dcRing[(drawId - 1) % pContext->MAX_DRAWS_IN_FLIGHT];
 382 }
 383
 384 INLINE
 385 bool IDComparesLess(uint32_t a, uint32_t b)
 386 {
 387     // Use signed delta to ensure that wrap-around to 0 is correctly handled.
 388     int32_t delta = int32_t(a - b);
 389     return (delta < 0);
 390 }
 391
 392 // returns true if dependency not met
 393 INLINE
 394 bool CheckDependency(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t lastRetiredDraw)
 395 {
 396     return pDC->dependent && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
 397 }
 398
 399 bool CheckDependencyFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t lastRetiredDraw)
 400 {
 401     return pDC->dependentFE && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
 402 }
 403
 404 //////////////////////////////////////////////////////////////////////////
 405 /// @brief Update client stats.
 406 INLINE void UpdateClientStats(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
 407 {
 408     if ((pContext->pfnUpdateStats == nullptr) || (GetApiState(pDC).enableStatsBE == false))
 409     {
 410         return;
 411     }
 412
 413     DRAW_DYNAMIC_STATE& dynState = pDC->dynState;
 414     OSALIGNLINE(SWR_STATS) stats{0};
 415
 416     // Sum up stats across all workers before sending to client.
 417     for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
 418     {
 419         stats.DepthPassCount += dynState.pStats[i].DepthPassCount;
 420         stats.PsInvocations += dynState.pStats[i].PsInvocations;
 421         stats.CsInvocations += dynState.pStats[i].CsInvocations;
 422
 423     }
 424
 425
 426     pContext->pfnUpdateStats(GetPrivateState(pDC), &stats);
 427 }
 428
 429 INLINE void ExecuteCallbacks(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
 430 {
 431     UpdateClientStats(pContext, workerId, pDC);
 432
 433     if (pDC->retireCallback.pfnCallbackFunc)
 434     {
 435         pDC->retireCallback.pfnCallbackFunc(pDC->retireCallback.userData,
 436                                             pDC->retireCallback.userData2,
 437                                             pDC->retireCallback.userData3);
 438
 439         // Callbacks to external code *could* change floating point control state
 440         // Reset our optimal flags
 441         SetOptimalVectorCSR();
 442     }
 443 }
 444
 445 // inlined-only version
 446 INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
 447 {
 448     int32_t result = static_cast<int32_t>(InterlockedDecrement(&pDC->threadsDone));
 449     SWR_ASSERT(result >= 0);
 450
 451     AR_FLUSH(pDC->drawId);
 452
 453     if (result == 0)
 454     {
 455         ExecuteCallbacks(pContext, workerId, pDC);
 456
 457
 458         // Cleanup memory allocations
 459         pDC->pArena->Reset(true);
 460         if (!pDC->isCompute)
 461         {
 462             pDC->pTileMgr->initialize();
 463         }
 464         if (pDC->cleanupState)
 465         {
 466             pDC->pState->pArena->Reset(true);
 467         }
 468
 469         _ReadWriteBarrier();
 470
 471         pContext->dcRing.Dequeue(); // Remove from tail
 472     }
 473
 474     return result;
 475 }
 476
 477 // available to other translation modules
 478 int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
 479 {
 480     return CompleteDrawContextInl(pContext, 0, pDC);
 481 }
 482
 483 INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext,
 484                                     uint32_t     workerId,
 485                                     uint32_t&    curDrawBE,
 486                                     uint32_t&    drawEnqueued)
 487 {
 488     // increment our current draw id to the first incomplete draw
 489     drawEnqueued = GetEnqueuedDraw(pContext);
 490     while (IDComparesLess(curDrawBE, drawEnqueued))
 491     {
 492         DRAW_CONTEXT* pDC = &pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT];
 493
 494         // If its not compute and FE is not done then break out of loop.
 495         if (!pDC->doneFE && !pDC->isCompute)
 496             break;
 497
 498         bool isWorkComplete =
 499             pDC->isCompute ? pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete();
 500
 501         if (isWorkComplete)
 502         {
 503             curDrawBE++;
 504             CompleteDrawContextInl(pContext, workerId, pDC);
 505         }
 506         else
 507         {
 508             break;
 509         }
 510     }
 511
 512     // If there are no more incomplete draws then return false.
 513     return IDComparesLess(curDrawBE, drawEnqueued);
 514 }
 515
 516 //////////////////////////////////////////////////////////////////////////
 517 /// @brief If there is any BE work then go work on it.
 518 /// @param pContext - pointer to SWR context.
 519 /// @param workerId - The unique worker ID that is assigned to this thread.
 520 /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker
 521 /// thread
 522 ///                    has its own curDrawBE counter and this ensures that each worker processes all
 523 ///                    the draws in order.
 524 /// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its
 525 ///                      own set and each time it fails to lock a macrotile, because its already
 526 ///                      locked, then it will add that tile to the lockedTiles set. As a worker
 527 ///                      begins to work on future draws the lockedTiles ensure that it doesn't work
 528 ///                      on tiles that may still have work pending in a previous draw. Additionally,
 529 ///                      the lockedTiles is hueristic that can steer a worker back to the same
 530 ///                      macrotile that it had been working on in a previous draw.
 531 /// @returns        true if worker thread should shutdown
 532 bool WorkOnFifoBE(SWR_CONTEXT* pContext,
 533                   uint32_t     workerId,
 534                   uint32_t&    curDrawBE,
 535                   TileSet&     lockedTiles,
 536                   uint32_t     numaNode,
 537                   uint32_t     numaMask)
 538 {
 539     bool bShutdown = false;
 540
 541     // Find the first incomplete draw that has pending work. If no such draw is found then
 542     // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
 543     uint32_t drawEnqueued = 0;
 544     if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
 545     {
 546         return false;
 547     }
 548
 549     uint32_t lastRetiredDraw =
 550         pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;
 551
 552     // Reset our history for locked tiles. We'll have to re-learn which tiles are locked.
 553     lockedTiles.clear();
 554
 555     // Try to work on each draw in order of the available draws in flight.
 556     //   1. If we're on curDrawBE, we can work on any macrotile that is available.
 557     //   2. If we're trying to work on draws after curDrawBE, we are restricted to
 558     //      working on those macrotiles that are known to be complete in the prior draw to
 559     //      maintain order. The locked tiles provides the history to ensures this.
 560     for (uint32_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
 561     {
 562         DRAW_CONTEXT* pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
 563
 564         if (pDC->isCompute)
 565             return false; // We don't look at compute work.
 566
 567         // First wait for FE to be finished with this draw. This keeps threading model simple
 568         // but if there are lots of bubbles between draws then serializing FE and BE may
 569         // need to be revisited.
 570         if (!pDC->doneFE)
 571             return false;
 572
 573         // If this draw is dependent on a previous draw then we need to bail.
 574         if (CheckDependency(pContext, pDC, lastRetiredDraw))
 575         {
 576             return false;
 577         }
 578
 579         // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it.
 580         auto& macroTiles = pDC->pTileMgr->getDirtyTiles();
 581
 582         for (auto tile : macroTiles)
 583         {
 584             uint32_t tileID = tile->mId;
 585
 586             // Only work on tiles for this numa node
 587             uint32_t x, y;
 588             pDC->pTileMgr->getTileIndices(tileID, x, y);
 589             if (((x ^ y) & numaMask) != numaNode)
 590             {
 591                 _mm_pause();
 592                 continue;
 593             }
 594
 595             if (!tile->getNumQueued())
 596             {
 597                 _mm_pause();
 598                 continue;
 599             }
 600
 601             // can only work on this draw if it's not in use by other threads
 602             if (lockedTiles.get(tileID))
 603             {
 604                 _mm_pause();
 605                 continue;
 606             }
 607
 608             if (tile->tryLock())
 609             {
 610                 BE_WORK* pWork;
 611
 612                 RDTSC_BEGIN(pContext->pBucketMgr, WorkerFoundWork, pDC->drawId);
 613
 614                 uint32_t numWorkItems = tile->getNumQueued();
 615                 SWR_ASSERT(numWorkItems);
 616
 617                 pWork = tile->peek();
 618                 SWR_ASSERT(pWork);
 619                 if (pWork->type == DRAW)
 620                 {
 621                     pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, workerId, tileID);
 622                 }
 623                 else if (pWork->type == SHUTDOWN)
 624                 {
 625                     bShutdown = true;
 626                 }
 627
 628                 while ((pWork = tile->peek()) != nullptr)
 629                 {
 630                     pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
 631                     tile->dequeue();
 632                 }
 633                 RDTSC_END(pContext->pBucketMgr, WorkerFoundWork, numWorkItems);
 634
 635                 _ReadWriteBarrier();
 636
 637                 pDC->pTileMgr->markTileComplete(tileID);
 638
 639                 // Optimization: If the draw is complete and we're the last one to have worked on it
 640                 // then we can reset the locked list as we know that all previous draws before the
 641                 // next are guaranteed to be complete.
 642                 if ((curDrawBE == i) && (bShutdown || pDC->pTileMgr->isWorkComplete()))
 643                 {
 644                     // We can increment the current BE and safely move to next draw since we know
 645                     // this draw is complete.
 646                     curDrawBE++;
 647                     CompleteDrawContextInl(pContext, workerId, pDC);
 648
 649                     lastRetiredDraw++;
 650
 651                     lockedTiles.clear();
 652                     break;
 653                 }
 654
 655                 if (bShutdown)
 656                 {
 657                     break;
 658                 }
 659             }
 660             else
 661             {
 662                 // This tile is already locked. So let's add it to our locked tiles set. This way we
 663                 // don't try locking this one again.
 664                 lockedTiles.set(tileID);
 665                 _mm_pause();
 666             }
 667         }
 668     }
 669
 670     return bShutdown;
 671 }
 672
 673 //////////////////////////////////////////////////////////////////////////
 674 /// @brief Called when FE work is complete for this DC.
 675 INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
 676 {
 677     if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStatsFE)
 678     {
 679         SWR_STATS_FE& stats = pDC->dynState.statsFE;
 680
 681         AR_EVENT(FrontendStatsEvent(pDC->drawId,
 682                                     stats.IaVertices,
 683                                     stats.IaPrimitives,
 684                                     stats.VsInvocations,
 685                                     stats.HsInvocations,
 686                                     stats.DsInvocations,
 687                                     stats.GsInvocations,
 688                                     stats.GsPrimitives,
 689                                     stats.CInvocations,
 690                                     stats.CPrimitives,
 691                                     stats.SoPrimStorageNeeded[0],
 692                                     stats.SoPrimStorageNeeded[1],
 693                                     stats.SoPrimStorageNeeded[2],
 694                                     stats.SoPrimStorageNeeded[3],
 695                                     stats.SoNumPrimsWritten[0],
 696                                     stats.SoNumPrimsWritten[1],
 697                                     stats.SoNumPrimsWritten[2],
 698                                     stats.SoNumPrimsWritten[3]));
 699         AR_EVENT(FrontendDrawEndEvent(pDC->drawId));
 700
 701         pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &stats);
 702     }
 703
 704     if (pContext->pfnUpdateSoWriteOffset)
 705     {
 706         for (uint32_t i = 0; i < MAX_SO_BUFFERS; ++i)
 707         {
 708             if ((pDC->dynState.SoWriteOffsetDirty[i]) &&
 709                 (pDC->pState->state.soBuffer[i].soWriteEnable))
 710             {
 711                 pContext->pfnUpdateSoWriteOffset(
 712                     GetPrivateState(pDC), i, pDC->dynState.SoWriteOffset[i]);
 713             }
 714         }
 715     }
 716
 717     if (pContext->pfnUpdateStreamOut)
 718         pContext->pfnUpdateStreamOut(GetPrivateState(pDC),  pDC->dynState.soPrims);
 719
 720     // Ensure all streaming writes are globally visible before marking this FE done
 721     _mm_mfence();
 722     pDC->doneFE = true;
 723
 724     InterlockedDecrement(&pContext->drawsOutstandingFE);
 725 }
 726
 727 void WorkOnFifoFE(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawFE)
 728 {
 729     // Try to grab the next DC from the ring
 730     uint32_t drawEnqueued = GetEnqueuedDraw(pContext);
 731     while (IDComparesLess(curDrawFE, drawEnqueued))
 732     {
 733         uint32_t      dcSlot = curDrawFE % pContext->MAX_DRAWS_IN_FLIGHT;
 734         DRAW_CONTEXT* pDC    = &pContext->dcRing[dcSlot];
 735         if (pDC->isCompute || pDC->doneFE)
 736         {
 737             CompleteDrawContextInl(pContext, workerId, pDC);
 738             curDrawFE++;
 739         }
 740         else
 741         {
 742             break;
 743         }
 744     }
 745
 746     uint32_t lastRetiredFE = curDrawFE - 1;
 747     uint32_t curDraw       = curDrawFE;
 748     while (IDComparesLess(curDraw, drawEnqueued))
 749     {
 750         uint32_t      dcSlot = curDraw % pContext->MAX_DRAWS_IN_FLIGHT;
 751         DRAW_CONTEXT* pDC    = &pContext->dcRing[dcSlot];
 752
 753         if (!pDC->FeLock && !pDC->isCompute)
 754         {
 755             if (CheckDependencyFE(pContext, pDC, lastRetiredFE))
 756             {
 757                 return;
 758             }
 759
 760             uint32_t initial = InterlockedCompareExchange((volatile uint32_t*)&pDC->FeLock, 1, 0);
 761             if (initial == 0)
 762             {
 763                 // successfully grabbed the DC, now run the FE
 764                 pDC->FeWork.pfnWork(pContext, pDC, workerId, &pDC->FeWork.desc);
 765
 766                 CompleteDrawFE(pContext, workerId, pDC);
 767             }
 768             else
 769             {
 770                 _mm_pause();
 771             }
 772         }
 773         else
 774         {
 775             _mm_pause();
 776         }
 777
 778         curDraw++;
 779     }
 780 }
 781
 782 //////////////////////////////////////////////////////////////////////////
 783 /// @brief If there is any compute work then go work on it.
 784 /// @param pContext - pointer to SWR context.
 785 /// @param workerId - The unique worker ID that is assigned to this thread.
 786 /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker
 787 /// thread
 788 ///                    has its own curDrawBE counter and this ensures that each worker processes all
 789 ///                    the draws in order.
 790 void WorkOnCompute(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawBE)
 791 {
 792     uint32_t drawEnqueued = 0;
 793     if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
 794     {
 795         return;
 796     }
 797
 798     uint32_t lastRetiredDraw =
 799         pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;
 800
 801     for (uint64_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
 802     {
 803         DRAW_CONTEXT* pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
 804         if (pDC->isCompute == false)
 805             return;
 806
 807         // check dependencies
 808         if (CheckDependency(pContext, pDC, lastRetiredDraw))
 809         {
 810             return;
 811         }
 812
 813         SWR_ASSERT(pDC->pDispatch != nullptr);
 814         DispatchQueue& queue = *pDC->pDispatch;
 815
 816         // Is there any work remaining?
 817         if (queue.getNumQueued() > 0)
 818         {
 819             void*    pSpillFillBuffer = nullptr;
 820             void*    pScratchSpace    = nullptr;
 821             uint32_t threadGroupId    = 0;
 822             while (queue.getWork(threadGroupId))
 823             {
 824                 queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer, pScratchSpace);
 825                 queue.finishedWork();
 826             }
 827
 828             // Ensure all streaming writes are globally visible before moving onto the next draw
 829             _mm_mfence();
 830         }
 831     }
 832 }
 833
 834 void BindApiThread(SWR_CONTEXT* pContext, uint32_t apiThreadId)
 835 {
 836     if (nullptr == pContext)
 837     {
 838         return;
 839     }
 840
 841     if (apiThreadId >= pContext->threadPool.numReservedThreads)
 842     {
 843         if (pContext->threadPool.numReservedThreads)
 844         {
 845             const THREAD_DATA& threadData = pContext->threadPool.pApiThreadData[0];
 846             // Just bind to the process group used for API thread 0
 847             bindThread(pContext, 0, threadData.procGroupId, true);
 848         }
 849         return;
 850     }
 851
 852     const THREAD_DATA& threadData = pContext->threadPool.pApiThreadData[apiThreadId];
 853
 854     bindThread(
 855         pContext, threadData.threadId, threadData.procGroupId, threadData.forceBindProcGroup);
 856 }
 857
 858 template <bool IsFEThread, bool IsBEThread>
 859 DWORD workerThreadMain(LPVOID pData)
 860 {
 861     THREAD_DATA* pThreadData = (THREAD_DATA*)pData;
 862     SWR_CONTEXT* pContext    = pThreadData->pContext;
 863     uint32_t     threadId    = pThreadData->threadId;
 864     uint32_t     workerId    = pThreadData->workerId;
 865
 866     bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup);
 867
 868     {
 869         char threadName[64];
 870         sprintf_s(threadName,
 871 #if defined(_WIN32)
 872                   "SWRWorker_%02d_NUMA%d_Core%02d_T%d",
 873 #else
 874                   // linux pthread name limited to 16 chars (including \0)
 875                   "w%03d-n%d-c%03d-t%d",
 876 #endif
 877                   workerId,
 878                   pThreadData->numaId,
 879                   pThreadData->coreId,
 880                   pThreadData->htId);
 881         SetCurrentThreadName(threadName);
 882     }
 883
 884     RDTSC_INIT(pContext->pBucketMgr, threadId);
 885
 886     // Only need offset numa index from base for correct masking
 887     uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE;
 888     uint32_t numaMask = pContext->threadPool.numaMask;
 889
 890     SetOptimalVectorCSR();
 891
 892     // Track tiles locked by other threads. If we try to lock a macrotile and find its already
 893     // locked then we'll add it to this list so that we don't try and lock it again.
 894     TileSet lockedTiles;
 895
 896     // each worker has the ability to work on any of the queued draws as long as certain
 897     // conditions are met. the data associated
 898     // with a draw is guaranteed to be active as long as a worker hasn't signaled that he
 899     // has moved on to the next draw when he determines there is no more work to do. The api
 900     // thread will not increment the head of the dc ring until all workers have moved past the
 901     // current head.
 902     // the logic to determine what to work on is:
 903     // 1- try to work on the FE any draw that is queued. For now there are no dependencies
 904     //    on the FE work, so any worker can grab any FE and process in parallel.  Eventually
 905     //    we'll need dependency tracking to force serialization on FEs.  The worker will try
 906     //    to pick an FE by atomically incrementing a counter in the swr context.  he'll keep
 907     //    trying until he reaches the tail.
 908     // 2- BE work must be done in strict order. we accomplish this today by pulling work off
 909     //    the oldest draw (ie the head) of the dcRing. the worker can determine if there is
 910     //    any work left by comparing the total # of binned work items and the total # of completed
 911     //    work items. If they are equal, then there is no more work to do for this draw, and
 912     //    the worker can safely increment its oldestDraw counter and move on to the next draw.
 913     std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
 914
 915     auto threadHasWork = [&](uint32_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };
 916
 917     uint32_t curDrawBE = 0;
 918     uint32_t curDrawFE = 0;
 919
 920     bool bShutdown = false;
 921
 922     while (true)
 923     {
 924         if (bShutdown && !threadHasWork(curDrawBE))
 925         {
 926             break;
 927         }
 928
 929         uint32_t loop = 0;
 930         while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE))
 931         {
 932             _mm_pause();
 933         }
 934
 935         if (!threadHasWork(curDrawBE))
 936         {
 937             lock.lock();
 938
 939             // check for thread idle condition again under lock
 940             if (threadHasWork(curDrawBE))
 941             {
 942                 lock.unlock();
 943                 continue;
 944             }
 945
 946             pContext->FifosNotEmpty.wait(lock);
 947             lock.unlock();
 948         }
 949
 950         if (IsBEThread)
 951         {
 952             RDTSC_BEGIN(pContext->pBucketMgr, WorkerWorkOnFifoBE, 0);
 953             bShutdown |=
 954                 WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
 955             RDTSC_END(pContext->pBucketMgr, WorkerWorkOnFifoBE, 0);
 956
 957             WorkOnCompute(pContext, workerId, curDrawBE);
 958         }
 959
 960         if (IsFEThread)
 961         {
 962             WorkOnFifoFE(pContext, workerId, curDrawFE);
 963
 964             if (!IsBEThread)
 965             {
 966                 curDrawBE = curDrawFE;
 967             }
 968         }
 969     }
 970
 971     return 0;
 972 }
 973 template <>
 974 DWORD workerThreadMain<false, false>(LPVOID) = delete;
 975
 976 template <bool IsFEThread, bool IsBEThread>
 977 DWORD workerThreadInit(LPVOID pData)
 978 {
 979 #if defined(_MSC_VER)
 980     __try
 981 #endif // _WIN32
 982     {
 983         return workerThreadMain<IsFEThread, IsBEThread>(pData);
 984     }
 985
 986 #if defined(_MSC_VER)
 987     __except (EXCEPTION_CONTINUE_SEARCH)
 988     {
 989     }
 990
 991 #endif // _WIN32
 992
 993     return 1;
 994 }
 995 template <>
 996 DWORD workerThreadInit<false, false>(LPVOID pData) = delete;
 997
 998 static void InitPerThreadStats(SWR_CONTEXT* pContext, uint32_t numThreads)
 999 {
1000     // Initialize DRAW_CONTEXT's per-thread stats
1001     for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
1002     {
1003         pContext->dcRing[dc].dynState.pStats =
1004             (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64);
1005         memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads);
1006     }
1007 }
1008
1009 //////////////////////////////////////////////////////////////////////////
1010 /// @brief Creates thread pool info but doesn't launch threads.
1011 /// @param pContext - pointer to context
1012 /// @param pPool - pointer to thread pool object.
1013 void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
1014 {
1015     CPUNumaNodes nodes;
1016     uint32_t     numThreadsPerProcGroup = 0;
1017     CalculateProcessorTopology(nodes, numThreadsPerProcGroup);
1018     assert(numThreadsPerProcGroup > 0);
1019
1020     // Assumption, for asymmetric topologies, multi-threaded cores will appear
1021     // in the list before single-threaded cores.  This appears to be true for
1022     // Windows when the total HW threads is limited to 64.
1023     uint32_t numHWNodes        = (uint32_t)nodes.size();
1024     uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size();
1025     uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size();
1026
1027 #if defined(_WIN32) && !defined(_WIN64)
1028     if (!pContext->threadInfo.MAX_WORKER_THREADS)
1029     {
1030         // Limit 32-bit windows to bindable HW threads only
1031         if ((numHWCoresPerNode * numHWHyperThreads) > 32)
1032         {
1033             numHWCoresPerNode = 32 / numHWHyperThreads;
1034         }
1035     }
1036 #endif
1037
1038     // Calculate num HW threads.  Due to asymmetric topologies, this is not
1039     // a trivial multiplication.
1040     uint32_t numHWThreads = 0;
1041     for (auto const& node : nodes)
1042     {
1043         for (auto const& core : node.cores)
1044         {
1045             numHWThreads += (uint32_t)core.threadIds.size();
1046         }
1047     }
1048
1049     uint32_t numNodes        = numHWNodes;
1050     uint32_t numCoresPerNode = numHWCoresPerNode;
1051     uint32_t numHyperThreads = numHWHyperThreads;
1052
1053     // Calc used threads per-core
1054     if (numHyperThreads > pContext->threadInfo.BASE_THREAD)
1055     {
1056         numHyperThreads -= pContext->threadInfo.BASE_THREAD;
1057     }
1058     else
1059     {
1060         SWR_ASSERT(false,
1061                    "Cannot use BASE_THREAD value: %d, maxThreads: %d, reverting BASE_THREAD to 0",
1062                    pContext->threadInfo.BASE_THREAD,
1063                    numHyperThreads);
1064         pContext->threadInfo.BASE_THREAD = 0;
1065     }
1066
1067     if (pContext->threadInfo.MAX_THREADS_PER_CORE)
1068     {
1069         numHyperThreads = std::min(numHyperThreads, pContext->threadInfo.MAX_THREADS_PER_CORE);
1070     }
1071
1072     // Prune any cores that don't support the number of threads
1073     if (numHyperThreads > 1)
1074     {
1075         for (auto& node : nodes)
1076         {
1077             uint32_t numUsableCores = 0;
1078             for (auto& core : node.cores)
1079             {
1080                 numUsableCores += (core.threadIds.size() >= numHyperThreads);
1081             }
1082             numCoresPerNode = std::min(numCoresPerNode, numUsableCores);
1083         }
1084     }
1085
1086     // Calc used cores per NUMA node
1087     if (numCoresPerNode > pContext->threadInfo.BASE_CORE)
1088     {
1089         numCoresPerNode -= pContext->threadInfo.BASE_CORE;
1090     }
1091     else
1092     {
1093         SWR_ASSERT(false,
1094                    "Cannot use BASE_CORE value: %d, maxCores: %d, reverting BASE_CORE to 0",
1095                    pContext->threadInfo.BASE_CORE,
1096                    numCoresPerNode);
1097         pContext->threadInfo.BASE_CORE = 0;
1098     }
1099
1100     if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE)
1101     {
1102         numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE);
1103     }
1104
1105     // Calc used NUMA nodes
1106     if (numNodes > pContext->threadInfo.BASE_NUMA_NODE)
1107     {
1108         numNodes -= pContext->threadInfo.BASE_NUMA_NODE;
1109     }
1110     else
1111     {
1112         SWR_ASSERT(
1113             false,
1114             "Cannot use BASE_NUMA_NODE value: %d, maxNodes: %d, reverting BASE_NUMA_NODE to 0",
1115             pContext->threadInfo.BASE_NUMA_NODE,
1116             numNodes);
1117         pContext->threadInfo.BASE_NUMA_NODE = 0;
1118     }
1119
1120     if (pContext->threadInfo.MAX_NUMA_NODES)
1121     {
1122         numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES);
1123     }
1124
1125     // Calculate numThreads - at this point everything should be symmetric
1126     uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
1127     SWR_REL_ASSERT(numThreads <= numHWThreads);
1128
1129     uint32_t& numAPIReservedThreads = pContext->apiThreadInfo.numAPIReservedThreads;
1130     uint32_t& numAPIThreadsPerCore  = pContext->apiThreadInfo.numAPIThreadsPerCore;
1131     uint32_t  numRemovedThreads     = 0;
1132
1133     if (pContext->threadInfo.SINGLE_THREADED)
1134     {
1135         numAPIReservedThreads      = 0;
1136         numThreads                 = 1;
1137         pContext->NumWorkerThreads = 1;
1138         pContext->NumFEThreads     = 1;
1139         pContext->NumBEThreads     = 1;
1140         pPool->numThreads          = 0;
1141     }
1142     else if (pContext->threadInfo.MAX_WORKER_THREADS)
1143     {
1144         numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, numHWThreads);
1145         pContext->threadInfo.BASE_NUMA_NODE = 0;
1146         pContext->threadInfo.BASE_CORE      = 0;
1147         pContext->threadInfo.BASE_THREAD    = 0;
1148         numAPIReservedThreads               = 0;
1149     }
1150     else
1151     {
1152         if (numAPIReservedThreads >= numThreads)
1153         {
1154             numAPIReservedThreads = 0;
1155         }
1156         else if (numAPIReservedThreads)
1157         {
1158             numAPIThreadsPerCore = std::min(numAPIThreadsPerCore, numHWHyperThreads);
1159
1160             if (0 == numAPIThreadsPerCore)
1161             {
1162                 numAPIThreadsPerCore = numHWHyperThreads;
1163             }
1164
1165             numRemovedThreads = numAPIReservedThreads;
1166             if (numAPIThreadsPerCore == 2 && numHyperThreads == 1)
1167             {
1168                 // Adjust removed threads to make logic below work
1169                 numRemovedThreads =
1170                     std::max(1U, (numRemovedThreads + numAPIThreadsPerCore - 1) / 2);
1171             }
1172
1173             numThreads -= numRemovedThreads;
1174         }
1175     }
1176
1177     InitPerThreadStats(pContext, numThreads);
1178
1179     if (pContext->threadInfo.SINGLE_THREADED)
1180     {
1181         numAPIReservedThreads = 0;
1182         numThreads            = 1;
1183     }
1184
1185     if (numAPIReservedThreads)
1186     {
1187         pPool->pApiThreadData = new (std::nothrow) THREAD_DATA[numAPIReservedThreads];
1188         SWR_ASSERT(pPool->pApiThreadData);
1189         if (!pPool->pApiThreadData)
1190         {
1191             numAPIReservedThreads = 0;
1192         }
1193         else
1194         {
1195             memset(pPool->pApiThreadData, 0, sizeof(THREAD_DATA) * numAPIReservedThreads);
1196         }
1197     }
1198     pPool->numReservedThreads = numAPIReservedThreads;
1199
1200     pPool->numThreads          = numThreads;
1201     pContext->NumWorkerThreads = pPool->numThreads;
1202
1203     pPool->pThreadData = new (std::nothrow) THREAD_DATA[pPool->numThreads];
1204     assert(pPool->pThreadData);
1205     memset(pPool->pThreadData, 0, sizeof(THREAD_DATA) * pPool->numThreads);
1206     pPool->numaMask = 0;
1207
1208     // Allocate worker private data
1209     pPool->pWorkerPrivateDataArray = nullptr;
1210     if (pContext->workerPrivateState.perWorkerPrivateStateSize == 0)
1211     {
1212         pContext->workerPrivateState.perWorkerPrivateStateSize = sizeof(SWR_WORKER_DATA);
1213         pContext->workerPrivateState.pfnInitWorkerData = nullptr;
1214         pContext->workerPrivateState.pfnFinishWorkerData = nullptr;
1215     }
1216
1217     // initialize contents of SWR_WORKER_DATA
1218     size_t perWorkerSize =
1219         AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64);
1220     size_t totalSize = perWorkerSize * pPool->numThreads;
1221     if (totalSize)
1222     {
1223         pPool->pWorkerPrivateDataArray = AlignedMalloc(totalSize, 64);
1224         SWR_ASSERT(pPool->pWorkerPrivateDataArray);
1225
1226         void* pWorkerData = pPool->pWorkerPrivateDataArray;
1227         for (uint32_t i = 0; i < pPool->numThreads; ++i)
1228         {
1229             pPool->pThreadData[i].pWorkerPrivateData = pWorkerData;
1230             if (pContext->workerPrivateState.pfnInitWorkerData)
1231             {
1232                 pContext->workerPrivateState.pfnInitWorkerData(pContext, pWorkerData, i);
1233             }
1234             pWorkerData = PtrAdd(pWorkerData, perWorkerSize);
1235         }
1236     }
1237
1238     if (pContext->threadInfo.SINGLE_THREADED)
1239     {
1240         return;
1241     }
1242
1243     pPool->pThreads = new (std::nothrow) THREAD_PTR[pPool->numThreads];
1244     assert(pPool->pThreads);
1245
1246     if (pContext->threadInfo.MAX_WORKER_THREADS)
1247     {
1248         bool     bForceBindProcGroup = (numThreads > numThreadsPerProcGroup);
1249         uint32_t numProcGroups = (numThreads + numThreadsPerProcGroup - 1) / numThreadsPerProcGroup;
1250         // When MAX_WORKER_THREADS is set we don't bother to bind to specific HW threads
1251         // But Windows will still require binding to specific process groups
1252         for (uint32_t workerId = 0; workerId < numThreads; ++workerId)
1253         {
1254             pPool->pThreadData[workerId].workerId           = workerId;
1255             pPool->pThreadData[workerId].procGroupId        = workerId % numProcGroups;
1256             pPool->pThreadData[workerId].threadId           = 0;
1257             pPool->pThreadData[workerId].numaId             = 0;
1258             pPool->pThreadData[workerId].coreId             = 0;
1259             pPool->pThreadData[workerId].htId               = 0;
1260             pPool->pThreadData[workerId].pContext           = pContext;
1261             pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
1262
1263             pContext->NumBEThreads++;
1264             pContext->NumFEThreads++;
1265         }
1266     }
1267     else
1268     {
1269         // numa distribution assumes workers on all nodes
1270         bool useNuma = true;
1271         if (numCoresPerNode * numHyperThreads == 1)
1272         {
1273             useNuma = false;
1274         }
1275
1276         if (useNuma)
1277         {
1278             pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.)
1279         }
1280         else
1281         {
1282             pPool->numaMask = 0;
1283         }
1284
1285         uint32_t workerId           = 0;
1286         uint32_t numReservedThreads = numAPIReservedThreads;
1287         for (uint32_t n = 0; n < numNodes; ++n)
1288         {
1289             if ((n + pContext->threadInfo.BASE_NUMA_NODE) >= nodes.size())
1290             {
1291                 break;
1292             }
1293             auto&    node     = nodes[n + pContext->threadInfo.BASE_NUMA_NODE];
1294             uint32_t numCores = numCoresPerNode;
1295             for (uint32_t c = 0; c < numCores; ++c)
1296             {
1297                 if ((c + pContext->threadInfo.BASE_CORE) >= node.cores.size())
1298                 {
1299                     break;
1300                 }
1301
1302                 auto& core = node.cores[c + pContext->threadInfo.BASE_CORE];
1303                 for (uint32_t t = 0; t < numHyperThreads; ++t)
1304                 {
1305                     if ((t + pContext->threadInfo.BASE_THREAD) >= core.threadIds.size())
1306                     {
1307                         break;
1308                     }
1309
1310                     if (numRemovedThreads)
1311                     {
1312                         --numRemovedThreads;
1313                         assert(numReservedThreads);
1314                         --numReservedThreads;
1315                         pPool->pApiThreadData[numReservedThreads].workerId    = 0xFFFFFFFFU;
1316                         pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
1317                         pPool->pApiThreadData[numReservedThreads].threadId    = core.threadIds[t];
1318                         pPool->pApiThreadData[numReservedThreads].numaId =
1319                             useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
1320                         pPool->pApiThreadData[numReservedThreads].coreId =
1321                             c + pContext->threadInfo.BASE_CORE;
1322                         pPool->pApiThreadData[numReservedThreads].htId =
1323                             t + pContext->threadInfo.BASE_THREAD;
1324                         pPool->pApiThreadData[numReservedThreads].pContext           = pContext;
1325                         pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
1326
1327                         if (numAPIThreadsPerCore > numHyperThreads && numReservedThreads)
1328                         {
1329                             --numReservedThreads;
1330                             pPool->pApiThreadData[numReservedThreads].workerId    = 0xFFFFFFFFU;
1331                             pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
1332                             pPool->pApiThreadData[numReservedThreads].threadId =
1333                                 core.threadIds[t + 1];
1334                             pPool->pApiThreadData[numReservedThreads].numaId =
1335                                 useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
1336                             pPool->pApiThreadData[numReservedThreads].coreId =
1337                                 c + pContext->threadInfo.BASE_CORE;
1338                             pPool->pApiThreadData[numReservedThreads].htId =
1339                                 t + pContext->threadInfo.BASE_THREAD;
1340                             pPool->pApiThreadData[numReservedThreads].pContext           = pContext;
1341                             pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
1342                         }
1343
1344                         continue;
1345                     }
1346
1347                     SWR_ASSERT(workerId < numThreads);
1348
1349                     pPool->pThreadData[workerId].workerId    = workerId;
1350                     pPool->pThreadData[workerId].procGroupId = core.procGroup;
1351                     pPool->pThreadData[workerId].threadId =
1352                         core.threadIds[t + pContext->threadInfo.BASE_THREAD];
1353                     pPool->pThreadData[workerId].numaId =
1354                         useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
1355                     pPool->pThreadData[workerId].coreId   = c + pContext->threadInfo.BASE_CORE;
1356                     pPool->pThreadData[workerId].htId     = t + pContext->threadInfo.BASE_THREAD;
1357                     pPool->pThreadData[workerId].pContext = pContext;
1358                     pPool->pThreadData[workerId].forceBindProcGroup = false;
1359
1360                     pContext->NumBEThreads++;
1361                     pContext->NumFEThreads++;
1362
1363                     ++workerId;
1364                 }
1365             }
1366         }
1367         SWR_ASSERT(workerId == pContext->NumWorkerThreads);
1368     }
1369 }
1370
1371 //////////////////////////////////////////////////////////////////////////
1372 /// @brief Launches worker threads in thread pool.
1373 /// @param pContext - pointer to context
1374 /// @param pPool - pointer to thread pool object.
1375 void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
1376 {
1377     if (pContext->threadInfo.SINGLE_THREADED)
1378     {
1379         return;
1380     }
1381
1382     for (uint32_t workerId = 0; workerId < pContext->NumWorkerThreads; ++workerId)
1383     {
1384         pPool->pThreads[workerId] =
1385             new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
1386     }
1387 }
1388
1389 //////////////////////////////////////////////////////////////////////////
1390 /// @brief Destroys thread pool.
1391 /// @param pContext - pointer to context
1392 /// @param pPool - pointer to thread pool object.
1393 void DestroyThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
1394 {
1395     // Wait for all threads to finish
1396     SwrWaitForIdle(pContext);
1397
1398     // Wait for threads to finish and destroy them
1399     for (uint32_t t = 0; t < pPool->numThreads; ++t)
1400     {
1401         if (!pContext->threadInfo.SINGLE_THREADED)
1402         {
1403             // Detach from thread.  Cannot join() due to possibility (in Windows) of code
1404             // in some DLLMain(THREAD_DETATCH case) blocking the thread until after this returns.
1405             pPool->pThreads[t]->detach();
1406             delete (pPool->pThreads[t]);
1407         }
1408
1409         if (pContext->workerPrivateState.pfnFinishWorkerData)
1410         {
1411             pContext->workerPrivateState.pfnFinishWorkerData(
1412                 pContext, pPool->pThreadData[t].pWorkerPrivateData, t);
1413         }
1414     }
1415
1416     delete[] pPool->pThreads;
1417
1418     // Clean up data used by threads
1419     delete[] pPool->pThreadData;
1420     delete[] pPool->pApiThreadData;
1421
1422     AlignedFree(pPool->pWorkerPrivateDataArray);
1423 }