src/gallium/drivers/swr/rasterizer/core/threads.cpp

   1 /****************************************************************************
   2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  ****************************************************************************/
  23
  24 #include <stdio.h>
  25 #include <thread>
  26 #include <algorithm>
  27 #include <float.h>
  28 #include <vector>
  29 #include <utility>
  30 #include <fstream>
  31 #include <string>
  32
  33 #if defined(__linux__) || defined(__gnu_linux__) || defined(__APPLE__)
  34 #include <pthread.h>
  35 #include <sched.h>
  36 #include <unistd.h>
  37 #endif
  38
  39 #ifdef __APPLE__
  40 #include <sys/types.h>
  41 #include <sys/sysctl.h>
  42 #endif
  43
  44 #include "common/os.h"
  45 #include "core/api.h"
  46 #include "context.h"
  47 #include "frontend.h"
  48 #include "backend.h"
  49 #include "rasterizer.h"
  50 #include "rdtsc_core.h"
  51 #include "tilemgr.h"
  52 #include "tileset.h"
  53
  54
  55 // ThreadId
  56 struct Core
  57 {
  58     uint32_t              procGroup = 0;
  59     std::vector<uint32_t> threadIds;
  60 };
  61
  62 struct NumaNode
  63 {
  64     uint32_t          numaId;
  65     std::vector<Core> cores;
  66 };
  67
  68 typedef std::vector<NumaNode> CPUNumaNodes;
  69
  70 void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup)
  71 {
  72     out_nodes.clear();
  73     out_numThreadsPerProcGroup = 0;
  74
  75 #if defined(_WIN32)
  76
  77     std::vector<KAFFINITY> threadMaskPerProcGroup;
  78
  79     static std::mutex           m;
  80     std::lock_guard<std::mutex> l(m);
  81
  82     DWORD bufSize = 0;
  83
  84     BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufSize);
  85     SWR_ASSERT(ret == FALSE && GetLastError() == ERROR_INSUFFICIENT_BUFFER);
  86
  87     PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem =
  88         (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize);
  89     SWR_ASSERT(pBufferMem);
  90
  91     ret = GetLogicalProcessorInformationEx(RelationProcessorCore, pBufferMem, &bufSize);
  92     SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information");
  93
  94     uint32_t                                 count   = bufSize / pBufferMem->Size;
  95     PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = pBufferMem;
  96
  97     for (uint32_t i = 0; i < count; ++i)
  98     {
  99         SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore);
 100         for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g)
 101         {
 102             auto&    gmask     = pBuffer->Processor.GroupMask[g];
 103             uint32_t threadId  = 0;
 104             uint32_t procGroup = gmask.Group;
 105
 106             Core* pCore = nullptr;
 107
 108             uint32_t numThreads = (uint32_t)_mm_popcount_sizeT(gmask.Mask);
 109
 110             while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask))
 111             {
 112                 // clear mask
 113                 KAFFINITY threadMask = KAFFINITY(1) << threadId;
 114                 gmask.Mask &= ~threadMask;
 115
 116                 if (procGroup >= threadMaskPerProcGroup.size())
 117                 {
 118                     threadMaskPerProcGroup.resize(procGroup + 1);
 119                 }
 120
 121                 if (threadMaskPerProcGroup[procGroup] & threadMask)
 122                 {
 123                     // Already seen this mask.  This means that we are in 32-bit mode and
 124                     // have seen more than 32 HW threads for this procGroup
 125                     // Don't use it
 126 #if defined(_WIN64)
 127                     SWR_INVALID("Shouldn't get here in 64-bit mode");
 128 #endif
 129                     continue;
 130                 }
 131
 132                 threadMaskPerProcGroup[procGroup] |= (KAFFINITY(1) << threadId);
 133
 134                 // Find Numa Node
 135                 uint32_t         numaId  = 0;
 136                 PROCESSOR_NUMBER procNum = {};
 137                 procNum.Group            = WORD(procGroup);
 138                 procNum.Number           = UCHAR(threadId);
 139
 140                 ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId);
 141                 SWR_ASSERT(ret);
 142
 143                 // Store data
 144                 if (out_nodes.size() <= numaId)
 145                 {
 146                     out_nodes.resize(numaId + 1);
 147                 }
 148                 auto& numaNode  = out_nodes[numaId];
 149                 numaNode.numaId = numaId;
 150
 151                 uint32_t coreId = 0;
 152
 153                 if (nullptr == pCore)
 154                 {
 155                     numaNode.cores.push_back(Core());
 156                     pCore            = &numaNode.cores.back();
 157                     pCore->procGroup = procGroup;
 158                 }
 159                 pCore->threadIds.push_back(threadId);
 160                 if (procGroup == 0)
 161                 {
 162                     out_numThreadsPerProcGroup++;
 163                 }
 164             }
 165         }
 166         pBuffer = PtrAdd(pBuffer, pBuffer->Size);
 167     }
 168
 169     free(pBufferMem);
 170
 171 #elif defined(__linux__) || defined(__gnu_linux__)
 172
 173     // Parse /proc/cpuinfo to get full topology
 174     std::ifstream input("/proc/cpuinfo");
 175     std::string   line;
 176     char*         c;
 177     uint32_t      procId = uint32_t(-1);
 178     uint32_t      coreId = uint32_t(-1);
 179     uint32_t      physId = uint32_t(-1);
 180
 181     while (std::getline(input, line))
 182     {
 183         if (line.find("processor") != std::string::npos)
 184         {
 185             auto data_start = line.find(": ") + 2;
 186             procId          = std::strtoul(&line.c_str()[data_start], &c, 10);
 187             continue;
 188         }
 189         if (line.find("core id") != std::string::npos)
 190         {
 191             auto data_start = line.find(": ") + 2;
 192             coreId          = std::strtoul(&line.c_str()[data_start], &c, 10);
 193             continue;
 194         }
 195         if (line.find("physical id") != std::string::npos)
 196         {
 197             auto data_start = line.find(": ") + 2;
 198             physId          = std::strtoul(&line.c_str()[data_start], &c, 10);
 199             continue;
 200         }
 201         if (line.length() == 0)
 202         {
 203             if (physId + 1 > out_nodes.size())
 204                 out_nodes.resize(physId + 1);
 205             auto& numaNode  = out_nodes[physId];
 206             numaNode.numaId = physId;
 207
 208             if (coreId + 1 > numaNode.cores.size())
 209                 numaNode.cores.resize(coreId + 1);
 210             auto& core     = numaNode.cores[coreId];
 211             core.procGroup = coreId;
 212             core.threadIds.push_back(procId);
 213         }
 214     }
 215
 216     out_numThreadsPerProcGroup = 0;
 217     for (auto& node : out_nodes)
 218     {
 219         for (auto& core : node.cores)
 220         {
 221             out_numThreadsPerProcGroup += core.threadIds.size();
 222         }
 223     }
 224
 225 #elif defined(__APPLE__)
 226
 227     auto numProcessors  = 0;
 228     auto numCores       = 0;
 229     auto numPhysicalIds = 0;
 230
 231     int    value;
 232     size_t size = sizeof(value);
 233
 234     int result = sysctlbyname("hw.packages", &value, &size, NULL, 0);
 235     SWR_ASSERT(result == 0);
 236     numPhysicalIds = value;
 237
 238     result = sysctlbyname("hw.logicalcpu", &value, &size, NULL, 0);
 239     SWR_ASSERT(result == 0);
 240     numProcessors = value;
 241
 242     result = sysctlbyname("hw.physicalcpu", &value, &size, NULL, 0);
 243     SWR_ASSERT(result == 0);
 244     numCores = value;
 245
 246     out_nodes.resize(numPhysicalIds);
 247
 248     for (auto physId = 0; physId < numPhysicalIds; ++physId)
 249     {
 250         auto& numaNode = out_nodes[physId];
 251         auto  procId   = 0;
 252
 253         numaNode.cores.resize(numCores);
 254
 255         while (procId < numProcessors)
 256         {
 257             for (auto coreId = 0; coreId < numaNode.cores.size(); ++coreId, ++procId)
 258             {
 259                 auto& core = numaNode.cores[coreId];
 260
 261                 core.procGroup = coreId;
 262                 core.threadIds.push_back(procId);
 263             }
 264         }
 265     }
 266
 267     out_numThreadsPerProcGroup = 0;
 268
 269     for (auto& node : out_nodes)
 270     {
 271         for (auto& core : node.cores)
 272         {
 273             out_numThreadsPerProcGroup += core.threadIds.size();
 274         }
 275     }
 276
 277 #else
 278
 279 #error Unsupported platform
 280
 281 #endif
 282
 283     // Prune empty cores and numa nodes
 284     for (auto node_it = out_nodes.begin(); node_it != out_nodes.end();)
 285     {
 286         // Erase empty cores (first)
 287         for (auto core_it = node_it->cores.begin(); core_it != node_it->cores.end();)
 288         {
 289             if (core_it->threadIds.size() == 0)
 290             {
 291                 core_it = node_it->cores.erase(core_it);
 292             }
 293             else
 294             {
 295                 ++core_it;
 296             }
 297         }
 298
 299         // Erase empty numa nodes (second)
 300         if (node_it->cores.size() == 0)
 301         {
 302             node_it = out_nodes.erase(node_it);
 303         }
 304         else
 305         {
 306             ++node_it;
 307         }
 308     }
 309 }
 310
 311 void bindThread(SWR_CONTEXT* pContext,
 312                 uint32_t     threadId,
 313                 uint32_t     procGroupId   = 0,
 314                 bool         bindProcGroup = false)
 315 {
 316     // Only bind threads when MAX_WORKER_THREADS isn't set.
 317     if (pContext->threadInfo.SINGLE_THREADED ||
 318         (pContext->threadInfo.MAX_WORKER_THREADS && bindProcGroup == false))
 319     {
 320         return;
 321     }
 322
 323 #if defined(_WIN32)
 324
 325     GROUP_AFFINITY affinity = {};
 326     affinity.Group          = procGroupId;
 327
 328 #if !defined(_WIN64)
 329     if (threadId >= 32)
 330     {
 331         // Hopefully we don't get here.  Logic in CreateThreadPool should prevent this.
 332         SWR_INVALID("Shouldn't get here");
 333
 334         // In a 32-bit process on Windows it is impossible to bind
 335         // to logical processors 32-63 within a processor group.
 336         // In this case set the mask to 0 and let the system assign
 337         // the processor.  Hopefully it will make smart choices.
 338         affinity.Mask = 0;
 339     }
 340     else
 341 #endif
 342     {
 343         // If MAX_WORKER_THREADS is set, only bind to the proc group,
 344         // Not the individual HW thread.
 345         if (!bindProcGroup && !pContext->threadInfo.MAX_WORKER_THREADS)
 346         {
 347             affinity.Mask = KAFFINITY(1) << threadId;
 348         }
 349         else
 350         {
 351             affinity.Mask = KAFFINITY(0);
 352         }
 353     }
 354
 355     if (!SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr))
 356     {
 357         SWR_INVALID("Failed to set Thread Affinity");
 358     }
 359
 360 #elif defined(__linux__) || defined(__gnu_linux__)
 361
 362     cpu_set_t cpuset;
 363     pthread_t thread = pthread_self();
 364     CPU_ZERO(&cpuset);
 365     CPU_SET(threadId, &cpuset);
 366
 367     int err = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
 368     if (err != 0)
 369     {
 370         fprintf(stderr, "pthread_setaffinity_np failure for tid %u: %s\n", threadId, strerror(err));
 371     }
 372
 373 #endif
 374 }
 375
 376 INLINE
 377 uint32_t GetEnqueuedDraw(SWR_CONTEXT* pContext)
 378 {
 379     return pContext->dcRing.GetHead();
 380 }
 381
 382 INLINE
 383 DRAW_CONTEXT* GetDC(SWR_CONTEXT* pContext, uint32_t drawId)
 384 {
 385     return &pContext->dcRing[(drawId - 1) % pContext->MAX_DRAWS_IN_FLIGHT];
 386 }
 387
 388 INLINE
 389 bool IDComparesLess(uint32_t a, uint32_t b)
 390 {
 391     // Use signed delta to ensure that wrap-around to 0 is correctly handled.
 392     int32_t delta = int32_t(a - b);
 393     return (delta < 0);
 394 }
 395
 396 // returns true if dependency not met
 397 INLINE
 398 bool CheckDependency(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t lastRetiredDraw)
 399 {
 400     return pDC->dependent && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
 401 }
 402
 403 bool CheckDependencyFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t lastRetiredDraw)
 404 {
 405     return pDC->dependentFE && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
 406 }
 407
 408 //////////////////////////////////////////////////////////////////////////
 409 /// @brief Update client stats.
 410 INLINE void UpdateClientStats(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
 411 {
 412     if ((pContext->pfnUpdateStats == nullptr) || (GetApiState(pDC).enableStatsBE == false))
 413     {
 414         return;
 415     }
 416
 417     DRAW_DYNAMIC_STATE& dynState = pDC->dynState;
 418     OSALIGNLINE(SWR_STATS) stats{0};
 419
 420     // Sum up stats across all workers before sending to client.
 421     for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
 422     {
 423         stats.DepthPassCount += dynState.pStats[i].DepthPassCount;
 424         stats.PsInvocations += dynState.pStats[i].PsInvocations;
 425         stats.CsInvocations += dynState.pStats[i].CsInvocations;
 426
 427     }
 428
 429
 430     pContext->pfnUpdateStats(GetPrivateState(pDC), &stats);
 431 }
 432
 433 INLINE void ExecuteCallbacks(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
 434 {
 435     UpdateClientStats(pContext, workerId, pDC);
 436
 437     if (pDC->retireCallback.pfnCallbackFunc)
 438     {
 439         pDC->retireCallback.pfnCallbackFunc(pDC->retireCallback.userData,
 440                                             pDC->retireCallback.userData2,
 441                                             pDC->retireCallback.userData3);
 442
 443         // Callbacks to external code *could* change floating point control state
 444         // Reset our optimal flags
 445         SetOptimalVectorCSR();
 446     }
 447 }
 448
 449 // inlined-only version
 450 INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
 451 {
 452     int32_t result = static_cast<int32_t>(InterlockedDecrement(&pDC->threadsDone));
 453     SWR_ASSERT(result >= 0);
 454
 455     AR_FLUSH(pDC->drawId);
 456
 457     if (result == 0)
 458     {
 459         ExecuteCallbacks(pContext, workerId, pDC);
 460
 461
 462         // Cleanup memory allocations
 463         pDC->pArena->Reset(true);
 464         if (!pDC->isCompute)
 465         {
 466             pDC->pTileMgr->initialize();
 467         }
 468         if (pDC->cleanupState)
 469         {
 470             pDC->pState->pArena->Reset(true);
 471         }
 472
 473         _ReadWriteBarrier();
 474
 475         pContext->dcRing.Dequeue(); // Remove from tail
 476     }
 477
 478     return result;
 479 }
 480
 481 // available to other translation modules
 482 int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
 483 {
 484     return CompleteDrawContextInl(pContext, 0, pDC);
 485 }
 486
 487 INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext,
 488                                     uint32_t     workerId,
 489                                     uint32_t&    curDrawBE,
 490                                     uint32_t&    drawEnqueued)
 491 {
 492     // increment our current draw id to the first incomplete draw
 493     drawEnqueued = GetEnqueuedDraw(pContext);
 494     while (IDComparesLess(curDrawBE, drawEnqueued))
 495     {
 496         DRAW_CONTEXT* pDC = &pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT];
 497
 498         // If its not compute and FE is not done then break out of loop.
 499         if (!pDC->doneFE && !pDC->isCompute)
 500             break;
 501
 502         bool isWorkComplete =
 503             pDC->isCompute ? pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete();
 504
 505         if (isWorkComplete)
 506         {
 507             curDrawBE++;
 508             CompleteDrawContextInl(pContext, workerId, pDC);
 509         }
 510         else
 511         {
 512             break;
 513         }
 514     }
 515
 516     // If there are no more incomplete draws then return false.
 517     return IDComparesLess(curDrawBE, drawEnqueued);
 518 }
 519
 520 //////////////////////////////////////////////////////////////////////////
 521 /// @brief If there is any BE work then go work on it.
 522 /// @param pContext - pointer to SWR context.
 523 /// @param workerId - The unique worker ID that is assigned to this thread.
 524 /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker
 525 /// thread
 526 ///                    has its own curDrawBE counter and this ensures that each worker processes all
 527 ///                    the draws in order.
 528 /// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its
 529 ///                      own set and each time it fails to lock a macrotile, because its already
 530 ///                      locked, then it will add that tile to the lockedTiles set. As a worker
 531 ///                      begins to work on future draws the lockedTiles ensure that it doesn't work
 532 ///                      on tiles that may still have work pending in a previous draw. Additionally,
 533 ///                      the lockedTiles is hueristic that can steer a worker back to the same
 534 ///                      macrotile that it had been working on in a previous draw.
 535 /// @returns        true if worker thread should shutdown
 536 bool WorkOnFifoBE(SWR_CONTEXT* pContext,
 537                   uint32_t     workerId,
 538                   uint32_t&    curDrawBE,
 539                   TileSet&     lockedTiles,
 540                   uint32_t     numaNode,
 541                   uint32_t     numaMask)
 542 {
 543     bool bShutdown = false;
 544
 545     // Find the first incomplete draw that has pending work. If no such draw is found then
 546     // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
 547     uint32_t drawEnqueued = 0;
 548     if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
 549     {
 550         return false;
 551     }
 552
 553     uint32_t lastRetiredDraw =
 554         pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;
 555
 556     // Reset our history for locked tiles. We'll have to re-learn which tiles are locked.
 557     lockedTiles.clear();
 558
 559     // Try to work on each draw in order of the available draws in flight.
 560     //   1. If we're on curDrawBE, we can work on any macrotile that is available.
 561     //   2. If we're trying to work on draws after curDrawBE, we are restricted to
 562     //      working on those macrotiles that are known to be complete in the prior draw to
 563     //      maintain order. The locked tiles provides the history to ensures this.
 564     for (uint32_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
 565     {
 566         DRAW_CONTEXT* pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
 567
 568         if (pDC->isCompute)
 569             return false; // We don't look at compute work.
 570
 571         // First wait for FE to be finished with this draw. This keeps threading model simple
 572         // but if there are lots of bubbles between draws then serializing FE and BE may
 573         // need to be revisited.
 574         if (!pDC->doneFE)
 575             return false;
 576
 577         // If this draw is dependent on a previous draw then we need to bail.
 578         if (CheckDependency(pContext, pDC, lastRetiredDraw))
 579         {
 580             return false;
 581         }
 582
 583         // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it.
 584         auto& macroTiles = pDC->pTileMgr->getDirtyTiles();
 585
 586         for (auto tile : macroTiles)
 587         {
 588             uint32_t tileID = tile->mId;
 589
 590             // Only work on tiles for this numa node
 591             uint32_t x, y;
 592             pDC->pTileMgr->getTileIndices(tileID, x, y);
 593             if (((x ^ y) & numaMask) != numaNode)
 594             {
 595                 continue;
 596             }
 597
 598             if (!tile->getNumQueued())
 599             {
 600                 continue;
 601             }
 602
 603             // can only work on this draw if it's not in use by other threads
 604             if (lockedTiles.get(tileID))
 605             {
 606                 continue;
 607             }
 608
 609             if (tile->tryLock())
 610             {
 611                 BE_WORK* pWork;
 612
 613                 RDTSC_BEGIN(pContext->pBucketMgr, WorkerFoundWork, pDC->drawId);
 614
 615                 uint32_t numWorkItems = tile->getNumQueued();
 616                 SWR_ASSERT(numWorkItems);
 617
 618                 pWork = tile->peek();
 619                 SWR_ASSERT(pWork);
 620                 if (pWork->type == DRAW)
 621                 {
 622                     pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, workerId, tileID);
 623                 }
 624                 else if (pWork->type == SHUTDOWN)
 625                 {
 626                     bShutdown = true;
 627                 }
 628
 629                 while ((pWork = tile->peek()) != nullptr)
 630                 {
 631                     pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
 632                     tile->dequeue();
 633                 }
 634                 RDTSC_END(pContext->pBucketMgr, WorkerFoundWork, numWorkItems);
 635
 636                 _ReadWriteBarrier();
 637
 638                 pDC->pTileMgr->markTileComplete(tileID);
 639
 640                 // Optimization: If the draw is complete and we're the last one to have worked on it
 641                 // then we can reset the locked list as we know that all previous draws before the
 642                 // next are guaranteed to be complete.
 643                 if ((curDrawBE == i) && (bShutdown || pDC->pTileMgr->isWorkComplete()))
 644                 {
 645                     // We can increment the current BE and safely move to next draw since we know
 646                     // this draw is complete.
 647                     curDrawBE++;
 648                     CompleteDrawContextInl(pContext, workerId, pDC);
 649
 650                     lastRetiredDraw++;
 651
 652                     lockedTiles.clear();
 653                     break;
 654                 }
 655
 656                 if (bShutdown)
 657                 {
 658                     break;
 659                 }
 660             }
 661             else
 662             {
 663                 // This tile is already locked. So let's add it to our locked tiles set. This way we
 664                 // don't try locking this one again.
 665                 lockedTiles.set(tileID);
 666             }
 667         }
 668     }
 669
 670     return bShutdown;
 671 }
 672
 673 //////////////////////////////////////////////////////////////////////////
 674 /// @brief Called when FE work is complete for this DC.
 675 INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
 676 {
 677     if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStatsFE)
 678     {
 679         SWR_STATS_FE& stats = pDC->dynState.statsFE;
 680
 681         AR_EVENT(FrontendStatsEvent(pDC->drawId,
 682                                     stats.IaVertices,
 683                                     stats.IaPrimitives,
 684                                     stats.VsInvocations,
 685                                     stats.HsInvocations,
 686                                     stats.DsInvocations,
 687                                     stats.GsInvocations,
 688                                     stats.GsPrimitives,
 689                                     stats.CInvocations,
 690                                     stats.CPrimitives,
 691                                     stats.SoPrimStorageNeeded[0],
 692                                     stats.SoPrimStorageNeeded[1],
 693                                     stats.SoPrimStorageNeeded[2],
 694                                     stats.SoPrimStorageNeeded[3],
 695                                     stats.SoNumPrimsWritten[0],
 696                                     stats.SoNumPrimsWritten[1],
 697                                     stats.SoNumPrimsWritten[2],
 698                                     stats.SoNumPrimsWritten[3]));
 699         AR_EVENT(FrontendDrawEndEvent(pDC->drawId));
 700
 701         pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &stats);
 702     }
 703
 704     if (pContext->pfnUpdateSoWriteOffset)
 705     {
 706         for (uint32_t i = 0; i < MAX_SO_BUFFERS; ++i)
 707         {
 708             if ((pDC->dynState.SoWriteOffsetDirty[i]) &&
 709                 (pDC->pState->state.soBuffer[i].soWriteEnable))
 710             {
 711                 pContext->pfnUpdateSoWriteOffset(
 712                     GetPrivateState(pDC), i, pDC->dynState.SoWriteOffset[i]);
 713             }
 714         }
 715     }
 716
 717     // Ensure all streaming writes are globally visible before marking this FE done
 718     _mm_mfence();
 719     pDC->doneFE = true;
 720
 721     InterlockedDecrement(&pContext->drawsOutstandingFE);
 722 }
 723
 724 void WorkOnFifoFE(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawFE)
 725 {
 726     // Try to grab the next DC from the ring
 727     uint32_t drawEnqueued = GetEnqueuedDraw(pContext);
 728     while (IDComparesLess(curDrawFE, drawEnqueued))
 729     {
 730         uint32_t      dcSlot = curDrawFE % pContext->MAX_DRAWS_IN_FLIGHT;
 731         DRAW_CONTEXT* pDC    = &pContext->dcRing[dcSlot];
 732         if (pDC->isCompute || pDC->doneFE)
 733         {
 734             CompleteDrawContextInl(pContext, workerId, pDC);
 735             curDrawFE++;
 736         }
 737         else
 738         {
 739             break;
 740         }
 741     }
 742
 743     uint32_t lastRetiredFE = curDrawFE - 1;
 744     uint32_t curDraw       = curDrawFE;
 745     while (IDComparesLess(curDraw, drawEnqueued))
 746     {
 747         uint32_t      dcSlot = curDraw % pContext->MAX_DRAWS_IN_FLIGHT;
 748         DRAW_CONTEXT* pDC    = &pContext->dcRing[dcSlot];
 749
 750         if (!pDC->isCompute && !pDC->FeLock)
 751         {
 752             if (CheckDependencyFE(pContext, pDC, lastRetiredFE))
 753             {
 754                 return;
 755             }
 756
 757             uint32_t initial = InterlockedCompareExchange((volatile uint32_t*)&pDC->FeLock, 1, 0);
 758             if (initial == 0)
 759             {
 760                 // successfully grabbed the DC, now run the FE
 761                 pDC->FeWork.pfnWork(pContext, pDC, workerId, &pDC->FeWork.desc);
 762
 763                 CompleteDrawFE(pContext, workerId, pDC);
 764             }
 765         }
 766         curDraw++;
 767     }
 768 }
 769
 770 //////////////////////////////////////////////////////////////////////////
 771 /// @brief If there is any compute work then go work on it.
 772 /// @param pContext - pointer to SWR context.
 773 /// @param workerId - The unique worker ID that is assigned to this thread.
 774 /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker
 775 /// thread
 776 ///                    has its own curDrawBE counter and this ensures that each worker processes all
 777 ///                    the draws in order.
 778 void WorkOnCompute(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawBE)
 779 {
 780     uint32_t drawEnqueued = 0;
 781     if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
 782     {
 783         return;
 784     }
 785
 786     uint32_t lastRetiredDraw =
 787         pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;
 788
 789     for (uint64_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
 790     {
 791         DRAW_CONTEXT* pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
 792         if (pDC->isCompute == false)
 793             return;
 794
 795         // check dependencies
 796         if (CheckDependency(pContext, pDC, lastRetiredDraw))
 797         {
 798             return;
 799         }
 800
 801         SWR_ASSERT(pDC->pDispatch != nullptr);
 802         DispatchQueue& queue = *pDC->pDispatch;
 803
 804         // Is there any work remaining?
 805         if (queue.getNumQueued() > 0)
 806         {
 807             void*    pSpillFillBuffer = nullptr;
 808             void*    pScratchSpace    = nullptr;
 809             uint32_t threadGroupId    = 0;
 810             while (queue.getWork(threadGroupId))
 811             {
 812                 queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer, pScratchSpace);
 813                 queue.finishedWork();
 814             }
 815
 816             // Ensure all streaming writes are globally visible before moving onto the next draw
 817             _mm_mfence();
 818         }
 819     }
 820 }
 821
 822 void BindApiThread(SWR_CONTEXT* pContext, uint32_t apiThreadId)
 823 {
 824     if (nullptr == pContext)
 825     {
 826         return;
 827     }
 828
 829     if (apiThreadId >= pContext->threadPool.numReservedThreads)
 830     {
 831         if (pContext->threadPool.numReservedThreads)
 832         {
 833             const THREAD_DATA& threadData = pContext->threadPool.pApiThreadData[0];
 834             // Just bind to the process group used for API thread 0
 835             bindThread(pContext, 0, threadData.procGroupId, true);
 836         }
 837         return;
 838     }
 839
 840     const THREAD_DATA& threadData = pContext->threadPool.pApiThreadData[apiThreadId];
 841
 842     bindThread(
 843         pContext, threadData.threadId, threadData.procGroupId, threadData.forceBindProcGroup);
 844 }
 845
 846 template <bool IsFEThread, bool IsBEThread>
 847 DWORD workerThreadMain(LPVOID pData)
 848 {
 849     THREAD_DATA* pThreadData = (THREAD_DATA*)pData;
 850     SWR_CONTEXT* pContext    = pThreadData->pContext;
 851     uint32_t     threadId    = pThreadData->threadId;
 852     uint32_t     workerId    = pThreadData->workerId;
 853
 854     bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup);
 855
 856     {
 857         char threadName[64];
 858         sprintf_s(threadName,
 859 #if defined(_WIN32)
 860                   "SWRWorker_%02d_NUMA%d_Core%02d_T%d",
 861 #else
 862                   // linux pthread name limited to 16 chars (including \0)
 863                   "w%03d-n%d-c%03d-t%d",
 864 #endif
 865                   workerId,
 866                   pThreadData->numaId,
 867                   pThreadData->coreId,
 868                   pThreadData->htId);
 869         SetCurrentThreadName(threadName);
 870     }
 871
 872     RDTSC_INIT(pContext->pBucketMgr, threadId);
 873
 874     // Only need offset numa index from base for correct masking
 875     uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE;
 876     uint32_t numaMask = pContext->threadPool.numaMask;
 877
 878     SetOptimalVectorCSR();
 879
 880     // Track tiles locked by other threads. If we try to lock a macrotile and find its already
 881     // locked then we'll add it to this list so that we don't try and lock it again.
 882     TileSet lockedTiles;
 883
 884     // each worker has the ability to work on any of the queued draws as long as certain
 885     // conditions are met. the data associated
 886     // with a draw is guaranteed to be active as long as a worker hasn't signaled that he
 887     // has moved on to the next draw when he determines there is no more work to do. The api
 888     // thread will not increment the head of the dc ring until all workers have moved past the
 889     // current head.
 890     // the logic to determine what to work on is:
 891     // 1- try to work on the FE any draw that is queued. For now there are no dependencies
 892     //    on the FE work, so any worker can grab any FE and process in parallel.  Eventually
 893     //    we'll need dependency tracking to force serialization on FEs.  The worker will try
 894     //    to pick an FE by atomically incrementing a counter in the swr context.  he'll keep
 895     //    trying until he reaches the tail.
 896     // 2- BE work must be done in strict order. we accomplish this today by pulling work off
 897     //    the oldest draw (ie the head) of the dcRing. the worker can determine if there is
 898     //    any work left by comparing the total # of binned work items and the total # of completed
 899     //    work items. If they are equal, then there is no more work to do for this draw, and
 900     //    the worker can safely increment its oldestDraw counter and move on to the next draw.
 901     std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
 902
 903     auto threadHasWork = [&](uint32_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };
 904
 905     uint32_t curDrawBE = 0;
 906     uint32_t curDrawFE = 0;
 907
 908     bool bShutdown = false;
 909
 910     while (true)
 911     {
 912         if (bShutdown && !threadHasWork(curDrawBE))
 913         {
 914             break;
 915         }
 916
 917         uint32_t loop = 0;
 918         while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE))
 919         {
 920             _mm_pause();
 921         }
 922
 923         if (!threadHasWork(curDrawBE))
 924         {
 925             lock.lock();
 926
 927             // check for thread idle condition again under lock
 928             if (threadHasWork(curDrawBE))
 929             {
 930                 lock.unlock();
 931                 continue;
 932             }
 933
 934             pContext->FifosNotEmpty.wait(lock);
 935             lock.unlock();
 936         }
 937
 938         if (IsBEThread)
 939         {
 940             RDTSC_BEGIN(pContext->pBucketMgr, WorkerWorkOnFifoBE, 0);
 941             bShutdown |=
 942                 WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
 943             RDTSC_END(pContext->pBucketMgr, WorkerWorkOnFifoBE, 0);
 944
 945             WorkOnCompute(pContext, workerId, curDrawBE);
 946         }
 947
 948         if (IsFEThread)
 949         {
 950             WorkOnFifoFE(pContext, workerId, curDrawFE);
 951
 952             if (!IsBEThread)
 953             {
 954                 curDrawBE = curDrawFE;
 955             }
 956         }
 957     }
 958
 959     return 0;
 960 }
 961 template <>
 962 DWORD workerThreadMain<false, false>(LPVOID) = delete;
 963
 964 template <bool IsFEThread, bool IsBEThread>
 965 DWORD workerThreadInit(LPVOID pData)
 966 {
 967 #if defined(_WIN32)
 968     __try
 969 #endif // _WIN32
 970     {
 971         return workerThreadMain<IsFEThread, IsBEThread>(pData);
 972     }
 973
 974 #if defined(_WIN32)
 975     __except (EXCEPTION_CONTINUE_SEARCH)
 976     {
 977     }
 978
 979 #endif // _WIN32
 980
 981     return 1;
 982 }
 983 template <>
 984 DWORD workerThreadInit<false, false>(LPVOID pData) = delete;
 985
 986 static void InitPerThreadStats(SWR_CONTEXT* pContext, uint32_t numThreads)
 987 {
 988     // Initialize DRAW_CONTEXT's per-thread stats
 989     for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
 990     {
 991         pContext->dcRing[dc].dynState.pStats =
 992             (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64);
 993         memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads);
 994     }
 995 }
 996
 997 //////////////////////////////////////////////////////////////////////////
 998 /// @brief Creates thread pool info but doesn't launch threads.
 999 /// @param pContext - pointer to context
1000 /// @param pPool - pointer to thread pool object.
1001 void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
1002 {
1003     CPUNumaNodes nodes;
1004     uint32_t     numThreadsPerProcGroup = 0;
1005     CalculateProcessorTopology(nodes, numThreadsPerProcGroup);
1006
1007     // Assumption, for asymmetric topologies, multi-threaded cores will appear
1008     // in the list before single-threaded cores.  This appears to be true for
1009     // Windows when the total HW threads is limited to 64.
1010     uint32_t numHWNodes        = (uint32_t)nodes.size();
1011     uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size();
1012     uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size();
1013
1014 #if defined(_WIN32) && !defined(_WIN64)
1015     if (!pContext->threadInfo.MAX_WORKER_THREADS)
1016     {
1017         // Limit 32-bit windows to bindable HW threads only
1018         if ((numHWCoresPerNode * numHWHyperThreads) > 32)
1019         {
1020             numHWCoresPerNode = 32 / numHWHyperThreads;
1021         }
1022     }
1023 #endif
1024
1025     // Calculate num HW threads.  Due to asymmetric topologies, this is not
1026     // a trivial multiplication.
1027     uint32_t numHWThreads = 0;
1028     for (auto const& node : nodes)
1029     {
1030         for (auto const& core : node.cores)
1031         {
1032             numHWThreads += (uint32_t)core.threadIds.size();
1033         }
1034     }
1035
1036     uint32_t numNodes        = numHWNodes;
1037     uint32_t numCoresPerNode = numHWCoresPerNode;
1038     uint32_t numHyperThreads = numHWHyperThreads;
1039
1040     // Calc used threads per-core
1041     if (numHyperThreads > pContext->threadInfo.BASE_THREAD)
1042     {
1043         numHyperThreads -= pContext->threadInfo.BASE_THREAD;
1044     }
1045     else
1046     {
1047         SWR_ASSERT(false,
1048                    "Cannot use BASE_THREAD value: %d, maxThreads: %d, reverting BASE_THREAD to 0",
1049                    pContext->threadInfo.BASE_THREAD,
1050                    numHyperThreads);
1051         pContext->threadInfo.BASE_THREAD = 0;
1052     }
1053
1054     if (pContext->threadInfo.MAX_THREADS_PER_CORE)
1055     {
1056         numHyperThreads = std::min(numHyperThreads, pContext->threadInfo.MAX_THREADS_PER_CORE);
1057     }
1058
1059     // Prune any cores that don't support the number of threads
1060     if (numHyperThreads > 1)
1061     {
1062         for (auto& node : nodes)
1063         {
1064             uint32_t numUsableCores = 0;
1065             for (auto& core : node.cores)
1066             {
1067                 numUsableCores += (core.threadIds.size() >= numHyperThreads);
1068             }
1069             numCoresPerNode = std::min(numCoresPerNode, numUsableCores);
1070         }
1071     }
1072
1073     // Calc used cores per NUMA node
1074     if (numCoresPerNode > pContext->threadInfo.BASE_CORE)
1075     {
1076         numCoresPerNode -= pContext->threadInfo.BASE_CORE;
1077     }
1078     else
1079     {
1080         SWR_ASSERT(false,
1081                    "Cannot use BASE_CORE value: %d, maxCores: %d, reverting BASE_CORE to 0",
1082                    pContext->threadInfo.BASE_CORE,
1083                    numCoresPerNode);
1084         pContext->threadInfo.BASE_CORE = 0;
1085     }
1086
1087     if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE)
1088     {
1089         numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE);
1090     }
1091
1092     // Calc used NUMA nodes
1093     if (numNodes > pContext->threadInfo.BASE_NUMA_NODE)
1094     {
1095         numNodes -= pContext->threadInfo.BASE_NUMA_NODE;
1096     }
1097     else
1098     {
1099         SWR_ASSERT(
1100             false,
1101             "Cannot use BASE_NUMA_NODE value: %d, maxNodes: %d, reverting BASE_NUMA_NODE to 0",
1102             pContext->threadInfo.BASE_NUMA_NODE,
1103             numNodes);
1104         pContext->threadInfo.BASE_NUMA_NODE = 0;
1105     }
1106
1107     if (pContext->threadInfo.MAX_NUMA_NODES)
1108     {
1109         numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES);
1110     }
1111
1112     // Calculate numThreads - at this point everything should be symmetric
1113     uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
1114     SWR_REL_ASSERT(numThreads <= numHWThreads);
1115
1116     uint32_t& numAPIReservedThreads = pContext->apiThreadInfo.numAPIReservedThreads;
1117     uint32_t& numAPIThreadsPerCore  = pContext->apiThreadInfo.numAPIThreadsPerCore;
1118     uint32_t  numRemovedThreads     = 0;
1119
1120     if (pContext->threadInfo.SINGLE_THREADED)
1121     {
1122         numAPIReservedThreads      = 0;
1123         numThreads                 = 1;
1124         pContext->NumWorkerThreads = 1;
1125         pContext->NumFEThreads     = 1;
1126         pContext->NumBEThreads     = 1;
1127         pPool->numThreads          = 0;
1128     }
1129     else if (pContext->threadInfo.MAX_WORKER_THREADS)
1130     {
1131         numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, numHWThreads);
1132         pContext->threadInfo.BASE_NUMA_NODE = 0;
1133         pContext->threadInfo.BASE_CORE      = 0;
1134         pContext->threadInfo.BASE_THREAD    = 0;
1135         numAPIReservedThreads               = 0;
1136     }
1137     else
1138     {
1139         if (numAPIReservedThreads >= numThreads)
1140         {
1141             numAPIReservedThreads = 0;
1142         }
1143         else if (numAPIReservedThreads)
1144         {
1145             numAPIThreadsPerCore = std::min(numAPIThreadsPerCore, numHWHyperThreads);
1146
1147             if (0 == numAPIThreadsPerCore)
1148             {
1149                 numAPIThreadsPerCore = numHWHyperThreads;
1150             }
1151
1152             numRemovedThreads = numAPIReservedThreads;
1153             if (numAPIThreadsPerCore == 2 && numHyperThreads == 1)
1154             {
1155                 // Adjust removed threads to make logic below work
1156                 numRemovedThreads =
1157                     std::max(1U, (numRemovedThreads + numAPIThreadsPerCore - 1) / 2);
1158             }
1159
1160             numThreads -= numRemovedThreads;
1161         }
1162     }
1163
1164     InitPerThreadStats(pContext, numThreads);
1165
1166     if (pContext->threadInfo.SINGLE_THREADED)
1167     {
1168         numAPIReservedThreads = 0;
1169         numThreads            = 1;
1170     }
1171
1172     if (numAPIReservedThreads)
1173     {
1174         pPool->pApiThreadData = new (std::nothrow) THREAD_DATA[numAPIReservedThreads];
1175         SWR_ASSERT(pPool->pApiThreadData);
1176         if (!pPool->pApiThreadData)
1177         {
1178             numAPIReservedThreads = 0;
1179         }
1180         else
1181         {
1182             memset(pPool->pApiThreadData, 0, sizeof(THREAD_DATA) * numAPIReservedThreads);
1183         }
1184     }
1185     pPool->numReservedThreads = numAPIReservedThreads;
1186
1187     pPool->numThreads          = numThreads;
1188     pContext->NumWorkerThreads = pPool->numThreads;
1189
1190     pPool->pThreadData = new (std::nothrow) THREAD_DATA[pPool->numThreads];
1191     SWR_ASSERT(pPool->pThreadData);
1192     memset(pPool->pThreadData, 0, sizeof(THREAD_DATA) * pPool->numThreads);
1193     pPool->numaMask = 0;
1194
1195     // Allocate worker private data
1196     pPool->pWorkerPrivateDataArray = nullptr;
1197     if (pContext->workerPrivateState.perWorkerPrivateStateSize == 0)
1198     {
1199         pContext->workerPrivateState.perWorkerPrivateStateSize = sizeof(SWR_WORKER_DATA);
1200         pContext->workerPrivateState.pfnInitWorkerData = nullptr;
1201         pContext->workerPrivateState.pfnFinishWorkerData = nullptr;
1202     }
1203
1204     // initialize contents of SWR_WORKER_DATA
1205     size_t perWorkerSize =
1206         AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64);
1207     size_t totalSize = perWorkerSize * pPool->numThreads;
1208     if (totalSize)
1209     {
1210         pPool->pWorkerPrivateDataArray = AlignedMalloc(totalSize, 64);
1211         SWR_ASSERT(pPool->pWorkerPrivateDataArray);
1212
1213         void* pWorkerData = pPool->pWorkerPrivateDataArray;
1214         for (uint32_t i = 0; i < pPool->numThreads; ++i)
1215         {
1216             pPool->pThreadData[i].pWorkerPrivateData = pWorkerData;
1217             if (pContext->workerPrivateState.pfnInitWorkerData)
1218             {
1219                 pContext->workerPrivateState.pfnInitWorkerData(pContext, pWorkerData, i);
1220             }
1221             pWorkerData = PtrAdd(pWorkerData, perWorkerSize);
1222         }
1223     }
1224
1225     if (pContext->threadInfo.SINGLE_THREADED)
1226     {
1227         return;
1228     }
1229
1230     pPool->pThreads = new (std::nothrow) THREAD_PTR[pPool->numThreads];
1231     SWR_ASSERT(pPool->pThreads);
1232
1233     if (pContext->threadInfo.MAX_WORKER_THREADS)
1234     {
1235         bool     bForceBindProcGroup = (numThreads > numThreadsPerProcGroup);
1236         uint32_t numProcGroups = (numThreads + numThreadsPerProcGroup - 1) / numThreadsPerProcGroup;
1237         // When MAX_WORKER_THREADS is set we don't bother to bind to specific HW threads
1238         // But Windows will still require binding to specific process groups
1239         for (uint32_t workerId = 0; workerId < numThreads; ++workerId)
1240         {
1241             pPool->pThreadData[workerId].workerId           = workerId;
1242             pPool->pThreadData[workerId].procGroupId        = workerId % numProcGroups;
1243             pPool->pThreadData[workerId].threadId           = 0;
1244             pPool->pThreadData[workerId].numaId             = 0;
1245             pPool->pThreadData[workerId].coreId             = 0;
1246             pPool->pThreadData[workerId].htId               = 0;
1247             pPool->pThreadData[workerId].pContext           = pContext;
1248             pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
1249
1250             pContext->NumBEThreads++;
1251             pContext->NumFEThreads++;
1252         }
1253     }
1254     else
1255     {
1256         // numa distribution assumes workers on all nodes
1257         bool useNuma = true;
1258         if (numCoresPerNode * numHyperThreads == 1)
1259         {
1260             useNuma = false;
1261         }
1262
1263         if (useNuma)
1264         {
1265             pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.)
1266         }
1267         else
1268         {
1269             pPool->numaMask = 0;
1270         }
1271
1272         uint32_t workerId           = 0;
1273         uint32_t numReservedThreads = numAPIReservedThreads;
1274         for (uint32_t n = 0; n < numNodes; ++n)
1275         {
1276             if ((n + pContext->threadInfo.BASE_NUMA_NODE) >= nodes.size())
1277             {
1278                 break;
1279             }
1280             auto&    node     = nodes[n + pContext->threadInfo.BASE_NUMA_NODE];
1281             uint32_t numCores = numCoresPerNode;
1282             for (uint32_t c = 0; c < numCores; ++c)
1283             {
1284                 if ((c + pContext->threadInfo.BASE_CORE) >= node.cores.size())
1285                 {
1286                     break;
1287                 }
1288
1289                 auto& core = node.cores[c + pContext->threadInfo.BASE_CORE];
1290                 for (uint32_t t = 0; t < numHyperThreads; ++t)
1291                 {
1292                     if ((t + pContext->threadInfo.BASE_THREAD) >= core.threadIds.size())
1293                     {
1294                         break;
1295                     }
1296
1297                     if (numRemovedThreads)
1298                     {
1299                         --numRemovedThreads;
1300                         SWR_REL_ASSERT(numReservedThreads);
1301                         --numReservedThreads;
1302                         pPool->pApiThreadData[numReservedThreads].workerId    = 0xFFFFFFFFU;
1303                         pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
1304                         pPool->pApiThreadData[numReservedThreads].threadId    = core.threadIds[t];
1305                         pPool->pApiThreadData[numReservedThreads].numaId =
1306                             useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
1307                         pPool->pApiThreadData[numReservedThreads].coreId =
1308                             c + pContext->threadInfo.BASE_CORE;
1309                         pPool->pApiThreadData[numReservedThreads].htId =
1310                             t + pContext->threadInfo.BASE_THREAD;
1311                         pPool->pApiThreadData[numReservedThreads].pContext           = pContext;
1312                         pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
1313
1314                         if (numAPIThreadsPerCore > numHyperThreads && numReservedThreads)
1315                         {
1316                             --numReservedThreads;
1317                             pPool->pApiThreadData[numReservedThreads].workerId    = 0xFFFFFFFFU;
1318                             pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
1319                             pPool->pApiThreadData[numReservedThreads].threadId =
1320                                 core.threadIds[t + 1];
1321                             pPool->pApiThreadData[numReservedThreads].numaId =
1322                                 useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
1323                             pPool->pApiThreadData[numReservedThreads].coreId =
1324                                 c + pContext->threadInfo.BASE_CORE;
1325                             pPool->pApiThreadData[numReservedThreads].htId =
1326                                 t + pContext->threadInfo.BASE_THREAD;
1327                             pPool->pApiThreadData[numReservedThreads].pContext           = pContext;
1328                             pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
1329                         }
1330
1331                         continue;
1332                     }
1333
1334                     SWR_ASSERT(workerId < numThreads);
1335
1336                     pPool->pThreadData[workerId].workerId    = workerId;
1337                     pPool->pThreadData[workerId].procGroupId = core.procGroup;
1338                     pPool->pThreadData[workerId].threadId =
1339                         core.threadIds[t + pContext->threadInfo.BASE_THREAD];
1340                     pPool->pThreadData[workerId].numaId =
1341                         useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
1342                     pPool->pThreadData[workerId].coreId   = c + pContext->threadInfo.BASE_CORE;
1343                     pPool->pThreadData[workerId].htId     = t + pContext->threadInfo.BASE_THREAD;
1344                     pPool->pThreadData[workerId].pContext = pContext;
1345                     pPool->pThreadData[workerId].forceBindProcGroup = false;
1346
1347                     pContext->NumBEThreads++;
1348                     pContext->NumFEThreads++;
1349
1350                     ++workerId;
1351                 }
1352             }
1353         }
1354         SWR_ASSERT(workerId == pContext->NumWorkerThreads);
1355     }
1356 }
1357
1358 //////////////////////////////////////////////////////////////////////////
1359 /// @brief Launches worker threads in thread pool.
1360 /// @param pContext - pointer to context
1361 /// @param pPool - pointer to thread pool object.
1362 void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
1363 {
1364     if (pContext->threadInfo.SINGLE_THREADED)
1365     {
1366         return;
1367     }
1368
1369     for (uint32_t workerId = 0; workerId < pContext->NumWorkerThreads; ++workerId)
1370     {
1371         pPool->pThreads[workerId] =
1372             new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
1373     }
1374 }
1375
1376 //////////////////////////////////////////////////////////////////////////
1377 /// @brief Destroys thread pool.
1378 /// @param pContext - pointer to context
1379 /// @param pPool - pointer to thread pool object.
1380 void DestroyThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
1381 {
1382     // Wait for all threads to finish
1383     SwrWaitForIdle(pContext);
1384
1385     // Wait for threads to finish and destroy them
1386     for (uint32_t t = 0; t < pPool->numThreads; ++t)
1387     {
1388         if (!pContext->threadInfo.SINGLE_THREADED)
1389         {
1390             // Detach from thread.  Cannot join() due to possibility (in Windows) of code
1391             // in some DLLMain(THREAD_DETATCH case) blocking the thread until after this returns.
1392             pPool->pThreads[t]->detach();
1393             delete (pPool->pThreads[t]);
1394         }
1395
1396         if (pContext->workerPrivateState.pfnFinishWorkerData)
1397         {
1398             pContext->workerPrivateState.pfnFinishWorkerData(
1399                 pContext, pPool->pThreadData[t].pWorkerPrivateData, t);
1400         }
1401     }
1402
1403     delete[] pPool->pThreads;
1404
1405     // Clean up data used by threads
1406     delete[] pPool->pThreadData;
1407     delete[] pPool->pApiThreadData;
1408
1409     AlignedFree(pPool->pWorkerPrivateDataArray);
1410 }