src/gallium/drivers/swr/rasterizer/core/threads.cpp

   1 /****************************************************************************
   2  * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  ****************************************************************************/
  23
  24 #include <stdio.h>
  25 #include <thread>
  26 #include <algorithm>
  27 #include <float.h>
  28 #include <vector>
  29 #include <utility>
  30 #include <fstream>
  31 #include <string>
  32
  33 #if defined(__linux__) || defined(__gnu_linux__) || defined(__APPLE__)
  34 #include <pthread.h>
  35 #include <sched.h>
  36 #include <unistd.h>
  37 #endif
  38
  39 #ifdef __APPLE__
  40 #include <sys/types.h>
  41 #include <sys/sysctl.h>
  42 #endif
  43
  44 #include "common/os.h"
  45 #include "core/api.h"
  46 #include "context.h"
  47 #include "frontend.h"
  48 #include "backend.h"
  49 #include "rasterizer.h"
  50 #include "rdtsc_core.h"
  51 #include "tilemgr.h"
  52 #include "tileset.h"
  53
  54
  55 // ThreadId
  56 struct Core
  57 {
  58     uint32_t              procGroup = 0;
  59     std::vector<uint32_t> threadIds;
  60 };
  61
  62 struct NumaNode
  63 {
  64     uint32_t          numaId;
  65     std::vector<Core> cores;
  66 };
  67
  68 typedef std::vector<NumaNode> CPUNumaNodes;
  69
  70 void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup)
  71 {
  72     out_nodes.clear();
  73     out_numThreadsPerProcGroup = 0;
  74
  75 #if defined(_WIN32)
  76
  77     std::vector<KAFFINITY> threadMaskPerProcGroup;
  78
  79     static std::mutex           m;
  80     std::lock_guard<std::mutex> l(m);
  81
  82     DWORD bufSize = 0;
  83
  84     BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufSize);
  85     SWR_ASSERT(ret == FALSE && GetLastError() == ERROR_INSUFFICIENT_BUFFER);
  86
  87     PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem =
  88         (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize);
  89     SWR_ASSERT(pBufferMem);
  90
  91     ret = GetLogicalProcessorInformationEx(RelationProcessorCore, pBufferMem, &bufSize);
  92     SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information");
  93
  94     uint32_t                                 count   = bufSize / pBufferMem->Size;
  95     PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = pBufferMem;
  96
  97     for (uint32_t i = 0; i < count; ++i)
  98     {
  99         SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore);
 100         for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g)
 101         {
 102             auto&    gmask     = pBuffer->Processor.GroupMask[g];
 103             uint32_t threadId  = 0;
 104             uint32_t procGroup = gmask.Group;
 105
 106             Core* pCore = nullptr;
 107
 108             uint32_t numThreads = (uint32_t)_mm_popcount_sizeT(gmask.Mask);
 109
 110             while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask))
 111             {
 112                 // clear mask
 113                 KAFFINITY threadMask = KAFFINITY(1) << threadId;
 114                 gmask.Mask &= ~threadMask;
 115
 116                 if (procGroup >= threadMaskPerProcGroup.size())
 117                 {
 118                     threadMaskPerProcGroup.resize(procGroup + 1);
 119                 }
 120
 121                 if (threadMaskPerProcGroup[procGroup] & threadMask)
 122                 {
 123                     // Already seen this mask.  This means that we are in 32-bit mode and
 124                     // have seen more than 32 HW threads for this procGroup
 125                     // Don't use it
 126 #if defined(_WIN64)
 127                     SWR_INVALID("Shouldn't get here in 64-bit mode");
 128 #endif
 129                     continue;
 130                 }
 131
 132                 threadMaskPerProcGroup[procGroup] |= (KAFFINITY(1) << threadId);
 133
 134                 // Find Numa Node
 135                 uint32_t         numaId  = 0;
 136                 PROCESSOR_NUMBER procNum = {};
 137                 procNum.Group            = WORD(procGroup);
 138                 procNum.Number           = UCHAR(threadId);
 139
 140                 ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId);
 141                 SWR_ASSERT(ret);
 142
 143                 // Store data
 144                 if (out_nodes.size() <= numaId)
 145                 {
 146                     out_nodes.resize(numaId + 1);
 147                 }
 148                 auto& numaNode  = out_nodes[numaId];
 149                 numaNode.numaId = numaId;
 150
 151                 uint32_t coreId = 0;
 152
 153                 if (nullptr == pCore)
 154                 {
 155                     numaNode.cores.push_back(Core());
 156                     pCore            = &numaNode.cores.back();
 157                     pCore->procGroup = procGroup;
 158                 }
 159                 pCore->threadIds.push_back(threadId);
 160                 if (procGroup == 0)
 161                 {
 162                     out_numThreadsPerProcGroup++;
 163                 }
 164             }
 165         }
 166         pBuffer = PtrAdd(pBuffer, pBuffer->Size);
 167     }
 168
 169     free(pBufferMem);
 170
 171 #elif defined(__linux__) || defined(__gnu_linux__)
 172
 173     // Parse /proc/cpuinfo to get full topology
 174     std::ifstream input("/proc/cpuinfo");
 175     std::string   line;
 176     char*         c;
 177     uint32_t      procId = uint32_t(-1);
 178     uint32_t      coreId = uint32_t(-1);
 179     uint32_t      physId = uint32_t(-1);
 180
 181     while (std::getline(input, line))
 182     {
 183         if (line.find("processor") != std::string::npos)
 184         {
 185             auto data_start = line.find(": ") + 2;
 186             procId          = std::strtoul(&line.c_str()[data_start], &c, 10);
 187             continue;
 188         }
 189         if (line.find("core id") != std::string::npos)
 190         {
 191             auto data_start = line.find(": ") + 2;
 192             coreId          = std::strtoul(&line.c_str()[data_start], &c, 10);
 193             continue;
 194         }
 195         if (line.find("physical id") != std::string::npos)
 196         {
 197             auto data_start = line.find(": ") + 2;
 198             physId          = std::strtoul(&line.c_str()[data_start], &c, 10);
 199             continue;
 200         }
 201         if (line.length() == 0)
 202         {
 203             if (physId + 1 > out_nodes.size())
 204                 out_nodes.resize(physId + 1);
 205             auto& numaNode  = out_nodes[physId];
 206             numaNode.numaId = physId;
 207
 208             if (coreId + 1 > numaNode.cores.size())
 209                 numaNode.cores.resize(coreId + 1);
 210             auto& core     = numaNode.cores[coreId];
 211             core.procGroup = coreId;
 212             core.threadIds.push_back(procId);
 213         }
 214     }
 215
 216     out_numThreadsPerProcGroup = 0;
 217     for (auto& node : out_nodes)
 218     {
 219         for (auto& core : node.cores)
 220         {
 221             out_numThreadsPerProcGroup += core.threadIds.size();
 222         }
 223     }
 224
 225 #elif defined(__APPLE__)
 226
 227     auto numProcessors  = 0;
 228     auto numCores       = 0;
 229     auto numPhysicalIds = 0;
 230
 231     int    value;
 232     size_t size = sizeof(value);
 233
 234     int result = sysctlbyname("hw.packages", &value, &size, NULL, 0);
 235     SWR_ASSERT(result == 0);
 236     numPhysicalIds = value;
 237
 238     result = sysctlbyname("hw.logicalcpu", &value, &size, NULL, 0);
 239     SWR_ASSERT(result == 0);
 240     numProcessors = value;
 241
 242     result = sysctlbyname("hw.physicalcpu", &value, &size, NULL, 0);
 243     SWR_ASSERT(result == 0);
 244     numCores = value;
 245
 246     out_nodes.resize(numPhysicalIds);
 247
 248     for (auto physId = 0; physId < numPhysicalIds; ++physId)
 249     {
 250         auto& numaNode = out_nodes[physId];
 251         auto  procId   = 0;
 252
 253         numaNode.cores.resize(numCores);
 254
 255         while (procId < numProcessors)
 256         {
 257             for (auto coreId = 0; coreId < numaNode.cores.size(); ++coreId, ++procId)
 258             {
 259                 auto& core = numaNode.cores[coreId];
 260
 261                 core.procGroup = coreId;
 262                 core.threadIds.push_back(procId);
 263             }
 264         }
 265     }
 266
 267     out_numThreadsPerProcGroup = 0;
 268
 269     for (auto& node : out_nodes)
 270     {
 271         for (auto& core : node.cores)
 272         {
 273             out_numThreadsPerProcGroup += core.threadIds.size();
 274         }
 275     }
 276
 277 #else
 278
 279 #error Unsupported platform
 280
 281 #endif
 282
 283     // Prune empty cores and numa nodes
 284     for (auto node_it = out_nodes.begin(); node_it != out_nodes.end();)
 285     {
 286         // Erase empty cores (first)
 287         for (auto core_it = node_it->cores.begin(); core_it != node_it->cores.end();)
 288         {
 289             if (core_it->threadIds.size() == 0)
 290             {
 291                 core_it = node_it->cores.erase(core_it);
 292             }
 293             else
 294             {
 295                 ++core_it;
 296             }
 297         }
 298
 299         // Erase empty numa nodes (second)
 300         if (node_it->cores.size() == 0)
 301         {
 302             node_it = out_nodes.erase(node_it);
 303         }
 304         else
 305         {
 306             ++node_it;
 307         }
 308     }
 309 }
 310
 311 void bindThread(SWR_CONTEXT* pContext,
 312                 uint32_t     threadId,
 313                 uint32_t     procGroupId   = 0,
 314                 bool         bindProcGroup = false)
 315 {
 316     // Only bind threads when MAX_WORKER_THREADS isn't set.
 317     if (pContext->threadInfo.SINGLE_THREADED ||
 318         (pContext->threadInfo.MAX_WORKER_THREADS && bindProcGroup == false))
 319     {
 320         return;
 321     }
 322
 323 #if defined(_WIN32)
 324
 325     GROUP_AFFINITY affinity = {};
 326     affinity.Group          = procGroupId;
 327
 328 #if !defined(_WIN64)
 329     if (threadId >= 32)
 330     {
 331         // Hopefully we don't get here.  Logic in CreateThreadPool should prevent this.
 332         SWR_INVALID("Shouldn't get here");
 333
 334         // In a 32-bit process on Windows it is impossible to bind
 335         // to logical processors 32-63 within a processor group.
 336         // In this case set the mask to 0 and let the system assign
 337         // the processor.  Hopefully it will make smart choices.
 338         affinity.Mask = 0;
 339     }
 340     else
 341 #endif
 342     {
 343         // If MAX_WORKER_THREADS is set, only bind to the proc group,
 344         // Not the individual HW thread.
 345         if (!bindProcGroup && !pContext->threadInfo.MAX_WORKER_THREADS)
 346         {
 347             affinity.Mask = KAFFINITY(1) << threadId;
 348         }
 349         else
 350         {
 351             affinity.Mask = KAFFINITY(0);
 352         }
 353     }
 354
 355     if (!SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr))
 356     {
 357         SWR_INVALID("Failed to set Thread Affinity");
 358     }
 359
 360 #elif defined(__linux__) || defined(__gnu_linux__)
 361
 362     cpu_set_t cpuset;
 363     pthread_t thread = pthread_self();
 364     CPU_ZERO(&cpuset);
 365     CPU_SET(threadId, &cpuset);
 366
 367     int err = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
 368     if (err != 0)
 369     {
 370         fprintf(stderr, "pthread_setaffinity_np failure for tid %u: %s\n", threadId, strerror(err));
 371     }
 372
 373 #endif
 374 }
 375
 376 INLINE
 377 uint32_t GetEnqueuedDraw(SWR_CONTEXT* pContext)
 378 {
 379     return pContext->dcRing.GetHead();
 380 }
 381
 382 INLINE
 383 DRAW_CONTEXT* GetDC(SWR_CONTEXT* pContext, uint32_t drawId)
 384 {
 385     return &pContext->dcRing[(drawId - 1) % pContext->MAX_DRAWS_IN_FLIGHT];
 386 }
 387
 388 INLINE
 389 bool IDComparesLess(uint32_t a, uint32_t b)
 390 {
 391     // Use signed delta to ensure that wrap-around to 0 is correctly handled.
 392     int32_t delta = int32_t(a - b);
 393     return (delta < 0);
 394 }
 395
 396 // returns true if dependency not met
 397 INLINE
 398 bool CheckDependency(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t lastRetiredDraw)
 399 {
 400     return pDC->dependent && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
 401 }
 402
 403 bool CheckDependencyFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t lastRetiredDraw)
 404 {
 405     return pDC->dependentFE && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
 406 }
 407
 408 //////////////////////////////////////////////////////////////////////////
 409 /// @brief Update client stats.
 410 INLINE void UpdateClientStats(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
 411 {
 412     if ((pContext->pfnUpdateStats == nullptr) || (GetApiState(pDC).enableStatsBE == false))
 413     {
 414         return;
 415     }
 416
 417     DRAW_DYNAMIC_STATE& dynState = pDC->dynState;
 418     OSALIGNLINE(SWR_STATS) stats{0};
 419
 420     // Sum up stats across all workers before sending to client.
 421     for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
 422     {
 423         stats.DepthPassCount += dynState.pStats[i].DepthPassCount;
 424         stats.PsInvocations += dynState.pStats[i].PsInvocations;
 425         stats.CsInvocations += dynState.pStats[i].CsInvocations;
 426
 427     }
 428
 429
 430     pContext->pfnUpdateStats(GetPrivateState(pDC), &stats);
 431 }
 432
 433 INLINE void ExecuteCallbacks(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
 434 {
 435     UpdateClientStats(pContext, workerId, pDC);
 436
 437     if (pDC->retireCallback.pfnCallbackFunc)
 438     {
 439         pDC->retireCallback.pfnCallbackFunc(pDC->retireCallback.userData,
 440                                             pDC->retireCallback.userData2,
 441                                             pDC->retireCallback.userData3);
 442
 443         // Callbacks to external code *could* change floating point control state
 444         // Reset our optimal flags
 445         SetOptimalVectorCSR();
 446     }
 447 }
 448
 449 // inlined-only version
 450 INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
 451 {
 452     int32_t result = static_cast<int32_t>(InterlockedDecrement(&pDC->threadsDone));
 453     SWR_ASSERT(result >= 0);
 454
 455     AR_FLUSH(pDC->drawId);
 456
 457     if (result == 0)
 458     {
 459         ExecuteCallbacks(pContext, workerId, pDC);
 460
 461
 462         // Cleanup memory allocations
 463         pDC->pArena->Reset(true);
 464         if (!pDC->isCompute)
 465         {
 466             pDC->pTileMgr->initialize();
 467         }
 468         if (pDC->cleanupState)
 469         {
 470             pDC->pState->pArena->Reset(true);
 471         }
 472
 473         _ReadWriteBarrier();
 474
 475         pContext->dcRing.Dequeue(); // Remove from tail
 476     }
 477
 478     return result;
 479 }
 480
 481 // available to other translation modules
 482 int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
 483 {
 484     return CompleteDrawContextInl(pContext, 0, pDC);
 485 }
 486
 487 INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext,
 488                                     uint32_t     workerId,
 489                                     uint32_t&    curDrawBE,
 490                                     uint32_t&    drawEnqueued)
 491 {
 492     // increment our current draw id to the first incomplete draw
 493     drawEnqueued = GetEnqueuedDraw(pContext);
 494     while (IDComparesLess(curDrawBE, drawEnqueued))
 495     {
 496         DRAW_CONTEXT* pDC = &pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT];
 497
 498         // If its not compute and FE is not done then break out of loop.
 499         if (!pDC->doneFE && !pDC->isCompute)
 500             break;
 501
 502         bool isWorkComplete =
 503             pDC->isCompute ? pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete();
 504
 505         if (isWorkComplete)
 506         {
 507             curDrawBE++;
 508             CompleteDrawContextInl(pContext, workerId, pDC);
 509         }
 510         else
 511         {
 512             break;
 513         }
 514     }
 515
 516     // If there are no more incomplete draws then return false.
 517     return IDComparesLess(curDrawBE, drawEnqueued);
 518 }
 519
 520 //////////////////////////////////////////////////////////////////////////
 521 /// @brief If there is any BE work then go work on it.
 522 /// @param pContext - pointer to SWR context.
 523 /// @param workerId - The unique worker ID that is assigned to this thread.
 524 /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker
 525 /// thread
 526 ///                    has its own curDrawBE counter and this ensures that each worker processes all
 527 ///                    the draws in order.
 528 /// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its
 529 ///                      own set and each time it fails to lock a macrotile, because its already
 530 ///                      locked, then it will add that tile to the lockedTiles set. As a worker
 531 ///                      begins to work on future draws the lockedTiles ensure that it doesn't work
 532 ///                      on tiles that may still have work pending in a previous draw. Additionally,
 533 ///                      the lockedTiles is hueristic that can steer a worker back to the same
 534 ///                      macrotile that it had been working on in a previous draw.
 535 /// @returns        true if worker thread should shutdown
 536 bool WorkOnFifoBE(SWR_CONTEXT* pContext,
 537                   uint32_t     workerId,
 538                   uint32_t&    curDrawBE,
 539                   TileSet&     lockedTiles,
 540                   uint32_t     numaNode,
 541                   uint32_t     numaMask)
 542 {
 543     bool bShutdown = false;
 544
 545     // Find the first incomplete draw that has pending work. If no such draw is found then
 546     // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
 547     uint32_t drawEnqueued = 0;
 548     if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
 549     {
 550         return false;
 551     }
 552
 553     uint32_t lastRetiredDraw =
 554         pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;
 555
 556     // Reset our history for locked tiles. We'll have to re-learn which tiles are locked.
 557     lockedTiles.clear();
 558
 559     // Try to work on each draw in order of the available draws in flight.
 560     //   1. If we're on curDrawBE, we can work on any macrotile that is available.
 561     //   2. If we're trying to work on draws after curDrawBE, we are restricted to
 562     //      working on those macrotiles that are known to be complete in the prior draw to
 563     //      maintain order. The locked tiles provides the history to ensures this.
 564     for (uint32_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
 565     {
 566         DRAW_CONTEXT* pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
 567
 568         if (pDC->isCompute)
 569             return false; // We don't look at compute work.
 570
 571         // First wait for FE to be finished with this draw. This keeps threading model simple
 572         // but if there are lots of bubbles between draws then serializing FE and BE may
 573         // need to be revisited.
 574         if (!pDC->doneFE)
 575             return false;
 576
 577         // If this draw is dependent on a previous draw then we need to bail.
 578         if (CheckDependency(pContext, pDC, lastRetiredDraw))
 579         {
 580             return false;
 581         }
 582
 583         // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it.
 584         auto& macroTiles = pDC->pTileMgr->getDirtyTiles();
 585
 586         for (auto tile : macroTiles)
 587         {
 588             uint32_t tileID = tile->mId;
 589
 590             // Only work on tiles for this numa node
 591             uint32_t x, y;
 592             pDC->pTileMgr->getTileIndices(tileID, x, y);
 593             if (((x ^ y) & numaMask) != numaNode)
 594             {
 595                 _mm_pause();
 596                 continue;
 597             }
 598
 599             if (!tile->getNumQueued())
 600             {
 601                 _mm_pause();
 602                 continue;
 603             }
 604
 605             // can only work on this draw if it's not in use by other threads
 606             if (lockedTiles.get(tileID))
 607             {
 608                 _mm_pause();
 609                 continue;
 610             }
 611
 612             if (tile->tryLock())
 613             {
 614                 BE_WORK* pWork;
 615
 616                 RDTSC_BEGIN(pContext->pBucketMgr, WorkerFoundWork, pDC->drawId);
 617
 618                 uint32_t numWorkItems = tile->getNumQueued();
 619                 SWR_ASSERT(numWorkItems);
 620
 621                 pWork = tile->peek();
 622                 SWR_ASSERT(pWork);
 623                 if (pWork->type == DRAW)
 624                 {
 625                     pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, workerId, tileID);
 626                 }
 627                 else if (pWork->type == SHUTDOWN)
 628                 {
 629                     bShutdown = true;
 630                 }
 631
 632                 while ((pWork = tile->peek()) != nullptr)
 633                 {
 634                     pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
 635                     tile->dequeue();
 636                 }
 637                 RDTSC_END(pContext->pBucketMgr, WorkerFoundWork, numWorkItems);
 638
 639                 _ReadWriteBarrier();
 640
 641                 pDC->pTileMgr->markTileComplete(tileID);
 642
 643                 // Optimization: If the draw is complete and we're the last one to have worked on it
 644                 // then we can reset the locked list as we know that all previous draws before the
 645                 // next are guaranteed to be complete.
 646                 if ((curDrawBE == i) && (bShutdown || pDC->pTileMgr->isWorkComplete()))
 647                 {
 648                     // We can increment the current BE and safely move to next draw since we know
 649                     // this draw is complete.
 650                     curDrawBE++;
 651                     CompleteDrawContextInl(pContext, workerId, pDC);
 652
 653                     lastRetiredDraw++;
 654
 655                     lockedTiles.clear();
 656                     break;
 657                 }
 658
 659                 if (bShutdown)
 660                 {
 661                     break;
 662                 }
 663             }
 664             else
 665             {
 666                 // This tile is already locked. So let's add it to our locked tiles set. This way we
 667                 // don't try locking this one again.
 668                 lockedTiles.set(tileID);
 669                 _mm_pause();
 670             }
 671         }
 672     }
 673
 674     return bShutdown;
 675 }
 676
 677 //////////////////////////////////////////////////////////////////////////
 678 /// @brief Called when FE work is complete for this DC.
 679 INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
 680 {
 681     if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStatsFE)
 682     {
 683         SWR_STATS_FE& stats = pDC->dynState.statsFE;
 684
 685         AR_EVENT(FrontendStatsEvent(pDC->drawId,
 686                                     stats.IaVertices,
 687                                     stats.IaPrimitives,
 688                                     stats.VsInvocations,
 689                                     stats.HsInvocations,
 690                                     stats.DsInvocations,
 691                                     stats.GsInvocations,
 692                                     stats.GsPrimitives,
 693                                     stats.CInvocations,
 694                                     stats.CPrimitives,
 695                                     stats.SoPrimStorageNeeded[0],
 696                                     stats.SoPrimStorageNeeded[1],
 697                                     stats.SoPrimStorageNeeded[2],
 698                                     stats.SoPrimStorageNeeded[3],
 699                                     stats.SoNumPrimsWritten[0],
 700                                     stats.SoNumPrimsWritten[1],
 701                                     stats.SoNumPrimsWritten[2],
 702                                     stats.SoNumPrimsWritten[3]));
 703         AR_EVENT(FrontendDrawEndEvent(pDC->drawId));
 704
 705         pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &stats);
 706     }
 707
 708     if (pContext->pfnUpdateSoWriteOffset)
 709     {
 710         for (uint32_t i = 0; i < MAX_SO_BUFFERS; ++i)
 711         {
 712             if ((pDC->dynState.SoWriteOffsetDirty[i]) &&
 713                 (pDC->pState->state.soBuffer[i].soWriteEnable))
 714             {
 715                 pContext->pfnUpdateSoWriteOffset(
 716                     GetPrivateState(pDC), i, pDC->dynState.SoWriteOffset[i]);
 717             }
 718         }
 719     }
 720
 721     if (pContext->pfnUpdateStreamOut)
 722         pContext->pfnUpdateStreamOut(GetPrivateState(pDC),  pDC->dynState.soPrims);
 723
 724     // Ensure all streaming writes are globally visible before marking this FE done
 725     _mm_mfence();
 726     pDC->doneFE = true;
 727
 728     InterlockedDecrement(&pContext->drawsOutstandingFE);
 729 }
 730
 731 void WorkOnFifoFE(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawFE)
 732 {
 733     // Try to grab the next DC from the ring
 734     uint32_t drawEnqueued = GetEnqueuedDraw(pContext);
 735     while (IDComparesLess(curDrawFE, drawEnqueued))
 736     {
 737         uint32_t      dcSlot = curDrawFE % pContext->MAX_DRAWS_IN_FLIGHT;
 738         DRAW_CONTEXT* pDC    = &pContext->dcRing[dcSlot];
 739         if (pDC->isCompute || pDC->doneFE)
 740         {
 741             CompleteDrawContextInl(pContext, workerId, pDC);
 742             curDrawFE++;
 743         }
 744         else
 745         {
 746             break;
 747         }
 748     }
 749
 750     uint32_t lastRetiredFE = curDrawFE - 1;
 751     uint32_t curDraw       = curDrawFE;
 752     while (IDComparesLess(curDraw, drawEnqueued))
 753     {
 754         uint32_t      dcSlot = curDraw % pContext->MAX_DRAWS_IN_FLIGHT;
 755         DRAW_CONTEXT* pDC    = &pContext->dcRing[dcSlot];
 756
 757         if (!pDC->FeLock && !pDC->isCompute)
 758         {
 759             if (CheckDependencyFE(pContext, pDC, lastRetiredFE))
 760             {
 761                 return;
 762             }
 763
 764             uint32_t initial = InterlockedCompareExchange((volatile uint32_t*)&pDC->FeLock, 1, 0);
 765             if (initial == 0)
 766             {
 767                 // successfully grabbed the DC, now run the FE
 768                 pDC->FeWork.pfnWork(pContext, pDC, workerId, &pDC->FeWork.desc);
 769
 770                 CompleteDrawFE(pContext, workerId, pDC);
 771             }
 772             else
 773             {
 774                 _mm_pause();
 775             }
 776         }
 777         else
 778         {
 779             _mm_pause();
 780         }
 781
 782         curDraw++;
 783     }
 784 }
 785
 786 //////////////////////////////////////////////////////////////////////////
 787 /// @brief If there is any compute work then go work on it.
 788 /// @param pContext - pointer to SWR context.
 789 /// @param workerId - The unique worker ID that is assigned to this thread.
 790 /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker
 791 /// thread
 792 ///                    has its own curDrawBE counter and this ensures that each worker processes all
 793 ///                    the draws in order.
 794 void WorkOnCompute(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawBE)
 795 {
 796     uint32_t drawEnqueued = 0;
 797     if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
 798     {
 799         return;
 800     }
 801
 802     uint32_t lastRetiredDraw =
 803         pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;
 804
 805     for (uint64_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
 806     {
 807         DRAW_CONTEXT* pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
 808         if (pDC->isCompute == false)
 809             return;
 810
 811         // check dependencies
 812         if (CheckDependency(pContext, pDC, lastRetiredDraw))
 813         {
 814             return;
 815         }
 816
 817         SWR_ASSERT(pDC->pDispatch != nullptr);
 818         DispatchQueue& queue = *pDC->pDispatch;
 819
 820         // Is there any work remaining?
 821         if (queue.getNumQueued() > 0)
 822         {
 823             void*    pSpillFillBuffer = nullptr;
 824             void*    pScratchSpace    = nullptr;
 825             uint32_t threadGroupId    = 0;
 826             while (queue.getWork(threadGroupId))
 827             {
 828                 queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer, pScratchSpace);
 829                 queue.finishedWork();
 830             }
 831
 832             // Ensure all streaming writes are globally visible before moving onto the next draw
 833             _mm_mfence();
 834         }
 835     }
 836 }
 837
 838 void BindApiThread(SWR_CONTEXT* pContext, uint32_t apiThreadId)
 839 {
 840     if (nullptr == pContext)
 841     {
 842         return;
 843     }
 844
 845     if (apiThreadId >= pContext->threadPool.numReservedThreads)
 846     {
 847         if (pContext->threadPool.numReservedThreads)
 848         {
 849             const THREAD_DATA& threadData = pContext->threadPool.pApiThreadData[0];
 850             // Just bind to the process group used for API thread 0
 851             bindThread(pContext, 0, threadData.procGroupId, true);
 852         }
 853         return;
 854     }
 855
 856     const THREAD_DATA& threadData = pContext->threadPool.pApiThreadData[apiThreadId];
 857
 858     bindThread(
 859         pContext, threadData.threadId, threadData.procGroupId, threadData.forceBindProcGroup);
 860 }
 861
 862 template <bool IsFEThread, bool IsBEThread>
 863 DWORD workerThreadMain(LPVOID pData)
 864 {
 865     THREAD_DATA* pThreadData = (THREAD_DATA*)pData;
 866     SWR_CONTEXT* pContext    = pThreadData->pContext;
 867     uint32_t     threadId    = pThreadData->threadId;
 868     uint32_t     workerId    = pThreadData->workerId;
 869
 870     bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup);
 871
 872     {
 873         char threadName[64];
 874         sprintf_s(threadName,
 875 #if defined(_WIN32)
 876                   "SWRWorker_%02d_NUMA%d_Core%02d_T%d",
 877 #else
 878                   // linux pthread name limited to 16 chars (including \0)
 879                   "w%03d-n%d-c%03d-t%d",
 880 #endif
 881                   workerId,
 882                   pThreadData->numaId,
 883                   pThreadData->coreId,
 884                   pThreadData->htId);
 885         SetCurrentThreadName(threadName);
 886     }
 887
 888     RDTSC_INIT(pContext->pBucketMgr, threadId);
 889
 890     // Only need offset numa index from base for correct masking
 891     uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE;
 892     uint32_t numaMask = pContext->threadPool.numaMask;
 893
 894     SetOptimalVectorCSR();
 895
 896     // Track tiles locked by other threads. If we try to lock a macrotile and find its already
 897     // locked then we'll add it to this list so that we don't try and lock it again.
 898     TileSet lockedTiles;
 899
 900     // each worker has the ability to work on any of the queued draws as long as certain
 901     // conditions are met. the data associated
 902     // with a draw is guaranteed to be active as long as a worker hasn't signaled that he
 903     // has moved on to the next draw when he determines there is no more work to do. The api
 904     // thread will not increment the head of the dc ring until all workers have moved past the
 905     // current head.
 906     // the logic to determine what to work on is:
 907     // 1- try to work on the FE any draw that is queued. For now there are no dependencies
 908     //    on the FE work, so any worker can grab any FE and process in parallel.  Eventually
 909     //    we'll need dependency tracking to force serialization on FEs.  The worker will try
 910     //    to pick an FE by atomically incrementing a counter in the swr context.  he'll keep
 911     //    trying until he reaches the tail.
 912     // 2- BE work must be done in strict order. we accomplish this today by pulling work off
 913     //    the oldest draw (ie the head) of the dcRing. the worker can determine if there is
 914     //    any work left by comparing the total # of binned work items and the total # of completed
 915     //    work items. If they are equal, then there is no more work to do for this draw, and
 916     //    the worker can safely increment its oldestDraw counter and move on to the next draw.
 917     std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
 918
 919     auto threadHasWork = [&](uint32_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };
 920
 921     uint32_t curDrawBE = 0;
 922     uint32_t curDrawFE = 0;
 923
 924     bool bShutdown = false;
 925
 926     while (true)
 927     {
 928         if (bShutdown && !threadHasWork(curDrawBE))
 929         {
 930             break;
 931         }
 932
 933         uint32_t loop = 0;
 934         while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE))
 935         {
 936             _mm_pause();
 937         }
 938
 939         if (!threadHasWork(curDrawBE))
 940         {
 941             lock.lock();
 942
 943             // check for thread idle condition again under lock
 944             if (threadHasWork(curDrawBE))
 945             {
 946                 lock.unlock();
 947                 continue;
 948             }
 949
 950             pContext->FifosNotEmpty.wait(lock);
 951             lock.unlock();
 952         }
 953
 954         if (IsBEThread)
 955         {
 956             RDTSC_BEGIN(pContext->pBucketMgr, WorkerWorkOnFifoBE, 0);
 957             bShutdown |=
 958                 WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
 959             RDTSC_END(pContext->pBucketMgr, WorkerWorkOnFifoBE, 0);
 960
 961             WorkOnCompute(pContext, workerId, curDrawBE);
 962         }
 963
 964         if (IsFEThread)
 965         {
 966             WorkOnFifoFE(pContext, workerId, curDrawFE);
 967
 968             if (!IsBEThread)
 969             {
 970                 curDrawBE = curDrawFE;
 971             }
 972         }
 973     }
 974
 975     return 0;
 976 }
 977 template <>
 978 DWORD workerThreadMain<false, false>(LPVOID) = delete;
 979
 980 template <bool IsFEThread, bool IsBEThread>
 981 DWORD workerThreadInit(LPVOID pData)
 982 {
 983 #if defined(_WIN32)
 984     __try
 985 #endif // _WIN32
 986     {
 987         return workerThreadMain<IsFEThread, IsBEThread>(pData);
 988     }
 989
 990 #if defined(_WIN32)
 991     __except (EXCEPTION_CONTINUE_SEARCH)
 992     {
 993     }
 994
 995 #endif // _WIN32
 996
 997     return 1;
 998 }
 999 template <>
1000 DWORD workerThreadInit<false, false>(LPVOID pData) = delete;
1001
1002 static void InitPerThreadStats(SWR_CONTEXT* pContext, uint32_t numThreads)
1003 {
1004     // Initialize DRAW_CONTEXT's per-thread stats
1005     for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
1006     {
1007         pContext->dcRing[dc].dynState.pStats =
1008             (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64);
1009         memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads);
1010     }
1011 }
1012
1013 //////////////////////////////////////////////////////////////////////////
1014 /// @brief Creates thread pool info but doesn't launch threads.
1015 /// @param pContext - pointer to context
1016 /// @param pPool - pointer to thread pool object.
1017 void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
1018 {
1019     CPUNumaNodes nodes;
1020     uint32_t     numThreadsPerProcGroup = 0;
1021     CalculateProcessorTopology(nodes, numThreadsPerProcGroup);
1022     assert(numThreadsPerProcGroup > 0);
1023
1024     // Assumption, for asymmetric topologies, multi-threaded cores will appear
1025     // in the list before single-threaded cores.  This appears to be true for
1026     // Windows when the total HW threads is limited to 64.
1027     uint32_t numHWNodes        = (uint32_t)nodes.size();
1028     uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size();
1029     uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size();
1030
1031 #if defined(_WIN32) && !defined(_WIN64)
1032     if (!pContext->threadInfo.MAX_WORKER_THREADS)
1033     {
1034         // Limit 32-bit windows to bindable HW threads only
1035         if ((numHWCoresPerNode * numHWHyperThreads) > 32)
1036         {
1037             numHWCoresPerNode = 32 / numHWHyperThreads;
1038         }
1039     }
1040 #endif
1041
1042     // Calculate num HW threads.  Due to asymmetric topologies, this is not
1043     // a trivial multiplication.
1044     uint32_t numHWThreads = 0;
1045     for (auto const& node : nodes)
1046     {
1047         for (auto const& core : node.cores)
1048         {
1049             numHWThreads += (uint32_t)core.threadIds.size();
1050         }
1051     }
1052
1053     uint32_t numNodes        = numHWNodes;
1054     uint32_t numCoresPerNode = numHWCoresPerNode;
1055     uint32_t numHyperThreads = numHWHyperThreads;
1056
1057     // Calc used threads per-core
1058     if (numHyperThreads > pContext->threadInfo.BASE_THREAD)
1059     {
1060         numHyperThreads -= pContext->threadInfo.BASE_THREAD;
1061     }
1062     else
1063     {
1064         SWR_ASSERT(false,
1065                    "Cannot use BASE_THREAD value: %d, maxThreads: %d, reverting BASE_THREAD to 0",
1066                    pContext->threadInfo.BASE_THREAD,
1067                    numHyperThreads);
1068         pContext->threadInfo.BASE_THREAD = 0;
1069     }
1070
1071     if (pContext->threadInfo.MAX_THREADS_PER_CORE)
1072     {
1073         numHyperThreads = std::min(numHyperThreads, pContext->threadInfo.MAX_THREADS_PER_CORE);
1074     }
1075
1076     // Prune any cores that don't support the number of threads
1077     if (numHyperThreads > 1)
1078     {
1079         for (auto& node : nodes)
1080         {
1081             uint32_t numUsableCores = 0;
1082             for (auto& core : node.cores)
1083             {
1084                 numUsableCores += (core.threadIds.size() >= numHyperThreads);
1085             }
1086             numCoresPerNode = std::min(numCoresPerNode, numUsableCores);
1087         }
1088     }
1089
1090     // Calc used cores per NUMA node
1091     if (numCoresPerNode > pContext->threadInfo.BASE_CORE)
1092     {
1093         numCoresPerNode -= pContext->threadInfo.BASE_CORE;
1094     }
1095     else
1096     {
1097         SWR_ASSERT(false,
1098                    "Cannot use BASE_CORE value: %d, maxCores: %d, reverting BASE_CORE to 0",
1099                    pContext->threadInfo.BASE_CORE,
1100                    numCoresPerNode);
1101         pContext->threadInfo.BASE_CORE = 0;
1102     }
1103
1104     if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE)
1105     {
1106         numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE);
1107     }
1108
1109     // Calc used NUMA nodes
1110     if (numNodes > pContext->threadInfo.BASE_NUMA_NODE)
1111     {
1112         numNodes -= pContext->threadInfo.BASE_NUMA_NODE;
1113     }
1114     else
1115     {
1116         SWR_ASSERT(
1117             false,
1118             "Cannot use BASE_NUMA_NODE value: %d, maxNodes: %d, reverting BASE_NUMA_NODE to 0",
1119             pContext->threadInfo.BASE_NUMA_NODE,
1120             numNodes);
1121         pContext->threadInfo.BASE_NUMA_NODE = 0;
1122     }
1123
1124     if (pContext->threadInfo.MAX_NUMA_NODES)
1125     {
1126         numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES);
1127     }
1128
1129     // Calculate numThreads - at this point everything should be symmetric
1130     uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
1131     SWR_REL_ASSERT(numThreads <= numHWThreads);
1132
1133     uint32_t& numAPIReservedThreads = pContext->apiThreadInfo.numAPIReservedThreads;
1134     uint32_t& numAPIThreadsPerCore  = pContext->apiThreadInfo.numAPIThreadsPerCore;
1135     uint32_t  numRemovedThreads     = 0;
1136
1137     if (pContext->threadInfo.SINGLE_THREADED)
1138     {
1139         numAPIReservedThreads      = 0;
1140         numThreads                 = 1;
1141         pContext->NumWorkerThreads = 1;
1142         pContext->NumFEThreads     = 1;
1143         pContext->NumBEThreads     = 1;
1144         pPool->numThreads          = 0;
1145     }
1146     else if (pContext->threadInfo.MAX_WORKER_THREADS)
1147     {
1148         numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, numHWThreads);
1149         pContext->threadInfo.BASE_NUMA_NODE = 0;
1150         pContext->threadInfo.BASE_CORE      = 0;
1151         pContext->threadInfo.BASE_THREAD    = 0;
1152         numAPIReservedThreads               = 0;
1153     }
1154     else
1155     {
1156         if (numAPIReservedThreads >= numThreads)
1157         {
1158             numAPIReservedThreads = 0;
1159         }
1160         else if (numAPIReservedThreads)
1161         {
1162             numAPIThreadsPerCore = std::min(numAPIThreadsPerCore, numHWHyperThreads);
1163
1164             if (0 == numAPIThreadsPerCore)
1165             {
1166                 numAPIThreadsPerCore = numHWHyperThreads;
1167             }
1168
1169             numRemovedThreads = numAPIReservedThreads;
1170             if (numAPIThreadsPerCore == 2 && numHyperThreads == 1)
1171             {
1172                 // Adjust removed threads to make logic below work
1173                 numRemovedThreads =
1174                     std::max(1U, (numRemovedThreads + numAPIThreadsPerCore - 1) / 2);
1175             }
1176
1177             numThreads -= numRemovedThreads;
1178         }
1179     }
1180
1181     InitPerThreadStats(pContext, numThreads);
1182
1183     if (pContext->threadInfo.SINGLE_THREADED)
1184     {
1185         numAPIReservedThreads = 0;
1186         numThreads            = 1;
1187     }
1188
1189     if (numAPIReservedThreads)
1190     {
1191         pPool->pApiThreadData = new (std::nothrow) THREAD_DATA[numAPIReservedThreads];
1192         SWR_ASSERT(pPool->pApiThreadData);
1193         if (!pPool->pApiThreadData)
1194         {
1195             numAPIReservedThreads = 0;
1196         }
1197         else
1198         {
1199             memset(pPool->pApiThreadData, 0, sizeof(THREAD_DATA) * numAPIReservedThreads);
1200         }
1201     }
1202     pPool->numReservedThreads = numAPIReservedThreads;
1203
1204     pPool->numThreads          = numThreads;
1205     pContext->NumWorkerThreads = pPool->numThreads;
1206
1207     pPool->pThreadData = new (std::nothrow) THREAD_DATA[pPool->numThreads];
1208     assert(pPool->pThreadData);
1209     memset(pPool->pThreadData, 0, sizeof(THREAD_DATA) * pPool->numThreads);
1210     pPool->numaMask = 0;
1211
1212     // Allocate worker private data
1213     pPool->pWorkerPrivateDataArray = nullptr;
1214     if (pContext->workerPrivateState.perWorkerPrivateStateSize == 0)
1215     {
1216         pContext->workerPrivateState.perWorkerPrivateStateSize = sizeof(SWR_WORKER_DATA);
1217         pContext->workerPrivateState.pfnInitWorkerData = nullptr;
1218         pContext->workerPrivateState.pfnFinishWorkerData = nullptr;
1219     }
1220
1221     // initialize contents of SWR_WORKER_DATA
1222     size_t perWorkerSize =
1223         AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64);
1224     size_t totalSize = perWorkerSize * pPool->numThreads;
1225     if (totalSize)
1226     {
1227         pPool->pWorkerPrivateDataArray = AlignedMalloc(totalSize, 64);
1228         SWR_ASSERT(pPool->pWorkerPrivateDataArray);
1229
1230         void* pWorkerData = pPool->pWorkerPrivateDataArray;
1231         for (uint32_t i = 0; i < pPool->numThreads; ++i)
1232         {
1233             pPool->pThreadData[i].pWorkerPrivateData = pWorkerData;
1234             if (pContext->workerPrivateState.pfnInitWorkerData)
1235             {
1236                 pContext->workerPrivateState.pfnInitWorkerData(pContext, pWorkerData, i);
1237             }
1238             pWorkerData = PtrAdd(pWorkerData, perWorkerSize);
1239         }
1240     }
1241
1242     if (pContext->threadInfo.SINGLE_THREADED)
1243     {
1244         return;
1245     }
1246
1247     pPool->pThreads = new (std::nothrow) THREAD_PTR[pPool->numThreads];
1248     assert(pPool->pThreads);
1249
1250     if (pContext->threadInfo.MAX_WORKER_THREADS)
1251     {
1252         bool     bForceBindProcGroup = (numThreads > numThreadsPerProcGroup);
1253         uint32_t numProcGroups = (numThreads + numThreadsPerProcGroup - 1) / numThreadsPerProcGroup;
1254         // When MAX_WORKER_THREADS is set we don't bother to bind to specific HW threads
1255         // But Windows will still require binding to specific process groups
1256         for (uint32_t workerId = 0; workerId < numThreads; ++workerId)
1257         {
1258             pPool->pThreadData[workerId].workerId           = workerId;
1259             pPool->pThreadData[workerId].procGroupId        = workerId % numProcGroups;
1260             pPool->pThreadData[workerId].threadId           = 0;
1261             pPool->pThreadData[workerId].numaId             = 0;
1262             pPool->pThreadData[workerId].coreId             = 0;
1263             pPool->pThreadData[workerId].htId               = 0;
1264             pPool->pThreadData[workerId].pContext           = pContext;
1265             pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
1266
1267             pContext->NumBEThreads++;
1268             pContext->NumFEThreads++;
1269         }
1270     }
1271     else
1272     {
1273         // numa distribution assumes workers on all nodes
1274         bool useNuma = true;
1275         if (numCoresPerNode * numHyperThreads == 1)
1276         {
1277             useNuma = false;
1278         }
1279
1280         if (useNuma)
1281         {
1282             pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.)
1283         }
1284         else
1285         {
1286             pPool->numaMask = 0;
1287         }
1288
1289         uint32_t workerId           = 0;
1290         uint32_t numReservedThreads = numAPIReservedThreads;
1291         for (uint32_t n = 0; n < numNodes; ++n)
1292         {
1293             if ((n + pContext->threadInfo.BASE_NUMA_NODE) >= nodes.size())
1294             {
1295                 break;
1296             }
1297             auto&    node     = nodes[n + pContext->threadInfo.BASE_NUMA_NODE];
1298             uint32_t numCores = numCoresPerNode;
1299             for (uint32_t c = 0; c < numCores; ++c)
1300             {
1301                 if ((c + pContext->threadInfo.BASE_CORE) >= node.cores.size())
1302                 {
1303                     break;
1304                 }
1305
1306                 auto& core = node.cores[c + pContext->threadInfo.BASE_CORE];
1307                 for (uint32_t t = 0; t < numHyperThreads; ++t)
1308                 {
1309                     if ((t + pContext->threadInfo.BASE_THREAD) >= core.threadIds.size())
1310                     {
1311                         break;
1312                     }
1313
1314                     if (numRemovedThreads)
1315                     {
1316                         --numRemovedThreads;
1317                         assert(numReservedThreads);
1318                         --numReservedThreads;
1319                         pPool->pApiThreadData[numReservedThreads].workerId    = 0xFFFFFFFFU;
1320                         pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
1321                         pPool->pApiThreadData[numReservedThreads].threadId    = core.threadIds[t];
1322                         pPool->pApiThreadData[numReservedThreads].numaId =
1323                             useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
1324                         pPool->pApiThreadData[numReservedThreads].coreId =
1325                             c + pContext->threadInfo.BASE_CORE;
1326                         pPool->pApiThreadData[numReservedThreads].htId =
1327                             t + pContext->threadInfo.BASE_THREAD;
1328                         pPool->pApiThreadData[numReservedThreads].pContext           = pContext;
1329                         pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
1330
1331                         if (numAPIThreadsPerCore > numHyperThreads && numReservedThreads)
1332                         {
1333                             --numReservedThreads;
1334                             pPool->pApiThreadData[numReservedThreads].workerId    = 0xFFFFFFFFU;
1335                             pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
1336                             pPool->pApiThreadData[numReservedThreads].threadId =
1337                                 core.threadIds[t + 1];
1338                             pPool->pApiThreadData[numReservedThreads].numaId =
1339                                 useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
1340                             pPool->pApiThreadData[numReservedThreads].coreId =
1341                                 c + pContext->threadInfo.BASE_CORE;
1342                             pPool->pApiThreadData[numReservedThreads].htId =
1343                                 t + pContext->threadInfo.BASE_THREAD;
1344                             pPool->pApiThreadData[numReservedThreads].pContext           = pContext;
1345                             pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
1346                         }
1347
1348                         continue;
1349                     }
1350
1351                     SWR_ASSERT(workerId < numThreads);
1352
1353                     pPool->pThreadData[workerId].workerId    = workerId;
1354                     pPool->pThreadData[workerId].procGroupId = core.procGroup;
1355                     pPool->pThreadData[workerId].threadId =
1356                         core.threadIds[t + pContext->threadInfo.BASE_THREAD];
1357                     pPool->pThreadData[workerId].numaId =
1358                         useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
1359                     pPool->pThreadData[workerId].coreId   = c + pContext->threadInfo.BASE_CORE;
1360                     pPool->pThreadData[workerId].htId     = t + pContext->threadInfo.BASE_THREAD;
1361                     pPool->pThreadData[workerId].pContext = pContext;
1362                     pPool->pThreadData[workerId].forceBindProcGroup = false;
1363
1364                     pContext->NumBEThreads++;
1365                     pContext->NumFEThreads++;
1366
1367                     ++workerId;
1368                 }
1369             }
1370         }
1371         SWR_ASSERT(workerId == pContext->NumWorkerThreads);
1372     }
1373 }
1374
1375 //////////////////////////////////////////////////////////////////////////
1376 /// @brief Launches worker threads in thread pool.
1377 /// @param pContext - pointer to context
1378 /// @param pPool - pointer to thread pool object.
1379 void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
1380 {
1381     if (pContext->threadInfo.SINGLE_THREADED)
1382     {
1383         return;
1384     }
1385
1386     for (uint32_t workerId = 0; workerId < pContext->NumWorkerThreads; ++workerId)
1387     {
1388         pPool->pThreads[workerId] =
1389             new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
1390     }
1391 }
1392
1393 //////////////////////////////////////////////////////////////////////////
1394 /// @brief Destroys thread pool.
1395 /// @param pContext - pointer to context
1396 /// @param pPool - pointer to thread pool object.
1397 void DestroyThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
1398 {
1399     // Wait for all threads to finish
1400     SwrWaitForIdle(pContext);
1401
1402     // Wait for threads to finish and destroy them
1403     for (uint32_t t = 0; t < pPool->numThreads; ++t)
1404     {
1405         if (!pContext->threadInfo.SINGLE_THREADED)
1406         {
1407             // Detach from thread.  Cannot join() due to possibility (in Windows) of code
1408             // in some DLLMain(THREAD_DETATCH case) blocking the thread until after this returns.
1409             pPool->pThreads[t]->detach();
1410             delete (pPool->pThreads[t]);
1411         }
1412
1413         if (pContext->workerPrivateState.pfnFinishWorkerData)
1414         {
1415             pContext->workerPrivateState.pfnFinishWorkerData(
1416                 pContext, pPool->pThreadData[t].pWorkerPrivateData, t);
1417         }
1418     }
1419
1420     delete[] pPool->pThreads;
1421
1422     // Clean up data used by threads
1423     delete[] pPool->pThreadData;
1424     delete[] pPool->pApiThreadData;
1425
1426     AlignedFree(pPool->pWorkerPrivateDataArray);
1427 }