src/gallium/drivers/swr/rasterizer/core/threads.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 ****************************************************************************/
  23
  24 #include <stdio.h>
  25 #include <thread>
  26 #include <algorithm>
  27 #include <float.h>
  28 #include <vector>
  29 #include <utility>
  30 #include <fstream>
  31 #include <string>
  32
  33 #if defined(__linux__) || defined(__gnu_linux__) || defined(__APPLE__)
  34 #include <pthread.h>
  35 #include <sched.h>
  36 #include <unistd.h>
  37 #endif
  38
  39 #ifdef __APPLE__
  40 #include <sys/types.h>
  41 #include <sys/sysctl.h>
  42 #endif
  43
  44 #include "common/os.h"
  45 #include "context.h"
  46 #include "frontend.h"
  47 #include "backend.h"
  48 #include "rasterizer.h"
  49 #include "rdtsc_core.h"
  50 #include "tilemgr.h"
  51
  52
  53
  54
  55 // ThreadId
  56 struct Core
  57 {
  58     uint32_t                procGroup = 0;
  59     std::vector<uint32_t>   threadIds;
  60 };
  61
  62 struct NumaNode
  63 {
  64     uint32_t          numaId;
  65     std::vector<Core> cores;
  66 };
  67
  68 typedef std::vector<NumaNode> CPUNumaNodes;
  69
  70 void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup)
  71 {
  72     out_nodes.clear();
  73     out_numThreadsPerProcGroup = 0;
  74
  75 #if defined(_WIN32)
  76
  77     std::vector<KAFFINITY> threadMaskPerProcGroup;
  78
  79     static std::mutex m;
  80     std::lock_guard<std::mutex> l(m);
  81
  82     DWORD bufSize = 0;
  83
  84     BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufSize);
  85     SWR_ASSERT(ret == FALSE && GetLastError() == ERROR_INSUFFICIENT_BUFFER);
  86
  87     PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize);
  88     SWR_ASSERT(pBufferMem);
  89
  90     ret = GetLogicalProcessorInformationEx(RelationProcessorCore, pBufferMem, &bufSize);
  91     SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information");
  92
  93     uint32_t count = bufSize / pBufferMem->Size;
  94     PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = pBufferMem;
  95
  96     for (uint32_t i = 0; i < count; ++i)
  97     {
  98         SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore);
  99         for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g)
 100         {
 101             auto& gmask = pBuffer->Processor.GroupMask[g];
 102             uint32_t threadId = 0;
 103             uint32_t procGroup = gmask.Group;
 104
 105             Core* pCore = nullptr;
 106
 107             uint32_t numThreads = (uint32_t)_mm_popcount_sizeT(gmask.Mask);
 108
 109             while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask))
 110             {
 111                 // clear mask
 112                 KAFFINITY threadMask = KAFFINITY(1) << threadId;
 113                 gmask.Mask &= ~threadMask;
 114
 115                 if (procGroup >= threadMaskPerProcGroup.size())
 116                 {
 117                     threadMaskPerProcGroup.resize(procGroup + 1);
 118                 }
 119
 120                 if (threadMaskPerProcGroup[procGroup] & threadMask)
 121                 {
 122                     // Already seen this mask.  This means that we are in 32-bit mode and
 123                     // have seen more than 32 HW threads for this procGroup
 124                     // Don't use it
 125 #if defined(_WIN64)
 126                     SWR_INVALID("Shouldn't get here in 64-bit mode");
 127 #endif
 128                     continue;
 129                 }
 130
 131                 threadMaskPerProcGroup[procGroup] |= (KAFFINITY(1) << threadId);
 132
 133                 // Find Numa Node
 134                 uint32_t numaId = 0;
 135                 PROCESSOR_NUMBER procNum = {};
 136                 procNum.Group = WORD(procGroup);
 137                 procNum.Number = UCHAR(threadId);
 138
 139                 ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId);
 140                 SWR_ASSERT(ret);
 141
 142                 // Store data
 143                 if (out_nodes.size() <= numaId)
 144                 {
 145                     out_nodes.resize(numaId + 1);
 146                 }
 147                 auto& numaNode = out_nodes[numaId];
 148                 numaNode.numaId = numaId;
 149
 150                 uint32_t coreId = 0;
 151
 152                 if (nullptr == pCore)
 153                 {
 154                     numaNode.cores.push_back(Core());
 155                     pCore = &numaNode.cores.back();
 156                     pCore->procGroup = procGroup;
 157                 }
 158                 pCore->threadIds.push_back(threadId);
 159                 if (procGroup == 0)
 160                 {
 161                     out_numThreadsPerProcGroup++;
 162                 }
 163             }
 164         }
 165         pBuffer = PtrAdd(pBuffer, pBuffer->Size);
 166     }
 167
 168     free(pBufferMem);
 169
 170
 171 #elif defined(__linux__) || defined (__gnu_linux__)
 172
 173     // Parse /proc/cpuinfo to get full topology
 174     std::ifstream input("/proc/cpuinfo");
 175     std::string line;
 176     char* c;
 177     uint32_t procId = uint32_t(-1);
 178     uint32_t coreId = uint32_t(-1);
 179     uint32_t physId = uint32_t(-1);
 180
 181     while (std::getline(input, line))
 182     {
 183         if (line.find("processor") != std::string::npos)
 184         {
 185             auto data_start = line.find(": ") + 2;
 186             procId = std::strtoul(&line.c_str()[data_start], &c, 10);
 187             continue;
 188         }
 189         if (line.find("core id") != std::string::npos)
 190         {
 191             auto data_start = line.find(": ") + 2;
 192             coreId = std::strtoul(&line.c_str()[data_start], &c, 10);
 193             continue;
 194         }
 195         if (line.find("physical id") != std::string::npos)
 196         {
 197             auto data_start = line.find(": ") + 2;
 198             physId = std::strtoul(&line.c_str()[data_start], &c, 10);
 199             continue;
 200         }
 201         if (line.length() == 0)
 202         {
 203             if (physId + 1 > out_nodes.size())
 204                 out_nodes.resize(physId + 1);
 205             auto& numaNode = out_nodes[physId];
 206             numaNode.numaId = physId;
 207
 208             if (coreId + 1 > numaNode.cores.size())
 209                 numaNode.cores.resize(coreId + 1);
 210             auto& core = numaNode.cores[coreId];
 211             core.procGroup = coreId;
 212             core.threadIds.push_back(procId);
 213         }
 214     }
 215
 216     out_numThreadsPerProcGroup = 0;
 217     for (auto &node : out_nodes)
 218     {
 219         for (auto &core : node.cores)
 220         {
 221             out_numThreadsPerProcGroup += core.threadIds.size();
 222         }
 223     }
 224
 225 #elif defined(__APPLE__)
 226
 227 #else
 228
 229 #error Unsupported platform
 230
 231 #endif
 232
 233     // Prune empty cores and numa nodes
 234     for (auto node_it = out_nodes.begin(); node_it != out_nodes.end(); )
 235     {
 236         // Erase empty cores (first)
 237         for (auto core_it = node_it->cores.begin(); core_it != node_it->cores.end(); )
 238         {
 239             if (core_it->threadIds.size() == 0)
 240             {
 241                 core_it = node_it->cores.erase(core_it);
 242             }
 243             else
 244             {
 245                 ++core_it;
 246             }
 247         }
 248
 249         // Erase empty numa nodes (second)
 250         if (node_it->cores.size() == 0)
 251         {
 252             node_it = out_nodes.erase(node_it);
 253         }
 254         else
 255         {
 256             ++node_it;
 257         }
 258     }
 259 }
 260
 261     auto numProcessors = 0;
 262     auto numCores = 0;
 263     auto numPhysicalIds = 0;
 264
 265     int value;
 266     size_t size = sizeof(value);
 267
 268     int result = sysctlbyname("hw.packages", &value, &size, NULL, 0);
 269     SWR_ASSERT(result == 0);
 270     numPhysicalIds = value;
 271
 272     result = sysctlbyname("hw.logicalcpu", &value, &size, NULL, 0);
 273     SWR_ASSERT(result == 0);
 274     numProcessors = value;
 275
 276     result = sysctlbyname("hw.physicalcpu", &value, &size, NULL, 0);
 277     SWR_ASSERT(result == 0);
 278     numCores = value;
 279
 280     out_nodes.resize(numPhysicalIds);
 281
 282     for (auto physId = 0; physId < numPhysicalIds; ++physId)
 283     {
 284         auto &numaNode = out_nodes[physId];
 285         auto procId = 0;
 286
 287         numaNode.cores.resize(numCores);
 288
 289         while (procId < numProcessors)
 290         {
 291             for (auto coreId = 0; coreId < numaNode.cores.size(); ++coreId, ++procId)
 292             {
 293                 auto &core = numaNode.cores[coreId];
 294
 295                 core.procGroup = coreId;
 296                 core.threadIds.push_back(procId);
 297             }
 298         }
 299     }
 300
 301     out_numThreadsPerProcGroup = 0;
 302
 303     for (auto &node : out_nodes)
 304     {
 305         for (auto &core : node.cores)
 306         {
 307             out_numThreadsPerProcGroup += core.threadIds.size();
 308         }
 309     }
 310
 311
 312 void bindThread(SWR_CONTEXT* pContext, uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=false)
 313 {
 314     // Only bind threads when MAX_WORKER_THREADS isn't set.
 315     if (pContext->threadInfo.SINGLE_THREADED || (pContext->threadInfo.MAX_WORKER_THREADS && bindProcGroup == false))
 316     {
 317         return;
 318     }
 319
 320 #if defined(_WIN32)
 321
 322     GROUP_AFFINITY affinity = {};
 323     affinity.Group = procGroupId;
 324
 325 #if !defined(_WIN64)
 326     if (threadId >= 32)
 327     {
 328         // Hopefully we don't get here.  Logic in CreateThreadPool should prevent this.
 329         SWR_INVALID("Shouldn't get here");
 330
 331         // In a 32-bit process on Windows it is impossible to bind
 332         // to logical processors 32-63 within a processor group.
 333         // In this case set the mask to 0 and let the system assign
 334         // the processor.  Hopefully it will make smart choices.
 335         affinity.Mask = 0;
 336     }
 337     else
 338 #endif
 339     {
 340         // If MAX_WORKER_THREADS is set, only bind to the proc group,
 341         // Not the individual HW thread.
 342         if (!bindProcGroup  && !pContext->threadInfo.MAX_WORKER_THREADS)
 343         {
 344             affinity.Mask = KAFFINITY(1) << threadId;
 345         }
 346         else
 347         {
 348             affinity.Mask = KAFFINITY(0);
 349         }
 350     }
 351
 352     if (!SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr))
 353     {
 354         SWR_INVALID("Failed to set Thread Affinity");
 355     }
 356
 357 #elif defined(__linux__) || defined(__gnu_linux__)
 358
 359     cpu_set_t cpuset;
 360     pthread_t thread = pthread_self();
 361     CPU_ZERO(&cpuset);
 362     CPU_SET(threadId, &cpuset);
 363
 364     int err = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
 365     if (err != 0)
 366     {
 367         fprintf(stderr, "pthread_setaffinity_np failure for tid %u: %s\n", threadId, strerror(err));
 368     }
 369
 370 #endif
 371 }
 372
 373 INLINE
 374 uint32_t GetEnqueuedDraw(SWR_CONTEXT *pContext)
 375 {
 376     return pContext->dcRing.GetHead();
 377 }
 378
 379 INLINE
 380 DRAW_CONTEXT *GetDC(SWR_CONTEXT *pContext, uint32_t drawId)
 381 {
 382     return &pContext->dcRing[(drawId-1) % pContext->MAX_DRAWS_IN_FLIGHT];
 383 }
 384
 385 INLINE
 386 bool IDComparesLess(uint32_t a, uint32_t b)
 387 {
 388     // Use signed delta to ensure that wrap-around to 0 is correctly handled.
 389     int32_t delta = int32_t(a - b);
 390     return (delta < 0);
 391 }
 392
 393 // returns true if dependency not met
 394 INLINE
 395 bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t lastRetiredDraw)
 396 {
 397     return pDC->dependent && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
 398 }
 399
 400 bool CheckDependencyFE(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t lastRetiredDraw)
 401 {
 402     return pDC->dependentFE && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
 403 }
 404
 405 //////////////////////////////////////////////////////////////////////////
 406 /// @brief Update client stats.
 407 INLINE void UpdateClientStats(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
 408 {
 409     if ((pContext->pfnUpdateStats == nullptr) || (GetApiState(pDC).enableStatsBE == false))
 410     {
 411         return;
 412     }
 413
 414     DRAW_DYNAMIC_STATE& dynState = pDC->dynState;
 415     OSALIGNLINE(SWR_STATS) stats{ 0 };
 416
 417     // Sum up stats across all workers before sending to client.
 418     for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
 419     {
 420         stats.DepthPassCount += dynState.pStats[i].DepthPassCount;
 421
 422         stats.PsInvocations  += dynState.pStats[i].PsInvocations;
 423         stats.CsInvocations  += dynState.pStats[i].CsInvocations;
 424     }
 425
 426
 427     pContext->pfnUpdateStats(GetPrivateState(pDC), &stats);
 428 }
 429
 430 INLINE void ExecuteCallbacks(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
 431 {
 432     UpdateClientStats(pContext, workerId, pDC);
 433
 434     if (pDC->retireCallback.pfnCallbackFunc)
 435     {
 436         pDC->retireCallback.pfnCallbackFunc(pDC->retireCallback.userData,
 437             pDC->retireCallback.userData2,
 438             pDC->retireCallback.userData3);
 439     }
 440 }
 441
 442 // inlined-only version
 443 INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
 444 {
 445     int32_t result = static_cast<int32_t>(InterlockedDecrement(&pDC->threadsDone));
 446     SWR_ASSERT(result >= 0);
 447
 448     AR_FLUSH(pDC->drawId);
 449
 450     if (result == 0)
 451     {
 452         ExecuteCallbacks(pContext, workerId, pDC);
 453
 454         // Cleanup memory allocations
 455         pDC->pArena->Reset(true);
 456         if (!pDC->isCompute)
 457         {
 458             pDC->pTileMgr->initialize();
 459         }
 460         if (pDC->cleanupState)
 461         {
 462             pDC->pState->pArena->Reset(true);
 463         }
 464
 465         _ReadWriteBarrier();
 466
 467         pContext->dcRing.Dequeue();  // Remove from tail
 468     }
 469
 470     return result;
 471 }
 472
 473 // available to other translation modules
 474 int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
 475 {
 476     return CompleteDrawContextInl(pContext, 0, pDC);
 477 }
 478
 479 INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawBE, uint32_t& drawEnqueued)
 480 {
 481     // increment our current draw id to the first incomplete draw
 482     drawEnqueued = GetEnqueuedDraw(pContext);
 483     while (IDComparesLess(curDrawBE, drawEnqueued))
 484     {
 485         DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT];
 486
 487         // If its not compute and FE is not done then break out of loop.
 488         if (!pDC->doneFE && !pDC->isCompute) break;
 489
 490         bool isWorkComplete = pDC->isCompute ?
 491             pDC->pDispatch->isWorkComplete() :
 492             pDC->pTileMgr->isWorkComplete();
 493
 494         if (isWorkComplete)
 495         {
 496             curDrawBE++;
 497             CompleteDrawContextInl(pContext, workerId, pDC);
 498         }
 499         else
 500         {
 501             break;
 502         }
 503     }
 504
 505     // If there are no more incomplete draws then return false.
 506     return IDComparesLess(curDrawBE, drawEnqueued);
 507 }
 508
 509 //////////////////////////////////////////////////////////////////////////
 510 /// @brief If there is any BE work then go work on it.
 511 /// @param pContext - pointer to SWR context.
 512 /// @param workerId - The unique worker ID that is assigned to this thread.
 513 /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
 514 ///                    has its own curDrawBE counter and this ensures that each worker processes all the
 515 ///                    draws in order.
 516 /// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its
 517 ///                      own set and each time it fails to lock a macrotile, because its already locked,
 518 ///                      then it will add that tile to the lockedTiles set. As a worker begins to work
 519 ///                      on future draws the lockedTiles ensure that it doesn't work on tiles that may
 520 ///                      still have work pending in a previous draw. Additionally, the lockedTiles is
 521 ///                      hueristic that can steer a worker back to the same macrotile that it had been
 522 ///                      working on in a previous draw.
 523 /// @returns        true if worker thread should shutdown
 524 bool WorkOnFifoBE(
 525     SWR_CONTEXT *pContext,
 526     uint32_t workerId,
 527     uint32_t &curDrawBE,
 528     TileSet& lockedTiles,
 529     uint32_t numaNode,
 530     uint32_t numaMask)
 531 {
 532     bool bShutdown = false;
 533
 534     // Find the first incomplete draw that has pending work. If no such draw is found then
 535     // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
 536     uint32_t drawEnqueued = 0;
 537     if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
 538     {
 539         return false;
 540     }
 541
 542     uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;
 543
 544     // Reset our history for locked tiles. We'll have to re-learn which tiles are locked.
 545     lockedTiles.clear();
 546
 547     // Try to work on each draw in order of the available draws in flight.
 548     //   1. If we're on curDrawBE, we can work on any macrotile that is available.
 549     //   2. If we're trying to work on draws after curDrawBE, we are restricted to
 550     //      working on those macrotiles that are known to be complete in the prior draw to
 551     //      maintain order. The locked tiles provides the history to ensures this.
 552     for (uint32_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
 553     {
 554         DRAW_CONTEXT *pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
 555
 556         if (pDC->isCompute) return false; // We don't look at compute work.
 557
 558         // First wait for FE to be finished with this draw. This keeps threading model simple
 559         // but if there are lots of bubbles between draws then serializing FE and BE may
 560         // need to be revisited.
 561         if (!pDC->doneFE) return false;
 562
 563         // If this draw is dependent on a previous draw then we need to bail.
 564         if (CheckDependency(pContext, pDC, lastRetiredDraw))
 565         {
 566             return false;
 567         }
 568
 569         // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it.
 570         auto &macroTiles = pDC->pTileMgr->getDirtyTiles();
 571
 572         for (auto tile : macroTiles)
 573         {
 574             uint32_t tileID = tile->mId;
 575
 576             // Only work on tiles for this numa node
 577             uint32_t x, y;
 578             pDC->pTileMgr->getTileIndices(tileID, x, y);
 579             if (((x ^ y) & numaMask) != numaNode)
 580             {
 581                 continue;
 582             }
 583
 584             if (!tile->getNumQueued())
 585             {
 586                 continue;
 587             }
 588
 589             // can only work on this draw if it's not in use by other threads
 590             if (lockedTiles.find(tileID) != lockedTiles.end())
 591             {
 592                 continue;
 593             }
 594
 595             if (tile->tryLock())
 596             {
 597                 BE_WORK *pWork;
 598
 599                 RDTSC_BEGIN(WorkerFoundWork, pDC->drawId);
 600
 601                 uint32_t numWorkItems = tile->getNumQueued();
 602                 SWR_ASSERT(numWorkItems);
 603
 604                 pWork = tile->peek();
 605                 SWR_ASSERT(pWork);
 606                 if (pWork->type == DRAW)
 607                 {
 608                     pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, workerId, tileID);
 609                 }
 610                 else if (pWork->type == SHUTDOWN)
 611                 {
 612                     bShutdown = true;
 613                 }
 614
 615                 while ((pWork = tile->peek()) != nullptr)
 616                 {
 617                     pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
 618                     tile->dequeue();
 619                 }
 620                 RDTSC_END(WorkerFoundWork, numWorkItems);
 621
 622                 _ReadWriteBarrier();
 623
 624                 pDC->pTileMgr->markTileComplete(tileID);
 625
 626                 // Optimization: If the draw is complete and we're the last one to have worked on it then
 627                 // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
 628                 if ((curDrawBE == i) && (bShutdown || pDC->pTileMgr->isWorkComplete()))
 629                 {
 630                     // We can increment the current BE and safely move to next draw since we know this draw is complete.
 631                     curDrawBE++;
 632                     CompleteDrawContextInl(pContext, workerId, pDC);
 633
 634                     lastRetiredDraw++;
 635
 636                     lockedTiles.clear();
 637                     break;
 638                 }
 639
 640                 if (bShutdown)
 641                 {
 642                     break;
 643                 }
 644             }
 645             else
 646             {
 647                 // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
 648                 lockedTiles.insert(tileID);
 649             }
 650         }
 651     }
 652
 653     return bShutdown;
 654 }
 655
 656 //////////////////////////////////////////////////////////////////////////
 657 /// @brief Called when FE work is complete for this DC.
 658 INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
 659 {
 660     if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStatsFE)
 661     {
 662         SWR_STATS_FE& stats = pDC->dynState.statsFE;
 663
 664         AR_EVENT(FrontendStatsEvent(pDC->drawId,
 665             stats.IaVertices, stats.IaPrimitives, stats.VsInvocations, stats.HsInvocations,
 666             stats.DsInvocations, stats.GsInvocations, stats.GsPrimitives, stats.CInvocations, stats.CPrimitives,
 667             stats.SoPrimStorageNeeded[0], stats.SoPrimStorageNeeded[1], stats.SoPrimStorageNeeded[2], stats.SoPrimStorageNeeded[3],
 668             stats.SoNumPrimsWritten[0], stats.SoNumPrimsWritten[1], stats.SoNumPrimsWritten[2], stats.SoNumPrimsWritten[3]
 669         ));
 670                 AR_EVENT(FrontendDrawEndEvent(pDC->drawId));
 671
 672         pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &stats);
 673     }
 674
 675     if (pContext->pfnUpdateSoWriteOffset)
 676     {
 677         for (uint32_t i = 0; i < MAX_SO_BUFFERS; ++i)
 678         {
 679             if ((pDC->dynState.SoWriteOffsetDirty[i]) &&
 680                 (pDC->pState->state.soBuffer[i].soWriteEnable))
 681             {
 682                 pContext->pfnUpdateSoWriteOffset(GetPrivateState(pDC), i, pDC->dynState.SoWriteOffset[i]);
 683             }
 684         }
 685     }
 686
 687     // Ensure all streaming writes are globally visible before marking this FE done
 688     _mm_mfence();
 689     pDC->doneFE = true;
 690
 691     InterlockedDecrement(&pContext->drawsOutstandingFE);
 692 }
 693
 694 void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE)
 695 {
 696     // Try to grab the next DC from the ring
 697     uint32_t drawEnqueued = GetEnqueuedDraw(pContext);
 698     while (IDComparesLess(curDrawFE, drawEnqueued))
 699     {
 700         uint32_t dcSlot = curDrawFE % pContext->MAX_DRAWS_IN_FLIGHT;
 701         DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
 702         if (pDC->isCompute || pDC->doneFE)
 703         {
 704             CompleteDrawContextInl(pContext, workerId, pDC);
 705             curDrawFE++;
 706         }
 707         else
 708         {
 709             break;
 710         }
 711     }
 712
 713     uint32_t lastRetiredFE = curDrawFE - 1;
 714     uint32_t curDraw = curDrawFE;
 715     while (IDComparesLess(curDraw, drawEnqueued))
 716     {
 717         uint32_t dcSlot = curDraw % pContext->MAX_DRAWS_IN_FLIGHT;
 718         DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
 719
 720         if (!pDC->isCompute && !pDC->FeLock)
 721         {
 722             if (CheckDependencyFE(pContext, pDC, lastRetiredFE))
 723             {
 724                 return;
 725             }
 726
 727             uint32_t initial = InterlockedCompareExchange((volatile uint32_t*)&pDC->FeLock, 1, 0);
 728             if (initial == 0)
 729             {
 730                 // successfully grabbed the DC, now run the FE
 731                 pDC->FeWork.pfnWork(pContext, pDC, workerId, &pDC->FeWork.desc);
 732
 733                 CompleteDrawFE(pContext, workerId, pDC);
 734             }
 735         }
 736         curDraw++;
 737     }
 738 }
 739
 740 //////////////////////////////////////////////////////////////////////////
 741 /// @brief If there is any compute work then go work on it.
 742 /// @param pContext - pointer to SWR context.
 743 /// @param workerId - The unique worker ID that is assigned to this thread.
 744 /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
 745 ///                    has its own curDrawBE counter and this ensures that each worker processes all the
 746 ///                    draws in order.
 747 void WorkOnCompute(
 748     SWR_CONTEXT *pContext,
 749     uint32_t workerId,
 750     uint32_t& curDrawBE)
 751 {
 752     uint32_t drawEnqueued = 0;
 753     if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
 754     {
 755         return;
 756     }
 757
 758     uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;
 759
 760     for (uint64_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
 761     {
 762         DRAW_CONTEXT *pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
 763         if (pDC->isCompute == false) return;
 764
 765         // check dependencies
 766         if (CheckDependency(pContext, pDC, lastRetiredDraw))
 767         {
 768             return;
 769         }
 770
 771         SWR_ASSERT(pDC->pDispatch != nullptr);
 772         DispatchQueue& queue = *pDC->pDispatch;
 773
 774         // Is there any work remaining?
 775         if (queue.getNumQueued() > 0)
 776         {
 777             void* pSpillFillBuffer = nullptr;
 778             void* pScratchSpace = nullptr;
 779             uint32_t threadGroupId = 0;
 780             while (queue.getWork(threadGroupId))
 781             {
 782                 queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer, pScratchSpace);
 783                 queue.finishedWork();
 784             }
 785
 786             // Ensure all streaming writes are globally visible before moving onto the next draw
 787             _mm_mfence();
 788         }
 789     }
 790 }
 791
 792 void BindApiThread(SWR_CONTEXT *pContext, uint32_t apiThreadId)
 793 {
 794     if (nullptr == pContext)
 795     {
 796         return;
 797     }
 798
 799     if (apiThreadId >= pContext->threadPool.numReservedThreads)
 800     {
 801         if (pContext->threadPool.numReservedThreads)
 802         {
 803             const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[0];
 804             // Just bind to the process group used for API thread 0
 805             bindThread(pContext, 0, threadData.procGroupId, true);
 806         }
 807         return;
 808     }
 809
 810     const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[apiThreadId];
 811
 812     bindThread(pContext, threadData.threadId, threadData.procGroupId, threadData.forceBindProcGroup);
 813 }
 814
 815 template<bool IsFEThread, bool IsBEThread>
 816 DWORD workerThreadMain(LPVOID pData)
 817 {
 818     THREAD_DATA *pThreadData = (THREAD_DATA*)pData;
 819     SWR_CONTEXT *pContext = pThreadData->pContext;
 820     uint32_t threadId = pThreadData->threadId;
 821     uint32_t workerId = pThreadData->workerId;
 822
 823     bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup);
 824
 825     {
 826         char threadName[64];
 827         sprintf_s(threadName,
 828 #if defined(_WIN32)
 829                   "SWRWorker_%02d_NUMA%d_Core%02d_T%d",
 830 #else
 831                   // linux pthread name limited to 16 chars (including \0)
 832                   "w%03d-n%d-c%03d-t%d",
 833 #endif
 834             workerId, pThreadData->numaId, pThreadData->coreId, pThreadData->htId);
 835         SetCurrentThreadName(threadName);
 836     }
 837
 838     RDTSC_INIT(threadId);
 839
 840     // Only need offset numa index from base for correct masking
 841     uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE;
 842     uint32_t numaMask = pContext->threadPool.numaMask;
 843
 844     // flush denormals to 0
 845     _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
 846
 847     // Track tiles locked by other threads. If we try to lock a macrotile and find its already
 848     // locked then we'll add it to this list so that we don't try and lock it again.
 849     TileSet lockedTiles;
 850
 851     // each worker has the ability to work on any of the queued draws as long as certain
 852     // conditions are met. the data associated
 853     // with a draw is guaranteed to be active as long as a worker hasn't signaled that he
 854     // has moved on to the next draw when he determines there is no more work to do. The api
 855     // thread will not increment the head of the dc ring until all workers have moved past the
 856     // current head.
 857     // the logic to determine what to work on is:
 858     // 1- try to work on the FE any draw that is queued. For now there are no dependencies
 859     //    on the FE work, so any worker can grab any FE and process in parallel.  Eventually
 860     //    we'll need dependency tracking to force serialization on FEs.  The worker will try
 861     //    to pick an FE by atomically incrementing a counter in the swr context.  he'll keep
 862     //    trying until he reaches the tail.
 863     // 2- BE work must be done in strict order. we accomplish this today by pulling work off
 864     //    the oldest draw (ie the head) of the dcRing. the worker can determine if there is
 865     //    any work left by comparing the total # of binned work items and the total # of completed
 866     //    work items. If they are equal, then there is no more work to do for this draw, and
 867     //    the worker can safely increment its oldestDraw counter and move on to the next draw.
 868     std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
 869
 870     auto threadHasWork = [&](uint32_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };
 871
 872     uint32_t curDrawBE = 0;
 873     uint32_t curDrawFE = 0;
 874
 875     bool bShutdown = false;
 876
 877     while (true)
 878     {
 879         if (bShutdown && !threadHasWork(curDrawBE))
 880         {
 881             break;
 882         }
 883
 884         uint32_t loop = 0;
 885         while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE))
 886         {
 887             _mm_pause();
 888         }
 889
 890         if (!threadHasWork(curDrawBE))
 891         {
 892             lock.lock();
 893
 894             // check for thread idle condition again under lock
 895             if (threadHasWork(curDrawBE))
 896             {
 897                 lock.unlock();
 898                 continue;
 899             }
 900
 901             pContext->FifosNotEmpty.wait(lock);
 902             lock.unlock();
 903         }
 904
 905         if (IsBEThread)
 906         {
 907             RDTSC_BEGIN(WorkerWorkOnFifoBE, 0);
 908             bShutdown |= WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
 909             RDTSC_END(WorkerWorkOnFifoBE, 0);
 910
 911             WorkOnCompute(pContext, workerId, curDrawBE);
 912         }
 913
 914         if (IsFEThread)
 915         {
 916             WorkOnFifoFE(pContext, workerId, curDrawFE);
 917
 918             if (!IsBEThread)
 919             {
 920                 curDrawBE = curDrawFE;
 921             }
 922         }
 923     }
 924
 925     return 0;
 926 }
 927 template<> DWORD workerThreadMain<false, false>(LPVOID) = delete;
 928
 929 template <bool IsFEThread, bool IsBEThread>
 930 DWORD workerThreadInit(LPVOID pData)
 931 {
 932 #if defined(_WIN32)
 933     __try
 934 #endif // _WIN32
 935     {
 936         return workerThreadMain<IsFEThread, IsBEThread>(pData);
 937     }
 938
 939 #if defined(_WIN32)
 940     __except(EXCEPTION_CONTINUE_SEARCH)
 941     {
 942     }
 943
 944 #endif // _WIN32
 945
 946     return 1;
 947 }
 948 template<> DWORD workerThreadInit<false, false>(LPVOID pData) = delete;
 949
 950 static void InitPerThreadStats(SWR_CONTEXT* pContext, uint32_t numThreads)
 951 {
 952     // Initialize DRAW_CONTEXT's per-thread stats
 953     for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
 954     {
 955         pContext->dcRing[dc].dynState.pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64);
 956         memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads);
 957     }
 958 }
 959
 960 //////////////////////////////////////////////////////////////////////////
 961 /// @brief Creates thread pool info but doesn't launch threads.
 962 /// @param pContext - pointer to context
 963 /// @param pPool - pointer to thread pool object.
 964 void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
 965 {
 966     CPUNumaNodes nodes;
 967     uint32_t numThreadsPerProcGroup = 0;
 968     CalculateProcessorTopology(nodes, numThreadsPerProcGroup);
 969
 970     // Assumption, for asymmetric topologies, multi-threaded cores will appear
 971     // in the list before single-threaded cores.  This appears to be true for
 972     // Windows when the total HW threads is limited to 64.
 973     uint32_t numHWNodes         = (uint32_t)nodes.size();
 974     uint32_t numHWCoresPerNode  = (uint32_t)nodes[0].cores.size();
 975     uint32_t numHWHyperThreads  = (uint32_t)nodes[0].cores[0].threadIds.size();
 976
 977 #if defined(_WIN32) && !defined(_WIN64)
 978     if (!pContext->threadInfo.MAX_WORKER_THREADS)
 979     {
 980         // Limit 32-bit windows to bindable HW threads only
 981         if ((numHWCoresPerNode * numHWHyperThreads) > 32)
 982         {
 983             numHWCoresPerNode = 32 / numHWHyperThreads;
 984         }
 985     }
 986 #endif
 987
 988     // Calculate num HW threads.  Due to asymmetric topologies, this is not
 989     // a trivial multiplication.
 990     uint32_t numHWThreads = 0;
 991     for (auto const& node : nodes)
 992     {
 993         for (auto const& core : node.cores)
 994         {
 995             numHWThreads += (uint32_t)core.threadIds.size();
 996         }
 997     }
 998
 999     uint32_t numNodes           = numHWNodes;
1000     uint32_t numCoresPerNode    = numHWCoresPerNode;
1001     uint32_t numHyperThreads    = numHWHyperThreads;
1002
1003     // Calc used threads per-core
1004     if (numHyperThreads > pContext->threadInfo.BASE_THREAD)
1005     {
1006         numHyperThreads -= pContext->threadInfo.BASE_THREAD;
1007     }
1008     else
1009     {
1010         SWR_ASSERT(
1011             false,
1012             "Cannot use BASE_THREAD value: %d, maxThreads: %d, reverting BASE_THREAD to 0",
1013             pContext->threadInfo.BASE_THREAD,
1014             numHyperThreads);
1015         pContext->threadInfo.BASE_THREAD = 0;
1016     }
1017
1018     if (pContext->threadInfo.MAX_THREADS_PER_CORE)
1019     {
1020         numHyperThreads = std::min(numHyperThreads, pContext->threadInfo.MAX_THREADS_PER_CORE);
1021     }
1022
1023     // Prune any cores that don't support the number of threads
1024     if (numHyperThreads > 1)
1025     {
1026         for (auto& node : nodes)
1027         {
1028             uint32_t numUsableCores = 0;
1029             for (auto& core : node.cores)
1030             {
1031                 numUsableCores += (core.threadIds.size() >= numHyperThreads);
1032             }
1033             numCoresPerNode = std::min(numCoresPerNode, numUsableCores);
1034         }
1035     }
1036
1037     // Calc used cores per NUMA node
1038     if (numCoresPerNode > pContext->threadInfo.BASE_CORE)
1039     {
1040         numCoresPerNode -= pContext->threadInfo.BASE_CORE;
1041     }
1042     else
1043     {
1044         SWR_ASSERT(
1045             false,
1046             "Cannot use BASE_CORE value: %d, maxCores: %d, reverting BASE_CORE to 0",
1047             pContext->threadInfo.BASE_CORE,
1048             numCoresPerNode);
1049         pContext->threadInfo.BASE_CORE = 0;
1050     }
1051
1052     if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE)
1053     {
1054         numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE);
1055     }
1056
1057     // Calc used NUMA nodes
1058     if (numNodes > pContext->threadInfo.BASE_NUMA_NODE)
1059     {
1060         numNodes -= pContext->threadInfo.BASE_NUMA_NODE;
1061     }
1062     else
1063     {
1064         SWR_ASSERT(
1065             false,
1066             "Cannot use BASE_NUMA_NODE value: %d, maxNodes: %d, reverting BASE_NUMA_NODE to 0",
1067             pContext->threadInfo.BASE_NUMA_NODE,
1068             numNodes);
1069         pContext->threadInfo.BASE_NUMA_NODE = 0;
1070     }
1071
1072     if (pContext->threadInfo.MAX_NUMA_NODES)
1073     {
1074         numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES);
1075     }
1076
1077     // Calculate numThreads - at this point everything should be symmetric
1078     uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
1079     SWR_REL_ASSERT(numThreads <= numHWThreads);
1080
1081     uint32_t& numAPIReservedThreads = pContext->apiThreadInfo.numAPIReservedThreads;
1082     uint32_t& numAPIThreadsPerCore = pContext->apiThreadInfo.numAPIThreadsPerCore;
1083     uint32_t numRemovedThreads = 0;
1084
1085     if (pContext->threadInfo.SINGLE_THREADED)
1086     {
1087         numAPIReservedThreads = 0;
1088         numThreads = 1;
1089         pContext->NumWorkerThreads = 1;
1090         pContext->NumFEThreads = 1;
1091         pContext->NumBEThreads = 1;
1092         pPool->numThreads = 0;
1093     }
1094     else if (pContext->threadInfo.MAX_WORKER_THREADS)
1095     {
1096         numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, numHWThreads);
1097         pContext->threadInfo.BASE_NUMA_NODE = 0;
1098         pContext->threadInfo.BASE_CORE = 0;
1099         pContext->threadInfo.BASE_THREAD = 0;
1100         numAPIReservedThreads = 0;
1101     }
1102     else
1103     {
1104         if (numAPIReservedThreads >= numThreads)
1105         {
1106             numAPIReservedThreads = 0;
1107         }
1108         else if (numAPIReservedThreads)
1109         {
1110             numAPIThreadsPerCore = std::min(numAPIThreadsPerCore, numHWHyperThreads);
1111
1112             if (0 == numAPIThreadsPerCore)
1113             {
1114                 numAPIThreadsPerCore = numHWHyperThreads;
1115             }
1116
1117             numRemovedThreads = numAPIReservedThreads;
1118             if (numAPIThreadsPerCore == 2 && numHyperThreads == 1)
1119             {
1120                 // Adjust removed threads to make logic below work
1121                 numRemovedThreads = std::max(1U, (numRemovedThreads + numAPIThreadsPerCore - 1) / 2);
1122             }
1123
1124             numThreads -= numRemovedThreads;
1125         }
1126     }
1127
1128     InitPerThreadStats(pContext, numThreads);
1129
1130     if (pContext->threadInfo.SINGLE_THREADED)
1131     {
1132         return;
1133     }
1134
1135     if (numAPIReservedThreads)
1136     {
1137         pPool->pApiThreadData = new (std::nothrow) THREAD_DATA[numAPIReservedThreads];
1138         SWR_ASSERT(pPool->pApiThreadData);
1139         if (!pPool->pApiThreadData)
1140         {
1141             numAPIReservedThreads = 0;
1142         }
1143     }
1144     pPool->numReservedThreads = numAPIReservedThreads;
1145
1146     pPool->numThreads = numThreads;
1147     pContext->NumWorkerThreads = pPool->numThreads;
1148
1149     pPool->pThreadData = new (std::nothrow) THREAD_DATA[pPool->numThreads];
1150     SWR_ASSERT(pPool->pThreadData);
1151     pPool->numaMask = 0;
1152
1153
1154     pPool->pThreads = new (std::nothrow) THREAD_PTR[pPool->numThreads];
1155     SWR_ASSERT(pPool->pThreads);
1156
1157     if (pContext->threadInfo.MAX_WORKER_THREADS)
1158     {
1159         bool bForceBindProcGroup = (numThreads > numThreadsPerProcGroup);
1160         uint32_t numProcGroups = (numThreads + numThreadsPerProcGroup - 1) / numThreadsPerProcGroup;
1161         // When MAX_WORKER_THREADS is set we don't bother to bind to specific HW threads
1162         // But Windows will still require binding to specific process groups
1163         for (uint32_t workerId = 0; workerId < numThreads; ++workerId)
1164         {
1165             pPool->pThreadData[workerId].workerId = workerId;
1166             pPool->pThreadData[workerId].procGroupId = workerId % numProcGroups;
1167             pPool->pThreadData[workerId].threadId = 0;
1168             pPool->pThreadData[workerId].numaId = 0;
1169             pPool->pThreadData[workerId].coreId = 0;
1170             pPool->pThreadData[workerId].htId = 0;
1171             pPool->pThreadData[workerId].pContext = pContext;
1172             pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
1173
1174             pContext->NumBEThreads++;
1175             pContext->NumFEThreads++;
1176         }
1177     }
1178     else
1179     {
1180         // numa distribution assumes workers on all nodes
1181         bool useNuma = true;
1182         if (numCoresPerNode * numHyperThreads == 1)
1183         {
1184             useNuma = false;
1185         }
1186
1187         if (useNuma)
1188         {
1189             pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.)
1190         }
1191         else
1192         {
1193             pPool->numaMask = 0;
1194         }
1195
1196         uint32_t workerId = 0;
1197         uint32_t numReservedThreads = numAPIReservedThreads;
1198         for (uint32_t n = 0; n < numNodes; ++n)
1199         {
1200             if ((n + pContext->threadInfo.BASE_NUMA_NODE) >= nodes.size())
1201             {
1202                 break;
1203             }
1204             auto& node = nodes[n + pContext->threadInfo.BASE_NUMA_NODE];
1205             uint32_t numCores = numCoresPerNode;
1206             for (uint32_t c = 0; c < numCores; ++c)
1207             {
1208                 if ((c + pContext->threadInfo.BASE_CORE) >= node.cores.size())
1209                 {
1210                     break;
1211                 }
1212
1213                 auto& core = node.cores[c + pContext->threadInfo.BASE_CORE];
1214                 for (uint32_t t = 0; t < numHyperThreads; ++t)
1215                 {
1216                     if ((t + pContext->threadInfo.BASE_THREAD) >= core.threadIds.size())
1217                     {
1218                         break;
1219                     }
1220
1221                     if (numRemovedThreads)
1222                     {
1223                         --numRemovedThreads;
1224                         SWR_REL_ASSERT(numReservedThreads);
1225                         --numReservedThreads;
1226                         pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU;
1227                         pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
1228                         pPool->pApiThreadData[numReservedThreads].threadId = core.threadIds[t];
1229                         pPool->pApiThreadData[numReservedThreads].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
1230                         pPool->pApiThreadData[numReservedThreads].coreId = c + pContext->threadInfo.BASE_CORE;
1231                         pPool->pApiThreadData[numReservedThreads].htId = t + pContext->threadInfo.BASE_THREAD;
1232                         pPool->pApiThreadData[numReservedThreads].pContext = pContext;
1233                         pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
1234
1235
1236                         if (numAPIThreadsPerCore > numHyperThreads && numReservedThreads)
1237                         {
1238                             --numReservedThreads;
1239                             pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU;
1240                             pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
1241                             pPool->pApiThreadData[numReservedThreads].threadId = core.threadIds[t + 1];
1242                             pPool->pApiThreadData[numReservedThreads].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
1243                             pPool->pApiThreadData[numReservedThreads].coreId = c + pContext->threadInfo.BASE_CORE;
1244                             pPool->pApiThreadData[numReservedThreads].htId = t + pContext->threadInfo.BASE_THREAD;
1245                             pPool->pApiThreadData[numReservedThreads].pContext = pContext;
1246                             pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
1247                         }
1248
1249                         continue;
1250                     }
1251
1252                     SWR_ASSERT(workerId < numThreads);
1253
1254                     pPool->pThreadData[workerId].workerId = workerId;
1255                     pPool->pThreadData[workerId].procGroupId = core.procGroup;
1256                     pPool->pThreadData[workerId].threadId = core.threadIds[t + pContext->threadInfo.BASE_THREAD];
1257                     pPool->pThreadData[workerId].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
1258                     pPool->pThreadData[workerId].coreId = c + pContext->threadInfo.BASE_CORE;
1259                     pPool->pThreadData[workerId].htId = t + pContext->threadInfo.BASE_THREAD;
1260                     pPool->pThreadData[workerId].pContext = pContext;
1261                     pPool->pThreadData[workerId].forceBindProcGroup = false;
1262
1263                     pContext->NumBEThreads++;
1264                     pContext->NumFEThreads++;
1265
1266                     ++workerId;
1267                 }
1268             }
1269         }
1270         SWR_ASSERT(workerId == pContext->NumWorkerThreads);
1271     }
1272 }
1273
1274 //////////////////////////////////////////////////////////////////////////
1275 /// @brief Launches worker threads in thread pool.
1276 /// @param pContext - pointer to context
1277 /// @param pPool - pointer to thread pool object.
1278 void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
1279 {
1280     if (pContext->threadInfo.SINGLE_THREADED)
1281     {
1282         return;
1283     }
1284
1285     for (uint32_t workerId = 0; workerId < pContext->NumWorkerThreads; ++workerId)
1286     {
1287         pPool->pThreads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
1288     }
1289 }
1290
1291 //////////////////////////////////////////////////////////////////////////
1292 /// @brief Destroys thread pool.
1293 /// @param pContext - pointer to context
1294 /// @param pPool - pointer to thread pool object.
1295 void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
1296 {
1297     if (!pContext->threadInfo.SINGLE_THREADED)
1298     {
1299         // Wait for all threads to finish
1300         SwrWaitForIdle(pContext);
1301
1302         // Wait for threads to finish and destroy them
1303         for (uint32_t t = 0; t < pPool->numThreads; ++t)
1304         {
1305             // Detach from thread.  Cannot join() due to possibility (in Windows) of code
1306             // in some DLLMain(THREAD_DETATCH case) blocking the thread until after this returns.
1307             pPool->pThreads[t]->detach();
1308             delete(pPool->pThreads[t]);
1309         }
1310
1311         delete[] pPool->pThreads;
1312
1313         // Clean up data used by threads
1314         delete[] pPool->pThreadData;
1315         delete[] pPool->pApiThreadData;
1316     }
1317 }