pContext->pMacroTileManagerArray = (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
pContext->pDispatchQueueArray = (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
+ for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
+ {
+ pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
+ new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
+ new (&pContext->pDispatchQueueArray[dc]) DispatchQueue();
+
+ pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
+ }
+
pContext->threadInfo.MAX_WORKER_THREADS = KNOB_MAX_WORKER_THREADS;
pContext->threadInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES;
pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE;
pContext->threadInfo = *pCreateInfo->pThreadInfo;
}
- for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
- {
- pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
- new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
- new (&pContext->pDispatchQueueArray[dc]) DispatchQueue();
-
- pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
- }
-
- if (!pContext->threadInfo.SINGLE_THREADED)
- {
- memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
- memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
- new (&pContext->WaitLock) std::mutex();
- new (&pContext->FifosNotEmpty) std::condition_variable();
+ memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
+ memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
+ new (&pContext->WaitLock) std::mutex();
+ new (&pContext->FifosNotEmpty) std::condition_variable();
- CreateThreadPool(pContext, &pContext->threadPool);
- }
+ CreateThreadPool(pContext, &pContext->threadPool);
// Calling createThreadPool() above can set SINGLE_THREADED
if (pContext->threadInfo.SINGLE_THREADED)
pContext->NumBEThreads = 1;
}
+ pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads];
+ pContext->pStats = new SWR_STATS[pContext->NumWorkerThreads];
+
// Allocate scratch space for workers.
///@note We could lazily allocate this but its rather small amount of memory.
for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
#if defined(_WIN32)
uint32_t numaNode = pContext->threadPool.pThreadData ?
pContext->threadPool.pThreadData[i].numaId : 0;
- pContext->pScratch[i] = (uint8_t*)VirtualAllocExNuma(
+ pContext->ppScratch[i] = (uint8_t*)VirtualAllocExNuma(
GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE),
MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE,
numaNode);
#else
- pContext->pScratch[i] = (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4);
+ pContext->ppScratch[i] = (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4);
#endif
}
// free the fifos
for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i)
{
+ delete [] pContext->dcRing[i].dynState.pStats;
delete pContext->dcRing[i].pArena;
delete pContext->dsRing[i].pArena;
pContext->pMacroTileManagerArray[i].~MacroTileMgr();
for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
{
#if defined(_WIN32)
- VirtualFree(pContext->pScratch[i], 0, MEM_RELEASE);
+ VirtualFree(pContext->ppScratch[i], 0, MEM_RELEASE);
#else
- AlignedFree(pContext->pScratch[i]);
+ AlignedFree(pContext->ppScratch[i]);
#endif
}
+ delete [] pContext->ppScratch;
+ delete [] pContext->pStats;
+
delete(pContext->pHotTileMgr);
pContext->~SWR_CONTEXT();
pCurDrawContext->threadsDone = 0;
pCurDrawContext->retireCallback.pfnCallbackFunc = nullptr;
- memset(&pCurDrawContext->dynState, 0, sizeof(pCurDrawContext->dynState));
+ pCurDrawContext->dynState.Reset(pContext->threadPool.numThreads);
// Assign unique drawId for this DC
pCurDrawContext->drawId = pContext->dcRing.GetHead();
csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
- csContext.pTGSM = pContext->pScratch[workerId];
+ csContext.pTGSM = pContext->ppScratch[workerId];
csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
state.pfnCsFunc(GetPrivateState(pDC), &csContext);
struct DRAW_DYNAMIC_STATE
{
+ void Reset(uint32_t numThreads)
+ {
+ SWR_STATS* pSavePtr = pStats;
+ memset(this, 0, sizeof(*this));
+ pStats = pSavePtr;
+ memset(pStats, 0, sizeof(SWR_STATS) * (numThreads ? numThreads : 1));
+ }
///@todo Currently assumes only a single FE can do stream output for a draw.
uint32_t SoWriteOffset[4];
bool SoWriteOffsetDirty[4];
SWR_STATS_FE statsFE; // Only one FE thread per DC.
- SWR_STATS stats[KNOB_MAX_NUM_THREADS];
+ SWR_STATS* pStats;
};
// Draw Context
PFN_UPDATE_STATS_FE pfnUpdateStatsFE;
// Global Stats
- SWR_STATS stats[KNOB_MAX_NUM_THREADS];
+ SWR_STATS* pStats;
// Scratch space for workers.
- uint8_t* pScratch[KNOB_MAX_NUM_THREADS];
+ uint8_t** ppScratch;
volatile int32_t drawsOutstandingFE;
TileSet singleThreadLockedTiles;
};
-#define UPDATE_STAT(name, count) if (GetApiState(pDC).enableStats) { pDC->dynState.stats[workerId].name += count; }
+#define UPDATE_STAT(name, count) if (GetApiState(pDC).enableStats) { pDC->dynState.pStats[workerId].name += count; }
#define UPDATE_STAT_FE(name, count) if (GetApiState(pDC).enableStats) { pDC->dynState.statsFE.name += count; }
///////////////////////////////////////////////////////////////////////////////
// Configuration knobs
///////////////////////////////////////////////////////////////////////////////
-#define KNOB_MAX_NUM_THREADS 256 // Supports up to dual-HSW-Xeon.
-
// Maximum supported number of active vertex buffer streams
#define KNOB_NUM_STREAMS 32
static std::mutex m;
std::lock_guard<std::mutex> l(m);
- static SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS];
- DWORD bufSize = sizeof(buffer);
+ DWORD bufSize = 0;
- BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, buffer, &bufSize);
+ BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufSize);
+ SWR_ASSERT(ret == FALSE && GetLastError() == ERROR_INSUFFICIENT_BUFFER);
+
+ PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize);
+ SWR_ASSERT(pBufferMem);
+
+ ret = GetLogicalProcessorInformationEx(RelationProcessorCore, pBufferMem, &bufSize);
SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information");
- uint32_t count = bufSize / buffer->Size;
- PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = buffer;
+ uint32_t count = bufSize / pBufferMem->Size;
+ PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = pBufferMem;
for (uint32_t i = 0; i < count; ++i)
{
pBuffer = PtrAdd(pBuffer, pBuffer->Size);
}
+ free(pBufferMem);
+
#elif defined(__linux__) || defined (__gnu_linux__)
// Sum up stats across all workers before sending to client.
for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
{
- stats.DepthPassCount += dynState.stats[i].DepthPassCount;
+ stats.DepthPassCount += dynState.pStats[i].DepthPassCount;
- stats.PsInvocations += dynState.stats[i].PsInvocations;
- stats.CsInvocations += dynState.stats[i].CsInvocations;
+ stats.PsInvocations += dynState.pStats[i].PsInvocations;
+ stats.CsInvocations += dynState.pStats[i].CsInvocations;
}
pContext->pfnUpdateStats(GetPrivateState(pDC), &stats);
numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, maxHWThreads);
}
- if (numThreads > KNOB_MAX_NUM_THREADS)
- {
- printf("WARNING: system thread count %u exceeds max %u, "
- "performance will be degraded\n",
- numThreads, KNOB_MAX_NUM_THREADS);
- }
-
uint32_t numAPIReservedThreads = 1;
else
{
pPool->numThreads = 0;
- SET_KNOB(SINGLE_THREADED, true);
- return;
+ numThreads = 1;
+ pContext->threadInfo.SINGLE_THREADED = true;
}
}
else
}
}
+ // Initialize DRAW_CONTEXT's per-thread stats
+ for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
+ {
+ pContext->dcRing[dc].dynState.pStats = new SWR_STATS[numThreads];
+ memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads);
+ }
+
+ if (pContext->threadInfo.SINGLE_THREADED)
+ {
+ return;
+ }
+
+
pPool->numThreads = numThreads;
pContext->NumWorkerThreads = pPool->numThreads;
pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));
pPool->numaMask = 0;
+ pPool->pThreads = new THREAD_PTR[pPool->numThreads];
+
if (pContext->threadInfo.MAX_WORKER_THREADS)
{
bool bForceBindProcGroup = (numThreads > numThreadsPerProcGroup);
pPool->pThreadData[workerId].htId = 0;
pPool->pThreadData[workerId].pContext = pContext;
pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
- pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
+ pPool->pThreads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
pContext->NumBEThreads++;
pContext->NumFEThreads++;
pPool->pThreadData[workerId].htId = t;
pPool->pThreadData[workerId].pContext = pContext;
- pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
+ pPool->pThreads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
pContext->NumBEThreads++;
pContext->NumFEThreads++;
// Wait for threads to finish and destroy them
for (uint32_t t = 0; t < pPool->numThreads; ++t)
{
- pPool->threads[t]->join();
- delete(pPool->threads[t]);
+ pPool->pThreads[t]->join();
+ delete(pPool->pThreads[t]);
}
+ delete [] pPool->pThreads;
+
// Clean up data used by threads
free(pPool->pThreadData);
}
struct THREAD_POOL
{
- THREAD_PTR threads[KNOB_MAX_NUM_THREADS];
+ THREAD_PTR* pThreads;
uint32_t numThreads;
uint32_t numaMask;
volatile bool inThreadShutdown;