gallium/swr: add OpenSWR rasterizer
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / threads.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23
24 #include <stdio.h>
25 #include <thread>
26 #include <algorithm>
27 #include <unordered_set>
28 #include <float.h>
29 #include <vector>
30 #include <utility>
31 #include <fstream>
32 #include <string>
33
34 #if defined(__linux__) || defined(__gnu_linux__)
35 #include <pthread.h>
36 #include <sched.h>
37 #include <unistd.h>
38 #endif
39
40 #include "common/os.h"
41 #include "context.h"
42 #include "frontend.h"
43 #include "backend.h"
44 #include "rasterizer.h"
45 #include "rdtsc_core.h"
46 #include "tilemgr.h"
47 #include "core/multisample.h"
48
49
50
51
52 // ThreadId
53 struct Core
54 {
55 uint32_t procGroup = 0;
56 std::vector<uint32_t> threadIds;
57 };
58
59 struct NumaNode
60 {
61 std::vector<Core> cores;
62 };
63
64 typedef std::vector<NumaNode> CPUNumaNodes;
65
66 void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup)
67 {
68 out_nodes.clear();
69 out_numThreadsPerProcGroup = 0;
70
71 #if defined(_WIN32)
72
73 SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS];
74 DWORD bufSize = sizeof(buffer);
75
76 BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, buffer, &bufSize);
77 SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information");
78
79 uint32_t count = bufSize / buffer->Size;
80 PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = buffer;
81
82 for (uint32_t i = 0; i < count; ++i)
83 {
84 SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore);
85 for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g)
86 {
87 auto& gmask = pBuffer->Processor.GroupMask[g];
88 uint32_t threadId = 0;
89 uint32_t procGroup = gmask.Group;
90
91 Core* pCore = nullptr;
92
93 uint32_t numThreads = (uint32_t)_mm_popcount_sizeT(gmask.Mask);
94
95 while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask))
96 {
97 // clear mask
98 gmask.Mask &= ~(KAFFINITY(1) << threadId);
99
100 // Find Numa Node
101 PROCESSOR_NUMBER procNum = {};
102 procNum.Group = WORD(procGroup);
103 procNum.Number = UCHAR(threadId);
104
105 uint32_t numaId = 0;
106 ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId);
107 SWR_ASSERT(ret);
108
109 // Store data
110 if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
111 auto& numaNode = out_nodes[numaId];
112
113 uint32_t coreId = 0;
114
115 if (nullptr == pCore)
116 {
117 numaNode.cores.push_back(Core());
118 pCore = &numaNode.cores.back();
119 pCore->procGroup = procGroup;
120 #if !defined(_WIN64)
121 coreId = (uint32_t)numaNode.cores.size();
122 if ((coreId * numThreads) >= 32)
123 {
124 // Windows doesn't return threadIds >= 32 for a processor group correctly
125 // when running a 32-bit application.
126 // Just save -1 as the threadId
127 threadId = uint32_t(-1);
128 }
129 #endif
130 }
131 pCore->threadIds.push_back(threadId);
132 if (procGroup == 0)
133 {
134 out_numThreadsPerProcGroup++;
135 }
136 }
137 }
138 pBuffer = PtrAdd(pBuffer, pBuffer->Size);
139 }
140
141
142 #elif defined(__linux__) || defined (__gnu_linux__)
143
144 // Parse /proc/cpuinfo to get full topology
145 std::ifstream input("/proc/cpuinfo");
146 std::string line;
147 char* c;
148 uint32_t threadId = uint32_t(-1);
149 uint32_t coreId = uint32_t(-1);
150 uint32_t numaId = uint32_t(-1);
151
152 while (std::getline(input, line))
153 {
154 if (line.find("processor") != std::string::npos)
155 {
156 if (threadId != uint32_t(-1))
157 {
158 // Save information.
159 if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
160 auto& numaNode = out_nodes[numaId];
161 if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1);
162 auto& core = numaNode.cores[coreId];
163
164 core.procGroup = coreId;
165 core.threadIds.push_back(threadId);
166
167 out_numThreadsPerProcGroup++;
168 }
169
170 auto data_start = line.find(": ") + 2;
171 threadId = std::strtoul(&line.c_str()[data_start], &c, 10);
172 continue;
173 }
174 if (line.find("core id") != std::string::npos)
175 {
176 auto data_start = line.find(": ") + 2;
177 coreId = std::strtoul(&line.c_str()[data_start], &c, 10);
178 continue;
179 }
180 if (line.find("physical id") != std::string::npos)
181 {
182 auto data_start = line.find(": ") + 2;
183 numaId = std::strtoul(&line.c_str()[data_start], &c, 10);
184 continue;
185 }
186 }
187
188 if (threadId != uint32_t(-1))
189 {
190 // Save information.
191 if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
192 auto& numaNode = out_nodes[numaId];
193 if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1);
194 auto& core = numaNode.cores[coreId];
195
196 core.procGroup = coreId;
197 core.threadIds.push_back(threadId);
198 out_numThreadsPerProcGroup++;
199 }
200
201 for (uint32_t node = 0; node < out_nodes.size(); node++) {
202 auto& numaNode = out_nodes[node];
203 auto it = numaNode.cores.begin();
204 for ( ; it != numaNode.cores.end(); ) {
205 if (it->threadIds.size() == 0)
206 numaNode.cores.erase(it);
207 else
208 ++it;
209 }
210 }
211
212 #else
213
214 #error Unsupported platform
215
216 #endif
217 }
218
219
220 void bindThread(uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=false)
221 {
222 // Only bind threads when MAX_WORKER_THREADS isn't set.
223 if (KNOB_MAX_WORKER_THREADS && bindProcGroup == false)
224 {
225 return;
226 }
227
228 #if defined(_WIN32)
229 {
230 GROUP_AFFINITY affinity = {};
231 affinity.Group = procGroupId;
232
233 #if !defined(_WIN64)
234 if (threadId >= 32)
235 {
236 // In a 32-bit process on Windows it is impossible to bind
237 // to logical processors 32-63 within a processor group.
238 // In this case set the mask to 0 and let the system assign
239 // the processor. Hopefully it will make smart choices.
240 affinity.Mask = 0;
241 }
242 else
243 #endif
244 {
245 // If KNOB_MAX_WORKER_THREADS is set, only bind to the proc group,
246 // Not the individual HW thread.
247 if (!KNOB_MAX_WORKER_THREADS)
248 {
249 affinity.Mask = KAFFINITY(1) << threadId;
250 }
251 }
252
253 SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr);
254 }
255 #else
256 cpu_set_t cpuset;
257 pthread_t thread = pthread_self();
258 CPU_ZERO(&cpuset);
259 CPU_SET(threadId, &cpuset);
260
261 pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
262 #endif
263 }
264
265 INLINE
266 uint64_t GetEnqueuedDraw(SWR_CONTEXT *pContext)
267 {
268 //uint64_t result = _InterlockedCompareExchange64((volatile __int64*)&pContext->DrawEnqueued, 0, 0);
269 //return result;
270 return pContext->DrawEnqueued;
271 }
272
273 INLINE
274 DRAW_CONTEXT *GetDC(SWR_CONTEXT *pContext, uint64_t drawId)
275 {
276 return &pContext->dcRing[(drawId-1) % KNOB_MAX_DRAWS_IN_FLIGHT];
277 }
278
279 // returns true if dependency not met
280 INLINE
281 bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint64_t lastRetiredDraw)
282 {
283 return (pDC->dependency > lastRetiredDraw);
284 }
285
286 void ClearColorHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
287 {
288 // Load clear color into SIMD register...
289 float *pClearData = (float*)(pHotTile->clearData);
290 simdscalar valR = _simd_broadcast_ss(&pClearData[0]);
291 simdscalar valG = _simd_broadcast_ss(&pClearData[1]);
292 simdscalar valB = _simd_broadcast_ss(&pClearData[2]);
293 simdscalar valA = _simd_broadcast_ss(&pClearData[3]);
294
295 float *pfBuf = (float*)pHotTile->pBuffer;
296 uint32_t numSamples = pHotTile->numSamples;
297
298 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
299 {
300 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
301 {
302 for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++)
303 {
304 _simd_store_ps(pfBuf, valR);
305 pfBuf += KNOB_SIMD_WIDTH;
306 _simd_store_ps(pfBuf, valG);
307 pfBuf += KNOB_SIMD_WIDTH;
308 _simd_store_ps(pfBuf, valB);
309 pfBuf += KNOB_SIMD_WIDTH;
310 _simd_store_ps(pfBuf, valA);
311 pfBuf += KNOB_SIMD_WIDTH;
312 }
313 }
314 }
315 }
316
317 void ClearDepthHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
318 {
319 // Load clear color into SIMD register...
320 float *pClearData = (float*)(pHotTile->clearData);
321 simdscalar valZ = _simd_broadcast_ss(&pClearData[0]);
322
323 float *pfBuf = (float*)pHotTile->pBuffer;
324 uint32_t numSamples = pHotTile->numSamples;
325
326 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
327 {
328 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
329 {
330 for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM)
331 {
332 _simd_store_ps(pfBuf, valZ);
333 pfBuf += KNOB_SIMD_WIDTH;
334 }
335 }
336 }
337 }
338
339 void ClearStencilHotTile(const HOTTILE* pHotTile)
340 {
341 // convert from F32 to U8.
342 uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
343 //broadcast 32x into __m256i...
344 simdscalari valS = _simd_set1_epi8(clearVal);
345
346 simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer;
347 uint32_t numSamples = pHotTile->numSamples;
348
349 for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
350 {
351 for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
352 {
353 // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
354 for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4)
355 {
356 _simd_store_si(pBuf, valS);
357 pBuf += 1;
358 }
359 }
360 }
361 }
362
363 // for draw calls, we initialize the active hot tiles and perform deferred
364 // load on them if tile is in invalid state. we do this in the outer thread loop instead of inside
365 // the draw routine itself mainly for performance, to avoid unnecessary setup
366 // every triangle
367 // @todo support deferred clear
368 INLINE
369 void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, const TRIANGLE_WORK_DESC* pWork)
370 {
371 const API_STATE& state = GetApiState(pDC);
372 HotTileMgr *pHotTileMgr = pContext->pHotTileMgr;
373
374 uint32_t x, y;
375 MacroTileMgr::getTileIndices(macroID, x, y);
376 x *= KNOB_MACROTILE_X_DIM;
377 y *= KNOB_MACROTILE_Y_DIM;
378
379 uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);
380
381 // check RT if enabled
382 unsigned long rtSlot = 0;
383 uint32_t colorHottileEnableMask = state.colorHottileEnable;
384 while(_BitScanForward(&rtSlot, colorHottileEnableMask))
385 {
386 HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples);
387
388 if (pHotTile->state == HOTTILE_INVALID)
389 {
390 RDTSC_START(BELoadTiles);
391 // invalid hottile before draw requires a load from surface before we can draw to it
392 pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
393 pHotTile->state = HOTTILE_DIRTY;
394 RDTSC_STOP(BELoadTiles, 0, 0);
395 }
396 else if (pHotTile->state == HOTTILE_CLEAR)
397 {
398 RDTSC_START(BELoadTiles);
399 // Clear the tile.
400 ClearColorHotTile(pHotTile);
401 pHotTile->state = HOTTILE_DIRTY;
402 RDTSC_STOP(BELoadTiles, 0, 0);
403 }
404 colorHottileEnableMask &= ~(1 << rtSlot);
405 }
406
407 // check depth if enabled
408 if (state.depthHottileEnable)
409 {
410 HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
411 if (pHotTile->state == HOTTILE_INVALID)
412 {
413 RDTSC_START(BELoadTiles);
414 // invalid hottile before draw requires a load from surface before we can draw to it
415 pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
416 pHotTile->state = HOTTILE_DIRTY;
417 RDTSC_STOP(BELoadTiles, 0, 0);
418 }
419 else if (pHotTile->state == HOTTILE_CLEAR)
420 {
421 RDTSC_START(BELoadTiles);
422 // Clear the tile.
423 ClearDepthHotTile(pHotTile);
424 pHotTile->state = HOTTILE_DIRTY;
425 RDTSC_STOP(BELoadTiles, 0, 0);
426 }
427 }
428
429 // check stencil if enabled
430 if (state.stencilHottileEnable)
431 {
432 HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
433 if (pHotTile->state == HOTTILE_INVALID)
434 {
435 RDTSC_START(BELoadTiles);
436 // invalid hottile before draw requires a load from surface before we can draw to it
437 pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
438 pHotTile->state = HOTTILE_DIRTY;
439 RDTSC_STOP(BELoadTiles, 0, 0);
440 }
441 else if (pHotTile->state == HOTTILE_CLEAR)
442 {
443 RDTSC_START(BELoadTiles);
444 // Clear the tile.
445 ClearStencilHotTile(pHotTile);
446 pHotTile->state = HOTTILE_DIRTY;
447 RDTSC_STOP(BELoadTiles, 0, 0);
448 }
449 }
450 }
451
452 INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
453 {
454 // increment our current draw id to the first incomplete draw
455 uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
456 while (curDrawBE < drawEnqueued)
457 {
458 DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT];
459
460 // If its not compute and FE is not done then break out of loop.
461 if (!pDC->doneFE && !pDC->isCompute) break;
462
463 bool isWorkComplete = (pDC->isCompute) ?
464 pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete();
465
466 if (isWorkComplete)
467 {
468 curDrawBE++;
469 InterlockedIncrement(&pDC->threadsDoneBE);
470 }
471 else
472 {
473 break;
474 }
475 }
476
477 // If there are no more incomplete draws then return false.
478 return (curDrawBE >= drawEnqueued) ? false : true;
479 }
480
481 //////////////////////////////////////////////////////////////////////////
482 /// @brief If there is any BE work then go work on it.
483 /// @param pContext - pointer to SWR context.
484 /// @param workerId - The unique worker ID that is assigned to this thread.
485 /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
486 /// has its own curDrawBE counter and this ensures that each worker processes all the
487 /// draws in order.
488 /// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its
489 /// own set and each time it fails to lock a macrotile, because its already locked,
490 /// then it will add that tile to the lockedTiles set. As a worker begins to work
491 /// on future draws the lockedTiles ensure that it doesn't work on tiles that may
492 /// still have work pending in a previous draw. Additionally, the lockedTiles is
493 /// hueristic that can steer a worker back to the same macrotile that it had been
494 /// working on in a previous draw.
495 void WorkOnFifoBE(
496 SWR_CONTEXT *pContext,
497 uint32_t workerId,
498 uint64_t &curDrawBE,
499 std::unordered_set<uint32_t>& lockedTiles)
500 {
501 // Find the first incomplete draw that has pending work. If no such draw is found then
502 // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
503 if (FindFirstIncompleteDraw(pContext, curDrawBE) == false)
504 {
505 return;
506 }
507
508 uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;
509
510 // Reset our history for locked tiles. We'll have to re-learn which tiles are locked.
511 lockedTiles.clear();
512
513 // Try to work on each draw in order of the available draws in flight.
514 // 1. If we're on curDrawBE, we can work on any macrotile that is available.
515 // 2. If we're trying to work on draws after curDrawBE, we are restricted to
516 // working on those macrotiles that are known to be complete in the prior draw to
517 // maintain order. The locked tiles provides the history to ensures this.
518 for (uint64_t i = curDrawBE; i < GetEnqueuedDraw(pContext); ++i)
519 {
520 DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT];
521
522 if (pDC->isCompute) return; // We don't look at compute work.
523
524 // First wait for FE to be finished with this draw. This keeps threading model simple
525 // but if there are lots of bubbles between draws then serializing FE and BE may
526 // need to be revisited.
527 if (!pDC->doneFE) return;
528
529 // If this draw is dependent on a previous draw then we need to bail.
530 if (CheckDependency(pContext, pDC, lastRetiredDraw))
531 {
532 return;
533 }
534
535 // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it.
536 std::vector<uint32_t> &macroTiles = pDC->pTileMgr->getDirtyTiles();
537
538 for (uint32_t tileID : macroTiles)
539 {
540 MacroTileQueue &tile = pDC->pTileMgr->getMacroTileQueue(tileID);
541
542 // can only work on this draw if it's not in use by other threads
543 if (lockedTiles.find(tileID) == lockedTiles.end())
544 {
545 if (tile.getNumQueued())
546 {
547 if (tile.tryLock())
548 {
549 BE_WORK *pWork;
550
551 RDTSC_START(WorkerFoundWork);
552
553 uint32_t numWorkItems = tile.getNumQueued();
554
555 if (numWorkItems != 0)
556 {
557 pWork = tile.peek();
558 SWR_ASSERT(pWork);
559 if (pWork->type == DRAW)
560 {
561 InitializeHotTiles(pContext, pDC, tileID, (const TRIANGLE_WORK_DESC*)&pWork->desc);
562 }
563 }
564
565 while ((pWork = tile.peek()) != nullptr)
566 {
567 pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
568 tile.dequeue();
569 }
570 RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId);
571
572 _ReadWriteBarrier();
573
574 pDC->pTileMgr->markTileComplete(tileID);
575
576 // Optimization: If the draw is complete and we're the last one to have worked on it then
577 // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
578 if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete())
579 {
580 // We can increment the current BE and safely move to next draw since we know this draw is complete.
581 curDrawBE++;
582 InterlockedIncrement(&pDC->threadsDoneBE);
583
584 lastRetiredDraw++;
585
586 lockedTiles.clear();
587 break;
588 }
589 }
590 else
591 {
592 // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
593 lockedTiles.insert(tileID);
594 }
595 }
596 }
597 }
598 }
599 }
600
601 void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, UCHAR numaNode)
602 {
603 // Try to grab the next DC from the ring
604 uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
605 while (curDrawFE < drawEnqueued)
606 {
607 uint32_t dcSlot = curDrawFE % KNOB_MAX_DRAWS_IN_FLIGHT;
608 DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
609 if (pDC->isCompute || pDC->doneFE || pDC->FeLock)
610 {
611 curDrawFE++;
612 InterlockedIncrement(&pDC->threadsDoneFE);
613 }
614 else
615 {
616 break;
617 }
618 }
619
620 uint64_t curDraw = curDrawFE;
621 while (curDraw < drawEnqueued)
622 {
623 uint32_t dcSlot = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT;
624 DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
625
626 if (!pDC->isCompute && !pDC->FeLock)
627 {
628 uint32_t initial = InterlockedCompareExchange((volatile uint32_t*)&pDC->FeLock, 1, 0);
629 if (initial == 0)
630 {
631 // successfully grabbed the DC, now run the FE
632 pDC->FeWork.pfnWork(pContext, pDC, workerId, &pDC->FeWork.desc);
633
634 _ReadWriteBarrier();
635 pDC->doneFE = true;
636 }
637 }
638 curDraw++;
639 }
640 }
641
642 //////////////////////////////////////////////////////////////////////////
643 /// @brief If there is any compute work then go work on it.
644 /// @param pContext - pointer to SWR context.
645 /// @param workerId - The unique worker ID that is assigned to this thread.
646 /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
647 /// has its own curDrawBE counter and this ensures that each worker processes all the
648 /// draws in order.
649 void WorkOnCompute(
650 SWR_CONTEXT *pContext,
651 uint32_t workerId,
652 uint64_t& curDrawBE)
653 {
654 if (FindFirstIncompleteDraw(pContext, curDrawBE) == false)
655 {
656 return;
657 }
658
659 uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;
660
661 DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT];
662 if (pDC->isCompute == false) return;
663
664 // check dependencies
665 if (CheckDependency(pContext, pDC, lastRetiredDraw))
666 {
667 return;
668 }
669
670 SWR_ASSERT(pDC->pDispatch != nullptr);
671 DispatchQueue& queue = *pDC->pDispatch;
672
673 // Is there any work remaining?
674 if (queue.getNumQueued() > 0)
675 {
676 bool lastToComplete = false;
677
678 uint32_t threadGroupId = 0;
679 while (queue.getWork(threadGroupId))
680 {
681 ProcessComputeBE(pDC, workerId, threadGroupId);
682
683 lastToComplete = queue.finishedWork();
684 }
685
686 _ReadWriteBarrier();
687
688 if (lastToComplete)
689 {
690 SWR_ASSERT(queue.isWorkComplete() == true);
691 pDC->doneCompute = true;
692 }
693 }
694 }
695
696 DWORD workerThreadMain(LPVOID pData)
697 {
698 THREAD_DATA *pThreadData = (THREAD_DATA*)pData;
699 SWR_CONTEXT *pContext = pThreadData->pContext;
700 uint32_t threadId = pThreadData->threadId;
701 uint32_t workerId = pThreadData->workerId;
702
703 bindThread(threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup);
704
705 RDTSC_INIT(threadId);
706
707 int numaNode = (int)pThreadData->numaId;
708
709 // flush denormals to 0
710 _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
711
712 // Track tiles locked by other threads. If we try to lock a macrotile and find its already
713 // locked then we'll add it to this list so that we don't try and lock it again.
714 std::unordered_set<uint32_t> lockedTiles;
715
716 // each worker has the ability to work on any of the queued draws as long as certain
717 // conditions are met. the data associated
718 // with a draw is guaranteed to be active as long as a worker hasn't signaled that he
719 // has moved on to the next draw when he determines there is no more work to do. The api
720 // thread will not increment the head of the dc ring until all workers have moved past the
721 // current head.
722 // the logic to determine what to work on is:
723 // 1- try to work on the FE any draw that is queued. For now there are no dependencies
724 // on the FE work, so any worker can grab any FE and process in parallel. Eventually
725 // we'll need dependency tracking to force serialization on FEs. The worker will try
726 // to pick an FE by atomically incrementing a counter in the swr context. he'll keep
727 // trying until he reaches the tail.
728 // 2- BE work must be done in strict order. we accomplish this today by pulling work off
729 // the oldest draw (ie the head) of the dcRing. the worker can determine if there is
730 // any work left by comparing the total # of binned work items and the total # of completed
731 // work items. If they are equal, then there is no more work to do for this draw, and
732 // the worker can safely increment its oldestDraw counter and move on to the next draw.
733 std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
734
735 auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->DrawEnqueued; };
736
737 uint64_t curDrawBE = 1;
738 uint64_t curDrawFE = 1;
739
740 while (pContext->threadPool.inThreadShutdown == false)
741 {
742 uint32_t loop = 0;
743 while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE))
744 {
745 _mm_pause();
746 }
747
748 if (!threadHasWork(curDrawBE))
749 {
750 lock.lock();
751
752 // check for thread idle condition again under lock
753 if (threadHasWork(curDrawBE))
754 {
755 lock.unlock();
756 continue;
757 }
758
759 if (pContext->threadPool.inThreadShutdown)
760 {
761 lock.unlock();
762 break;
763 }
764
765 RDTSC_START(WorkerWaitForThreadEvent);
766
767 pContext->FifosNotEmpty.wait(lock);
768 lock.unlock();
769
770 RDTSC_STOP(WorkerWaitForThreadEvent, 0, 0);
771
772 if (pContext->threadPool.inThreadShutdown)
773 {
774 break;
775 }
776 }
777
778 RDTSC_START(WorkerWorkOnFifoBE);
779 WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles);
780 RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
781
782 WorkOnCompute(pContext, workerId, curDrawBE);
783
784 WorkOnFifoFE(pContext, workerId, curDrawFE, numaNode);
785 }
786
787 return 0;
788 }
789
790 DWORD workerThreadInit(LPVOID pData)
791 {
792 #if defined(_WIN32)
793 __try
794 #endif // _WIN32
795 {
796 return workerThreadMain(pData);
797 }
798
799 #if defined(_WIN32)
800 __except(EXCEPTION_CONTINUE_SEARCH)
801 {
802 }
803
804 #endif // _WIN32
805
806 return 1;
807 }
808
809 void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
810 {
811 bindThread(0);
812
813 CPUNumaNodes nodes;
814 uint32_t numThreadsPerProcGroup = 0;
815 CalculateProcessorTopology(nodes, numThreadsPerProcGroup);
816
817 uint32_t numHWNodes = (uint32_t)nodes.size();
818 uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size();
819 uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size();
820
821 uint32_t numNodes = numHWNodes;
822 uint32_t numCoresPerNode = numHWCoresPerNode;
823 uint32_t numHyperThreads = numHWHyperThreads;
824
825 if (KNOB_MAX_NUMA_NODES)
826 {
827 numNodes = std::min(numNodes, KNOB_MAX_NUMA_NODES);
828 }
829
830 if (KNOB_MAX_CORES_PER_NUMA_NODE)
831 {
832 numCoresPerNode = std::min(numCoresPerNode, KNOB_MAX_CORES_PER_NUMA_NODE);
833 }
834
835 if (KNOB_MAX_THREADS_PER_CORE)
836 {
837 numHyperThreads = std::min(numHyperThreads, KNOB_MAX_THREADS_PER_CORE);
838 }
839
840 // Calculate numThreads
841 uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
842
843 if (KNOB_MAX_WORKER_THREADS)
844 {
845 uint32_t maxHWThreads = numHWNodes * numHWCoresPerNode * numHWHyperThreads;
846 numThreads = std::min(KNOB_MAX_WORKER_THREADS, maxHWThreads);
847 }
848
849 if (numThreads > KNOB_MAX_NUM_THREADS)
850 {
851 printf("WARNING: system thread count %u exceeds max %u, "
852 "performance will be degraded\n",
853 numThreads, KNOB_MAX_NUM_THREADS);
854 }
855
856 if (numThreads == 1)
857 {
858 // If only 1 worker thread, try to move it to an available
859 // HW thread. If that fails, use the API thread.
860 if (numCoresPerNode < numHWCoresPerNode)
861 {
862 numCoresPerNode++;
863 }
864 else if (numHyperThreads < numHWHyperThreads)
865 {
866 numHyperThreads++;
867 }
868 else if (numNodes < numHWNodes)
869 {
870 numNodes++;
871 }
872 else
873 {
874 pPool->numThreads = 0;
875 SET_KNOB(SINGLE_THREADED, true);
876 return;
877 }
878 }
879 else
880 {
881 // Save a HW thread for the API thread.
882 numThreads--;
883 }
884
885 pPool->numThreads = numThreads;
886 pContext->NumWorkerThreads = pPool->numThreads;
887
888 pPool->inThreadShutdown = false;
889 pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));
890
891 if (KNOB_MAX_WORKER_THREADS)
892 {
893 bool bForceBindProcGroup = (numThreads > numThreadsPerProcGroup);
894 uint32_t numProcGroups = (numThreads + numThreadsPerProcGroup - 1) / numThreadsPerProcGroup;
895 // When MAX_WORKER_THREADS is set we don't bother to bind to specific HW threads
896 // But Windows will still require binding to specific process groups
897 for (uint32_t workerId = 0; workerId < numThreads; ++workerId)
898 {
899 pPool->pThreadData[workerId].workerId = workerId;
900 pPool->pThreadData[workerId].procGroupId = workerId % numProcGroups;
901 pPool->pThreadData[workerId].threadId = 0;
902 pPool->pThreadData[workerId].numaId = 0;
903 pPool->pThreadData[workerId].pContext = pContext;
904 pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
905 pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]);
906 }
907 }
908 else
909 {
910 uint32_t workerId = 0;
911 for (uint32_t n = 0; n < numNodes; ++n)
912 {
913 auto& node = nodes[n];
914
915 uint32_t numCores = numCoresPerNode;
916 for (uint32_t c = 0; c < numCores; ++c)
917 {
918 auto& core = node.cores[c];
919 for (uint32_t t = 0; t < numHyperThreads; ++t)
920 {
921 if (c == 0 && n == 0 && t == 0)
922 {
923 // Skip core 0, thread0 on node 0 to reserve for API thread
924 continue;
925 }
926
927 pPool->pThreadData[workerId].workerId = workerId;
928 pPool->pThreadData[workerId].procGroupId = core.procGroup;
929 pPool->pThreadData[workerId].threadId = core.threadIds[t];
930 pPool->pThreadData[workerId].numaId = n;
931 pPool->pThreadData[workerId].pContext = pContext;
932 pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]);
933
934 ++workerId;
935 }
936 }
937 }
938 }
939 }
940
941 void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
942 {
943 if (!KNOB_SINGLE_THREADED)
944 {
945 // Inform threads to finish up
946 std::unique_lock<std::mutex> lock(pContext->WaitLock);
947 pPool->inThreadShutdown = true;
948 _mm_mfence();
949 pContext->FifosNotEmpty.notify_all();
950 lock.unlock();
951
952 // Wait for threads to finish and destroy them
953 for (uint32_t t = 0; t < pPool->numThreads; ++t)
954 {
955 pPool->threads[t]->join();
956 delete(pPool->threads[t]);
957 }
958
959 // Clean up data used by threads
960 free(pPool->pThreadData);
961 }
962 }