swr/rasterizer: modernize thread TLB
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / api.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file api.cpp
24 *
25 * @brief API implementation
26 *
27 ******************************************************************************/
28
29 #include <cfloat>
30 #include <cmath>
31 #include <cstdio>
32 #include <new>
33
34 #include "core/api.h"
35 #include "core/backend.h"
36 #include "core/context.h"
37 #include "core/depthstencil.h"
38 #include "core/frontend.h"
39 #include "core/rasterizer.h"
40 #include "core/rdtsc_core.h"
41 #include "core/threads.h"
42 #include "core/tilemgr.h"
43 #include "core/clip.h"
44 #include "core/utils.h"
45 #include "core/tileset.h"
46
47 #include "common/os.h"
48
49 static const SWR_RECT g_MaxScissorRect = {0, 0, KNOB_MAX_SCISSOR_X, KNOB_MAX_SCISSOR_Y};
50
51 void SetupDefaultState(SWR_CONTEXT* pContext);
52
53 static INLINE SWR_CONTEXT* GetContext(HANDLE hContext)
54 {
55 return (SWR_CONTEXT*)hContext;
56 }
57
58 void WakeAllThreads(SWR_CONTEXT* pContext)
59 {
60 pContext->FifosNotEmpty.notify_all();
61 }
62
63 //////////////////////////////////////////////////////////////////////////
64 /// @brief Create SWR Context.
65 /// @param pCreateInfo - pointer to creation info.
66 HANDLE SwrCreateContext(SWR_CREATECONTEXT_INFO* pCreateInfo)
67 {
68 void* pContextMem = AlignedMalloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4);
69 memset(pContextMem, 0, sizeof(SWR_CONTEXT));
70 SWR_CONTEXT* pContext = new (pContextMem) SWR_CONTEXT();
71
72 pContext->privateStateSize = pCreateInfo->privateStateSize;
73
74 // initialize callback functions
75 pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
76 pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
77 pContext->pfnTranslateGfxptrForRead = pCreateInfo->pfnTranslateGfxptrForRead;
78 pContext->pfnTranslateGfxptrForWrite = pCreateInfo->pfnTranslateGfxptrForWrite;
79 pContext->pfnMakeGfxPtr = pCreateInfo->pfnMakeGfxPtr;
80 pContext->pfnCreateMemoryContext = pCreateInfo->pfnCreateMemoryContext;
81 pContext->pfnDestroyMemoryContext = pCreateInfo->pfnDestroyMemoryContext;
82 pContext->pfnUpdateSoWriteOffset = pCreateInfo->pfnUpdateSoWriteOffset;
83 pContext->pfnUpdateStats = pCreateInfo->pfnUpdateStats;
84 pContext->pfnUpdateStatsFE = pCreateInfo->pfnUpdateStatsFE;
85
86
87 pContext->hExternalMemory = pCreateInfo->hExternalMemory;
88
89 pContext->MAX_DRAWS_IN_FLIGHT = KNOB_MAX_DRAWS_IN_FLIGHT;
90 if (pCreateInfo->MAX_DRAWS_IN_FLIGHT != 0)
91 {
92 pContext->MAX_DRAWS_IN_FLIGHT = pCreateInfo->MAX_DRAWS_IN_FLIGHT;
93 }
94
95 pContext->dcRing.Init(pContext->MAX_DRAWS_IN_FLIGHT);
96 pContext->dsRing.Init(pContext->MAX_DRAWS_IN_FLIGHT);
97
98 pContext->pMacroTileManagerArray =
99 (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * pContext->MAX_DRAWS_IN_FLIGHT, 64);
100 pContext->pDispatchQueueArray =
101 (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * pContext->MAX_DRAWS_IN_FLIGHT, 64);
102
103 for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
104 {
105 pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
106 new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
107 new (&pContext->pDispatchQueueArray[dc]) DispatchQueue();
108
109 pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
110 }
111
112 if (pCreateInfo->pThreadInfo)
113 {
114 pContext->threadInfo = *pCreateInfo->pThreadInfo;
115 }
116 else
117 {
118 pContext->threadInfo.MAX_WORKER_THREADS = KNOB_MAX_WORKER_THREADS;
119 pContext->threadInfo.BASE_NUMA_NODE = KNOB_BASE_NUMA_NODE;
120 pContext->threadInfo.BASE_CORE = KNOB_BASE_CORE;
121 pContext->threadInfo.BASE_THREAD = KNOB_BASE_THREAD;
122 pContext->threadInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES;
123 pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE;
124 pContext->threadInfo.MAX_THREADS_PER_CORE = KNOB_MAX_THREADS_PER_CORE;
125 pContext->threadInfo.SINGLE_THREADED = KNOB_SINGLE_THREADED;
126 }
127
128 if (pCreateInfo->pApiThreadInfo)
129 {
130 pContext->apiThreadInfo = *pCreateInfo->pApiThreadInfo;
131 }
132 else
133 {
134 pContext->apiThreadInfo.bindAPIThread0 = true;
135 pContext->apiThreadInfo.numAPIReservedThreads = 1;
136 pContext->apiThreadInfo.numAPIThreadsPerCore = 1;
137 }
138
139 if (pCreateInfo->pWorkerPrivateState)
140 {
141 pContext->workerPrivateState = *pCreateInfo->pWorkerPrivateState;
142 }
143
144 memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
145 memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
146 new (&pContext->WaitLock) std::mutex();
147 new (&pContext->FifosNotEmpty) std::condition_variable();
148
149 CreateThreadPool(pContext, &pContext->threadPool);
150
151 if (pContext->apiThreadInfo.bindAPIThread0)
152 {
153 BindApiThread(pContext, 0);
154 }
155
156 if (pContext->threadInfo.SINGLE_THREADED)
157 {
158 pContext->pSingleThreadLockedTiles = new TileSet();
159 }
160
161 pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads];
162 pContext->pStats =
163 (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64);
164
165 #if defined(KNOB_ENABLE_AR)
166 // Setup ArchRast thread contexts which includes +1 for API thread.
167 pContext->pArContext = new HANDLE[pContext->NumWorkerThreads + 1];
168 pContext->pArContext[pContext->NumWorkerThreads] =
169 ArchRast::CreateThreadContext(ArchRast::AR_THREAD::API);
170 #endif
171
172 #if defined(KNOB_ENABLE_RDTSC)
173 pContext->pBucketMgr = new BucketManager(pCreateInfo->contextName);
174 RDTSC_RESET(pContext->pBucketMgr);
175 RDTSC_INIT(pContext->pBucketMgr, 0);
176 #endif
177
178 // Allocate scratch space for workers.
179 ///@note We could lazily allocate this but its rather small amount of memory.
180 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
181 {
182 #if defined(_WIN32)
183 uint32_t numaNode =
184 pContext->threadPool.pThreadData ? pContext->threadPool.pThreadData[i].numaId : 0;
185 pContext->ppScratch[i] = (uint8_t*)VirtualAllocExNuma(GetCurrentProcess(),
186 nullptr,
187 KNOB_WORKER_SCRATCH_SPACE_SIZE,
188 MEM_RESERVE | MEM_COMMIT,
189 PAGE_READWRITE,
190 numaNode);
191 #else
192 pContext->ppScratch[i] =
193 (uint8_t*)AlignedMalloc(KNOB_WORKER_SCRATCH_SPACE_SIZE, KNOB_SIMD_WIDTH * 4);
194 #endif
195
196 #if defined(KNOB_ENABLE_AR)
197 // Initialize worker thread context for ArchRast.
198 pContext->pArContext[i] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::WORKER);
199
200 SWR_WORKER_DATA* pWorkerData = (SWR_WORKER_DATA*)pContext->threadPool.pThreadData[i].pWorkerPrivateData;
201 pWorkerData->hArContext = pContext->pArContext[i];
202 #endif
203
204
205 }
206
207 #if defined(KNOB_ENABLE_AR)
208 // cache the API thread event manager, for use with sim layer
209 pCreateInfo->hArEventManager = pContext->pArContext[pContext->NumWorkerThreads];
210 #endif
211
212 // State setup AFTER context is fully initialized
213 SetupDefaultState(pContext);
214
215 // initialize hot tile manager
216 pContext->pHotTileMgr = new HotTileMgr();
217
218 // pass pointer to bucket manager back to caller
219 #ifdef KNOB_ENABLE_RDTSC
220 pCreateInfo->pBucketMgr = pContext->pBucketMgr;
221 #endif
222
223 pCreateInfo->contextSaveSize = sizeof(API_STATE);
224
225 StartThreadPool(pContext, &pContext->threadPool);
226
227 return (HANDLE)pContext;
228 }
229
230 void CopyState(DRAW_STATE& dst, const DRAW_STATE& src)
231 {
232 memcpy(&dst.state, &src.state, sizeof(API_STATE));
233 }
234
235 template <bool IsDraw>
236 void QueueWork(SWR_CONTEXT* pContext)
237 {
238 DRAW_CONTEXT* pDC = pContext->pCurDrawContext;
239 uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT;
240
241 if (IsDraw)
242 {
243 pDC->pTileMgr = &pContext->pMacroTileManagerArray[dcIndex];
244 pDC->pTileMgr->initialize();
245 }
246
247 // Each worker thread looks at a DC for both FE and BE work at different times and so we
248 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
249 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
250 // then moved on if all work is done.)
251 pContext->pCurDrawContext->threadsDone = pContext->NumFEThreads + pContext->NumBEThreads;
252
253 if (IsDraw)
254 {
255 InterlockedIncrement(&pContext->drawsOutstandingFE);
256 }
257
258 _ReadWriteBarrier();
259 {
260 std::unique_lock<std::mutex> lock(pContext->WaitLock);
261 pContext->dcRing.Enqueue();
262 }
263
264 if (pContext->threadInfo.SINGLE_THREADED)
265 {
266 uint32_t mxcsr = SetOptimalVectorCSR();
267
268 if (IsDraw)
269 {
270 uint32_t curDraw[2] = {pContext->pCurDrawContext->drawId,
271 pContext->pCurDrawContext->drawId};
272 WorkOnFifoFE(pContext, 0, curDraw[0]);
273 WorkOnFifoBE(pContext, 0, curDraw[1], *pContext->pSingleThreadLockedTiles, 0, 0);
274 }
275 else
276 {
277 uint32_t curDispatch = pContext->pCurDrawContext->drawId;
278 WorkOnCompute(pContext, 0, curDispatch);
279 }
280
281 // Dequeue the work here, if not already done, since we're single threaded (i.e. no
282 // workers).
283 while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0)
284 {
285 }
286
287 // restore csr
288 RestoreVectorCSR(mxcsr);
289 }
290 else
291 {
292 RDTSC_BEGIN(pContext->pBucketMgr, APIDrawWakeAllThreads, pDC->drawId);
293 WakeAllThreads(pContext);
294 RDTSC_END(pContext->pBucketMgr, APIDrawWakeAllThreads, 1);
295 }
296
297 // Set current draw context to NULL so that next state call forces a new draw context to be
298 // created and populated.
299 pContext->pPrevDrawContext = pContext->pCurDrawContext;
300 pContext->pCurDrawContext = nullptr;
301 }
302
303 INLINE void QueueDraw(SWR_CONTEXT* pContext)
304 {
305 QueueWork<true>(pContext);
306 }
307
308 INLINE void QueueDispatch(SWR_CONTEXT* pContext)
309 {
310 QueueWork<false>(pContext);
311 }
312
313 DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT* pContext, bool isSplitDraw = false)
314 {
315 RDTSC_BEGIN(pContext->pBucketMgr, APIGetDrawContext, 0);
316 // If current draw context is null then need to obtain a new draw context to use from ring.
317 if (pContext->pCurDrawContext == nullptr)
318 {
319 // Need to wait for a free entry.
320 while (pContext->dcRing.IsFull())
321 {
322 _mm_pause();
323 }
324
325 uint64_t curDraw = pContext->dcRing.GetHead();
326 uint32_t dcIndex = curDraw % pContext->MAX_DRAWS_IN_FLIGHT;
327
328 if ((pContext->frameCount - pContext->lastFrameChecked) > 2 ||
329 (curDraw - pContext->lastDrawChecked) > 0x10000)
330 {
331 // Take this opportunity to clean-up old arena allocations
332 pContext->cachingArenaAllocator.FreeOldBlocks();
333
334 pContext->lastFrameChecked = pContext->frameCount;
335 pContext->lastDrawChecked = curDraw;
336 }
337
338 DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
339 pContext->pCurDrawContext = pCurDrawContext;
340
341 // Assign next available entry in DS ring to this DC.
342 uint32_t dsIndex = pContext->curStateId % pContext->MAX_DRAWS_IN_FLIGHT;
343 pCurDrawContext->pState = &pContext->dsRing[dsIndex];
344
345 // Copy previous state to current state.
346 if (pContext->pPrevDrawContext)
347 {
348 DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext;
349
350 // If we're splitting our draw then we can just use the same state from the previous
351 // draw. In this case, we won't increment the DS ring index so the next non-split
352 // draw can receive the state.
353 if (isSplitDraw == false)
354 {
355 CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);
356
357 // Should have been cleaned up previously
358 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
359
360 pCurDrawContext->pState->pPrivateState = nullptr;
361
362 pContext->curStateId++; // Progress state ring index forward.
363 }
364 else
365 {
366 // If its a split draw then just copy the state pointer over
367 // since its the same draw.
368 pCurDrawContext->pState = pPrevDrawContext->pState;
369 SWR_ASSERT(pPrevDrawContext->cleanupState == false);
370 }
371 }
372 else
373 {
374 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
375 pContext->curStateId++; // Progress state ring index forward.
376 }
377
378 SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true);
379
380 // Reset dependency
381 pCurDrawContext->dependent = false;
382 pCurDrawContext->dependentFE = false;
383
384 pCurDrawContext->pContext = pContext;
385 pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
386
387 pCurDrawContext->doneFE = false;
388 pCurDrawContext->FeLock = 0;
389 pCurDrawContext->threadsDone = 0;
390 pCurDrawContext->retireCallback.pfnCallbackFunc = nullptr;
391
392 pCurDrawContext->dynState.Reset(pContext->NumWorkerThreads);
393
394 // Assign unique drawId for this DC
395 pCurDrawContext->drawId = pContext->dcRing.GetHead();
396
397 pCurDrawContext->cleanupState = true;
398 }
399 else
400 {
401 SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
402 }
403
404 RDTSC_END(pContext->pBucketMgr, APIGetDrawContext, 0);
405 return pContext->pCurDrawContext;
406 }
407
408 API_STATE* GetDrawState(SWR_CONTEXT* pContext)
409 {
410 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
411 SWR_ASSERT(pDC->pState != nullptr);
412
413 return &pDC->pState->state;
414 }
415
416 void SwrDestroyContext(HANDLE hContext)
417 {
418 SWR_CONTEXT* pContext = GetContext(hContext);
419 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
420
421 pDC->FeWork.type = SHUTDOWN;
422 pDC->FeWork.pfnWork = ProcessShutdown;
423
424 // enqueue
425 QueueDraw(pContext);
426
427 DestroyThreadPool(pContext, &pContext->threadPool);
428
429 // free the fifos
430 for (uint32_t i = 0; i < pContext->MAX_DRAWS_IN_FLIGHT; ++i)
431 {
432 AlignedFree(pContext->dcRing[i].dynState.pStats);
433 delete pContext->dcRing[i].pArena;
434 delete pContext->dsRing[i].pArena;
435 pContext->pMacroTileManagerArray[i].~MacroTileMgr();
436 pContext->pDispatchQueueArray[i].~DispatchQueue();
437 }
438
439 AlignedFree(pContext->pDispatchQueueArray);
440 AlignedFree(pContext->pMacroTileManagerArray);
441
442 // Free scratch space.
443 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
444 {
445 #if defined(_WIN32)
446 VirtualFree(pContext->ppScratch[i], 0, MEM_RELEASE);
447 #else
448 AlignedFree(pContext->ppScratch[i]);
449 #endif
450
451 #if defined(KNOB_ENABLE_AR)
452 ArchRast::DestroyThreadContext(pContext->pArContext[i]);
453 #endif
454 }
455
456 #if defined(KNOB_ENABLE_RDTSC)
457 delete pContext->pBucketMgr;
458 #endif
459
460 delete[] pContext->ppScratch;
461 AlignedFree(pContext->pStats);
462
463 delete pContext->pHotTileMgr;
464 delete pContext->pSingleThreadLockedTiles;
465
466 pContext->~SWR_CONTEXT();
467 AlignedFree(GetContext(hContext));
468 }
469
470 void SwrBindApiThread(HANDLE hContext, uint32_t apiThreadId)
471 {
472 SWR_CONTEXT* pContext = GetContext(hContext);
473 BindApiThread(pContext, apiThreadId);
474 }
475
476 void SWR_API SwrSaveState(HANDLE hContext, void* pOutputStateBlock, size_t memSize)
477 {
478 SWR_CONTEXT* pContext = GetContext(hContext);
479 auto pSrc = GetDrawState(pContext);
480 SWR_ASSERT(pOutputStateBlock && memSize >= sizeof(*pSrc));
481
482 memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc));
483 }
484
485 void SWR_API SwrRestoreState(HANDLE hContext, const void* pStateBlock, size_t memSize)
486 {
487 SWR_CONTEXT* pContext = GetContext(hContext);
488 auto pDst = GetDrawState(pContext);
489 SWR_ASSERT(pStateBlock && memSize >= sizeof(*pDst));
490
491 memcpy(pDst, pStateBlock, sizeof(*pDst));
492 }
493
494 void SetupDefaultState(SWR_CONTEXT* pContext)
495 {
496 API_STATE* pState = GetDrawState(pContext);
497
498 pState->rastState.cullMode = SWR_CULLMODE_NONE;
499 pState->rastState.frontWinding = SWR_FRONTWINDING_CCW;
500
501 pState->depthBoundsState.depthBoundsTestEnable = false;
502 pState->depthBoundsState.depthBoundsTestMinValue = 0.0f;
503 pState->depthBoundsState.depthBoundsTestMaxValue = 1.0f;
504 }
505
506 void SWR_API SwrSync(HANDLE hContext,
507 PFN_CALLBACK_FUNC pfnFunc,
508 uint64_t userData,
509 uint64_t userData2,
510 uint64_t userData3)
511 {
512 SWR_ASSERT(pfnFunc != nullptr);
513
514 SWR_CONTEXT* pContext = GetContext(hContext);
515 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
516
517 RDTSC_BEGIN(pContext->pBucketMgr, APISync, 0);
518
519 pDC->FeWork.type = SYNC;
520 pDC->FeWork.pfnWork = ProcessSync;
521
522 // Setup callback function
523 pDC->retireCallback.pfnCallbackFunc = pfnFunc;
524 pDC->retireCallback.userData = userData;
525 pDC->retireCallback.userData2 = userData2;
526 pDC->retireCallback.userData3 = userData3;
527
528 AR_API_EVENT(SwrSyncEvent(pDC->drawId));
529
530 // enqueue
531 QueueDraw(pContext);
532
533 RDTSC_END(pContext->pBucketMgr, APISync, 1);
534 }
535
536 void SwrStallBE(HANDLE hContext)
537 {
538 SWR_CONTEXT* pContext = GetContext(hContext);
539 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
540
541 pDC->dependent = true;
542 }
543
544 void SwrWaitForIdle(HANDLE hContext)
545 {
546 SWR_CONTEXT* pContext = GetContext(hContext);
547
548 RDTSC_BEGIN(pContext->pBucketMgr, APIWaitForIdle, 0);
549
550 while (!pContext->dcRing.IsEmpty())
551 {
552 _mm_pause();
553 }
554
555 RDTSC_END(pContext->pBucketMgr, APIWaitForIdle, 1);
556 }
557
558 void SwrWaitForIdleFE(HANDLE hContext)
559 {
560 SWR_CONTEXT* pContext = GetContext(hContext);
561
562 RDTSC_BEGIN(pContext->pBucketMgr, APIWaitForIdle, 0);
563
564 while (pContext->drawsOutstandingFE > 0)
565 {
566 _mm_pause();
567 }
568
569 RDTSC_END(pContext->pBucketMgr, APIWaitForIdle, 1);
570 }
571
572 void SwrSetVertexBuffers(HANDLE hContext,
573 uint32_t numBuffers,
574 const SWR_VERTEX_BUFFER_STATE* pVertexBuffers)
575 {
576 API_STATE* pState = GetDrawState(GetContext(hContext));
577
578 for (uint32_t i = 0; i < numBuffers; ++i)
579 {
580 const SWR_VERTEX_BUFFER_STATE* pVB = &pVertexBuffers[i];
581 pState->vertexBuffers[pVB->index] = *pVB;
582 }
583 }
584
585 void SwrSetIndexBuffer(HANDLE hContext, const SWR_INDEX_BUFFER_STATE* pIndexBuffer)
586 {
587 API_STATE* pState = GetDrawState(GetContext(hContext));
588
589 pState->indexBuffer = *pIndexBuffer;
590 }
591
592 void SwrSetFetchFunc(HANDLE hContext, PFN_FETCH_FUNC pfnFetchFunc)
593 {
594 API_STATE* pState = GetDrawState(GetContext(hContext));
595
596 pState->pfnFetchFunc = pfnFetchFunc;
597 }
598
599 void SwrSetSoFunc(HANDLE hContext, PFN_SO_FUNC pfnSoFunc, uint32_t streamIndex)
600 {
601 API_STATE* pState = GetDrawState(GetContext(hContext));
602
603 SWR_ASSERT(streamIndex < MAX_SO_STREAMS);
604
605 pState->pfnSoFunc[streamIndex] = pfnSoFunc;
606 }
607
608 void SwrSetSoState(HANDLE hContext, SWR_STREAMOUT_STATE* pSoState)
609 {
610 API_STATE* pState = GetDrawState(GetContext(hContext));
611
612 pState->soState = *pSoState;
613 }
614
615 void SwrSetSoBuffers(HANDLE hContext, SWR_STREAMOUT_BUFFER* pSoBuffer, uint32_t slot)
616 {
617 API_STATE* pState = GetDrawState(GetContext(hContext));
618
619 SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot);
620
621 pState->soBuffer[slot] = *pSoBuffer;
622 }
623
624 void SwrSetVertexFunc(HANDLE hContext, PFN_VERTEX_FUNC pfnVertexFunc)
625 {
626 API_STATE* pState = GetDrawState(GetContext(hContext));
627
628 pState->pfnVertexFunc = pfnVertexFunc;
629 }
630
631 void SwrSetFrontendState(HANDLE hContext, SWR_FRONTEND_STATE* pFEState)
632 {
633 API_STATE* pState = GetDrawState(GetContext(hContext));
634 pState->frontendState = *pFEState;
635 }
636
637 void SwrSetGsState(HANDLE hContext, SWR_GS_STATE* pGSState)
638 {
639 API_STATE* pState = GetDrawState(GetContext(hContext));
640 pState->gsState = *pGSState;
641 }
642
643 void SwrSetGsFunc(HANDLE hContext, PFN_GS_FUNC pfnGsFunc)
644 {
645 API_STATE* pState = GetDrawState(GetContext(hContext));
646 pState->pfnGsFunc = pfnGsFunc;
647 }
648
649 void SwrSetCsFunc(HANDLE hContext,
650 PFN_CS_FUNC pfnCsFunc,
651 uint32_t totalThreadsInGroup,
652 uint32_t totalSpillFillSize,
653 uint32_t scratchSpaceSizePerWarp,
654 uint32_t numWarps)
655 {
656 API_STATE* pState = GetDrawState(GetContext(hContext));
657 pState->pfnCsFunc = pfnCsFunc;
658 pState->totalThreadsInGroup = totalThreadsInGroup;
659 pState->totalSpillFillSize = totalSpillFillSize;
660 pState->scratchSpaceSizePerWarp = scratchSpaceSizePerWarp;
661 pState->scratchSpaceNumWarps = numWarps;
662 }
663
664 void SwrSetTsState(HANDLE hContext, SWR_TS_STATE* pState)
665 {
666 API_STATE* pApiState = GetDrawState(GetContext(hContext));
667 pApiState->tsState = *pState;
668 }
669
670 void SwrSetHsFunc(HANDLE hContext, PFN_HS_FUNC pfnFunc)
671 {
672 API_STATE* pApiState = GetDrawState(GetContext(hContext));
673 pApiState->pfnHsFunc = pfnFunc;
674 }
675
676 void SwrSetDsFunc(HANDLE hContext, PFN_DS_FUNC pfnFunc)
677 {
678 API_STATE* pApiState = GetDrawState(GetContext(hContext));
679 pApiState->pfnDsFunc = pfnFunc;
680 }
681
682 void SwrSetDepthStencilState(HANDLE hContext, SWR_DEPTH_STENCIL_STATE* pDSState)
683 {
684 API_STATE* pState = GetDrawState(GetContext(hContext));
685
686 pState->depthStencilState = *pDSState;
687 }
688
689 void SwrSetBackendState(HANDLE hContext, SWR_BACKEND_STATE* pBEState)
690 {
691 API_STATE* pState = GetDrawState(GetContext(hContext));
692
693 pState->backendState = *pBEState;
694 }
695
696 void SwrSetDepthBoundsState(HANDLE hContext, SWR_DEPTH_BOUNDS_STATE* pDBState)
697 {
698 API_STATE* pState = GetDrawState(GetContext(hContext));
699
700 pState->depthBoundsState = *pDBState;
701 }
702
703 void SwrSetPixelShaderState(HANDLE hContext, SWR_PS_STATE* pPSState)
704 {
705 API_STATE* pState = GetDrawState(GetContext(hContext));
706 pState->psState = *pPSState;
707 }
708
709 void SwrSetBlendState(HANDLE hContext, SWR_BLEND_STATE* pBlendState)
710 {
711 API_STATE* pState = GetDrawState(GetContext(hContext));
712 memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE));
713 }
714
715 void SwrSetBlendFunc(HANDLE hContext, uint32_t renderTarget, PFN_BLEND_JIT_FUNC pfnBlendFunc)
716 {
717 SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS);
718 API_STATE* pState = GetDrawState(GetContext(hContext));
719 pState->pfnBlendFunc[renderTarget] = pfnBlendFunc;
720 }
721
722 // update guardband multipliers for the viewport
723 void updateGuardbands(API_STATE* pState)
724 {
725 uint32_t numGbs = pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
726
727 for (uint32_t i = 0; i < numGbs; ++i)
728 {
729 // guardband center is viewport center
730 pState->gbState.left[i] = KNOB_GUARDBAND_WIDTH / pState->vp[i].width;
731 pState->gbState.right[i] = KNOB_GUARDBAND_WIDTH / pState->vp[i].width;
732 pState->gbState.top[i] = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height;
733 pState->gbState.bottom[i] = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height;
734 }
735 }
736
737 void SwrSetRastState(HANDLE hContext, const SWR_RASTSTATE* pRastState)
738 {
739 SWR_CONTEXT* pContext = GetContext(hContext);
740 API_STATE* pState = GetDrawState(pContext);
741
742 memcpy(&pState->rastState, pRastState, sizeof(SWR_RASTSTATE));
743 }
744
745 void SwrSetViewports(HANDLE hContext,
746 uint32_t numViewports,
747 const SWR_VIEWPORT* pViewports,
748 const SWR_VIEWPORT_MATRICES* pMatrices)
749 {
750 SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS, "Invalid number of viewports.");
751
752 SWR_CONTEXT* pContext = GetContext(hContext);
753 API_STATE* pState = GetDrawState(pContext);
754
755 memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports);
756 // @todo Faster to copy portions of the SOA or just copy all of it?
757 memcpy(&pState->vpMatrices, pMatrices, sizeof(SWR_VIEWPORT_MATRICES));
758 }
759
760 void SwrSetScissorRects(HANDLE hContext, uint32_t numScissors, const SWR_RECT* pScissors)
761 {
762 SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS, "Invalid number of scissor rects.");
763
764 API_STATE* pState = GetDrawState(GetContext(hContext));
765 memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(pScissors[0]));
766 };
767
768 void SetupMacroTileScissors(DRAW_CONTEXT* pDC)
769 {
770 API_STATE* pState = &pDC->pState->state;
771 uint32_t numScissors =
772 pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
773 pState->scissorsTileAligned = true;
774
775 for (uint32_t index = 0; index < numScissors; ++index)
776 {
777 SWR_RECT& scissorInFixedPoint = pState->scissorsInFixedPoint[index];
778
779 // Set up scissor dimensions based on scissor or viewport
780 if (pState->rastState.scissorEnable)
781 {
782 scissorInFixedPoint = pState->scissorRects[index];
783 }
784 else
785 {
786 // the vp width and height must be added to origin un-rounded then the result round to
787 // -inf. The cast to int works for rounding assuming all [left, right, top, bottom] are
788 // positive.
789 scissorInFixedPoint.xmin = (int32_t)pState->vp[index].x;
790 scissorInFixedPoint.xmax = (int32_t)(pState->vp[index].x + pState->vp[index].width);
791 scissorInFixedPoint.ymin = (int32_t)pState->vp[index].y;
792 scissorInFixedPoint.ymax = (int32_t)(pState->vp[index].y + pState->vp[index].height);
793 }
794
795 // Clamp to max rect
796 scissorInFixedPoint &= g_MaxScissorRect;
797
798 // Test for tile alignment
799 bool tileAligned;
800 tileAligned = (scissorInFixedPoint.xmin % KNOB_TILE_X_DIM) == 0;
801 tileAligned &= (scissorInFixedPoint.ymin % KNOB_TILE_Y_DIM) == 0;
802 tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_X_DIM) == 0;
803 tileAligned &= (scissorInFixedPoint.ymax % KNOB_TILE_Y_DIM) == 0;
804
805 pState->scissorsTileAligned &= tileAligned;
806
807 // Scale to fixed point
808 scissorInFixedPoint.xmin *= FIXED_POINT_SCALE;
809 scissorInFixedPoint.xmax *= FIXED_POINT_SCALE;
810 scissorInFixedPoint.ymin *= FIXED_POINT_SCALE;
811 scissorInFixedPoint.ymax *= FIXED_POINT_SCALE;
812
813 // Make scissor inclusive
814 scissorInFixedPoint.xmax -= 1;
815 scissorInFixedPoint.ymax -= 1;
816 }
817 }
818
819
820 // templated backend function tables
821
822 void SetupPipeline(DRAW_CONTEXT* pDC)
823 {
824 DRAW_STATE* pState = pDC->pState;
825 const SWR_RASTSTATE& rastState = pState->state.rastState;
826 const SWR_PS_STATE& psState = pState->state.psState;
827 BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
828
829 // setup backend
830 if (psState.pfnPixelShader == nullptr)
831 {
832 backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
833 }
834 else
835 {
836 const uint32_t forcedSampleCount = (rastState.forcedSampleCount) ? 1 : 0;
837 const bool bMultisampleEnable =
838 ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || forcedSampleCount) ? 1 : 0;
839 const uint32_t centroid =
840 ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
841 const uint32_t canEarlyZ =
842 (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesUAV)) ? 1 : 0;
843 SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
844
845 // select backend function
846 switch (psState.shadingRate)
847 {
848 case SWR_SHADING_RATE_PIXEL:
849 if (bMultisampleEnable)
850 {
851 // always need to generate I & J per sample for Z interpolation
852 barycentricsMask =
853 (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
854 backendFuncs.pfnBackend =
855 gBackendPixelRateTable[rastState.sampleCount][rastState.bIsCenterPattern]
856 [psState.inputCoverage][centroid][forcedSampleCount]
857 [canEarlyZ]
858 ;
859 }
860 else
861 {
862 // always need to generate I & J per pixel for Z interpolation
863 barycentricsMask =
864 (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
865 backendFuncs.pfnBackend =
866 gBackendSingleSample[psState.inputCoverage][centroid][canEarlyZ];
867 }
868 break;
869 case SWR_SHADING_RATE_SAMPLE:
870 SWR_ASSERT(rastState.bIsCenterPattern != true);
871 // always need to generate I & J per sample for Z interpolation
872 barycentricsMask =
873 (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
874 backendFuncs.pfnBackend =
875 gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid]
876 [canEarlyZ];
877 break;
878 default:
879 SWR_ASSERT(0 && "Invalid shading rate");
880 break;
881 }
882 }
883
884 SWR_ASSERT(backendFuncs.pfnBackend);
885
886 PFN_PROCESS_PRIMS pfnBinner;
887 #if USE_SIMD16_FRONTEND
888 PFN_PROCESS_PRIMS_SIMD16 pfnBinner_simd16;
889 #endif
890 switch (pState->state.topology)
891 {
892 case TOP_POINT_LIST:
893 pState->pfnProcessPrims = ClipPoints;
894 pfnBinner = BinPoints;
895 #if USE_SIMD16_FRONTEND
896 pState->pfnProcessPrims_simd16 = ClipPoints_simd16;
897 pfnBinner_simd16 = BinPoints_simd16;
898 #endif
899 break;
900 case TOP_LINE_LIST:
901 case TOP_LINE_STRIP:
902 case TOP_LINE_LOOP:
903 case TOP_LINE_LIST_ADJ:
904 case TOP_LISTSTRIP_ADJ:
905 pState->pfnProcessPrims = ClipLines;
906 pfnBinner = BinLines;
907 #if USE_SIMD16_FRONTEND
908 pState->pfnProcessPrims_simd16 = ClipLines_simd16;
909 pfnBinner_simd16 = BinLines_simd16;
910 #endif
911 break;
912 default:
913 pState->pfnProcessPrims = ClipTriangles;
914 pfnBinner = GetBinTrianglesFunc((rastState.conservativeRast > 0));
915 #if USE_SIMD16_FRONTEND
916 pState->pfnProcessPrims_simd16 = ClipTriangles_simd16;
917 pfnBinner_simd16 = GetBinTrianglesFunc_simd16((rastState.conservativeRast > 0));
918 #endif
919 break;
920 };
921
922
923 // Disable clipper if viewport transform is disabled or if clipper is disabled
924 if (pState->state.frontendState.vpTransformDisable || !pState->state.rastState.clipEnable)
925 {
926 pState->pfnProcessPrims = pfnBinner;
927 #if USE_SIMD16_FRONTEND
928 pState->pfnProcessPrims_simd16 = pfnBinner_simd16;
929 #endif
930 }
931
932 // Disable rasterizer and backend if no pixel, no depth/stencil, and no attributes
933 if ((pState->state.psState.pfnPixelShader == nullptr) &&
934 (pState->state.depthStencilState.depthTestEnable == FALSE) &&
935 (pState->state.depthStencilState.depthWriteEnable == FALSE) &&
936 (pState->state.depthStencilState.stencilTestEnable == FALSE) &&
937 (pState->state.depthStencilState.stencilWriteEnable == FALSE) &&
938 (pState->state.backendState.numAttributes == 0))
939 {
940 pState->pfnProcessPrims = nullptr;
941 #if USE_SIMD16_FRONTEND
942 pState->pfnProcessPrims_simd16 = nullptr;
943 #endif
944 }
945
946 if (pState->state.soState.rasterizerDisable == true)
947 {
948 pState->pfnProcessPrims = nullptr;
949 #if USE_SIMD16_FRONTEND
950 pState->pfnProcessPrims_simd16 = nullptr;
951 #endif
952 }
953
954
955 // set up the frontend attribute count
956 pState->state.feNumAttributes = 0;
957 const SWR_BACKEND_STATE& backendState = pState->state.backendState;
958 if (backendState.swizzleEnable)
959 {
960 // attribute swizzling is enabled, iterate over the map and record the max attribute used
961 for (uint32_t i = 0; i < backendState.numAttributes; ++i)
962 {
963 pState->state.feNumAttributes =
964 std::max(pState->state.feNumAttributes,
965 (uint32_t)backendState.swizzleMap[i].sourceAttrib + 1);
966 }
967 }
968 else
969 {
970 pState->state.feNumAttributes = pState->state.backendState.numAttributes;
971 }
972
973 if (pState->state.soState.soEnable)
974 {
975 uint64_t streamMasks = 0;
976 for (uint32_t i = 0; i < 4; ++i)
977 {
978 streamMasks |= pState->state.soState.streamMasks[i];
979 }
980
981 DWORD maxAttrib;
982 if (_BitScanReverse64(&maxAttrib, streamMasks))
983 {
984 pState->state.feNumAttributes =
985 std::max(pState->state.feNumAttributes, (uint32_t)(maxAttrib + 1));
986 }
987 }
988
989 // complicated logic to test for cases where we don't need backing hottile memory for a draw
990 // have to check for the special case where depth/stencil test is enabled but depthwrite is
991 // disabled.
992 pState->state.depthHottileEnable =
993 ((!(pState->state.depthStencilState.depthTestEnable &&
994 !pState->state.depthStencilState.depthWriteEnable &&
995 !pState->state.depthBoundsState.depthBoundsTestEnable &&
996 pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) &&
997 (pState->state.depthStencilState.depthTestEnable ||
998 pState->state.depthStencilState.depthWriteEnable ||
999 pState->state.depthBoundsState.depthBoundsTestEnable))
1000 ? true
1001 : false;
1002
1003 pState->state.stencilHottileEnable =
1004 (((!(pState->state.depthStencilState.stencilTestEnable &&
1005 !pState->state.depthStencilState.stencilWriteEnable &&
1006 pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) ||
1007 // for stencil we have to check the double sided state as well
1008 (!(pState->state.depthStencilState.doubleSidedStencilTestEnable &&
1009 !pState->state.depthStencilState.stencilWriteEnable &&
1010 pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) &&
1011 (pState->state.depthStencilState.stencilTestEnable ||
1012 pState->state.depthStencilState.stencilWriteEnable))
1013 ? true
1014 : false;
1015
1016 uint32_t hotTileEnable = pState->state.psState.renderTargetMask;
1017
1018 // Disable hottile for surfaces with no writes
1019 if (psState.pfnPixelShader != nullptr)
1020 {
1021 DWORD rt;
1022 uint32_t rtMask = pState->state.psState.renderTargetMask;
1023 while (_BitScanForward(&rt, rtMask))
1024 {
1025 rtMask &= ~(1 << rt);
1026
1027 if (pState->state.blendState.renderTarget[rt].writeDisableAlpha &&
1028 pState->state.blendState.renderTarget[rt].writeDisableRed &&
1029 pState->state.blendState.renderTarget[rt].writeDisableGreen &&
1030 pState->state.blendState.renderTarget[rt].writeDisableBlue)
1031 {
1032 hotTileEnable &= ~(1 << rt);
1033 }
1034 }
1035 }
1036
1037 pState->state.colorHottileEnable = hotTileEnable;
1038
1039 // Setup depth quantization function
1040 if (pState->state.depthHottileEnable)
1041 {
1042 switch (pState->state.rastState.depthFormat)
1043 {
1044 case R32_FLOAT_X8X24_TYPELESS:
1045 pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT_X8X24_TYPELESS>;
1046 break;
1047 case R32_FLOAT:
1048 pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT>;
1049 break;
1050 case R24_UNORM_X8_TYPELESS:
1051 pState->state.pfnQuantizeDepth = QuantizeDepth<R24_UNORM_X8_TYPELESS>;
1052 break;
1053 case R16_UNORM:
1054 pState->state.pfnQuantizeDepth = QuantizeDepth<R16_UNORM>;
1055 break;
1056 default:
1057 SWR_INVALID("Unsupported depth format for depth quantiztion.");
1058 pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT>;
1059 }
1060 }
1061 else
1062 {
1063 // set up pass-through quantize if depth isn't enabled
1064 pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT>;
1065 }
1066
1067 // Generate guardbands
1068 updateGuardbands(&pState->state);
1069 }
1070
1071 //////////////////////////////////////////////////////////////////////////
1072 /// @brief InitDraw
1073 /// @param pDC - Draw context to initialize for this draw.
1074 void InitDraw(DRAW_CONTEXT* pDC, bool isSplitDraw)
1075 {
1076 // We don't need to re-setup the scissors/pipeline state again for split draw.
1077 if (isSplitDraw == false)
1078 {
1079 SetupMacroTileScissors(pDC);
1080 SetupPipeline(pDC);
1081 }
1082
1083 }
1084
1085 //////////////////////////////////////////////////////////////////////////
1086 /// @brief We can split the draw for certain topologies for better performance.
1087 /// @param totalVerts - Total vertices for draw
1088 /// @param topology - Topology used for draw
1089 uint32_t MaxVertsPerDraw(DRAW_CONTEXT* pDC, uint32_t totalVerts, PRIMITIVE_TOPOLOGY topology)
1090 {
1091 API_STATE& state = pDC->pState->state;
1092
1093 // We can not split draws that have streamout enabled because there is no practical way
1094 // to support multiple threads generating SO data for a single set of buffers.
1095 if (state.soState.soEnable)
1096 {
1097 return totalVerts;
1098 }
1099
1100 // The Primitive Assembly code can only handle 1 RECT at a time. Specified with only 3 verts.
1101 if (topology == TOP_RECT_LIST)
1102 {
1103 return 3;
1104 }
1105
1106 // Is split drawing disabled?
1107 if (KNOB_DISABLE_SPLIT_DRAW)
1108 {
1109 return totalVerts;
1110 }
1111
1112 uint32_t vertsPerDraw = totalVerts;
1113
1114 switch (topology)
1115 {
1116 case TOP_POINT_LIST:
1117 case TOP_TRIANGLE_LIST:
1118 vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW;
1119 break;
1120
1121 case TOP_PATCHLIST_1:
1122 case TOP_PATCHLIST_2:
1123 case TOP_PATCHLIST_3:
1124 case TOP_PATCHLIST_4:
1125 case TOP_PATCHLIST_5:
1126 case TOP_PATCHLIST_6:
1127 case TOP_PATCHLIST_7:
1128 case TOP_PATCHLIST_8:
1129 case TOP_PATCHLIST_9:
1130 case TOP_PATCHLIST_10:
1131 case TOP_PATCHLIST_11:
1132 case TOP_PATCHLIST_12:
1133 case TOP_PATCHLIST_13:
1134 case TOP_PATCHLIST_14:
1135 case TOP_PATCHLIST_15:
1136 case TOP_PATCHLIST_16:
1137 case TOP_PATCHLIST_17:
1138 case TOP_PATCHLIST_18:
1139 case TOP_PATCHLIST_19:
1140 case TOP_PATCHLIST_20:
1141 case TOP_PATCHLIST_21:
1142 case TOP_PATCHLIST_22:
1143 case TOP_PATCHLIST_23:
1144 case TOP_PATCHLIST_24:
1145 case TOP_PATCHLIST_25:
1146 case TOP_PATCHLIST_26:
1147 case TOP_PATCHLIST_27:
1148 case TOP_PATCHLIST_28:
1149 case TOP_PATCHLIST_29:
1150 case TOP_PATCHLIST_30:
1151 case TOP_PATCHLIST_31:
1152 case TOP_PATCHLIST_32:
1153 if (pDC->pState->state.tsState.tsEnable)
1154 {
1155 uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE;
1156 vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW;
1157 }
1158 break;
1159 default:
1160 // We are not splitting up draws for other topologies.
1161 break;
1162 }
1163
1164 return vertsPerDraw;
1165 }
1166
1167 //////////////////////////////////////////////////////////////////////////
1168 /// @brief DrawInstanced
1169 /// @param hContext - Handle passed back from SwrCreateContext
1170 /// @param topology - Specifies topology for draw.
1171 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
1172 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1173 /// @param numInstances - How many instances to render.
1174 /// @param startInstance - Which instance to start sequentially fetching from in each buffer
1175 /// (instanced data)
1176 void DrawInstanced(HANDLE hContext,
1177 PRIMITIVE_TOPOLOGY topology,
1178 uint32_t numVertices,
1179 uint32_t startVertex,
1180 uint32_t numInstances = 1,
1181 uint32_t startInstance = 0)
1182 {
1183 if (KNOB_TOSS_DRAW)
1184 {
1185 return;
1186 }
1187
1188 SWR_CONTEXT* pContext = GetContext(hContext);
1189 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1190
1191 RDTSC_BEGIN(pContext->pBucketMgr, APIDraw, pDC->drawId);
1192
1193 uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
1194 uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
1195 uint32_t remainingVerts = numVertices;
1196
1197 API_STATE* pState = &pDC->pState->state;
1198 pState->topology = topology;
1199 pState->forceFront = false;
1200
1201 // disable culling for points/lines
1202 uint32_t oldCullMode = pState->rastState.cullMode;
1203 if (topology == TOP_POINT_LIST)
1204 {
1205 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1206 pState->forceFront = true;
1207 }
1208 else if (topology == TOP_RECT_LIST)
1209 {
1210 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1211 }
1212
1213 int draw = 0;
1214 while (remainingVerts)
1215 {
1216 uint32_t numVertsForDraw =
1217 (remainingVerts < maxVertsPerDraw) ? remainingVerts : maxVertsPerDraw;
1218
1219 bool isSplitDraw = (draw > 0) ? !KNOB_DISABLE_SPLIT_DRAW : false;
1220 DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw);
1221 InitDraw(pDC, isSplitDraw);
1222
1223 pDC->FeWork.type = DRAW;
1224 pDC->FeWork.pfnWork = GetProcessDrawFunc(false, // IsIndexed
1225 false, // bEnableCutIndex
1226 pState->tsState.tsEnable,
1227 pState->gsState.gsEnable,
1228 pState->soState.soEnable,
1229 pDC->pState->pfnProcessPrims != nullptr);
1230 pDC->FeWork.desc.draw.numVerts = numVertsForDraw;
1231 pDC->FeWork.desc.draw.startVertex = startVertex;
1232 pDC->FeWork.desc.draw.numInstances = numInstances;
1233 pDC->FeWork.desc.draw.startInstance = startInstance;
1234 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1235 pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw;
1236
1237 pDC->cleanupState = (remainingVerts == numVertsForDraw);
1238
1239 // enqueue DC
1240 QueueDraw(pContext);
1241
1242 AR_API_EVENT(DrawInstancedEvent(pDC->drawId,
1243 topology,
1244 numVertsForDraw,
1245 startVertex,
1246 numInstances,
1247 startInstance,
1248 pState->tsState.tsEnable,
1249 pState->gsState.gsEnable,
1250 pState->soState.soEnable,
1251 pState->gsState.outputTopology,
1252 draw));
1253
1254 remainingVerts -= numVertsForDraw;
1255 draw++;
1256 }
1257
1258 // restore culling state
1259 pDC = GetDrawContext(pContext);
1260 pDC->pState->state.rastState.cullMode = oldCullMode;
1261
1262 RDTSC_END(pContext->pBucketMgr, APIDraw, numVertices * numInstances);
1263 }
1264
1265 //////////////////////////////////////////////////////////////////////////
1266 /// @brief SwrDraw
1267 /// @param hContext - Handle passed back from SwrCreateContext
1268 /// @param topology - Specifies topology for draw.
1269 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
1270 /// @param primCount - Number of vertices.
1271 void SwrDraw(HANDLE hContext,
1272 PRIMITIVE_TOPOLOGY topology,
1273 uint32_t startVertex,
1274 uint32_t numVertices)
1275 {
1276 DrawInstanced(hContext, topology, numVertices, startVertex);
1277 }
1278
1279 //////////////////////////////////////////////////////////////////////////
1280 /// @brief SwrDrawInstanced
1281 /// @param hContext - Handle passed back from SwrCreateContext
1282 /// @param topology - Specifies topology for draw.
1283 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
1284 /// @param numInstances - How many instances to render.
1285 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1286 /// @param startInstance - Which instance to start sequentially fetching from in each buffer
1287 /// (instanced data)
1288 void SwrDrawInstanced(HANDLE hContext,
1289 PRIMITIVE_TOPOLOGY topology,
1290 uint32_t numVertsPerInstance,
1291 uint32_t numInstances,
1292 uint32_t startVertex,
1293 uint32_t startInstance)
1294 {
1295 DrawInstanced(
1296 hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance);
1297 }
1298
1299 //////////////////////////////////////////////////////////////////////////
1300 /// @brief DrawIndexedInstanced
1301 /// @param hContext - Handle passed back from SwrCreateContext
1302 /// @param topology - Specifies topology for draw.
1303 /// @param numIndices - Number of indices to read sequentially from index buffer.
1304 /// @param indexOffset - Starting index into index buffer.
1305 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1306 /// @param numInstances - Number of instances to render.
1307 /// @param startInstance - Which instance to start sequentially fetching from in each buffer
1308 /// (instanced data)
1309 void DrawIndexedInstance(HANDLE hContext,
1310 PRIMITIVE_TOPOLOGY topology,
1311 uint32_t numIndices,
1312 uint32_t indexOffset,
1313 int32_t baseVertex,
1314 uint32_t numInstances = 1,
1315 uint32_t startInstance = 0)
1316 {
1317 if (KNOB_TOSS_DRAW)
1318 {
1319 return;
1320 }
1321
1322 SWR_CONTEXT* pContext = GetContext(hContext);
1323 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1324 API_STATE* pState = &pDC->pState->state;
1325
1326 RDTSC_BEGIN(pContext->pBucketMgr, APIDrawIndexed, pDC->drawId);
1327
1328 uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
1329 uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
1330 uint32_t remainingIndices = numIndices;
1331
1332 uint32_t indexSize = 0;
1333 switch (pState->indexBuffer.format)
1334 {
1335 case R32_UINT:
1336 indexSize = sizeof(uint32_t);
1337 break;
1338 case R16_UINT:
1339 indexSize = sizeof(uint16_t);
1340 break;
1341 case R8_UINT:
1342 indexSize = sizeof(uint8_t);
1343 break;
1344 default:
1345 SWR_INVALID("Invalid index buffer format: %d", pState->indexBuffer.format);
1346 }
1347
1348 int draw = 0;
1349 gfxptr_t xpIB = pState->indexBuffer.xpIndices;
1350 xpIB += (uint64_t)indexOffset * (uint64_t)indexSize;
1351
1352 pState->topology = topology;
1353 pState->forceFront = false;
1354
1355 // disable culling for points/lines
1356 uint32_t oldCullMode = pState->rastState.cullMode;
1357 if (topology == TOP_POINT_LIST)
1358 {
1359 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1360 pState->forceFront = true;
1361 }
1362 else if (topology == TOP_RECT_LIST)
1363 {
1364 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1365 }
1366
1367 while (remainingIndices)
1368 {
1369 uint32_t numIndicesForDraw =
1370 (remainingIndices < maxIndicesPerDraw) ? remainingIndices : maxIndicesPerDraw;
1371
1372 // When breaking up draw, we need to obtain new draw context for each iteration.
1373 bool isSplitDraw = (draw > 0) ? !KNOB_DISABLE_SPLIT_DRAW : false;
1374
1375 pDC = GetDrawContext(pContext, isSplitDraw);
1376 InitDraw(pDC, isSplitDraw);
1377
1378 pDC->FeWork.type = DRAW;
1379 pDC->FeWork.pfnWork = GetProcessDrawFunc(true, // IsIndexed
1380 pState->frontendState.bEnableCutIndex,
1381 pState->tsState.tsEnable,
1382 pState->gsState.gsEnable,
1383 pState->soState.soEnable,
1384 pDC->pState->pfnProcessPrims != nullptr);
1385 pDC->FeWork.desc.draw.pDC = pDC;
1386 pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
1387 pDC->FeWork.desc.draw.xpIB = xpIB;
1388 pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format;
1389
1390 pDC->FeWork.desc.draw.numInstances = numInstances;
1391 pDC->FeWork.desc.draw.startInstance = startInstance;
1392 pDC->FeWork.desc.draw.baseVertex = baseVertex;
1393 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1394
1395 pDC->cleanupState = (remainingIndices == numIndicesForDraw);
1396
1397 // enqueue DC
1398 QueueDraw(pContext);
1399
1400 AR_API_EVENT(DrawIndexedInstancedEvent(pDC->drawId,
1401 topology,
1402 numIndicesForDraw,
1403 indexOffset,
1404 baseVertex,
1405 numInstances,
1406 startInstance,
1407 pState->tsState.tsEnable,
1408 pState->gsState.gsEnable,
1409 pState->soState.soEnable,
1410 pState->gsState.outputTopology,
1411 draw));
1412
1413 xpIB += maxIndicesPerDraw * indexSize;
1414 remainingIndices -= numIndicesForDraw;
1415 draw++;
1416 }
1417
1418 // Restore culling state
1419 pDC = GetDrawContext(pContext);
1420 pDC->pState->state.rastState.cullMode = oldCullMode;
1421
1422 RDTSC_END(pContext->pBucketMgr, APIDrawIndexed, numIndices * numInstances);
1423 }
1424
1425 //////////////////////////////////////////////////////////////////////////
1426 /// @brief DrawIndexed
1427 /// @param hContext - Handle passed back from SwrCreateContext
1428 /// @param topology - Specifies topology for draw.
1429 /// @param numIndices - Number of indices to read sequentially from index buffer.
1430 /// @param indexOffset - Starting index into index buffer.
1431 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1432 void SwrDrawIndexed(HANDLE hContext,
1433 PRIMITIVE_TOPOLOGY topology,
1434 uint32_t numIndices,
1435 uint32_t indexOffset,
1436 int32_t baseVertex)
1437 {
1438 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex);
1439 }
1440
1441 //////////////////////////////////////////////////////////////////////////
1442 /// @brief SwrDrawIndexedInstanced
1443 /// @param hContext - Handle passed back from SwrCreateContext
1444 /// @param topology - Specifies topology for draw.
1445 /// @param numIndices - Number of indices to read sequentially from index buffer.
1446 /// @param numInstances - Number of instances to render.
1447 /// @param indexOffset - Starting index into index buffer.
1448 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1449 /// @param startInstance - Which instance to start sequentially fetching from in each buffer
1450 /// (instanced data)
1451 void SwrDrawIndexedInstanced(HANDLE hContext,
1452 PRIMITIVE_TOPOLOGY topology,
1453 uint32_t numIndices,
1454 uint32_t numInstances,
1455 uint32_t indexOffset,
1456 int32_t baseVertex,
1457 uint32_t startInstance)
1458 {
1459 DrawIndexedInstance(
1460 hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
1461 }
1462
1463 //////////////////////////////////////////////////////////////////////////
1464 /// @brief SwrInvalidateTiles
1465 /// @param hContext - Handle passed back from SwrCreateContext
1466 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to
1467 /// invalidate.
1468 /// @param invalidateRect - The pixel-coordinate rectangle to invalidate. This will be expanded to
1469 /// be hottile size-aligned.
1470 void SWR_API SwrInvalidateTiles(HANDLE hContext,
1471 uint32_t attachmentMask,
1472 const SWR_RECT& invalidateRect)
1473 {
1474 if (KNOB_TOSS_DRAW)
1475 {
1476 return;
1477 }
1478
1479 SWR_CONTEXT* pContext = GetContext(hContext);
1480 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1481
1482 pDC->FeWork.type = DISCARDINVALIDATETILES;
1483 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1484 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1485 pDC->FeWork.desc.discardInvalidateTiles.rect = invalidateRect;
1486 pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect;
1487 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID;
1488 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false;
1489 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false;
1490
1491 // enqueue
1492 QueueDraw(pContext);
1493
1494 AR_API_EVENT(SwrInvalidateTilesEvent(pDC->drawId));
1495 }
1496
1497 //////////////////////////////////////////////////////////////////////////
1498 /// @brief SwrDiscardRect
1499 /// @param hContext - Handle passed back from SwrCreateContext
1500 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
1501 /// @param rect - The pixel-coordinate rectangle to discard. Only fully-covered hottiles will be
1502 /// discarded.
1503 void SWR_API SwrDiscardRect(HANDLE hContext, uint32_t attachmentMask, const SWR_RECT& rect)
1504 {
1505 if (KNOB_TOSS_DRAW)
1506 {
1507 return;
1508 }
1509
1510 SWR_CONTEXT* pContext = GetContext(hContext);
1511 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1512
1513 // Queue a load to the hottile
1514 pDC->FeWork.type = DISCARDINVALIDATETILES;
1515 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1516 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1517 pDC->FeWork.desc.discardInvalidateTiles.rect = rect;
1518 pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect;
1519 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED;
1520 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true;
1521 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true;
1522
1523 // enqueue
1524 QueueDraw(pContext);
1525
1526 AR_API_EVENT(SwrDiscardRectEvent(pDC->drawId));
1527 }
1528
1529 //////////////////////////////////////////////////////////////////////////
1530 /// @brief SwrDispatch
1531 /// @param hContext - Handle passed back from SwrCreateContext
1532 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
1533 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
1534 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
1535 void SwrDispatch(HANDLE hContext,
1536 uint32_t threadGroupCountX,
1537 uint32_t threadGroupCountY,
1538 uint32_t threadGroupCountZ
1539
1540 )
1541 {
1542 if (KNOB_TOSS_DRAW)
1543 {
1544 return;
1545 }
1546
1547 SWR_CONTEXT* pContext = GetContext(hContext);
1548 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1549
1550 RDTSC_BEGIN(pContext->pBucketMgr, APIDispatch, pDC->drawId);
1551 AR_API_EVENT(
1552 DispatchEvent(pDC->drawId, threadGroupCountX, threadGroupCountY, threadGroupCountZ));
1553 pDC->isCompute = true; // This is a compute context.
1554
1555 COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
1556
1557 pTaskData->threadGroupCountX = threadGroupCountX;
1558 pTaskData->threadGroupCountY = threadGroupCountY;
1559 pTaskData->threadGroupCountZ = threadGroupCountZ;
1560
1561 pTaskData->enableThreadDispatch = false;
1562
1563 uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
1564 uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT;
1565 pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex];
1566 pDC->pDispatch->initialize(totalThreadGroups, pTaskData, &ProcessComputeBE);
1567
1568 QueueDispatch(pContext);
1569 RDTSC_END(pContext->pBucketMgr,
1570 APIDispatch,
1571 threadGroupCountX * threadGroupCountY * threadGroupCountZ);
1572 }
1573
1574 // Deswizzles, converts and stores current contents of the hot tiles to surface
1575 // described by pState
1576 void SWR_API SwrStoreTiles(HANDLE hContext,
1577 uint32_t attachmentMask,
1578 SWR_TILE_STATE postStoreTileState,
1579 const SWR_RECT& storeRect)
1580 {
1581 if (KNOB_TOSS_DRAW)
1582 {
1583 return;
1584 }
1585
1586 SWR_CONTEXT* pContext = GetContext(hContext);
1587 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1588
1589 RDTSC_BEGIN(pContext->pBucketMgr, APIStoreTiles, pDC->drawId);
1590
1591 pDC->FeWork.type = STORETILES;
1592 pDC->FeWork.pfnWork = ProcessStoreTiles;
1593 pDC->FeWork.desc.storeTiles.attachmentMask = attachmentMask;
1594 pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState;
1595 pDC->FeWork.desc.storeTiles.rect = storeRect;
1596 pDC->FeWork.desc.storeTiles.rect &= g_MaxScissorRect;
1597
1598 // enqueue
1599 QueueDraw(pContext);
1600
1601 AR_API_EVENT(SwrStoreTilesEvent(pDC->drawId));
1602
1603 RDTSC_END(pContext->pBucketMgr, APIStoreTiles, 1);
1604 }
1605
1606 //////////////////////////////////////////////////////////////////////////
1607 /// @brief SwrClearRenderTarget - Clear attached render targets / depth / stencil
1608 /// @param hContext - Handle passed back from SwrCreateContext
1609 /// @param attachmentMask - combination of SWR_ATTACHMENT_*_BIT attachments to clear
1610 /// @param renderTargetArrayIndex - the RT array index to clear
1611 /// @param clearColor - color use for clearing render targets
1612 /// @param z - depth value use for clearing depth buffer
1613 /// @param stencil - stencil value used for clearing stencil buffer
1614 /// @param clearRect - The pixel-coordinate rectangle to clear in all cleared buffers
1615 void SWR_API SwrClearRenderTarget(HANDLE hContext,
1616 uint32_t attachmentMask,
1617 uint32_t renderTargetArrayIndex,
1618 const float clearColor[4],
1619 float z,
1620 uint8_t stencil,
1621 const SWR_RECT& clearRect)
1622 {
1623 if (KNOB_TOSS_DRAW)
1624 {
1625 return;
1626 }
1627
1628 SWR_CONTEXT* pContext = GetContext(hContext);
1629 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1630
1631 RDTSC_BEGIN(pContext->pBucketMgr, APIClearRenderTarget, pDC->drawId);
1632
1633 pDC->FeWork.type = CLEAR;
1634 pDC->FeWork.pfnWork = ProcessClear;
1635 pDC->FeWork.desc.clear.rect = clearRect;
1636 pDC->FeWork.desc.clear.rect &= g_MaxScissorRect;
1637 pDC->FeWork.desc.clear.attachmentMask = attachmentMask;
1638 pDC->FeWork.desc.clear.renderTargetArrayIndex = renderTargetArrayIndex;
1639 pDC->FeWork.desc.clear.clearDepth = z;
1640 pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0];
1641 pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1];
1642 pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2];
1643 pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3];
1644 pDC->FeWork.desc.clear.clearStencil = stencil;
1645
1646 // enqueue draw
1647 QueueDraw(pContext);
1648
1649 RDTSC_END(pContext->pBucketMgr, APIClearRenderTarget, 1);
1650 }
1651
1652 //////////////////////////////////////////////////////////////////////////
1653 /// @brief Returns a pointer to the private context state for the current
1654 /// draw operation. This is used for external componets such as the
1655 /// sampler.
1656 /// SWR is responsible for the allocation of the private context state.
1657 /// @param hContext - Handle passed back from SwrCreateContext
1658 VOID* SwrGetPrivateContextState(HANDLE hContext)
1659 {
1660 SWR_CONTEXT* pContext = GetContext(hContext);
1661 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1662 DRAW_STATE* pState = pDC->pState;
1663
1664 if (pState->pPrivateState == nullptr)
1665 {
1666 pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize,
1667 KNOB_SIMD_WIDTH * sizeof(float));
1668 }
1669
1670 return pState->pPrivateState;
1671 }
1672
1673 //////////////////////////////////////////////////////////////////////////
1674 /// @brief Clients can use this to allocate memory for draw/dispatch
1675 /// operations. The memory will automatically be freed once operation
1676 /// has completed. Client can use this to allocate binding tables,
1677 /// etc. needed for shader execution.
1678 /// @param hContext - Handle passed back from SwrCreateContext
1679 /// @param size - Size of allocation
1680 /// @param align - Alignment needed for allocation.
1681 VOID* SwrAllocDrawContextMemory(HANDLE hContext, uint32_t size, uint32_t align)
1682 {
1683 SWR_CONTEXT* pContext = GetContext(hContext);
1684 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1685
1686 return pDC->pState->pArena->AllocAligned(size, align);
1687 }
1688
1689 //////////////////////////////////////////////////////////////////////////
1690 /// @brief Enables stats counting
1691 /// @param hContext - Handle passed back from SwrCreateContext
1692 /// @param enable - If true then counts are incremented.
1693 void SwrEnableStatsFE(HANDLE hContext, bool enable)
1694 {
1695 SWR_CONTEXT* pContext = GetContext(hContext);
1696 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1697
1698 pDC->pState->state.enableStatsFE = enable;
1699 }
1700
1701 //////////////////////////////////////////////////////////////////////////
1702 /// @brief Enables stats counting
1703 /// @param hContext - Handle passed back from SwrCreateContext
1704 /// @param enable - If true then counts are incremented.
1705 void SwrEnableStatsBE(HANDLE hContext, bool enable)
1706 {
1707 SWR_CONTEXT* pContext = GetContext(hContext);
1708 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1709
1710 pDC->pState->state.enableStatsBE = enable;
1711 }
1712
1713 //////////////////////////////////////////////////////////////////////////
1714 /// @brief Mark end of frame - used for performance profiling
1715 /// @param hContext - Handle passed back from SwrCreateContext
1716 void SWR_API SwrEndFrame(HANDLE hContext)
1717 {
1718 SWR_CONTEXT* pContext = GetContext(hContext);
1719 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1720 (void)pDC; // var used
1721
1722 RDTSC_ENDFRAME(pContext->pBucketMgr);
1723 AR_API_EVENT(FrameEndEvent(pContext->frameCount, pDC->drawId));
1724
1725 pContext->frameCount++;
1726 }
1727
1728 void InitSimLoadTilesTable();
1729 void InitSimStoreTilesTable();
1730 void InitSimClearTilesTable();
1731
1732 void InitClearTilesTable();
1733 void InitBackendFuncTables();
1734
1735 //////////////////////////////////////////////////////////////////////////
1736 /// @brief Initialize swr backend and memory internal tables
1737 void SwrInit()
1738 {
1739 InitClearTilesTable();
1740 InitBackendFuncTables();
1741 InitRasterizerFunctions();
1742 }
1743
1744 void SwrGetInterface(SWR_INTERFACE& out_funcs)
1745 {
1746 out_funcs.pfnSwrCreateContext = SwrCreateContext;
1747 out_funcs.pfnSwrDestroyContext = SwrDestroyContext;
1748 out_funcs.pfnSwrBindApiThread = SwrBindApiThread;
1749 out_funcs.pfnSwrSaveState = SwrSaveState;
1750 out_funcs.pfnSwrRestoreState = SwrRestoreState;
1751 out_funcs.pfnSwrSync = SwrSync;
1752 out_funcs.pfnSwrStallBE = SwrStallBE;
1753 out_funcs.pfnSwrWaitForIdle = SwrWaitForIdle;
1754 out_funcs.pfnSwrWaitForIdleFE = SwrWaitForIdleFE;
1755 out_funcs.pfnSwrSetVertexBuffers = SwrSetVertexBuffers;
1756 out_funcs.pfnSwrSetIndexBuffer = SwrSetIndexBuffer;
1757 out_funcs.pfnSwrSetFetchFunc = SwrSetFetchFunc;
1758 out_funcs.pfnSwrSetSoFunc = SwrSetSoFunc;
1759 out_funcs.pfnSwrSetSoState = SwrSetSoState;
1760 out_funcs.pfnSwrSetSoBuffers = SwrSetSoBuffers;
1761 out_funcs.pfnSwrSetVertexFunc = SwrSetVertexFunc;
1762 out_funcs.pfnSwrSetFrontendState = SwrSetFrontendState;
1763 out_funcs.pfnSwrSetGsState = SwrSetGsState;
1764 out_funcs.pfnSwrSetGsFunc = SwrSetGsFunc;
1765 out_funcs.pfnSwrSetCsFunc = SwrSetCsFunc;
1766 out_funcs.pfnSwrSetTsState = SwrSetTsState;
1767 out_funcs.pfnSwrSetHsFunc = SwrSetHsFunc;
1768 out_funcs.pfnSwrSetDsFunc = SwrSetDsFunc;
1769 out_funcs.pfnSwrSetDepthStencilState = SwrSetDepthStencilState;
1770 out_funcs.pfnSwrSetBackendState = SwrSetBackendState;
1771 out_funcs.pfnSwrSetDepthBoundsState = SwrSetDepthBoundsState;
1772 out_funcs.pfnSwrSetPixelShaderState = SwrSetPixelShaderState;
1773 out_funcs.pfnSwrSetBlendState = SwrSetBlendState;
1774 out_funcs.pfnSwrSetBlendFunc = SwrSetBlendFunc;
1775 out_funcs.pfnSwrDraw = SwrDraw;
1776 out_funcs.pfnSwrDrawInstanced = SwrDrawInstanced;
1777 out_funcs.pfnSwrDrawIndexed = SwrDrawIndexed;
1778 out_funcs.pfnSwrDrawIndexedInstanced = SwrDrawIndexedInstanced;
1779 out_funcs.pfnSwrInvalidateTiles = SwrInvalidateTiles;
1780 out_funcs.pfnSwrDiscardRect = SwrDiscardRect;
1781 out_funcs.pfnSwrDispatch = SwrDispatch;
1782 out_funcs.pfnSwrStoreTiles = SwrStoreTiles;
1783 out_funcs.pfnSwrClearRenderTarget = SwrClearRenderTarget;
1784 out_funcs.pfnSwrSetRastState = SwrSetRastState;
1785 out_funcs.pfnSwrSetViewports = SwrSetViewports;
1786 out_funcs.pfnSwrSetScissorRects = SwrSetScissorRects;
1787 out_funcs.pfnSwrGetPrivateContextState = SwrGetPrivateContextState;
1788 out_funcs.pfnSwrAllocDrawContextMemory = SwrAllocDrawContextMemory;
1789 out_funcs.pfnSwrEnableStatsFE = SwrEnableStatsFE;
1790 out_funcs.pfnSwrEnableStatsBE = SwrEnableStatsBE;
1791 out_funcs.pfnSwrEndFrame = SwrEndFrame;
1792 out_funcs.pfnSwrInit = SwrInit;
1793 }