swr: avoid using exceptions for expected condition handling
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / api.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file api.cpp
24 *
25 * @brief API implementation
26 *
27 ******************************************************************************/
28
29 #include <cfloat>
30 #include <cmath>
31 #include <cstdio>
32 #include <new>
33
34 #include "core/api.h"
35 #include "core/backend.h"
36 #include "core/context.h"
37 #include "core/depthstencil.h"
38 #include "core/frontend.h"
39 #include "core/rasterizer.h"
40 #include "core/rdtsc_core.h"
41 #include "core/threads.h"
42 #include "core/tilemgr.h"
43 #include "core/clip.h"
44 #include "core/utils.h"
45
46 #include "common/simdintrin.h"
47 #include "common/os.h"
48
49 static const SWR_RECT g_MaxScissorRect = { 0, 0, KNOB_MAX_SCISSOR_X, KNOB_MAX_SCISSOR_Y };
50
51 void SetupDefaultState(SWR_CONTEXT *pContext);
52
53 static INLINE SWR_CONTEXT* GetContext(HANDLE hContext)
54 {
55 return (SWR_CONTEXT*)hContext;
56 }
57
58 void WakeAllThreads(SWR_CONTEXT *pContext)
59 {
60 pContext->FifosNotEmpty.notify_all();
61 }
62
63 //////////////////////////////////////////////////////////////////////////
64 /// @brief Create SWR Context.
65 /// @param pCreateInfo - pointer to creation info.
66 HANDLE SwrCreateContext(
67 SWR_CREATECONTEXT_INFO* pCreateInfo)
68 {
69 RDTSC_RESET();
70 RDTSC_INIT(0);
71
72 void* pContextMem = AlignedMalloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4);
73 memset(pContextMem, 0, sizeof(SWR_CONTEXT));
74 SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT();
75
76 pContext->privateStateSize = pCreateInfo->privateStateSize;
77
78 pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
79 pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
80
81 pContext->pMacroTileManagerArray = (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
82 pContext->pDispatchQueueArray = (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
83
84 for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
85 {
86 pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
87 new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
88 new (&pContext->pDispatchQueueArray[dc]) DispatchQueue();
89
90 pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
91 }
92
93 pContext->threadInfo.MAX_WORKER_THREADS = KNOB_MAX_WORKER_THREADS;
94 pContext->threadInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES;
95 pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE;
96 pContext->threadInfo.MAX_THREADS_PER_CORE = KNOB_MAX_THREADS_PER_CORE;
97 pContext->threadInfo.SINGLE_THREADED = KNOB_SINGLE_THREADED;
98
99 if (pCreateInfo->pThreadInfo)
100 {
101 pContext->threadInfo = *pCreateInfo->pThreadInfo;
102 }
103
104 memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
105 memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
106 new (&pContext->WaitLock) std::mutex();
107 new (&pContext->FifosNotEmpty) std::condition_variable();
108
109 CreateThreadPool(pContext, &pContext->threadPool);
110
111 pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads];
112 pContext->pStats = new SWR_STATS[pContext->NumWorkerThreads];
113
114 #if defined(KNOB_ENABLE_AR)
115 // Setup ArchRast thread contexts which includes +1 for API thread.
116 pContext->pArContext = new HANDLE[pContext->NumWorkerThreads+1];
117 pContext->pArContext[pContext->NumWorkerThreads] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::API);
118 #endif
119
120 // Allocate scratch space for workers.
121 ///@note We could lazily allocate this but its rather small amount of memory.
122 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
123 {
124 #if defined(_WIN32)
125 uint32_t numaNode = pContext->threadPool.pThreadData ?
126 pContext->threadPool.pThreadData[i].numaId : 0;
127 pContext->ppScratch[i] = (uint8_t*)VirtualAllocExNuma(
128 GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE),
129 MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE,
130 numaNode);
131 #else
132 pContext->ppScratch[i] = (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4);
133 #endif
134
135 #if defined(KNOB_ENABLE_AR)
136 // Initialize worker thread context for ArchRast.
137 pContext->pArContext[i] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::WORKER);
138 #endif
139 }
140
141 // State setup AFTER context is fully initialized
142 SetupDefaultState(pContext);
143
144 // initialize hot tile manager
145 pContext->pHotTileMgr = new HotTileMgr();
146
147 // initialize function pointer tables
148 InitClearTilesTable();
149
150 // initialize callback functions
151 pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
152 pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
153 pContext->pfnClearTile = pCreateInfo->pfnClearTile;
154 pContext->pfnUpdateSoWriteOffset = pCreateInfo->pfnUpdateSoWriteOffset;
155 pContext->pfnUpdateStats = pCreateInfo->pfnUpdateStats;
156 pContext->pfnUpdateStatsFE = pCreateInfo->pfnUpdateStatsFE;
157
158
159 // pass pointer to bucket manager back to caller
160 #ifdef KNOB_ENABLE_RDTSC
161 pCreateInfo->pBucketMgr = &gBucketMgr;
162 #endif
163
164 pCreateInfo->contextSaveSize = sizeof(API_STATE);
165
166 StartThreadPool(pContext, &pContext->threadPool);
167
168 return (HANDLE)pContext;
169 }
170
171 void CopyState(DRAW_STATE& dst, const DRAW_STATE& src)
172 {
173 memcpy(&dst.state, &src.state, sizeof(API_STATE));
174 }
175
176 template<bool IsDraw>
177 void QueueWork(SWR_CONTEXT *pContext)
178 {
179 DRAW_CONTEXT* pDC = pContext->pCurDrawContext;
180 uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
181
182 if (IsDraw)
183 {
184 pDC->pTileMgr = &pContext->pMacroTileManagerArray[dcIndex];
185 pDC->pTileMgr->initialize();
186 }
187
188 // Each worker thread looks at a DC for both FE and BE work at different times and so we
189 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
190 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
191 // then moved on if all work is done.)
192 pContext->pCurDrawContext->threadsDone = pContext->NumFEThreads + pContext->NumBEThreads;
193
194 if (IsDraw)
195 {
196 InterlockedIncrement((volatile LONG*)&pContext->drawsOutstandingFE);
197 }
198
199 _ReadWriteBarrier();
200 {
201 std::unique_lock<std::mutex> lock(pContext->WaitLock);
202 pContext->dcRing.Enqueue();
203 }
204
205 if (pContext->threadInfo.SINGLE_THREADED)
206 {
207 // flush denormals to 0
208 uint32_t mxcsr = _mm_getcsr();
209 _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
210
211 if (IsDraw)
212 {
213 uint32_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
214 WorkOnFifoFE(pContext, 0, curDraw[0]);
215 WorkOnFifoBE(pContext, 0, curDraw[1], pContext->singleThreadLockedTiles, 0, 0);
216 }
217 else
218 {
219 uint32_t curDispatch = pContext->pCurDrawContext->drawId;
220 WorkOnCompute(pContext, 0, curDispatch);
221 }
222
223 // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
224 while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0) {}
225
226 // restore csr
227 _mm_setcsr(mxcsr);
228 }
229 else
230 {
231 AR_API_BEGIN(APIDrawWakeAllThreads, pDC->drawId);
232 WakeAllThreads(pContext);
233 AR_API_END(APIDrawWakeAllThreads, 1);
234 }
235
236 // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
237 pContext->pPrevDrawContext = pContext->pCurDrawContext;
238 pContext->pCurDrawContext = nullptr;
239 }
240
241 INLINE void QueueDraw(SWR_CONTEXT* pContext)
242 {
243 QueueWork<true>(pContext);
244 }
245
246 INLINE void QueueDispatch(SWR_CONTEXT* pContext)
247 {
248 QueueWork<false>(pContext);
249 }
250
251 DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
252 {
253 AR_API_BEGIN(APIGetDrawContext, 0);
254 // If current draw context is null then need to obtain a new draw context to use from ring.
255 if (pContext->pCurDrawContext == nullptr)
256 {
257 // Need to wait for a free entry.
258 while (pContext->dcRing.IsFull())
259 {
260 _mm_pause();
261 }
262
263 uint64_t curDraw = pContext->dcRing.GetHead();
264 uint32_t dcIndex = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT;
265
266 if ((pContext->frameCount - pContext->lastFrameChecked) > 2 ||
267 (curDraw - pContext->lastDrawChecked) > 0x10000)
268 {
269 // Take this opportunity to clean-up old arena allocations
270 pContext->cachingArenaAllocator.FreeOldBlocks();
271
272 pContext->lastFrameChecked = pContext->frameCount;
273 pContext->lastDrawChecked = curDraw;
274 }
275
276 DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
277 pContext->pCurDrawContext = pCurDrawContext;
278
279 // Assign next available entry in DS ring to this DC.
280 uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
281 pCurDrawContext->pState = &pContext->dsRing[dsIndex];
282
283 // Copy previous state to current state.
284 if (pContext->pPrevDrawContext)
285 {
286 DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext;
287
288 // If we're splitting our draw then we can just use the same state from the previous
289 // draw. In this case, we won't increment the DS ring index so the next non-split
290 // draw can receive the state.
291 if (isSplitDraw == false)
292 {
293 CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);
294
295 // Should have been cleaned up previously
296 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
297
298 pCurDrawContext->pState->pPrivateState = nullptr;
299
300 pContext->curStateId++; // Progress state ring index forward.
301 }
302 else
303 {
304 // If its a split draw then just copy the state pointer over
305 // since its the same draw.
306 pCurDrawContext->pState = pPrevDrawContext->pState;
307 SWR_ASSERT(pPrevDrawContext->cleanupState == false);
308 }
309 }
310 else
311 {
312 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
313 pContext->curStateId++; // Progress state ring index forward.
314 }
315
316 SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true);
317
318 // Reset dependency
319 pCurDrawContext->dependent = false;
320 pCurDrawContext->dependentFE = false;
321
322 pCurDrawContext->pContext = pContext;
323 pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
324
325 pCurDrawContext->doneFE = false;
326 pCurDrawContext->FeLock = 0;
327 pCurDrawContext->threadsDone = 0;
328 pCurDrawContext->retireCallback.pfnCallbackFunc = nullptr;
329
330 pCurDrawContext->dynState.Reset(pContext->NumWorkerThreads);
331
332 // Assign unique drawId for this DC
333 pCurDrawContext->drawId = pContext->dcRing.GetHead();
334
335 pCurDrawContext->cleanupState = true;
336
337 }
338 else
339 {
340 SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
341 }
342
343 AR_API_END(APIGetDrawContext, 0);
344 return pContext->pCurDrawContext;
345 }
346
347 API_STATE* GetDrawState(SWR_CONTEXT *pContext)
348 {
349 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
350 SWR_ASSERT(pDC->pState != nullptr);
351
352 return &pDC->pState->state;
353 }
354
355 void SwrDestroyContext(HANDLE hContext)
356 {
357 SWR_CONTEXT *pContext = GetContext(hContext);
358 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
359
360 pDC->FeWork.type = SHUTDOWN;
361 pDC->FeWork.pfnWork = ProcessShutdown;
362
363 //enqueue
364 QueueDraw(pContext);
365
366 DestroyThreadPool(pContext, &pContext->threadPool);
367
368 // free the fifos
369 for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i)
370 {
371 delete[] pContext->dcRing[i].dynState.pStats;
372 delete pContext->dcRing[i].pArena;
373 delete pContext->dsRing[i].pArena;
374 pContext->pMacroTileManagerArray[i].~MacroTileMgr();
375 pContext->pDispatchQueueArray[i].~DispatchQueue();
376 }
377
378 AlignedFree(pContext->pDispatchQueueArray);
379 AlignedFree(pContext->pMacroTileManagerArray);
380
381 // Free scratch space.
382 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
383 {
384 #if defined(_WIN32)
385 VirtualFree(pContext->ppScratch[i], 0, MEM_RELEASE);
386 #else
387 AlignedFree(pContext->ppScratch[i]);
388 #endif
389
390 #if defined(KNOB_ENABLE_AR)
391 ArchRast::DestroyThreadContext(pContext->pArContext[i]);
392 #endif
393 }
394
395 delete[] pContext->ppScratch;
396 delete[] pContext->pStats;
397
398 delete(pContext->pHotTileMgr);
399
400 pContext->~SWR_CONTEXT();
401 AlignedFree(GetContext(hContext));
402 }
403
404 void SWR_API SwrSaveState(
405 HANDLE hContext,
406 void* pOutputStateBlock,
407 size_t memSize)
408 {
409 SWR_CONTEXT *pContext = GetContext(hContext);
410 auto pSrc = GetDrawState(pContext);
411 SWR_ASSERT(pOutputStateBlock && memSize >= sizeof(*pSrc));
412
413 memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc));
414 }
415
416 void SWR_API SwrRestoreState(
417 HANDLE hContext,
418 const void* pStateBlock,
419 size_t memSize)
420 {
421 SWR_CONTEXT *pContext = GetContext(hContext);
422 auto pDst = GetDrawState(pContext);
423 SWR_ASSERT(pStateBlock && memSize >= sizeof(*pDst));
424
425 memcpy(pDst, pStateBlock, sizeof(*pDst));
426 }
427
428 void SetupDefaultState(SWR_CONTEXT *pContext)
429 {
430 API_STATE* pState = GetDrawState(pContext);
431
432 pState->rastState.cullMode = SWR_CULLMODE_NONE;
433 pState->rastState.frontWinding = SWR_FRONTWINDING_CCW;
434
435 pState->depthBoundsState.depthBoundsTestEnable = false;
436 pState->depthBoundsState.depthBoundsTestMinValue = 0.0f;
437 pState->depthBoundsState.depthBoundsTestMaxValue = 1.0f;
438 }
439
440 void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2, uint64_t userData3)
441 {
442 SWR_ASSERT(pfnFunc != nullptr);
443
444 SWR_CONTEXT *pContext = GetContext(hContext);
445 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
446
447 AR_API_BEGIN(APISync, 0);
448
449 pDC->FeWork.type = SYNC;
450 pDC->FeWork.pfnWork = ProcessSync;
451
452 // Setup callback function
453 pDC->retireCallback.pfnCallbackFunc = pfnFunc;
454 pDC->retireCallback.userData = userData;
455 pDC->retireCallback.userData2 = userData2;
456 pDC->retireCallback.userData3 = userData3;
457
458 //enqueue
459 QueueDraw(pContext);
460
461 AR_API_END(APISync, 1);
462 }
463
464 void SwrWaitForIdle(HANDLE hContext)
465 {
466 SWR_CONTEXT *pContext = GetContext(hContext);
467
468 AR_API_BEGIN(APIWaitForIdle, 0);
469
470 while (!pContext->dcRing.IsEmpty())
471 {
472 _mm_pause();
473 }
474
475 AR_API_END(APIWaitForIdle, 1);
476 }
477
478 void SwrWaitForIdleFE(HANDLE hContext)
479 {
480 SWR_CONTEXT *pContext = GetContext(hContext);
481
482 AR_API_BEGIN(APIWaitForIdle, 0);
483
484 while (pContext->drawsOutstandingFE > 0)
485 {
486 _mm_pause();
487 }
488
489 AR_API_END(APIWaitForIdle, 1);
490 }
491
492 void SwrSetVertexBuffers(
493 HANDLE hContext,
494 uint32_t numBuffers,
495 const SWR_VERTEX_BUFFER_STATE* pVertexBuffers)
496 {
497 API_STATE* pState = GetDrawState(GetContext(hContext));
498
499 for (uint32_t i = 0; i < numBuffers; ++i)
500 {
501 const SWR_VERTEX_BUFFER_STATE *pVB = &pVertexBuffers[i];
502 pState->vertexBuffers[pVB->index] = *pVB;
503 }
504 }
505
506 void SwrSetIndexBuffer(
507 HANDLE hContext,
508 const SWR_INDEX_BUFFER_STATE* pIndexBuffer)
509 {
510 API_STATE* pState = GetDrawState(GetContext(hContext));
511
512 pState->indexBuffer = *pIndexBuffer;
513 }
514
515 void SwrSetFetchFunc(
516 HANDLE hContext,
517 PFN_FETCH_FUNC pfnFetchFunc)
518 {
519 API_STATE* pState = GetDrawState(GetContext(hContext));
520
521 pState->pfnFetchFunc = pfnFetchFunc;
522 }
523
524 void SwrSetSoFunc(
525 HANDLE hContext,
526 PFN_SO_FUNC pfnSoFunc,
527 uint32_t streamIndex)
528 {
529 API_STATE* pState = GetDrawState(GetContext(hContext));
530
531 SWR_ASSERT(streamIndex < MAX_SO_STREAMS);
532
533 pState->pfnSoFunc[streamIndex] = pfnSoFunc;
534 }
535
536 void SwrSetSoState(
537 HANDLE hContext,
538 SWR_STREAMOUT_STATE* pSoState)
539 {
540 API_STATE* pState = GetDrawState(GetContext(hContext));
541
542 pState->soState = *pSoState;
543 }
544
545 void SwrSetSoBuffers(
546 HANDLE hContext,
547 SWR_STREAMOUT_BUFFER* pSoBuffer,
548 uint32_t slot)
549 {
550 API_STATE* pState = GetDrawState(GetContext(hContext));
551
552 SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot);
553
554 pState->soBuffer[slot] = *pSoBuffer;
555 }
556
557 void SwrSetVertexFunc(
558 HANDLE hContext,
559 PFN_VERTEX_FUNC pfnVertexFunc)
560 {
561 API_STATE* pState = GetDrawState(GetContext(hContext));
562
563 pState->pfnVertexFunc = pfnVertexFunc;
564 }
565
566 void SwrSetFrontendState(
567 HANDLE hContext,
568 SWR_FRONTEND_STATE *pFEState)
569 {
570 API_STATE* pState = GetDrawState(GetContext(hContext));
571 pState->frontendState = *pFEState;
572 }
573
574 void SwrSetGsState(
575 HANDLE hContext,
576 SWR_GS_STATE *pGSState)
577 {
578 API_STATE* pState = GetDrawState(GetContext(hContext));
579 pState->gsState = *pGSState;
580 }
581
582 void SwrSetGsFunc(
583 HANDLE hContext,
584 PFN_GS_FUNC pfnGsFunc)
585 {
586 API_STATE* pState = GetDrawState(GetContext(hContext));
587 pState->pfnGsFunc = pfnGsFunc;
588 }
589
590 void SwrSetCsFunc(
591 HANDLE hContext,
592 PFN_CS_FUNC pfnCsFunc,
593 uint32_t totalThreadsInGroup,
594 uint32_t totalSpillFillSize)
595 {
596 API_STATE* pState = GetDrawState(GetContext(hContext));
597 pState->pfnCsFunc = pfnCsFunc;
598 pState->totalThreadsInGroup = totalThreadsInGroup;
599 pState->totalSpillFillSize = totalSpillFillSize;
600 }
601
602 void SwrSetTsState(
603 HANDLE hContext,
604 SWR_TS_STATE *pState)
605 {
606 API_STATE* pApiState = GetDrawState(GetContext(hContext));
607 pApiState->tsState = *pState;
608 }
609
610 void SwrSetHsFunc(
611 HANDLE hContext,
612 PFN_HS_FUNC pfnFunc)
613 {
614 API_STATE* pApiState = GetDrawState(GetContext(hContext));
615 pApiState->pfnHsFunc = pfnFunc;
616 }
617
618 void SwrSetDsFunc(
619 HANDLE hContext,
620 PFN_DS_FUNC pfnFunc)
621 {
622 API_STATE* pApiState = GetDrawState(GetContext(hContext));
623 pApiState->pfnDsFunc = pfnFunc;
624 }
625
626 void SwrSetDepthStencilState(
627 HANDLE hContext,
628 SWR_DEPTH_STENCIL_STATE *pDSState)
629 {
630 API_STATE* pState = GetDrawState(GetContext(hContext));
631
632 pState->depthStencilState = *pDSState;
633 }
634
635 void SwrSetBackendState(
636 HANDLE hContext,
637 SWR_BACKEND_STATE *pBEState)
638 {
639 API_STATE* pState = GetDrawState(GetContext(hContext));
640
641 pState->backendState = *pBEState;
642 }
643
644 void SwrSetDepthBoundsState(
645 HANDLE hContext,
646 SWR_DEPTH_BOUNDS_STATE *pDBState)
647 {
648 API_STATE* pState = GetDrawState(GetContext(hContext));
649
650 pState->depthBoundsState = *pDBState;
651 }
652
653 void SwrSetPixelShaderState(
654 HANDLE hContext,
655 SWR_PS_STATE *pPSState)
656 {
657 API_STATE *pState = GetDrawState(GetContext(hContext));
658 pState->psState = *pPSState;
659 }
660
661 void SwrSetBlendState(
662 HANDLE hContext,
663 SWR_BLEND_STATE *pBlendState)
664 {
665 API_STATE *pState = GetDrawState(GetContext(hContext));
666 memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE));
667 }
668
669 void SwrSetBlendFunc(
670 HANDLE hContext,
671 uint32_t renderTarget,
672 PFN_BLEND_JIT_FUNC pfnBlendFunc)
673 {
674 SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS);
675 API_STATE *pState = GetDrawState(GetContext(hContext));
676 pState->pfnBlendFunc[renderTarget] = pfnBlendFunc;
677 }
678
679 // update guardband multipliers for the viewport
680 void updateGuardbands(API_STATE *pState)
681 {
682 uint32_t numGbs = pState->gsState.emitsRenderTargetArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
683
684 for(uint32_t i = 0; i < numGbs; ++i)
685 {
686 // guardband center is viewport center
687 pState->gbState.left[i] = KNOB_GUARDBAND_WIDTH / pState->vp[i].width;
688 pState->gbState.right[i] = KNOB_GUARDBAND_WIDTH / pState->vp[i].width;
689 pState->gbState.top[i] = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height;
690 pState->gbState.bottom[i] = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height;
691 }
692 }
693
694 void SwrSetRastState(
695 HANDLE hContext,
696 const SWR_RASTSTATE *pRastState)
697 {
698 SWR_CONTEXT *pContext = GetContext(hContext);
699 API_STATE* pState = GetDrawState(pContext);
700
701 memcpy(&pState->rastState, pRastState, sizeof(SWR_RASTSTATE));
702 }
703
704 void SwrSetViewports(
705 HANDLE hContext,
706 uint32_t numViewports,
707 const SWR_VIEWPORT* pViewports,
708 const SWR_VIEWPORT_MATRICES* pMatrices)
709 {
710 SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS,
711 "Invalid number of viewports.");
712
713 SWR_CONTEXT *pContext = GetContext(hContext);
714 API_STATE* pState = GetDrawState(pContext);
715
716 memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports);
717 // @todo Faster to copy portions of the SOA or just copy all of it?
718 memcpy(&pState->vpMatrices, pMatrices, sizeof(SWR_VIEWPORT_MATRICES));
719
720 updateGuardbands(pState);
721 }
722
723 void SwrSetScissorRects(
724 HANDLE hContext,
725 uint32_t numScissors,
726 const SWR_RECT* pScissors)
727 {
728 SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS,
729 "Invalid number of scissor rects.");
730
731 API_STATE* pState = GetDrawState(GetContext(hContext));
732 memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(pScissors[0]));
733 };
734
735 void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
736 {
737 API_STATE *pState = &pDC->pState->state;
738 uint32_t numScissors = pState->gsState.emitsViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
739 pState->scissorsTileAligned = true;
740
741 for (uint32_t index = 0; index < numScissors; ++index)
742 {
743 SWR_RECT &scissorInFixedPoint = pState->scissorsInFixedPoint[index];
744
745 // Set up scissor dimensions based on scissor or viewport
746 if (pState->rastState.scissorEnable)
747 {
748 scissorInFixedPoint = pState->scissorRects[index];
749 }
750 else
751 {
752 // the vp width and height must be added to origin un-rounded then the result round to -inf.
753 // The cast to int works for rounding assuming all [left, right, top, bottom] are positive.
754 scissorInFixedPoint.xmin = (int32_t)pState->vp[index].x;
755 scissorInFixedPoint.xmax = (int32_t)(pState->vp[index].x + pState->vp[index].width);
756 scissorInFixedPoint.ymin = (int32_t)pState->vp[index].y;
757 scissorInFixedPoint.ymax = (int32_t)(pState->vp[index].y + pState->vp[index].height);
758 }
759
760 // Clamp to max rect
761 scissorInFixedPoint &= g_MaxScissorRect;
762
763 // Test for tile alignment
764 bool tileAligned;
765 tileAligned = (scissorInFixedPoint.xmin % KNOB_TILE_X_DIM) == 0;
766 tileAligned &= (scissorInFixedPoint.ymin % KNOB_TILE_Y_DIM) == 0;
767 tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_X_DIM) == 0;
768 tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_Y_DIM) == 0;
769
770 pState->scissorsTileAligned &= tileAligned;
771
772 // Scale to fixed point
773 scissorInFixedPoint.xmin *= FIXED_POINT_SCALE;
774 scissorInFixedPoint.xmax *= FIXED_POINT_SCALE;
775 scissorInFixedPoint.ymin *= FIXED_POINT_SCALE;
776 scissorInFixedPoint.ymax *= FIXED_POINT_SCALE;
777
778 // Make scissor inclusive
779 scissorInFixedPoint.xmax -= 1;
780 scissorInFixedPoint.ymax -= 1;
781 }
782 }
783
784 // templated backend function tables
785 extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
786 extern PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2][2];
787 extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2][2];
788 extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2];
789 void SetupPipeline(DRAW_CONTEXT *pDC)
790 {
791 SWR_CONTEXT* pContext = pDC->pContext;
792 DRAW_STATE* pState = pDC->pState;
793 const SWR_RASTSTATE &rastState = pState->state.rastState;
794 const SWR_PS_STATE &psState = pState->state.psState;
795 BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
796 const uint32_t forcedSampleCount = (rastState.forcedSampleCount) ? 1 : 0;
797
798 // setup backend
799 if (psState.pfnPixelShader == nullptr)
800 {
801 backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
802 }
803 else
804 {
805 const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.forcedSampleCount) ? 1 : 0;
806 const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
807 const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesSourceDepth && !psState.usesUAV)) ? 1 : 0;
808
809 SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
810
811 // select backend function
812 switch(psState.shadingRate)
813 {
814 case SWR_SHADING_RATE_PIXEL:
815 if(bMultisampleEnable)
816 {
817 // always need to generate I & J per sample for Z interpolation
818 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
819 backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount][canEarlyZ];
820 }
821 else
822 {
823 // always need to generate I & J per pixel for Z interpolation
824 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
825 backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid][canEarlyZ];
826 }
827 break;
828 case SWR_SHADING_RATE_SAMPLE:
829 SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
830 // always need to generate I & J per sample for Z interpolation
831 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
832 backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid][canEarlyZ];
833 break;
834 default:
835 SWR_ASSERT(0 && "Invalid shading rate");
836 break;
837 }
838 }
839
840 PFN_PROCESS_PRIMS pfnBinner;
841 switch (pState->state.topology)
842 {
843 case TOP_POINT_LIST:
844 pState->pfnProcessPrims = ClipPoints;
845 pfnBinner = BinPoints;
846 break;
847 case TOP_LINE_LIST:
848 case TOP_LINE_STRIP:
849 case TOP_LINE_LOOP:
850 case TOP_LINE_LIST_ADJ:
851 case TOP_LISTSTRIP_ADJ:
852 pState->pfnProcessPrims = ClipLines;
853 pfnBinner = BinLines;
854 break;
855 default:
856 pState->pfnProcessPrims = ClipTriangles;
857 pfnBinner = GetBinTrianglesFunc((rastState.conservativeRast > 0));
858 break;
859 };
860
861
862 // disable clipper if viewport transform is disabled
863 if (pState->state.frontendState.vpTransformDisable)
864 {
865 pState->pfnProcessPrims = pfnBinner;
866 }
867
868 if ((pState->state.psState.pfnPixelShader == nullptr) &&
869 (pState->state.depthStencilState.depthTestEnable == FALSE) &&
870 (pState->state.depthStencilState.depthWriteEnable == FALSE) &&
871 (pState->state.depthStencilState.stencilTestEnable == FALSE) &&
872 (pState->state.depthStencilState.stencilWriteEnable == FALSE) &&
873 (pState->state.backendState.numAttributes == 0))
874 {
875 pState->pfnProcessPrims = nullptr;
876 }
877
878 if (pState->state.soState.rasterizerDisable == true)
879 {
880 pState->pfnProcessPrims = nullptr;
881 }
882
883
884 // set up the frontend attribute count
885 pState->state.feNumAttributes = 0;
886 const SWR_BACKEND_STATE& backendState = pState->state.backendState;
887 if (backendState.swizzleEnable)
888 {
889 // attribute swizzling is enabled, iterate over the map and record the max attribute used
890 for (uint32_t i = 0; i < backendState.numAttributes; ++i)
891 {
892 pState->state.feNumAttributes = std::max(pState->state.feNumAttributes, (uint32_t)backendState.swizzleMap[i].sourceAttrib + 1);
893 }
894 }
895 else
896 {
897 pState->state.feNumAttributes = pState->state.backendState.numAttributes;
898 }
899
900 if (pState->state.soState.soEnable)
901 {
902 uint32_t streamMasks = 0;
903 for (uint32_t i = 0; i < 4; ++i)
904 {
905 streamMasks |= pState->state.soState.streamMasks[i];
906 }
907
908 DWORD maxAttrib;
909 if (_BitScanReverse(&maxAttrib, streamMasks))
910 {
911 pState->state.feNumAttributes = std::max(pState->state.feNumAttributes, (uint32_t)(maxAttrib + 1));
912 }
913 }
914
915 // complicated logic to test for cases where we don't need backing hottile memory for a draw
916 // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
917 pState->state.depthHottileEnable = ((!(pState->state.depthStencilState.depthTestEnable &&
918 !pState->state.depthStencilState.depthWriteEnable &&
919 !pState->state.depthBoundsState.depthBoundsTestEnable &&
920 pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) &&
921 (pState->state.depthStencilState.depthTestEnable ||
922 pState->state.depthStencilState.depthWriteEnable ||
923 pState->state.depthBoundsState.depthBoundsTestEnable)) ? true : false;
924
925 pState->state.stencilHottileEnable = (((!(pState->state.depthStencilState.stencilTestEnable &&
926 !pState->state.depthStencilState.stencilWriteEnable &&
927 pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) ||
928 // for stencil we have to check the double sided state as well
929 (!(pState->state.depthStencilState.doubleSidedStencilTestEnable &&
930 !pState->state.depthStencilState.stencilWriteEnable &&
931 pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) &&
932 (pState->state.depthStencilState.stencilTestEnable ||
933 pState->state.depthStencilState.stencilWriteEnable)) ? true : false;
934
935 uint32_t numRTs = pState->state.psState.numRenderTargets;
936 pState->state.colorHottileEnable = 0;
937 if (psState.pfnPixelShader != nullptr)
938 {
939 for (uint32_t rt = 0; rt < numRTs; ++rt)
940 {
941 pState->state.colorHottileEnable |=
942 (!pState->state.blendState.renderTarget[rt].writeDisableAlpha ||
943 !pState->state.blendState.renderTarget[rt].writeDisableRed ||
944 !pState->state.blendState.renderTarget[rt].writeDisableGreen ||
945 !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0;
946 }
947 }
948
949 // Setup depth quantization function
950 if (pState->state.depthHottileEnable)
951 {
952 switch (pState->state.rastState.depthFormat)
953 {
954 case R32_FLOAT_X8X24_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT_X8X24_TYPELESS > ; break;
955 case R32_FLOAT: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; break;
956 case R24_UNORM_X8_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R24_UNORM_X8_TYPELESS > ; break;
957 case R16_UNORM: pState->state.pfnQuantizeDepth = QuantizeDepth < R16_UNORM > ; break;
958 default: SWR_ASSERT(false, "Unsupported depth format for depth quantiztion.");
959 pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
960 }
961 }
962 else
963 {
964 // set up pass-through quantize if depth isn't enabled
965 pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
966 }
967 }
968
969 //////////////////////////////////////////////////////////////////////////
970 /// @brief InitDraw
971 /// @param pDC - Draw context to initialize for this draw.
972 void InitDraw(
973 DRAW_CONTEXT *pDC,
974 bool isSplitDraw)
975 {
976 // We don't need to re-setup the scissors/pipeline state again for split draw.
977 if (isSplitDraw == false)
978 {
979 SetupMacroTileScissors(pDC);
980 SetupPipeline(pDC);
981 }
982
983
984 }
985
986 //////////////////////////////////////////////////////////////////////////
987 /// @brief We can split the draw for certain topologies for better performance.
988 /// @param totalVerts - Total vertices for draw
989 /// @param topology - Topology used for draw
990 uint32_t MaxVertsPerDraw(
991 DRAW_CONTEXT* pDC,
992 uint32_t totalVerts,
993 PRIMITIVE_TOPOLOGY topology)
994 {
995 API_STATE& state = pDC->pState->state;
996
997 uint32_t vertsPerDraw = totalVerts;
998
999 if (state.soState.soEnable)
1000 {
1001 return totalVerts;
1002 }
1003
1004 switch (topology)
1005 {
1006 case TOP_POINT_LIST:
1007 case TOP_TRIANGLE_LIST:
1008 vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW;
1009 break;
1010
1011 case TOP_PATCHLIST_1:
1012 case TOP_PATCHLIST_2:
1013 case TOP_PATCHLIST_3:
1014 case TOP_PATCHLIST_4:
1015 case TOP_PATCHLIST_5:
1016 case TOP_PATCHLIST_6:
1017 case TOP_PATCHLIST_7:
1018 case TOP_PATCHLIST_8:
1019 case TOP_PATCHLIST_9:
1020 case TOP_PATCHLIST_10:
1021 case TOP_PATCHLIST_11:
1022 case TOP_PATCHLIST_12:
1023 case TOP_PATCHLIST_13:
1024 case TOP_PATCHLIST_14:
1025 case TOP_PATCHLIST_15:
1026 case TOP_PATCHLIST_16:
1027 case TOP_PATCHLIST_17:
1028 case TOP_PATCHLIST_18:
1029 case TOP_PATCHLIST_19:
1030 case TOP_PATCHLIST_20:
1031 case TOP_PATCHLIST_21:
1032 case TOP_PATCHLIST_22:
1033 case TOP_PATCHLIST_23:
1034 case TOP_PATCHLIST_24:
1035 case TOP_PATCHLIST_25:
1036 case TOP_PATCHLIST_26:
1037 case TOP_PATCHLIST_27:
1038 case TOP_PATCHLIST_28:
1039 case TOP_PATCHLIST_29:
1040 case TOP_PATCHLIST_30:
1041 case TOP_PATCHLIST_31:
1042 case TOP_PATCHLIST_32:
1043 if (pDC->pState->state.tsState.tsEnable)
1044 {
1045 uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE;
1046 vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW;
1047 }
1048 break;
1049
1050 // The Primitive Assembly code can only handle 1 RECT at a time.
1051 case TOP_RECT_LIST:
1052 vertsPerDraw = 3;
1053 break;
1054
1055 default:
1056 // We are not splitting up draws for other topologies.
1057 break;
1058 }
1059
1060 return vertsPerDraw;
1061 }
1062
1063
1064 //////////////////////////////////////////////////////////////////////////
1065 /// @brief DrawInstanced
1066 /// @param hContext - Handle passed back from SwrCreateContext
1067 /// @param topology - Specifies topology for draw.
1068 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
1069 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1070 /// @param numInstances - How many instances to render.
1071 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1072 void DrawInstanced(
1073 HANDLE hContext,
1074 PRIMITIVE_TOPOLOGY topology,
1075 uint32_t numVertices,
1076 uint32_t startVertex,
1077 uint32_t numInstances = 1,
1078 uint32_t startInstance = 0)
1079 {
1080 if (KNOB_TOSS_DRAW)
1081 {
1082 return;
1083 }
1084
1085 SWR_CONTEXT *pContext = GetContext(hContext);
1086 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1087
1088 AR_API_BEGIN(APIDraw, pDC->drawId);
1089 AR_API_EVENT(DrawInstancedEvent(pDC->drawId, topology, numVertices, startVertex, numInstances, startInstance));
1090
1091 uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
1092 uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
1093 uint32_t remainingVerts = numVertices;
1094
1095 API_STATE *pState = &pDC->pState->state;
1096 pState->topology = topology;
1097 pState->forceFront = false;
1098
1099 // disable culling for points/lines
1100 uint32_t oldCullMode = pState->rastState.cullMode;
1101 if (topology == TOP_POINT_LIST)
1102 {
1103 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1104 pState->forceFront = true;
1105 }
1106 else if (topology == TOP_RECT_LIST)
1107 {
1108 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1109 }
1110
1111
1112 int draw = 0;
1113 while (remainingVerts)
1114 {
1115 uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ?
1116 remainingVerts : maxVertsPerDraw;
1117
1118 bool isSplitDraw = (draw > 0) ? true : false;
1119 DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw);
1120 InitDraw(pDC, isSplitDraw);
1121
1122 pDC->FeWork.type = DRAW;
1123 pDC->FeWork.pfnWork = GetProcessDrawFunc(
1124 false, // IsIndexed
1125 false, // bEnableCutIndex
1126 pState->tsState.tsEnable,
1127 pState->gsState.gsEnable,
1128 pState->soState.soEnable,
1129 pDC->pState->pfnProcessPrims != nullptr);
1130 pDC->FeWork.desc.draw.numVerts = numVertsForDraw;
1131 pDC->FeWork.desc.draw.startVertex = startVertex;
1132 pDC->FeWork.desc.draw.numInstances = numInstances;
1133 pDC->FeWork.desc.draw.startInstance = startInstance;
1134 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1135 pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw;
1136
1137 pDC->cleanupState = (remainingVerts == numVertsForDraw);
1138
1139 //enqueue DC
1140 QueueDraw(pContext);
1141
1142 remainingVerts -= numVertsForDraw;
1143 draw++;
1144 }
1145
1146 // restore culling state
1147 pDC = GetDrawContext(pContext);
1148 pDC->pState->state.rastState.cullMode = oldCullMode;
1149
1150
1151 AR_API_END(APIDraw, numVertices * numInstances);
1152 }
1153
1154 //////////////////////////////////////////////////////////////////////////
1155 /// @brief SwrDraw
1156 /// @param hContext - Handle passed back from SwrCreateContext
1157 /// @param topology - Specifies topology for draw.
1158 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
1159 /// @param primCount - Number of vertices.
1160 void SwrDraw(
1161 HANDLE hContext,
1162 PRIMITIVE_TOPOLOGY topology,
1163 uint32_t startVertex,
1164 uint32_t numVertices)
1165 {
1166 DrawInstanced(hContext, topology, numVertices, startVertex);
1167 }
1168
1169 //////////////////////////////////////////////////////////////////////////
1170 /// @brief SwrDrawInstanced
1171 /// @param hContext - Handle passed back from SwrCreateContext
1172 /// @param topology - Specifies topology for draw.
1173 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
1174 /// @param numInstances - How many instances to render.
1175 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1176 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1177 void SwrDrawInstanced(
1178 HANDLE hContext,
1179 PRIMITIVE_TOPOLOGY topology,
1180 uint32_t numVertsPerInstance,
1181 uint32_t numInstances,
1182 uint32_t startVertex,
1183 uint32_t startInstance
1184 )
1185 {
1186 DrawInstanced(hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance);
1187 }
1188
1189 //////////////////////////////////////////////////////////////////////////
1190 /// @brief DrawIndexedInstanced
1191 /// @param hContext - Handle passed back from SwrCreateContext
1192 /// @param topology - Specifies topology for draw.
1193 /// @param numIndices - Number of indices to read sequentially from index buffer.
1194 /// @param indexOffset - Starting index into index buffer.
1195 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1196 /// @param numInstances - Number of instances to render.
1197 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1198 void DrawIndexedInstance(
1199 HANDLE hContext,
1200 PRIMITIVE_TOPOLOGY topology,
1201 uint32_t numIndices,
1202 uint32_t indexOffset,
1203 int32_t baseVertex,
1204 uint32_t numInstances = 1,
1205 uint32_t startInstance = 0)
1206 {
1207 if (KNOB_TOSS_DRAW)
1208 {
1209 return;
1210 }
1211
1212 SWR_CONTEXT *pContext = GetContext(hContext);
1213 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1214 API_STATE* pState = &pDC->pState->state;
1215
1216 AR_API_BEGIN(APIDrawIndexed, pDC->drawId);
1217 AR_API_EVENT(DrawIndexedInstancedEvent(pDC->drawId, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance));
1218
1219 uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
1220 uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
1221 uint32_t remainingIndices = numIndices;
1222
1223 uint32_t indexSize = 0;
1224 switch (pState->indexBuffer.format)
1225 {
1226 case R32_UINT: indexSize = sizeof(uint32_t); break;
1227 case R16_UINT: indexSize = sizeof(uint16_t); break;
1228 case R8_UINT: indexSize = sizeof(uint8_t); break;
1229 default:
1230 SWR_ASSERT(0);
1231 }
1232
1233 int draw = 0;
1234 uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices;
1235 pIB += (uint64_t)indexOffset * (uint64_t)indexSize;
1236
1237 pState->topology = topology;
1238 pState->forceFront = false;
1239
1240 // disable culling for points/lines
1241 uint32_t oldCullMode = pState->rastState.cullMode;
1242 if (topology == TOP_POINT_LIST)
1243 {
1244 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1245 pState->forceFront = true;
1246 }
1247 else if (topology == TOP_RECT_LIST)
1248 {
1249 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1250 }
1251
1252
1253 while (remainingIndices)
1254 {
1255 uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ?
1256 remainingIndices : maxIndicesPerDraw;
1257
1258 // When breaking up draw, we need to obtain new draw context for each iteration.
1259 bool isSplitDraw = (draw > 0) ? true : false;
1260
1261 pDC = GetDrawContext(pContext, isSplitDraw);
1262 InitDraw(pDC, isSplitDraw);
1263
1264 pDC->FeWork.type = DRAW;
1265 pDC->FeWork.pfnWork = GetProcessDrawFunc(
1266 true, // IsIndexed
1267 pState->frontendState.bEnableCutIndex,
1268 pState->tsState.tsEnable,
1269 pState->gsState.gsEnable,
1270 pState->soState.soEnable,
1271 pDC->pState->pfnProcessPrims != nullptr);
1272 pDC->FeWork.desc.draw.pDC = pDC;
1273 pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
1274 pDC->FeWork.desc.draw.pIB = (int*)pIB;
1275 pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format;
1276
1277 pDC->FeWork.desc.draw.numInstances = numInstances;
1278 pDC->FeWork.desc.draw.startInstance = startInstance;
1279 pDC->FeWork.desc.draw.baseVertex = baseVertex;
1280 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1281
1282 pDC->cleanupState = (remainingIndices == numIndicesForDraw);
1283
1284 //enqueue DC
1285 QueueDraw(pContext);
1286
1287 pIB += maxIndicesPerDraw * indexSize;
1288 remainingIndices -= numIndicesForDraw;
1289 draw++;
1290 }
1291
1292 // Restore culling state
1293 pDC = GetDrawContext(pContext);
1294 pDC->pState->state.rastState.cullMode = oldCullMode;
1295
1296
1297 AR_API_END(APIDrawIndexed, numIndices * numInstances);
1298 }
1299
1300
1301 //////////////////////////////////////////////////////////////////////////
1302 /// @brief DrawIndexed
1303 /// @param hContext - Handle passed back from SwrCreateContext
1304 /// @param topology - Specifies topology for draw.
1305 /// @param numIndices - Number of indices to read sequentially from index buffer.
1306 /// @param indexOffset - Starting index into index buffer.
1307 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1308 void SwrDrawIndexed(
1309 HANDLE hContext,
1310 PRIMITIVE_TOPOLOGY topology,
1311 uint32_t numIndices,
1312 uint32_t indexOffset,
1313 int32_t baseVertex
1314 )
1315 {
1316 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex);
1317 }
1318
1319 //////////////////////////////////////////////////////////////////////////
1320 /// @brief SwrDrawIndexedInstanced
1321 /// @param hContext - Handle passed back from SwrCreateContext
1322 /// @param topology - Specifies topology for draw.
1323 /// @param numIndices - Number of indices to read sequentially from index buffer.
1324 /// @param numInstances - Number of instances to render.
1325 /// @param indexOffset - Starting index into index buffer.
1326 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1327 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1328 void SwrDrawIndexedInstanced(
1329 HANDLE hContext,
1330 PRIMITIVE_TOPOLOGY topology,
1331 uint32_t numIndices,
1332 uint32_t numInstances,
1333 uint32_t indexOffset,
1334 int32_t baseVertex,
1335 uint32_t startInstance)
1336 {
1337 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
1338 }
1339
1340 //////////////////////////////////////////////////////////////////////////
1341 /// @brief SwrInvalidateTiles
1342 /// @param hContext - Handle passed back from SwrCreateContext
1343 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
1344 /// @param invalidateRect - The pixel-coordinate rectangle to invalidate. This will be expanded to
1345 /// be hottile size-aligned.
1346 void SWR_API SwrInvalidateTiles(
1347 HANDLE hContext,
1348 uint32_t attachmentMask,
1349 const SWR_RECT& invalidateRect)
1350 {
1351 if (KNOB_TOSS_DRAW)
1352 {
1353 return;
1354 }
1355
1356 SWR_CONTEXT *pContext = GetContext(hContext);
1357 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1358
1359 pDC->FeWork.type = DISCARDINVALIDATETILES;
1360 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1361 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1362 pDC->FeWork.desc.discardInvalidateTiles.rect = invalidateRect;
1363 pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect;
1364 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID;
1365 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false;
1366 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false;
1367
1368 //enqueue
1369 QueueDraw(pContext);
1370 }
1371
1372 //////////////////////////////////////////////////////////////////////////
1373 /// @brief SwrDiscardRect
1374 /// @param hContext - Handle passed back from SwrCreateContext
1375 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
1376 /// @param rect - The pixel-coordinate rectangle to discard. Only fully-covered hottiles will be
1377 /// discarded.
1378 void SWR_API SwrDiscardRect(
1379 HANDLE hContext,
1380 uint32_t attachmentMask,
1381 const SWR_RECT& rect)
1382 {
1383 if (KNOB_TOSS_DRAW)
1384 {
1385 return;
1386 }
1387
1388 SWR_CONTEXT *pContext = GetContext(hContext);
1389 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1390
1391 // Queue a load to the hottile
1392 pDC->FeWork.type = DISCARDINVALIDATETILES;
1393 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1394 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1395 pDC->FeWork.desc.discardInvalidateTiles.rect = rect;
1396 pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect;
1397 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED;
1398 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true;
1399 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true;
1400
1401 //enqueue
1402 QueueDraw(pContext);
1403 }
1404
1405 //////////////////////////////////////////////////////////////////////////
1406 /// @brief SwrDispatch
1407 /// @param hContext - Handle passed back from SwrCreateContext
1408 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
1409 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
1410 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
1411 void SwrDispatch(
1412 HANDLE hContext,
1413 uint32_t threadGroupCountX,
1414 uint32_t threadGroupCountY,
1415 uint32_t threadGroupCountZ)
1416 {
1417 if (KNOB_TOSS_DRAW)
1418 {
1419 return;
1420 }
1421
1422 SWR_CONTEXT *pContext = GetContext(hContext);
1423 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1424
1425 AR_API_BEGIN(APIDispatch, pDC->drawId);
1426 AR_API_EVENT(DispatchEvent(pDC->drawId, threadGroupCountX, threadGroupCountY, threadGroupCountZ));
1427 pDC->isCompute = true; // This is a compute context.
1428
1429 COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
1430
1431 pTaskData->threadGroupCountX = threadGroupCountX;
1432 pTaskData->threadGroupCountY = threadGroupCountY;
1433 pTaskData->threadGroupCountZ = threadGroupCountZ;
1434
1435 uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
1436 uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
1437 pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex];
1438 pDC->pDispatch->initialize(totalThreadGroups, pTaskData, &ProcessComputeBE);
1439
1440 QueueDispatch(pContext);
1441 AR_API_END(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ);
1442 }
1443
1444 // Deswizzles, converts and stores current contents of the hot tiles to surface
1445 // described by pState
1446 void SWR_API SwrStoreTiles(
1447 HANDLE hContext,
1448 uint32_t attachmentMask,
1449 SWR_TILE_STATE postStoreTileState,
1450 const SWR_RECT& storeRect)
1451 {
1452 if (KNOB_TOSS_DRAW)
1453 {
1454 return;
1455 }
1456
1457 SWR_CONTEXT *pContext = GetContext(hContext);
1458 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1459
1460 AR_API_BEGIN(APIStoreTiles, pDC->drawId);
1461
1462 pDC->FeWork.type = STORETILES;
1463 pDC->FeWork.pfnWork = ProcessStoreTiles;
1464 pDC->FeWork.desc.storeTiles.attachmentMask = attachmentMask;
1465 pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState;
1466 pDC->FeWork.desc.storeTiles.rect = storeRect;
1467 pDC->FeWork.desc.storeTiles.rect &= g_MaxScissorRect;
1468
1469 //enqueue
1470 QueueDraw(pContext);
1471
1472 AR_API_END(APIStoreTiles, 1);
1473 }
1474
1475 //////////////////////////////////////////////////////////////////////////
1476 /// @brief SwrClearRenderTarget - Clear attached render targets / depth / stencil
1477 /// @param hContext - Handle passed back from SwrCreateContext
1478 /// @param attachmentMask - combination of SWR_ATTACHMENT_*_BIT attachments to clear
1479 /// @param clearColor - color use for clearing render targets
1480 /// @param z - depth value use for clearing depth buffer
1481 /// @param stencil - stencil value used for clearing stencil buffer
1482 /// @param clearRect - The pixel-coordinate rectangle to clear in all cleared buffers
1483 void SWR_API SwrClearRenderTarget(
1484 HANDLE hContext,
1485 uint32_t attachmentMask,
1486 const float clearColor[4],
1487 float z,
1488 uint8_t stencil,
1489 const SWR_RECT& clearRect)
1490 {
1491 if (KNOB_TOSS_DRAW)
1492 {
1493 return;
1494 }
1495
1496 SWR_CONTEXT *pContext = GetContext(hContext);
1497 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1498
1499 AR_API_BEGIN(APIClearRenderTarget, pDC->drawId);
1500
1501 pDC->FeWork.type = CLEAR;
1502 pDC->FeWork.pfnWork = ProcessClear;
1503 pDC->FeWork.desc.clear.rect = clearRect;
1504 pDC->FeWork.desc.clear.rect &= g_MaxScissorRect;
1505 pDC->FeWork.desc.clear.attachmentMask = attachmentMask;
1506 pDC->FeWork.desc.clear.clearDepth = z;
1507 pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0];
1508 pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1];
1509 pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2];
1510 pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3];
1511 pDC->FeWork.desc.clear.clearStencil = stencil;
1512
1513 // enqueue draw
1514 QueueDraw(pContext);
1515
1516 AR_API_END(APIClearRenderTarget, 1);
1517 }
1518
1519 //////////////////////////////////////////////////////////////////////////
1520 /// @brief Returns a pointer to the private context state for the current
1521 /// draw operation. This is used for external componets such as the
1522 /// sampler.
1523 /// SWR is responsible for the allocation of the private context state.
1524 /// @param hContext - Handle passed back from SwrCreateContext
1525 VOID* SwrGetPrivateContextState(
1526 HANDLE hContext)
1527 {
1528 SWR_CONTEXT* pContext = GetContext(hContext);
1529 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1530 DRAW_STATE* pState = pDC->pState;
1531
1532 if (pState->pPrivateState == nullptr)
1533 {
1534 pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float));
1535 }
1536
1537 return pState->pPrivateState;
1538 }
1539
1540 //////////////////////////////////////////////////////////////////////////
1541 /// @brief Clients can use this to allocate memory for draw/dispatch
1542 /// operations. The memory will automatically be freed once operation
1543 /// has completed. Client can use this to allocate binding tables,
1544 /// etc. needed for shader execution.
1545 /// @param hContext - Handle passed back from SwrCreateContext
1546 /// @param size - Size of allocation
1547 /// @param align - Alignment needed for allocation.
1548 VOID* SwrAllocDrawContextMemory(
1549 HANDLE hContext,
1550 uint32_t size,
1551 uint32_t align)
1552 {
1553 SWR_CONTEXT* pContext = GetContext(hContext);
1554 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1555
1556 return pDC->pState->pArena->AllocAligned(size, align);
1557 }
1558
1559 //////////////////////////////////////////////////////////////////////////
1560 /// @brief Enables stats counting
1561 /// @param hContext - Handle passed back from SwrCreateContext
1562 /// @param enable - If true then counts are incremented.
1563 void SwrEnableStatsFE(
1564 HANDLE hContext,
1565 bool enable)
1566 {
1567 SWR_CONTEXT *pContext = GetContext(hContext);
1568 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1569
1570 pDC->pState->state.enableStatsFE = enable;
1571 }
1572
1573 //////////////////////////////////////////////////////////////////////////
1574 /// @brief Enables stats counting
1575 /// @param hContext - Handle passed back from SwrCreateContext
1576 /// @param enable - If true then counts are incremented.
1577 void SwrEnableStatsBE(
1578 HANDLE hContext,
1579 bool enable)
1580 {
1581 SWR_CONTEXT *pContext = GetContext(hContext);
1582 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1583
1584 pDC->pState->state.enableStatsBE = enable;
1585 }
1586
1587 //////////////////////////////////////////////////////////////////////////
1588 /// @brief Mark end of frame - used for performance profiling
1589 /// @param hContext - Handle passed back from SwrCreateContext
1590 void SWR_API SwrEndFrame(
1591 HANDLE hContext)
1592 {
1593 SWR_CONTEXT *pContext = GetContext(hContext);
1594 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1595
1596 RDTSC_ENDFRAME();
1597 AR_API_EVENT(FrameEndEvent(pContext->frameCount, pDC->drawId));
1598
1599 pContext->frameCount++;
1600 }
1601