1 /****************************************************************************
2 * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief API implementation
27 ******************************************************************************/
35 #include "core/backend.h"
36 #include "core/context.h"
37 #include "core/depthstencil.h"
38 #include "core/frontend.h"
39 #include "core/rasterizer.h"
40 #include "core/rdtsc_core.h"
41 #include "core/threads.h"
42 #include "core/tilemgr.h"
43 #include "core/clip.h"
44 #include "core/utils.h"
46 #include "common/simdintrin.h"
47 #include "common/os.h"
49 static const SWR_RECT g_MaxScissorRect
= { 0, 0, KNOB_MAX_SCISSOR_X
, KNOB_MAX_SCISSOR_Y
};
51 void SetupDefaultState(SWR_CONTEXT
*pContext
);
53 static INLINE SWR_CONTEXT
* GetContext(HANDLE hContext
)
55 return (SWR_CONTEXT
*)hContext
;
58 //////////////////////////////////////////////////////////////////////////
59 /// @brief Create SWR Context.
60 /// @param pCreateInfo - pointer to creation info.
61 HANDLE
SwrCreateContext(
62 SWR_CREATECONTEXT_INFO
* pCreateInfo
)
67 void* pContextMem
= AlignedMalloc(sizeof(SWR_CONTEXT
), KNOB_SIMD_WIDTH
* 4);
68 memset(pContextMem
, 0, sizeof(SWR_CONTEXT
));
69 SWR_CONTEXT
*pContext
= new (pContextMem
) SWR_CONTEXT();
71 pContext
->driverType
= pCreateInfo
->driver
;
72 pContext
->privateStateSize
= pCreateInfo
->privateStateSize
;
74 pContext
->dcRing
.Init(KNOB_MAX_DRAWS_IN_FLIGHT
);
75 pContext
->dsRing
.Init(KNOB_MAX_DRAWS_IN_FLIGHT
);
77 pContext
->pMacroTileManagerArray
= (MacroTileMgr
*)AlignedMalloc(sizeof(MacroTileMgr
) * KNOB_MAX_DRAWS_IN_FLIGHT
, 64);
78 pContext
->pDispatchQueueArray
= (DispatchQueue
*)AlignedMalloc(sizeof(DispatchQueue
) * KNOB_MAX_DRAWS_IN_FLIGHT
, 64);
80 for (uint32_t dc
= 0; dc
< KNOB_MAX_DRAWS_IN_FLIGHT
; ++dc
)
82 pContext
->dcRing
[dc
].pArena
= new CachingArena(pContext
->cachingArenaAllocator
);
83 new (&pContext
->pMacroTileManagerArray
[dc
]) MacroTileMgr(*pContext
->dcRing
[dc
].pArena
);
84 new (&pContext
->pDispatchQueueArray
[dc
]) DispatchQueue();
86 pContext
->dsRing
[dc
].pArena
= new CachingArena(pContext
->cachingArenaAllocator
);
89 pContext
->threadInfo
.MAX_WORKER_THREADS
= KNOB_MAX_WORKER_THREADS
;
90 pContext
->threadInfo
.MAX_NUMA_NODES
= KNOB_MAX_NUMA_NODES
;
91 pContext
->threadInfo
.MAX_CORES_PER_NUMA_NODE
= KNOB_MAX_CORES_PER_NUMA_NODE
;
92 pContext
->threadInfo
.MAX_THREADS_PER_CORE
= KNOB_MAX_THREADS_PER_CORE
;
93 pContext
->threadInfo
.SINGLE_THREADED
= KNOB_SINGLE_THREADED
;
95 if (pCreateInfo
->pThreadInfo
)
97 pContext
->threadInfo
= *pCreateInfo
->pThreadInfo
;
100 memset(&pContext
->WaitLock
, 0, sizeof(pContext
->WaitLock
));
101 memset(&pContext
->FifosNotEmpty
, 0, sizeof(pContext
->FifosNotEmpty
));
102 new (&pContext
->WaitLock
) std::mutex();
103 new (&pContext
->FifosNotEmpty
) std::condition_variable();
105 CreateThreadPool(pContext
, &pContext
->threadPool
);
107 pContext
->ppScratch
= new uint8_t*[pContext
->NumWorkerThreads
];
108 pContext
->pStats
= new SWR_STATS
[pContext
->NumWorkerThreads
];
110 // Setup ArchRast thread contexts which includes +1 for API thread.
111 pContext
->pArContext
= new HANDLE
[pContext
->NumWorkerThreads
+1];
112 pContext
->pArContext
[pContext
->NumWorkerThreads
] = ArchRast::CreateThreadContext();
114 // Allocate scratch space for workers.
115 ///@note We could lazily allocate this but its rather small amount of memory.
116 for (uint32_t i
= 0; i
< pContext
->NumWorkerThreads
; ++i
)
119 uint32_t numaNode
= pContext
->threadPool
.pThreadData
?
120 pContext
->threadPool
.pThreadData
[i
].numaId
: 0;
121 pContext
->ppScratch
[i
] = (uint8_t*)VirtualAllocExNuma(
122 GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE
),
123 MEM_RESERVE
| MEM_COMMIT
, PAGE_READWRITE
,
126 pContext
->ppScratch
[i
] = (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE
), KNOB_SIMD_WIDTH
* 4);
129 // Initialize worker thread context for ArchRast.
130 pContext
->pArContext
[i
] = ArchRast::CreateThreadContext();
133 // State setup AFTER context is fully initialized
134 SetupDefaultState(pContext
);
136 // initialize hot tile manager
137 pContext
->pHotTileMgr
= new HotTileMgr();
139 // initialize function pointer tables
140 InitClearTilesTable();
142 // initialize callback functions
143 pContext
->pfnLoadTile
= pCreateInfo
->pfnLoadTile
;
144 pContext
->pfnStoreTile
= pCreateInfo
->pfnStoreTile
;
145 pContext
->pfnClearTile
= pCreateInfo
->pfnClearTile
;
146 pContext
->pfnUpdateSoWriteOffset
= pCreateInfo
->pfnUpdateSoWriteOffset
;
147 pContext
->pfnUpdateStats
= pCreateInfo
->pfnUpdateStats
;
148 pContext
->pfnUpdateStatsFE
= pCreateInfo
->pfnUpdateStatsFE
;
150 // pass pointer to bucket manager back to caller
151 #ifdef KNOB_ENABLE_RDTSC
152 pCreateInfo
->pBucketMgr
= &gBucketMgr
;
155 pCreateInfo
->contextSaveSize
= sizeof(API_STATE
);
157 return (HANDLE
)pContext
;
160 void CopyState(DRAW_STATE
& dst
, const DRAW_STATE
& src
)
162 memcpy(&dst
.state
, &src
.state
, sizeof(API_STATE
));
165 void WakeAllThreads(SWR_CONTEXT
*pContext
)
167 pContext
->FifosNotEmpty
.notify_all();
170 template<bool IsDraw
>
171 void QueueWork(SWR_CONTEXT
*pContext
)
173 DRAW_CONTEXT
* pDC
= pContext
->pCurDrawContext
;
174 uint32_t dcIndex
= pDC
->drawId
% KNOB_MAX_DRAWS_IN_FLIGHT
;
178 pDC
->pTileMgr
= &pContext
->pMacroTileManagerArray
[dcIndex
];
179 pDC
->pTileMgr
->initialize();
182 // Each worker thread looks at a DC for both FE and BE work at different times and so we
183 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
184 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
185 // then moved on if all work is done.)
186 pContext
->pCurDrawContext
->threadsDone
= pContext
->NumFEThreads
+ pContext
->NumBEThreads
;
190 InterlockedIncrement((volatile LONG
*)&pContext
->drawsOutstandingFE
);
195 std::unique_lock
<std::mutex
> lock(pContext
->WaitLock
);
196 pContext
->dcRing
.Enqueue();
199 if (pContext
->threadInfo
.SINGLE_THREADED
)
201 // flush denormals to 0
202 uint32_t mxcsr
= _mm_getcsr();
203 _mm_setcsr(mxcsr
| _MM_FLUSH_ZERO_ON
| _MM_DENORMALS_ZERO_ON
);
207 uint32_t curDraw
[2] = { pContext
->pCurDrawContext
->drawId
, pContext
->pCurDrawContext
->drawId
};
208 WorkOnFifoFE(pContext
, 0, curDraw
[0]);
209 WorkOnFifoBE(pContext
, 0, curDraw
[1], pContext
->singleThreadLockedTiles
, 0, 0);
213 uint32_t curDispatch
= pContext
->pCurDrawContext
->drawId
;
214 WorkOnCompute(pContext
, 0, curDispatch
);
217 // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
218 while (CompleteDrawContext(pContext
, pContext
->pCurDrawContext
) > 0) {}
225 AR_API_BEGIN(APIDrawWakeAllThreads
, pDC
->drawId
);
226 WakeAllThreads(pContext
);
227 AR_API_END(APIDrawWakeAllThreads
, 1);
230 // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
231 pContext
->pPrevDrawContext
= pContext
->pCurDrawContext
;
232 pContext
->pCurDrawContext
= nullptr;
235 INLINE
void QueueDraw(SWR_CONTEXT
* pContext
)
237 QueueWork
<true>(pContext
);
240 INLINE
void QueueDispatch(SWR_CONTEXT
* pContext
)
242 QueueWork
<false>(pContext
);
245 DRAW_CONTEXT
* GetDrawContext(SWR_CONTEXT
*pContext
, bool isSplitDraw
= false)
247 AR_API_BEGIN(APIGetDrawContext
, 0);
248 // If current draw context is null then need to obtain a new draw context to use from ring.
249 if (pContext
->pCurDrawContext
== nullptr)
251 // Need to wait for a free entry.
252 while (pContext
->dcRing
.IsFull())
257 uint64_t curDraw
= pContext
->dcRing
.GetHead();
258 uint32_t dcIndex
= curDraw
% KNOB_MAX_DRAWS_IN_FLIGHT
;
260 if ((pContext
->frameCount
- pContext
->lastFrameChecked
) > 2 ||
261 (curDraw
- pContext
->lastDrawChecked
) > 0x10000)
263 // Take this opportunity to clean-up old arena allocations
264 pContext
->cachingArenaAllocator
.FreeOldBlocks();
266 pContext
->lastFrameChecked
= pContext
->frameCount
;
267 pContext
->lastDrawChecked
= curDraw
;
270 DRAW_CONTEXT
* pCurDrawContext
= &pContext
->dcRing
[dcIndex
];
271 pContext
->pCurDrawContext
= pCurDrawContext
;
273 // Assign next available entry in DS ring to this DC.
274 uint32_t dsIndex
= pContext
->curStateId
% KNOB_MAX_DRAWS_IN_FLIGHT
;
275 pCurDrawContext
->pState
= &pContext
->dsRing
[dsIndex
];
277 // Copy previous state to current state.
278 if (pContext
->pPrevDrawContext
)
280 DRAW_CONTEXT
* pPrevDrawContext
= pContext
->pPrevDrawContext
;
282 // If we're splitting our draw then we can just use the same state from the previous
283 // draw. In this case, we won't increment the DS ring index so the next non-split
284 // draw can receive the state.
285 if (isSplitDraw
== false)
287 CopyState(*pCurDrawContext
->pState
, *pPrevDrawContext
->pState
);
289 // Should have been cleaned up previously
290 SWR_ASSERT(pCurDrawContext
->pState
->pArena
->IsEmpty() == true);
292 pCurDrawContext
->pState
->pPrivateState
= nullptr;
294 pContext
->curStateId
++; // Progress state ring index forward.
298 // If its a split draw then just copy the state pointer over
299 // since its the same draw.
300 pCurDrawContext
->pState
= pPrevDrawContext
->pState
;
301 SWR_ASSERT(pPrevDrawContext
->cleanupState
== false);
306 SWR_ASSERT(pCurDrawContext
->pState
->pArena
->IsEmpty() == true);
307 pContext
->curStateId
++; // Progress state ring index forward.
310 SWR_ASSERT(pCurDrawContext
->pArena
->IsEmpty() == true);
312 pCurDrawContext
->dependent
= false;
313 pCurDrawContext
->pContext
= pContext
;
314 pCurDrawContext
->isCompute
= false; // Dispatch has to set this to true.
316 pCurDrawContext
->doneFE
= false;
317 pCurDrawContext
->FeLock
= 0;
318 pCurDrawContext
->threadsDone
= 0;
319 pCurDrawContext
->retireCallback
.pfnCallbackFunc
= nullptr;
321 pCurDrawContext
->dynState
.Reset(pContext
->NumWorkerThreads
);
323 // Assign unique drawId for this DC
324 pCurDrawContext
->drawId
= pContext
->dcRing
.GetHead();
326 pCurDrawContext
->cleanupState
= true;
330 SWR_ASSERT(isSplitDraw
== false, "Split draw should only be used when obtaining a new DC");
333 AR_API_END(APIGetDrawContext
, 0);
334 return pContext
->pCurDrawContext
;
337 API_STATE
* GetDrawState(SWR_CONTEXT
*pContext
)
339 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
340 SWR_ASSERT(pDC
->pState
!= nullptr);
342 return &pDC
->pState
->state
;
345 void SwrDestroyContext(HANDLE hContext
)
347 SWR_CONTEXT
*pContext
= GetContext(hContext
);
348 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
350 pDC
->FeWork
.type
= SHUTDOWN
;
351 pDC
->FeWork
.pfnWork
= ProcessShutdown
;
356 DestroyThreadPool(pContext
, &pContext
->threadPool
);
359 for (uint32_t i
= 0; i
< KNOB_MAX_DRAWS_IN_FLIGHT
; ++i
)
361 delete[] pContext
->dcRing
[i
].dynState
.pStats
;
362 delete pContext
->dcRing
[i
].pArena
;
363 delete pContext
->dsRing
[i
].pArena
;
364 pContext
->pMacroTileManagerArray
[i
].~MacroTileMgr();
365 pContext
->pDispatchQueueArray
[i
].~DispatchQueue();
368 AlignedFree(pContext
->pDispatchQueueArray
);
369 AlignedFree(pContext
->pMacroTileManagerArray
);
371 // Free scratch space.
372 for (uint32_t i
= 0; i
< pContext
->NumWorkerThreads
; ++i
)
375 VirtualFree(pContext
->ppScratch
[i
], 0, MEM_RELEASE
);
377 AlignedFree(pContext
->ppScratch
[i
]);
380 ArchRast::DestroyThreadContext(pContext
->pArContext
[i
]);
383 delete[] pContext
->ppScratch
;
384 delete[] pContext
->pArContext
;
385 delete[] pContext
->pStats
;
387 delete(pContext
->pHotTileMgr
);
389 pContext
->~SWR_CONTEXT();
390 AlignedFree(GetContext(hContext
));
393 void SWR_API
SwrSaveState(
395 void* pOutputStateBlock
,
398 SWR_CONTEXT
*pContext
= GetContext(hContext
);
399 auto pSrc
= GetDrawState(pContext
);
400 SWR_ASSERT(pOutputStateBlock
&& memSize
>= sizeof(*pSrc
));
402 memcpy(pOutputStateBlock
, pSrc
, sizeof(*pSrc
));
405 void SWR_API
SwrRestoreState(
407 const void* pStateBlock
,
410 SWR_CONTEXT
*pContext
= GetContext(hContext
);
411 auto pDst
= GetDrawState(pContext
);
412 SWR_ASSERT(pStateBlock
&& memSize
>= sizeof(*pDst
));
414 memcpy(pDst
, pStateBlock
, sizeof(*pDst
));
417 void SetupDefaultState(SWR_CONTEXT
*pContext
)
419 API_STATE
* pState
= GetDrawState(pContext
);
421 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
422 pState
->rastState
.frontWinding
= SWR_FRONTWINDING_CCW
;
425 void SwrSync(HANDLE hContext
, PFN_CALLBACK_FUNC pfnFunc
, uint64_t userData
, uint64_t userData2
, uint64_t userData3
)
427 SWR_ASSERT(pfnFunc
!= nullptr);
429 SWR_CONTEXT
*pContext
= GetContext(hContext
);
430 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
432 AR_API_BEGIN(APISync
, 0);
434 pDC
->FeWork
.type
= SYNC
;
435 pDC
->FeWork
.pfnWork
= ProcessSync
;
437 // Setup callback function
438 pDC
->retireCallback
.pfnCallbackFunc
= pfnFunc
;
439 pDC
->retireCallback
.userData
= userData
;
440 pDC
->retireCallback
.userData2
= userData2
;
441 pDC
->retireCallback
.userData3
= userData3
;
446 AR_API_END(APISync
, 1);
449 void SwrWaitForIdle(HANDLE hContext
)
451 SWR_CONTEXT
*pContext
= GetContext(hContext
);
453 AR_API_BEGIN(APIWaitForIdle
, 0);
455 while (!pContext
->dcRing
.IsEmpty())
460 AR_API_END(APIWaitForIdle
, 1);
463 void SwrWaitForIdleFE(HANDLE hContext
)
465 SWR_CONTEXT
*pContext
= GetContext(hContext
);
467 AR_API_BEGIN(APIWaitForIdle
, 0);
469 while (pContext
->drawsOutstandingFE
> 0)
474 AR_API_END(APIWaitForIdle
, 1);
477 void SwrSetVertexBuffers(
480 const SWR_VERTEX_BUFFER_STATE
* pVertexBuffers
)
482 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
484 for (uint32_t i
= 0; i
< numBuffers
; ++i
)
486 const SWR_VERTEX_BUFFER_STATE
*pVB
= &pVertexBuffers
[i
];
487 pState
->vertexBuffers
[pVB
->index
] = *pVB
;
491 void SwrSetIndexBuffer(
493 const SWR_INDEX_BUFFER_STATE
* pIndexBuffer
)
495 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
497 pState
->indexBuffer
= *pIndexBuffer
;
500 void SwrSetFetchFunc(
502 PFN_FETCH_FUNC pfnFetchFunc
)
504 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
506 pState
->pfnFetchFunc
= pfnFetchFunc
;
511 PFN_SO_FUNC pfnSoFunc
,
512 uint32_t streamIndex
)
514 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
516 SWR_ASSERT(streamIndex
< MAX_SO_STREAMS
);
518 pState
->pfnSoFunc
[streamIndex
] = pfnSoFunc
;
523 SWR_STREAMOUT_STATE
* pSoState
)
525 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
527 pState
->soState
= *pSoState
;
530 void SwrSetSoBuffers(
532 SWR_STREAMOUT_BUFFER
* pSoBuffer
,
535 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
537 SWR_ASSERT((slot
< 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot
);
539 pState
->soBuffer
[slot
] = *pSoBuffer
;
542 void SwrSetVertexFunc(
544 PFN_VERTEX_FUNC pfnVertexFunc
)
546 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
548 pState
->pfnVertexFunc
= pfnVertexFunc
;
551 void SwrSetFrontendState(
553 SWR_FRONTEND_STATE
*pFEState
)
555 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
556 pState
->frontendState
= *pFEState
;
561 SWR_GS_STATE
*pGSState
)
563 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
564 pState
->gsState
= *pGSState
;
569 PFN_GS_FUNC pfnGsFunc
)
571 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
572 pState
->pfnGsFunc
= pfnGsFunc
;
577 PFN_CS_FUNC pfnCsFunc
,
578 uint32_t totalThreadsInGroup
,
579 uint32_t totalSpillFillSize
)
581 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
582 pState
->pfnCsFunc
= pfnCsFunc
;
583 pState
->totalThreadsInGroup
= totalThreadsInGroup
;
584 pState
->totalSpillFillSize
= totalSpillFillSize
;
589 SWR_TS_STATE
*pState
)
591 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
592 pApiState
->tsState
= *pState
;
599 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
600 pApiState
->pfnHsFunc
= pfnFunc
;
607 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
608 pApiState
->pfnDsFunc
= pfnFunc
;
611 void SwrSetDepthStencilState(
613 SWR_DEPTH_STENCIL_STATE
*pDSState
)
615 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
617 pState
->depthStencilState
= *pDSState
;
620 void SwrSetBackendState(
622 SWR_BACKEND_STATE
*pBEState
)
624 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
626 pState
->backendState
= *pBEState
;
629 void SwrSetPixelShaderState(
631 SWR_PS_STATE
*pPSState
)
633 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
634 pState
->psState
= *pPSState
;
637 void SwrSetBlendState(
639 SWR_BLEND_STATE
*pBlendState
)
641 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
642 memcpy(&pState
->blendState
, pBlendState
, sizeof(SWR_BLEND_STATE
));
645 void SwrSetBlendFunc(
647 uint32_t renderTarget
,
648 PFN_BLEND_JIT_FUNC pfnBlendFunc
)
650 SWR_ASSERT(renderTarget
< SWR_NUM_RENDERTARGETS
);
651 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
652 pState
->pfnBlendFunc
[renderTarget
] = pfnBlendFunc
;
655 // update guardband multipliers for the viewport
656 void updateGuardbands(API_STATE
*pState
)
658 uint32_t numGbs
= pState
->gsState
.emitsRenderTargetArrayIndex
? KNOB_NUM_VIEWPORTS_SCISSORS
: 1;
660 for(uint32_t i
= 0; i
< numGbs
; ++i
)
662 // guardband center is viewport center
663 pState
->gbState
.left
[i
] = KNOB_GUARDBAND_WIDTH
/ pState
->vp
[i
].width
;
664 pState
->gbState
.right
[i
] = KNOB_GUARDBAND_WIDTH
/ pState
->vp
[i
].width
;
665 pState
->gbState
.top
[i
] = KNOB_GUARDBAND_HEIGHT
/ pState
->vp
[i
].height
;
666 pState
->gbState
.bottom
[i
] = KNOB_GUARDBAND_HEIGHT
/ pState
->vp
[i
].height
;
670 void SwrSetRastState(
672 const SWR_RASTSTATE
*pRastState
)
674 SWR_CONTEXT
*pContext
= GetContext(hContext
);
675 API_STATE
* pState
= GetDrawState(pContext
);
677 memcpy(&pState
->rastState
, pRastState
, sizeof(SWR_RASTSTATE
));
680 void SwrSetViewports(
682 uint32_t numViewports
,
683 const SWR_VIEWPORT
* pViewports
,
684 const SWR_VIEWPORT_MATRICES
* pMatrices
)
686 SWR_ASSERT(numViewports
<= KNOB_NUM_VIEWPORTS_SCISSORS
,
687 "Invalid number of viewports.");
689 SWR_CONTEXT
*pContext
= GetContext(hContext
);
690 API_STATE
* pState
= GetDrawState(pContext
);
692 memcpy(&pState
->vp
[0], pViewports
, sizeof(SWR_VIEWPORT
) * numViewports
);
694 if (pMatrices
!= nullptr)
696 // @todo Faster to copy portions of the SOA or just copy all of it?
697 memcpy(&pState
->vpMatrices
, pMatrices
, sizeof(SWR_VIEWPORT_MATRICES
));
701 // Compute default viewport transform.
702 for (uint32_t i
= 0; i
< numViewports
; ++i
)
704 if (pContext
->driverType
== DX
)
706 pState
->vpMatrices
.m00
[i
] = pState
->vp
[i
].width
/ 2.0f
;
707 pState
->vpMatrices
.m11
[i
] = -pState
->vp
[i
].height
/ 2.0f
;
708 pState
->vpMatrices
.m22
[i
] = pState
->vp
[i
].maxZ
- pState
->vp
[i
].minZ
;
709 pState
->vpMatrices
.m30
[i
] = pState
->vp
[i
].x
+ pState
->vpMatrices
.m00
[i
];
710 pState
->vpMatrices
.m31
[i
] = pState
->vp
[i
].y
- pState
->vpMatrices
.m11
[i
];
711 pState
->vpMatrices
.m32
[i
] = pState
->vp
[i
].minZ
;
715 // Standard, with the exception that Y is inverted.
716 pState
->vpMatrices
.m00
[i
] = (pState
->vp
[i
].width
- pState
->vp
[i
].x
) / 2.0f
;
717 pState
->vpMatrices
.m11
[i
] = (pState
->vp
[i
].y
- pState
->vp
[i
].height
) / 2.0f
;
718 pState
->vpMatrices
.m22
[i
] = (pState
->vp
[i
].maxZ
- pState
->vp
[i
].minZ
) / 2.0f
;
719 pState
->vpMatrices
.m30
[i
] = pState
->vp
[i
].x
+ pState
->vpMatrices
.m00
[i
];
720 pState
->vpMatrices
.m31
[i
] = pState
->vp
[i
].height
+ pState
->vpMatrices
.m11
[i
];
721 pState
->vpMatrices
.m32
[i
] = pState
->vp
[i
].minZ
+ pState
->vpMatrices
.m22
[i
];
723 // Now that the matrix is calculated, clip the view coords to screen size.
724 // OpenGL allows for -ve x,y in the viewport.
725 pState
->vp
[i
].x
= std::max(pState
->vp
[i
].x
, 0.0f
);
726 pState
->vp
[i
].y
= std::max(pState
->vp
[i
].y
, 0.0f
);
731 updateGuardbands(pState
);
734 void SwrSetScissorRects(
736 uint32_t numScissors
,
737 const SWR_RECT
* pScissors
)
739 SWR_ASSERT(numScissors
<= KNOB_NUM_VIEWPORTS_SCISSORS
,
740 "Invalid number of scissor rects.");
742 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
743 memcpy(&pState
->scissorRects
[0], pScissors
, numScissors
* sizeof(pScissors
[0]));
746 void SetupMacroTileScissors(DRAW_CONTEXT
*pDC
)
748 API_STATE
*pState
= &pDC
->pState
->state
;
749 uint32_t numScissors
= pState
->gsState
.emitsViewportArrayIndex
? KNOB_NUM_VIEWPORTS_SCISSORS
: 1;
750 pState
->scissorsTileAligned
= true;
752 for (uint32_t index
= 0; index
< numScissors
; ++index
)
754 SWR_RECT
&scissorInFixedPoint
= pState
->scissorsInFixedPoint
[index
];
756 // Set up scissor dimensions based on scissor or viewport
757 if (pState
->rastState
.scissorEnable
)
759 scissorInFixedPoint
= pState
->scissorRects
[index
];
763 // the vp width and height must be added to origin un-rounded then the result round to -inf.
764 // The cast to int works for rounding assuming all [left, right, top, bottom] are positive.
765 scissorInFixedPoint
.xmin
= (int32_t)pState
->vp
[index
].x
;
766 scissorInFixedPoint
.xmax
= (int32_t)(pState
->vp
[index
].x
+ pState
->vp
[index
].width
);
767 scissorInFixedPoint
.ymin
= (int32_t)pState
->vp
[index
].y
;
768 scissorInFixedPoint
.ymax
= (int32_t)(pState
->vp
[index
].y
+ pState
->vp
[index
].height
);
772 scissorInFixedPoint
&= g_MaxScissorRect
;
774 // Test for tile alignment
776 tileAligned
= (scissorInFixedPoint
.xmin
% KNOB_TILE_X_DIM
) == 0;
777 tileAligned
&= (scissorInFixedPoint
.ymin
% KNOB_TILE_Y_DIM
) == 0;
778 tileAligned
&= (scissorInFixedPoint
.xmax
% KNOB_TILE_X_DIM
) == 0;
779 tileAligned
&= (scissorInFixedPoint
.xmax
% KNOB_TILE_Y_DIM
) == 0;
781 pState
->scissorsTileAligned
&= tileAligned
;
783 // Scale to fixed point
784 scissorInFixedPoint
.xmin
*= FIXED_POINT_SCALE
;
785 scissorInFixedPoint
.xmax
*= FIXED_POINT_SCALE
;
786 scissorInFixedPoint
.ymin
*= FIXED_POINT_SCALE
;
787 scissorInFixedPoint
.ymax
*= FIXED_POINT_SCALE
;
789 // Make scissor inclusive
790 scissorInFixedPoint
.xmax
-= 1;
791 scissorInFixedPoint
.ymax
-= 1;
795 // templated backend function tables
796 extern PFN_BACKEND_FUNC gBackendNullPs
[SWR_MULTISAMPLE_TYPE_COUNT
];
797 extern PFN_BACKEND_FUNC gBackendSingleSample
[SWR_INPUT_COVERAGE_COUNT
][2][2];
798 extern PFN_BACKEND_FUNC gBackendPixelRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
][SWR_MSAA_SAMPLE_PATTERN_COUNT
][SWR_INPUT_COVERAGE_COUNT
][2][2][2];
799 extern PFN_BACKEND_FUNC gBackendSampleRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
][SWR_INPUT_COVERAGE_COUNT
][2][2];
800 void SetupPipeline(DRAW_CONTEXT
*pDC
)
802 DRAW_STATE
* pState
= pDC
->pState
;
803 const SWR_RASTSTATE
&rastState
= pState
->state
.rastState
;
804 const SWR_PS_STATE
&psState
= pState
->state
.psState
;
805 BACKEND_FUNCS
& backendFuncs
= pState
->backendFuncs
;
806 const uint32_t forcedSampleCount
= (rastState
.forcedSampleCount
) ? 1 : 0;
809 if (psState
.pfnPixelShader
== nullptr)
811 backendFuncs
.pfnBackend
= gBackendNullPs
[pState
->state
.rastState
.sampleCount
];
815 const bool bMultisampleEnable
= ((rastState
.sampleCount
> SWR_MULTISAMPLE_1X
) || rastState
.forcedSampleCount
) ? 1 : 0;
816 const uint32_t centroid
= ((psState
.barycentricsMask
& SWR_BARYCENTRIC_CENTROID_MASK
) > 0) ? 1 : 0;
817 const uint32_t canEarlyZ
= (psState
.forceEarlyZ
|| (!psState
.writesODepth
&& !psState
.usesSourceDepth
&& !psState
.usesUAV
)) ? 1 : 0;
819 SWR_BARYCENTRICS_MASK barycentricsMask
= (SWR_BARYCENTRICS_MASK
)psState
.barycentricsMask
;
821 // select backend function
822 switch(psState
.shadingRate
)
824 case SWR_SHADING_RATE_PIXEL
:
825 if(bMultisampleEnable
)
827 // always need to generate I & J per sample for Z interpolation
828 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_SAMPLE_MASK
);
829 backendFuncs
.pfnBackend
= gBackendPixelRateTable
[rastState
.sampleCount
][rastState
.samplePattern
][psState
.inputCoverage
][centroid
][forcedSampleCount
][canEarlyZ
];
833 // always need to generate I & J per pixel for Z interpolation
834 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_PIXEL_MASK
);
835 backendFuncs
.pfnBackend
= gBackendSingleSample
[psState
.inputCoverage
][centroid
][canEarlyZ
];
838 case SWR_SHADING_RATE_SAMPLE
:
839 SWR_ASSERT(rastState
.samplePattern
== SWR_MSAA_STANDARD_PATTERN
);
840 // always need to generate I & J per sample for Z interpolation
841 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_SAMPLE_MASK
);
842 backendFuncs
.pfnBackend
= gBackendSampleRateTable
[rastState
.sampleCount
][psState
.inputCoverage
][centroid
][canEarlyZ
];
845 SWR_ASSERT(0 && "Invalid shading rate");
850 PFN_PROCESS_PRIMS pfnBinner
;
851 switch (pState
->state
.topology
)
854 pState
->pfnProcessPrims
= ClipPoints
;
855 pfnBinner
= BinPoints
;
860 case TOP_LINE_LIST_ADJ
:
861 case TOP_LISTSTRIP_ADJ
:
862 pState
->pfnProcessPrims
= ClipLines
;
863 pfnBinner
= BinLines
;
866 pState
->pfnProcessPrims
= ClipTriangles
;
867 pfnBinner
= GetBinTrianglesFunc((rastState
.conservativeRast
> 0));
871 // disable clipper if viewport transform is disabled
872 if (pState
->state
.frontendState
.vpTransformDisable
)
874 pState
->pfnProcessPrims
= pfnBinner
;
877 if ((pState
->state
.psState
.pfnPixelShader
== nullptr) &&
878 (pState
->state
.depthStencilState
.depthTestEnable
== FALSE
) &&
879 (pState
->state
.depthStencilState
.depthWriteEnable
== FALSE
) &&
880 (pState
->state
.depthStencilState
.stencilTestEnable
== FALSE
) &&
881 (pState
->state
.depthStencilState
.stencilWriteEnable
== FALSE
) &&
882 (pState
->state
.backendState
.numAttributes
== 0))
884 pState
->pfnProcessPrims
= nullptr;
887 if (pState
->state
.soState
.rasterizerDisable
== true)
889 pState
->pfnProcessPrims
= nullptr;
892 // set up the frontend attribute count
893 pState
->state
.feNumAttributes
= 0;
894 const SWR_BACKEND_STATE
& backendState
= pState
->state
.backendState
;
895 if (backendState
.swizzleEnable
)
897 // attribute swizzling is enabled, iterate over the map and record the max attribute used
898 for (uint32_t i
= 0; i
< backendState
.numAttributes
; ++i
)
900 pState
->state
.feNumAttributes
= std::max(pState
->state
.feNumAttributes
, (uint32_t)backendState
.swizzleMap
[i
].sourceAttrib
+ 1);
905 pState
->state
.feNumAttributes
= pState
->state
.backendState
.numAttributes
;
908 if (pState
->state
.soState
.soEnable
)
910 uint32_t streamMasks
= 0;
911 for (uint32_t i
= 0; i
< 4; ++i
)
913 streamMasks
|= pState
->state
.soState
.streamMasks
[i
];
917 if (_BitScanReverse(&maxAttrib
, streamMasks
))
919 pState
->state
.feNumAttributes
= std::max(pState
->state
.feNumAttributes
, (uint32_t)(maxAttrib
+ 1));
923 // complicated logic to test for cases where we don't need backing hottile memory for a draw
924 // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
925 pState
->state
.depthHottileEnable
= ((!(pState
->state
.depthStencilState
.depthTestEnable
&&
926 !pState
->state
.depthStencilState
.depthWriteEnable
&&
927 pState
->state
.depthStencilState
.depthTestFunc
== ZFUNC_ALWAYS
)) &&
928 (pState
->state
.depthStencilState
.depthTestEnable
||
929 pState
->state
.depthStencilState
.depthWriteEnable
)) ? true : false;
931 pState
->state
.stencilHottileEnable
= (((!(pState
->state
.depthStencilState
.stencilTestEnable
&&
932 !pState
->state
.depthStencilState
.stencilWriteEnable
&&
933 pState
->state
.depthStencilState
.stencilTestFunc
== ZFUNC_ALWAYS
)) ||
934 // for stencil we have to check the double sided state as well
935 (!(pState
->state
.depthStencilState
.doubleSidedStencilTestEnable
&&
936 !pState
->state
.depthStencilState
.stencilWriteEnable
&&
937 pState
->state
.depthStencilState
.backfaceStencilTestFunc
== ZFUNC_ALWAYS
))) &&
938 (pState
->state
.depthStencilState
.stencilTestEnable
||
939 pState
->state
.depthStencilState
.stencilWriteEnable
)) ? true : false;
941 uint32_t numRTs
= pState
->state
.psState
.numRenderTargets
;
942 pState
->state
.colorHottileEnable
= 0;
943 if (psState
.pfnPixelShader
!= nullptr)
945 for (uint32_t rt
= 0; rt
< numRTs
; ++rt
)
947 pState
->state
.colorHottileEnable
|=
948 (!pState
->state
.blendState
.renderTarget
[rt
].writeDisableAlpha
||
949 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableRed
||
950 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableGreen
||
951 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableBlue
) ? (1 << rt
) : 0;
955 // Setup depth quantization function
956 if (pState
->state
.depthHottileEnable
)
958 switch (pState
->state
.rastState
.depthFormat
)
960 case R32_FLOAT_X8X24_TYPELESS
: pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R32_FLOAT_X8X24_TYPELESS
> ; break;
961 case R32_FLOAT
: pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R32_FLOAT
> ; break;
962 case R24_UNORM_X8_TYPELESS
: pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R24_UNORM_X8_TYPELESS
> ; break;
963 case R16_UNORM
: pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R16_UNORM
> ; break;
964 default: SWR_ASSERT(false, "Unsupported depth format for depth quantiztion.");
965 pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R32_FLOAT
> ;
970 // set up pass-through quantize if depth isn't enabled
971 pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R32_FLOAT
> ;
975 //////////////////////////////////////////////////////////////////////////
977 /// @param pDC - Draw context to initialize for this draw.
982 // We don't need to re-setup the scissors/pipeline state again for split draw.
983 if (isSplitDraw
== false)
985 SetupMacroTileScissors(pDC
);
990 //////////////////////////////////////////////////////////////////////////
991 /// @brief We can split the draw for certain topologies for better performance.
992 /// @param totalVerts - Total vertices for draw
993 /// @param topology - Topology used for draw
994 uint32_t MaxVertsPerDraw(
997 PRIMITIVE_TOPOLOGY topology
)
999 API_STATE
& state
= pDC
->pState
->state
;
1001 uint32_t vertsPerDraw
= totalVerts
;
1003 if (state
.soState
.soEnable
)
1010 case TOP_POINT_LIST
:
1011 case TOP_TRIANGLE_LIST
:
1012 vertsPerDraw
= KNOB_MAX_PRIMS_PER_DRAW
;
1015 case TOP_PATCHLIST_1
:
1016 case TOP_PATCHLIST_2
:
1017 case TOP_PATCHLIST_3
:
1018 case TOP_PATCHLIST_4
:
1019 case TOP_PATCHLIST_5
:
1020 case TOP_PATCHLIST_6
:
1021 case TOP_PATCHLIST_7
:
1022 case TOP_PATCHLIST_8
:
1023 case TOP_PATCHLIST_9
:
1024 case TOP_PATCHLIST_10
:
1025 case TOP_PATCHLIST_11
:
1026 case TOP_PATCHLIST_12
:
1027 case TOP_PATCHLIST_13
:
1028 case TOP_PATCHLIST_14
:
1029 case TOP_PATCHLIST_15
:
1030 case TOP_PATCHLIST_16
:
1031 case TOP_PATCHLIST_17
:
1032 case TOP_PATCHLIST_18
:
1033 case TOP_PATCHLIST_19
:
1034 case TOP_PATCHLIST_20
:
1035 case TOP_PATCHLIST_21
:
1036 case TOP_PATCHLIST_22
:
1037 case TOP_PATCHLIST_23
:
1038 case TOP_PATCHLIST_24
:
1039 case TOP_PATCHLIST_25
:
1040 case TOP_PATCHLIST_26
:
1041 case TOP_PATCHLIST_27
:
1042 case TOP_PATCHLIST_28
:
1043 case TOP_PATCHLIST_29
:
1044 case TOP_PATCHLIST_30
:
1045 case TOP_PATCHLIST_31
:
1046 case TOP_PATCHLIST_32
:
1047 if (pDC
->pState
->state
.tsState
.tsEnable
)
1049 uint32_t vertsPerPrim
= topology
- TOP_PATCHLIST_BASE
;
1050 vertsPerDraw
= vertsPerPrim
* KNOB_MAX_TESS_PRIMS_PER_DRAW
;
1054 // The Primitive Assembly code can only handle 1 RECT at a time.
1060 // We are not splitting up draws for other topologies.
1064 return vertsPerDraw
;
1068 //////////////////////////////////////////////////////////////////////////
1069 /// @brief DrawInstanced
1070 /// @param hContext - Handle passed back from SwrCreateContext
1071 /// @param topology - Specifies topology for draw.
1072 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
1073 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1074 /// @param numInstances - How many instances to render.
1075 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1078 PRIMITIVE_TOPOLOGY topology
,
1079 uint32_t numVertices
,
1080 uint32_t startVertex
,
1081 uint32_t numInstances
= 1,
1082 uint32_t startInstance
= 0)
1089 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1090 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1092 AR_API_BEGIN(APIDraw
, pDC
->drawId
);
1094 uint32_t maxVertsPerDraw
= MaxVertsPerDraw(pDC
, numVertices
, topology
);
1095 uint32_t primsPerDraw
= GetNumPrims(topology
, maxVertsPerDraw
);
1096 uint32_t remainingVerts
= numVertices
;
1098 API_STATE
*pState
= &pDC
->pState
->state
;
1099 pState
->topology
= topology
;
1100 pState
->forceFront
= false;
1102 // disable culling for points/lines
1103 uint32_t oldCullMode
= pState
->rastState
.cullMode
;
1104 if (topology
== TOP_POINT_LIST
)
1106 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
1107 pState
->forceFront
= true;
1111 while (remainingVerts
)
1113 uint32_t numVertsForDraw
= (remainingVerts
< maxVertsPerDraw
) ?
1114 remainingVerts
: maxVertsPerDraw
;
1116 bool isSplitDraw
= (draw
> 0) ? true : false;
1117 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
, isSplitDraw
);
1118 InitDraw(pDC
, isSplitDraw
);
1120 pDC
->FeWork
.type
= DRAW
;
1121 pDC
->FeWork
.pfnWork
= GetProcessDrawFunc(
1123 false, // bEnableCutIndex
1124 pState
->tsState
.tsEnable
,
1125 pState
->gsState
.gsEnable
,
1126 pState
->soState
.soEnable
,
1127 pDC
->pState
->pfnProcessPrims
!= nullptr);
1128 pDC
->FeWork
.desc
.draw
.numVerts
= numVertsForDraw
;
1129 pDC
->FeWork
.desc
.draw
.startVertex
= startVertex
;
1130 pDC
->FeWork
.desc
.draw
.numInstances
= numInstances
;
1131 pDC
->FeWork
.desc
.draw
.startInstance
= startInstance
;
1132 pDC
->FeWork
.desc
.draw
.startPrimID
= draw
* primsPerDraw
;
1133 pDC
->FeWork
.desc
.draw
.startVertexID
= draw
* maxVertsPerDraw
;
1135 pDC
->cleanupState
= (remainingVerts
== numVertsForDraw
);
1138 QueueDraw(pContext
);
1140 remainingVerts
-= numVertsForDraw
;
1144 // restore culling state
1145 pDC
= GetDrawContext(pContext
);
1146 pDC
->pState
->state
.rastState
.cullMode
= oldCullMode
;
1148 AR_API_END(APIDraw
, numVertices
* numInstances
);
1151 //////////////////////////////////////////////////////////////////////////
1153 /// @param hContext - Handle passed back from SwrCreateContext
1154 /// @param topology - Specifies topology for draw.
1155 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
1156 /// @param primCount - Number of vertices.
1159 PRIMITIVE_TOPOLOGY topology
,
1160 uint32_t startVertex
,
1161 uint32_t numVertices
)
1163 DrawInstanced(hContext
, topology
, numVertices
, startVertex
);
1166 //////////////////////////////////////////////////////////////////////////
1167 /// @brief SwrDrawInstanced
1168 /// @param hContext - Handle passed back from SwrCreateContext
1169 /// @param topology - Specifies topology for draw.
1170 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
1171 /// @param numInstances - How many instances to render.
1172 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1173 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1174 void SwrDrawInstanced(
1176 PRIMITIVE_TOPOLOGY topology
,
1177 uint32_t numVertsPerInstance
,
1178 uint32_t numInstances
,
1179 uint32_t startVertex
,
1180 uint32_t startInstance
1183 DrawInstanced(hContext
, topology
, numVertsPerInstance
, startVertex
, numInstances
, startInstance
);
1186 //////////////////////////////////////////////////////////////////////////
1187 /// @brief DrawIndexedInstanced
1188 /// @param hContext - Handle passed back from SwrCreateContext
1189 /// @param topology - Specifies topology for draw.
1190 /// @param numIndices - Number of indices to read sequentially from index buffer.
1191 /// @param indexOffset - Starting index into index buffer.
1192 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1193 /// @param numInstances - Number of instances to render.
1194 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1195 void DrawIndexedInstance(
1197 PRIMITIVE_TOPOLOGY topology
,
1198 uint32_t numIndices
,
1199 uint32_t indexOffset
,
1201 uint32_t numInstances
= 1,
1202 uint32_t startInstance
= 0)
1209 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1210 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1211 API_STATE
* pState
= &pDC
->pState
->state
;
1213 AR_API_BEGIN(APIDrawIndexed
, pDC
->drawId
);
1214 AR_API_EVENT(DrawIndexedInstance(topology
, numIndices
, indexOffset
, baseVertex
, numInstances
, startInstance
));
1216 uint32_t maxIndicesPerDraw
= MaxVertsPerDraw(pDC
, numIndices
, topology
);
1217 uint32_t primsPerDraw
= GetNumPrims(topology
, maxIndicesPerDraw
);
1218 uint32_t remainingIndices
= numIndices
;
1220 uint32_t indexSize
= 0;
1221 switch (pState
->indexBuffer
.format
)
1223 case R32_UINT
: indexSize
= sizeof(uint32_t); break;
1224 case R16_UINT
: indexSize
= sizeof(uint16_t); break;
1225 case R8_UINT
: indexSize
= sizeof(uint8_t); break;
1231 uint8_t *pIB
= (uint8_t*)pState
->indexBuffer
.pIndices
;
1232 pIB
+= (uint64_t)indexOffset
* (uint64_t)indexSize
;
1234 pState
->topology
= topology
;
1235 pState
->forceFront
= false;
1237 // disable culling for points/lines
1238 uint32_t oldCullMode
= pState
->rastState
.cullMode
;
1239 if (topology
== TOP_POINT_LIST
)
1241 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
1242 pState
->forceFront
= true;
1245 while (remainingIndices
)
1247 uint32_t numIndicesForDraw
= (remainingIndices
< maxIndicesPerDraw
) ?
1248 remainingIndices
: maxIndicesPerDraw
;
1250 // When breaking up draw, we need to obtain new draw context for each iteration.
1251 bool isSplitDraw
= (draw
> 0) ? true : false;
1252 pDC
= GetDrawContext(pContext
, isSplitDraw
);
1253 InitDraw(pDC
, isSplitDraw
);
1255 pDC
->FeWork
.type
= DRAW
;
1256 pDC
->FeWork
.pfnWork
= GetProcessDrawFunc(
1258 pState
->frontendState
.bEnableCutIndex
,
1259 pState
->tsState
.tsEnable
,
1260 pState
->gsState
.gsEnable
,
1261 pState
->soState
.soEnable
,
1262 pDC
->pState
->pfnProcessPrims
!= nullptr);
1263 pDC
->FeWork
.desc
.draw
.pDC
= pDC
;
1264 pDC
->FeWork
.desc
.draw
.numIndices
= numIndicesForDraw
;
1265 pDC
->FeWork
.desc
.draw
.pIB
= (int*)pIB
;
1266 pDC
->FeWork
.desc
.draw
.type
= pDC
->pState
->state
.indexBuffer
.format
;
1268 pDC
->FeWork
.desc
.draw
.numInstances
= numInstances
;
1269 pDC
->FeWork
.desc
.draw
.startInstance
= startInstance
;
1270 pDC
->FeWork
.desc
.draw
.baseVertex
= baseVertex
;
1271 pDC
->FeWork
.desc
.draw
.startPrimID
= draw
* primsPerDraw
;
1273 pDC
->cleanupState
= (remainingIndices
== numIndicesForDraw
);
1276 QueueDraw(pContext
);
1278 pIB
+= maxIndicesPerDraw
* indexSize
;
1279 remainingIndices
-= numIndicesForDraw
;
1283 // restore culling state
1284 pDC
= GetDrawContext(pContext
);
1285 pDC
->pState
->state
.rastState
.cullMode
= oldCullMode
;
1287 AR_API_END(APIDrawIndexed
, numIndices
* numInstances
);
1291 //////////////////////////////////////////////////////////////////////////
1292 /// @brief DrawIndexed
1293 /// @param hContext - Handle passed back from SwrCreateContext
1294 /// @param topology - Specifies topology for draw.
1295 /// @param numIndices - Number of indices to read sequentially from index buffer.
1296 /// @param indexOffset - Starting index into index buffer.
1297 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1298 void SwrDrawIndexed(
1300 PRIMITIVE_TOPOLOGY topology
,
1301 uint32_t numIndices
,
1302 uint32_t indexOffset
,
1306 DrawIndexedInstance(hContext
, topology
, numIndices
, indexOffset
, baseVertex
);
1309 //////////////////////////////////////////////////////////////////////////
1310 /// @brief SwrDrawIndexedInstanced
1311 /// @param hContext - Handle passed back from SwrCreateContext
1312 /// @param topology - Specifies topology for draw.
1313 /// @param numIndices - Number of indices to read sequentially from index buffer.
1314 /// @param numInstances - Number of instances to render.
1315 /// @param indexOffset - Starting index into index buffer.
1316 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1317 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1318 void SwrDrawIndexedInstanced(
1320 PRIMITIVE_TOPOLOGY topology
,
1321 uint32_t numIndices
,
1322 uint32_t numInstances
,
1323 uint32_t indexOffset
,
1325 uint32_t startInstance
)
1327 DrawIndexedInstance(hContext
, topology
, numIndices
, indexOffset
, baseVertex
, numInstances
, startInstance
);
1330 //////////////////////////////////////////////////////////////////////////
1331 /// @brief SwrInvalidateTiles
1332 /// @param hContext - Handle passed back from SwrCreateContext
1333 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
1334 /// @param invalidateRect - The pixel-coordinate rectangle to invalidate. This will be expanded to
1335 /// be hottile size-aligned.
1336 void SWR_API
SwrInvalidateTiles(
1338 uint32_t attachmentMask
,
1339 const SWR_RECT
& invalidateRect
)
1346 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1347 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1349 pDC
->FeWork
.type
= DISCARDINVALIDATETILES
;
1350 pDC
->FeWork
.pfnWork
= ProcessDiscardInvalidateTiles
;
1351 pDC
->FeWork
.desc
.discardInvalidateTiles
.attachmentMask
= attachmentMask
;
1352 pDC
->FeWork
.desc
.discardInvalidateTiles
.rect
= invalidateRect
;
1353 pDC
->FeWork
.desc
.discardInvalidateTiles
.rect
&= g_MaxScissorRect
;
1354 pDC
->FeWork
.desc
.discardInvalidateTiles
.newTileState
= SWR_TILE_INVALID
;
1355 pDC
->FeWork
.desc
.discardInvalidateTiles
.createNewTiles
= false;
1356 pDC
->FeWork
.desc
.discardInvalidateTiles
.fullTilesOnly
= false;
1359 QueueDraw(pContext
);
1362 //////////////////////////////////////////////////////////////////////////
1363 /// @brief SwrDiscardRect
1364 /// @param hContext - Handle passed back from SwrCreateContext
1365 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
1366 /// @param rect - The pixel-coordinate rectangle to discard. Only fully-covered hottiles will be
1368 void SWR_API
SwrDiscardRect(
1370 uint32_t attachmentMask
,
1371 const SWR_RECT
& rect
)
1378 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1379 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1381 // Queue a load to the hottile
1382 pDC
->FeWork
.type
= DISCARDINVALIDATETILES
;
1383 pDC
->FeWork
.pfnWork
= ProcessDiscardInvalidateTiles
;
1384 pDC
->FeWork
.desc
.discardInvalidateTiles
.attachmentMask
= attachmentMask
;
1385 pDC
->FeWork
.desc
.discardInvalidateTiles
.rect
= rect
;
1386 pDC
->FeWork
.desc
.discardInvalidateTiles
.rect
&= g_MaxScissorRect
;
1387 pDC
->FeWork
.desc
.discardInvalidateTiles
.newTileState
= SWR_TILE_RESOLVED
;
1388 pDC
->FeWork
.desc
.discardInvalidateTiles
.createNewTiles
= true;
1389 pDC
->FeWork
.desc
.discardInvalidateTiles
.fullTilesOnly
= true;
1392 QueueDraw(pContext
);
1395 //////////////////////////////////////////////////////////////////////////
1396 /// @brief SwrDispatch
1397 /// @param hContext - Handle passed back from SwrCreateContext
1398 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
1399 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
1400 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
1403 uint32_t threadGroupCountX
,
1404 uint32_t threadGroupCountY
,
1405 uint32_t threadGroupCountZ
)
1412 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1413 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1415 AR_API_BEGIN(APIDispatch
, pDC
->drawId
);
1417 pDC
->isCompute
= true; // This is a compute context.
1419 COMPUTE_DESC
* pTaskData
= (COMPUTE_DESC
*)pDC
->pArena
->AllocAligned(sizeof(COMPUTE_DESC
), 64);
1421 pTaskData
->threadGroupCountX
= threadGroupCountX
;
1422 pTaskData
->threadGroupCountY
= threadGroupCountY
;
1423 pTaskData
->threadGroupCountZ
= threadGroupCountZ
;
1425 uint32_t totalThreadGroups
= threadGroupCountX
* threadGroupCountY
* threadGroupCountZ
;
1426 uint32_t dcIndex
= pDC
->drawId
% KNOB_MAX_DRAWS_IN_FLIGHT
;
1427 pDC
->pDispatch
= &pContext
->pDispatchQueueArray
[dcIndex
];
1428 pDC
->pDispatch
->initialize(totalThreadGroups
, pTaskData
);
1430 QueueDispatch(pContext
);
1431 AR_API_END(APIDispatch
, threadGroupCountX
* threadGroupCountY
* threadGroupCountZ
);
1434 // Deswizzles, converts and stores current contents of the hot tiles to surface
1435 // described by pState
1436 void SWR_API
SwrStoreTiles(
1438 SWR_RENDERTARGET_ATTACHMENT attachment
,
1439 SWR_TILE_STATE postStoreTileState
,
1440 const SWR_RECT
& storeRect
)
1447 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1448 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1450 AR_API_BEGIN(APIStoreTiles
, pDC
->drawId
);
1452 pDC
->FeWork
.type
= STORETILES
;
1453 pDC
->FeWork
.pfnWork
= ProcessStoreTiles
;
1454 pDC
->FeWork
.desc
.storeTiles
.attachment
= attachment
;
1455 pDC
->FeWork
.desc
.storeTiles
.postStoreTileState
= postStoreTileState
;
1456 pDC
->FeWork
.desc
.storeTiles
.rect
= storeRect
;
1457 pDC
->FeWork
.desc
.storeTiles
.rect
&= g_MaxScissorRect
;
1460 QueueDraw(pContext
);
1462 AR_API_END(APIStoreTiles
, 1);
1465 //////////////////////////////////////////////////////////////////////////
1466 /// @brief SwrClearRenderTarget - Clear attached render targets / depth / stencil
1467 /// @param hContext - Handle passed back from SwrCreateContext
1468 /// @param clearMask - combination of SWR_CLEAR_COLOR / SWR_CLEAR_DEPTH / SWR_CLEAR_STENCIL flags (or SWR_CLEAR_NONE)
1469 /// @param clearColor - color use for clearing render targets
1470 /// @param z - depth value use for clearing depth buffer
1471 /// @param stencil - stencil value used for clearing stencil buffer
1472 /// @param clearRect - The pixel-coordinate rectangle to clear in all cleared buffers
1473 void SWR_API
SwrClearRenderTarget(
1476 const float clearColor
[4],
1479 const SWR_RECT
& clearRect
)
1486 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1487 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1489 AR_API_BEGIN(APIClearRenderTarget
, pDC
->drawId
);
1493 flags
.mask
= clearMask
;
1495 pDC
->FeWork
.type
= CLEAR
;
1496 pDC
->FeWork
.pfnWork
= ProcessClear
;
1497 pDC
->FeWork
.desc
.clear
.rect
= clearRect
;
1498 pDC
->FeWork
.desc
.clear
.rect
&= g_MaxScissorRect
;
1499 pDC
->FeWork
.desc
.clear
.flags
= flags
;
1500 pDC
->FeWork
.desc
.clear
.clearDepth
= z
;
1501 pDC
->FeWork
.desc
.clear
.clearRTColor
[0] = clearColor
[0];
1502 pDC
->FeWork
.desc
.clear
.clearRTColor
[1] = clearColor
[1];
1503 pDC
->FeWork
.desc
.clear
.clearRTColor
[2] = clearColor
[2];
1504 pDC
->FeWork
.desc
.clear
.clearRTColor
[3] = clearColor
[3];
1505 pDC
->FeWork
.desc
.clear
.clearStencil
= stencil
;
1508 QueueDraw(pContext
);
1510 AR_API_END(APIClearRenderTarget
, 1);
1513 //////////////////////////////////////////////////////////////////////////
1514 /// @brief Returns a pointer to the private context state for the current
1515 /// draw operation. This is used for external componets such as the
1517 /// SWR is responsible for the allocation of the private context state.
1518 /// @param hContext - Handle passed back from SwrCreateContext
1519 VOID
* SwrGetPrivateContextState(
1522 SWR_CONTEXT
* pContext
= GetContext(hContext
);
1523 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1524 DRAW_STATE
* pState
= pDC
->pState
;
1526 if (pState
->pPrivateState
== nullptr)
1528 pState
->pPrivateState
= pState
->pArena
->AllocAligned(pContext
->privateStateSize
, KNOB_SIMD_WIDTH
*sizeof(float));
1531 return pState
->pPrivateState
;
1534 //////////////////////////////////////////////////////////////////////////
1535 /// @brief Clients can use this to allocate memory for draw/dispatch
1536 /// operations. The memory will automatically be freed once operation
1537 /// has completed. Client can use this to allocate binding tables,
1538 /// etc. needed for shader execution.
1539 /// @param hContext - Handle passed back from SwrCreateContext
1540 /// @param size - Size of allocation
1541 /// @param align - Alignment needed for allocation.
1542 VOID
* SwrAllocDrawContextMemory(
1547 SWR_CONTEXT
* pContext
= GetContext(hContext
);
1548 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1550 return pDC
->pState
->pArena
->AllocAligned(size
, align
);
1553 //////////////////////////////////////////////////////////////////////////
1554 /// @brief Enables stats counting
1555 /// @param hContext - Handle passed back from SwrCreateContext
1556 /// @param enable - If true then counts are incremented.
1557 void SwrEnableStats(
1561 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1562 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1564 pDC
->pState
->state
.enableStats
= enable
;
1567 //////////////////////////////////////////////////////////////////////////
1568 /// @brief Mark end of frame - used for performance profiling
1569 /// @param hContext - Handle passed back from SwrCreateContext
1570 void SWR_API
SwrEndFrame(
1574 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1575 pContext
->frameCount
++;