1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief API implementation
27 ******************************************************************************/
35 #include "core/backend.h"
36 #include "core/context.h"
37 #include "core/depthstencil.h"
38 #include "core/frontend.h"
39 #include "core/rasterizer.h"
40 #include "core/rdtsc_core.h"
41 #include "core/threads.h"
42 #include "core/tilemgr.h"
43 #include "core/clip.h"
45 #include "common/simdintrin.h"
46 #include "common/os.h"
48 void SetupDefaultState(SWR_CONTEXT
*pContext
);
50 //////////////////////////////////////////////////////////////////////////
51 /// @brief Create SWR Context.
52 /// @param pCreateInfo - pointer to creation info.
53 HANDLE
SwrCreateContext(
54 SWR_CREATECONTEXT_INFO
* pCreateInfo
)
59 void* pContextMem
= _aligned_malloc(sizeof(SWR_CONTEXT
), KNOB_SIMD_WIDTH
* 4);
60 memset(pContextMem
, 0, sizeof(SWR_CONTEXT
));
61 SWR_CONTEXT
*pContext
= new (pContextMem
) SWR_CONTEXT();
63 pContext
->driverType
= pCreateInfo
->driver
;
64 pContext
->privateStateSize
= pCreateInfo
->privateStateSize
;
66 pContext
->dcRing
.Init(KNOB_MAX_DRAWS_IN_FLIGHT
);
67 pContext
->dsRing
.Init(KNOB_MAX_DRAWS_IN_FLIGHT
);
69 pContext
->pMacroTileManagerArray
= (MacroTileMgr
*)_aligned_malloc(sizeof(MacroTileMgr
) * KNOB_MAX_DRAWS_IN_FLIGHT
, 64);
70 pContext
->pDispatchQueueArray
= (DispatchQueue
*)_aligned_malloc(sizeof(DispatchQueue
) * KNOB_MAX_DRAWS_IN_FLIGHT
, 64);
72 for (uint32_t dc
= 0; dc
< KNOB_MAX_DRAWS_IN_FLIGHT
; ++dc
)
74 pContext
->dcRing
[dc
].pArena
= new CachingArena(pContext
->cachingArenaAllocator
);
75 new (&pContext
->pMacroTileManagerArray
[dc
]) MacroTileMgr(*pContext
->dcRing
[dc
].pArena
);
76 new (&pContext
->pDispatchQueueArray
[dc
]) DispatchQueue();
78 pContext
->dsRing
[dc
].pArena
= new CachingArena(pContext
->cachingArenaAllocator
);
81 if (!KNOB_SINGLE_THREADED
)
83 memset(&pContext
->WaitLock
, 0, sizeof(pContext
->WaitLock
));
84 memset(&pContext
->FifosNotEmpty
, 0, sizeof(pContext
->FifosNotEmpty
));
85 new (&pContext
->WaitLock
) std::mutex();
86 new (&pContext
->FifosNotEmpty
) std::condition_variable();
88 CreateThreadPool(pContext
, &pContext
->threadPool
);
91 // Calling createThreadPool() above can set SINGLE_THREADED
92 if (KNOB_SINGLE_THREADED
)
94 SET_KNOB(HYPERTHREADED_FE
, false);
95 pContext
->NumWorkerThreads
= 1;
96 pContext
->NumFEThreads
= 1;
97 pContext
->NumBEThreads
= 1;
100 // Allocate scratch space for workers.
101 ///@note We could lazily allocate this but its rather small amount of memory.
102 for (uint32_t i
= 0; i
< pContext
->NumWorkerThreads
; ++i
)
105 uint32_t numaNode
= pContext
->threadPool
.pThreadData
?
106 pContext
->threadPool
.pThreadData
[i
].numaId
: 0;
107 pContext
->pScratch
[i
] = (uint8_t*)VirtualAllocExNuma(
108 GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE
),
109 MEM_RESERVE
| MEM_COMMIT
, PAGE_READWRITE
,
112 pContext
->pScratch
[i
] = (uint8_t*)_aligned_malloc(32 * sizeof(KILOBYTE
), KNOB_SIMD_WIDTH
* 4);
116 // State setup AFTER context is fully initialized
117 SetupDefaultState(pContext
);
119 // initialize hot tile manager
120 pContext
->pHotTileMgr
= new HotTileMgr();
122 // initialize function pointer tables
123 InitClearTilesTable();
125 // initialize store tiles function
126 pContext
->pfnLoadTile
= pCreateInfo
->pfnLoadTile
;
127 pContext
->pfnStoreTile
= pCreateInfo
->pfnStoreTile
;
128 pContext
->pfnClearTile
= pCreateInfo
->pfnClearTile
;
130 // pass pointer to bucket manager back to caller
131 #ifdef KNOB_ENABLE_RDTSC
132 pCreateInfo
->pBucketMgr
= &gBucketMgr
;
135 pCreateInfo
->contextSaveSize
= sizeof(API_STATE
);
137 return (HANDLE
)pContext
;
140 void SwrDestroyContext(HANDLE hContext
)
142 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
143 DestroyThreadPool(pContext
, &pContext
->threadPool
);
146 for (uint32_t i
= 0; i
< KNOB_MAX_DRAWS_IN_FLIGHT
; ++i
)
148 delete pContext
->dcRing
[i
].pArena
;
149 delete pContext
->dsRing
[i
].pArena
;
150 pContext
->pMacroTileManagerArray
[i
].~MacroTileMgr();
151 pContext
->pDispatchQueueArray
[i
].~DispatchQueue();
154 _aligned_free(pContext
->pDispatchQueueArray
);
155 _aligned_free(pContext
->pMacroTileManagerArray
);
157 // Free scratch space.
158 for (uint32_t i
= 0; i
< pContext
->NumWorkerThreads
; ++i
)
161 VirtualFree(pContext
->pScratch
[i
], 0, MEM_RELEASE
);
163 _aligned_free(pContext
->pScratch
[i
]);
167 delete(pContext
->pHotTileMgr
);
169 pContext
->~SWR_CONTEXT();
170 _aligned_free((SWR_CONTEXT
*)hContext
);
173 void CopyState(DRAW_STATE
& dst
, const DRAW_STATE
& src
)
175 memcpy(&dst
.state
, &src
.state
, sizeof(API_STATE
));
178 void WakeAllThreads(SWR_CONTEXT
*pContext
)
180 pContext
->FifosNotEmpty
.notify_all();
183 template<bool IsDraw
>
184 void QueueWork(SWR_CONTEXT
*pContext
)
186 DRAW_CONTEXT
* pDC
= pContext
->pCurDrawContext
;
187 uint32_t dcIndex
= pDC
->drawId
% KNOB_MAX_DRAWS_IN_FLIGHT
;
191 pDC
->pTileMgr
= &pContext
->pMacroTileManagerArray
[dcIndex
];
192 pDC
->pTileMgr
->initialize();
195 // Each worker thread looks at a DC for both FE and BE work at different times and so we
196 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
197 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
198 // then moved on if all work is done.)
199 pContext
->pCurDrawContext
->threadsDone
= pContext
->NumFEThreads
+ pContext
->NumBEThreads
;
203 std::unique_lock
<std::mutex
> lock(pContext
->WaitLock
);
204 pContext
->dcRing
.Enqueue();
207 if (KNOB_SINGLE_THREADED
)
209 // flush denormals to 0
210 uint32_t mxcsr
= _mm_getcsr();
211 _mm_setcsr(mxcsr
| _MM_FLUSH_ZERO_ON
| _MM_DENORMALS_ZERO_ON
);
215 static TileSet lockedTiles
;
216 uint64_t curDraw
[2] = { pContext
->pCurDrawContext
->drawId
, pContext
->pCurDrawContext
->drawId
};
217 WorkOnFifoFE(pContext
, 0, curDraw
[0]);
218 WorkOnFifoBE(pContext
, 0, curDraw
[1], lockedTiles
, 0, 0);
222 uint64_t curDispatch
= pContext
->pCurDrawContext
->drawId
;
223 WorkOnCompute(pContext
, 0, curDispatch
);
226 // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
227 while (CompleteDrawContext(pContext
, pContext
->pCurDrawContext
) > 0) {}
234 RDTSC_START(APIDrawWakeAllThreads
);
235 WakeAllThreads(pContext
);
236 RDTSC_STOP(APIDrawWakeAllThreads
, 1, 0);
239 // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
240 pContext
->pPrevDrawContext
= pContext
->pCurDrawContext
;
241 pContext
->pCurDrawContext
= nullptr;
244 INLINE
void QueueDraw(SWR_CONTEXT
* pContext
)
246 QueueWork
<true>(pContext
);
249 INLINE
void QueueDispatch(SWR_CONTEXT
* pContext
)
251 QueueWork
<false>(pContext
);
254 DRAW_CONTEXT
* GetDrawContext(SWR_CONTEXT
*pContext
, bool isSplitDraw
= false)
256 RDTSC_START(APIGetDrawContext
);
257 // If current draw context is null then need to obtain a new draw context to use from ring.
258 if (pContext
->pCurDrawContext
== nullptr)
260 // Need to wait for a free entry.
261 while (pContext
->dcRing
.IsFull())
266 uint64_t curDraw
= pContext
->dcRing
.GetHead();
267 uint32_t dcIndex
= curDraw
% KNOB_MAX_DRAWS_IN_FLIGHT
;
269 static uint64_t lastDrawChecked
;
270 static uint32_t lastFrameChecked
;
271 if ((pContext
->frameCount
- lastFrameChecked
) > 2 ||
272 (curDraw
- lastDrawChecked
) > 0x10000)
274 // Take this opportunity to clean-up old arena allocations
275 pContext
->cachingArenaAllocator
.FreeOldBlocks();
277 lastFrameChecked
= pContext
->frameCount
;
278 lastDrawChecked
= curDraw
;
281 DRAW_CONTEXT
* pCurDrawContext
= &pContext
->dcRing
[dcIndex
];
282 pContext
->pCurDrawContext
= pCurDrawContext
;
284 // Assign next available entry in DS ring to this DC.
285 uint32_t dsIndex
= pContext
->curStateId
% KNOB_MAX_DRAWS_IN_FLIGHT
;
286 pCurDrawContext
->pState
= &pContext
->dsRing
[dsIndex
];
288 // Copy previous state to current state.
289 if (pContext
->pPrevDrawContext
)
291 DRAW_CONTEXT
* pPrevDrawContext
= pContext
->pPrevDrawContext
;
293 // If we're splitting our draw then we can just use the same state from the previous
294 // draw. In this case, we won't increment the DS ring index so the next non-split
295 // draw can receive the state.
296 if (isSplitDraw
== false)
298 CopyState(*pCurDrawContext
->pState
, *pPrevDrawContext
->pState
);
300 // Should have been cleaned up previously
301 SWR_ASSERT(pCurDrawContext
->pState
->pArena
->IsEmpty() == true);
303 pCurDrawContext
->pState
->pPrivateState
= nullptr;
305 pContext
->curStateId
++; // Progress state ring index forward.
309 // If its a split draw then just copy the state pointer over
310 // since its the same draw.
311 pCurDrawContext
->pState
= pPrevDrawContext
->pState
;
312 SWR_ASSERT(pPrevDrawContext
->cleanupState
== false);
317 SWR_ASSERT(pCurDrawContext
->pState
->pArena
->IsEmpty() == true);
318 pContext
->curStateId
++; // Progress state ring index forward.
321 SWR_ASSERT(pCurDrawContext
->pArena
->IsEmpty() == true);
323 pCurDrawContext
->dependency
= 0;
324 pCurDrawContext
->pContext
= pContext
;
325 pCurDrawContext
->isCompute
= false; // Dispatch has to set this to true.
327 pCurDrawContext
->doneFE
= false;
328 pCurDrawContext
->FeLock
= 0;
329 pCurDrawContext
->threadsDone
= 0;
331 // Assign unique drawId for this DC
332 pCurDrawContext
->drawId
= pContext
->dcRing
.GetHead();
334 pCurDrawContext
->cleanupState
= true;
338 SWR_ASSERT(isSplitDraw
== false, "Split draw should only be used when obtaining a new DC");
341 RDTSC_STOP(APIGetDrawContext
, 0, 0);
342 return pContext
->pCurDrawContext
;
345 API_STATE
* GetDrawState(SWR_CONTEXT
*pContext
)
347 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
348 SWR_ASSERT(pDC
->pState
!= nullptr);
350 return &pDC
->pState
->state
;
353 void SWR_API
SwrSaveState(
355 void* pOutputStateBlock
,
358 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
359 auto pSrc
= GetDrawState(pContext
);
360 SWR_ASSERT(pOutputStateBlock
&& memSize
>= sizeof(*pSrc
));
362 memcpy(pOutputStateBlock
, pSrc
, sizeof(*pSrc
));
365 void SWR_API
SwrRestoreState(
367 const void* pStateBlock
,
370 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
371 auto pDst
= GetDrawState(pContext
);
372 SWR_ASSERT(pStateBlock
&& memSize
>= sizeof(*pDst
));
374 memcpy(pDst
, pStateBlock
, sizeof(*pDst
));
377 void SetupDefaultState(SWR_CONTEXT
*pContext
)
379 API_STATE
* pState
= GetDrawState(pContext
);
381 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
382 pState
->rastState
.frontWinding
= SWR_FRONTWINDING_CCW
;
385 static INLINE SWR_CONTEXT
* GetContext(HANDLE hContext
)
387 return (SWR_CONTEXT
*)hContext
;
390 void SwrSync(HANDLE hContext
, PFN_CALLBACK_FUNC pfnFunc
, uint64_t userData
, uint64_t userData2
, uint64_t userData3
)
392 RDTSC_START(APISync
);
394 SWR_ASSERT(pfnFunc
!= nullptr);
396 SWR_CONTEXT
*pContext
= GetContext(hContext
);
397 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
399 pDC
->FeWork
.type
= SYNC
;
400 pDC
->FeWork
.pfnWork
= ProcessSync
;
401 pDC
->FeWork
.desc
.sync
.pfnCallbackFunc
= pfnFunc
;
402 pDC
->FeWork
.desc
.sync
.userData
= userData
;
403 pDC
->FeWork
.desc
.sync
.userData2
= userData2
;
404 pDC
->FeWork
.desc
.sync
.userData3
= userData3
;
406 // cannot execute until all previous draws have completed
407 pDC
->dependency
= pDC
->drawId
- 1;
412 RDTSC_STOP(APISync
, 1, 0);
415 void SwrWaitForIdle(HANDLE hContext
)
417 SWR_CONTEXT
*pContext
= GetContext(hContext
);
419 RDTSC_START(APIWaitForIdle
);
421 while (!pContext
->dcRing
.IsEmpty())
426 RDTSC_STOP(APIWaitForIdle
, 1, 0);
429 void SwrSetVertexBuffers(
432 const SWR_VERTEX_BUFFER_STATE
* pVertexBuffers
)
434 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
436 for (uint32_t i
= 0; i
< numBuffers
; ++i
)
438 const SWR_VERTEX_BUFFER_STATE
*pVB
= &pVertexBuffers
[i
];
439 pState
->vertexBuffers
[pVB
->index
] = *pVB
;
443 void SwrSetIndexBuffer(
445 const SWR_INDEX_BUFFER_STATE
* pIndexBuffer
)
447 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
449 pState
->indexBuffer
= *pIndexBuffer
;
452 void SwrSetFetchFunc(
454 PFN_FETCH_FUNC pfnFetchFunc
)
456 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
458 pState
->pfnFetchFunc
= pfnFetchFunc
;
463 PFN_SO_FUNC pfnSoFunc
,
464 uint32_t streamIndex
)
466 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
468 SWR_ASSERT(streamIndex
< MAX_SO_STREAMS
);
470 pState
->pfnSoFunc
[streamIndex
] = pfnSoFunc
;
475 SWR_STREAMOUT_STATE
* pSoState
)
477 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
479 pState
->soState
= *pSoState
;
482 void SwrSetSoBuffers(
484 SWR_STREAMOUT_BUFFER
* pSoBuffer
,
487 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
489 SWR_ASSERT((slot
< 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot
);
491 pState
->soBuffer
[slot
] = *pSoBuffer
;
494 void SwrSetVertexFunc(
496 PFN_VERTEX_FUNC pfnVertexFunc
)
498 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
500 pState
->pfnVertexFunc
= pfnVertexFunc
;
503 void SwrSetFrontendState(
505 SWR_FRONTEND_STATE
*pFEState
)
507 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
508 pState
->frontendState
= *pFEState
;
513 SWR_GS_STATE
*pGSState
)
515 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
516 pState
->gsState
= *pGSState
;
521 PFN_GS_FUNC pfnGsFunc
)
523 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
524 pState
->pfnGsFunc
= pfnGsFunc
;
529 PFN_CS_FUNC pfnCsFunc
,
530 uint32_t totalThreadsInGroup
,
531 uint32_t totalSpillFillSize
)
533 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
534 pState
->pfnCsFunc
= pfnCsFunc
;
535 pState
->totalThreadsInGroup
= totalThreadsInGroup
;
536 pState
->totalSpillFillSize
= totalSpillFillSize
;
541 SWR_TS_STATE
*pState
)
543 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
544 pApiState
->tsState
= *pState
;
551 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
552 pApiState
->pfnHsFunc
= pfnFunc
;
559 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
560 pApiState
->pfnDsFunc
= pfnFunc
;
563 void SwrSetDepthStencilState(
565 SWR_DEPTH_STENCIL_STATE
*pDSState
)
567 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
569 pState
->depthStencilState
= *pDSState
;
572 void SwrSetBackendState(
574 SWR_BACKEND_STATE
*pBEState
)
576 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
578 pState
->backendState
= *pBEState
;
581 void SwrSetPixelShaderState(
583 SWR_PS_STATE
*pPSState
)
585 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
586 pState
->psState
= *pPSState
;
589 void SwrSetBlendState(
591 SWR_BLEND_STATE
*pBlendState
)
593 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
594 memcpy(&pState
->blendState
, pBlendState
, sizeof(SWR_BLEND_STATE
));
597 void SwrSetBlendFunc(
599 uint32_t renderTarget
,
600 PFN_BLEND_JIT_FUNC pfnBlendFunc
)
602 SWR_ASSERT(renderTarget
< SWR_NUM_RENDERTARGETS
);
603 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
604 pState
->pfnBlendFunc
[renderTarget
] = pfnBlendFunc
;
612 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
614 static const uint8_t IDENTITY_MAP
[] =
616 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
617 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
619 static_assert(sizeof(IDENTITY_MAP
) == sizeof(pState
->linkageMap
),
620 "Update for new value of MAX_ATTRIBUTES");
622 pState
->linkageMask
= mask
;
623 pState
->linkageCount
= _mm_popcnt_u32(mask
);
629 memcpy(pState
->linkageMap
, pMap
, pState
->linkageCount
);
632 // update guardband multipliers for the viewport
633 void updateGuardband(API_STATE
*pState
)
635 // guardband center is viewport center
636 pState
->gbState
.left
= KNOB_GUARDBAND_WIDTH
/ pState
->vp
[0].width
;
637 pState
->gbState
.right
= KNOB_GUARDBAND_WIDTH
/ pState
->vp
[0].width
;
638 pState
->gbState
.top
= KNOB_GUARDBAND_HEIGHT
/ pState
->vp
[0].height
;
639 pState
->gbState
.bottom
= KNOB_GUARDBAND_HEIGHT
/ pState
->vp
[0].height
;
642 void SwrSetRastState(
644 const SWR_RASTSTATE
*pRastState
)
646 SWR_CONTEXT
*pContext
= GetContext(hContext
);
647 API_STATE
* pState
= GetDrawState(pContext
);
649 memcpy(&pState
->rastState
, pRastState
, sizeof(SWR_RASTSTATE
));
652 void SwrSetViewports(
654 uint32_t numViewports
,
655 const SWR_VIEWPORT
* pViewports
,
656 const SWR_VIEWPORT_MATRIX
* pMatrices
)
658 SWR_ASSERT(numViewports
<= KNOB_NUM_VIEWPORTS_SCISSORS
,
659 "Invalid number of viewports.");
661 SWR_CONTEXT
*pContext
= GetContext(hContext
);
662 API_STATE
* pState
= GetDrawState(pContext
);
664 memcpy(&pState
->vp
[0], pViewports
, sizeof(SWR_VIEWPORT
) * numViewports
);
666 if (pMatrices
!= nullptr)
668 memcpy(&pState
->vpMatrix
[0], pMatrices
, sizeof(SWR_VIEWPORT_MATRIX
) * numViewports
);
672 // Compute default viewport transform.
673 for (uint32_t i
= 0; i
< numViewports
; ++i
)
675 if (pContext
->driverType
== DX
)
677 pState
->vpMatrix
[i
].m00
= pState
->vp
[i
].width
/ 2.0f
;
678 pState
->vpMatrix
[i
].m11
= -pState
->vp
[i
].height
/ 2.0f
;
679 pState
->vpMatrix
[i
].m22
= pState
->vp
[i
].maxZ
- pState
->vp
[i
].minZ
;
680 pState
->vpMatrix
[i
].m30
= pState
->vp
[i
].x
+ pState
->vpMatrix
[i
].m00
;
681 pState
->vpMatrix
[i
].m31
= pState
->vp
[i
].y
- pState
->vpMatrix
[i
].m11
;
682 pState
->vpMatrix
[i
].m32
= pState
->vp
[i
].minZ
;
686 // Standard, with the exception that Y is inverted.
687 pState
->vpMatrix
[i
].m00
= (pState
->vp
[i
].width
- pState
->vp
[i
].x
) / 2.0f
;
688 pState
->vpMatrix
[i
].m11
= (pState
->vp
[i
].y
- pState
->vp
[i
].height
) / 2.0f
;
689 pState
->vpMatrix
[i
].m22
= (pState
->vp
[i
].maxZ
- pState
->vp
[i
].minZ
) / 2.0f
;
690 pState
->vpMatrix
[i
].m30
= pState
->vp
[i
].x
+ pState
->vpMatrix
[i
].m00
;
691 pState
->vpMatrix
[i
].m31
= pState
->vp
[i
].height
+ pState
->vpMatrix
[i
].m11
;
692 pState
->vpMatrix
[i
].m32
= pState
->vp
[i
].minZ
+ pState
->vpMatrix
[i
].m22
;
694 // Now that the matrix is calculated, clip the view coords to screen size.
695 // OpenGL allows for -ve x,y in the viewport.
696 pState
->vp
[i
].x
= std::max(pState
->vp
[i
].x
, 0.0f
);
697 pState
->vp
[i
].y
= std::max(pState
->vp
[i
].y
, 0.0f
);
702 updateGuardband(pState
);
705 void SwrSetScissorRects(
707 uint32_t numScissors
,
708 const BBOX
* pScissors
)
710 SWR_ASSERT(numScissors
<= KNOB_NUM_VIEWPORTS_SCISSORS
,
711 "Invalid number of scissor rects.");
713 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
714 memcpy(&pState
->scissorRects
[0], pScissors
, numScissors
* sizeof(BBOX
));
717 void SetupMacroTileScissors(DRAW_CONTEXT
*pDC
)
719 API_STATE
*pState
= &pDC
->pState
->state
;
720 uint32_t left
, right
, top
, bottom
;
722 // Set up scissor dimensions based on scissor or viewport
723 if (pState
->rastState
.scissorEnable
)
725 // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges
726 left
= pState
->scissorRects
[0].left
;
727 right
= pState
->scissorRects
[0].right
;
728 top
= pState
->scissorRects
[0].top
;
729 bottom
= pState
->scissorRects
[0].bottom
;
733 left
= (int32_t)pState
->vp
[0].x
;
734 right
= (int32_t)pState
->vp
[0].x
+ (int32_t)pState
->vp
[0].width
;
735 top
= (int32_t)pState
->vp
[0].y
;
736 bottom
= (int32_t)pState
->vp
[0].y
+ (int32_t)pState
->vp
[0].height
;
739 right
= std::min
<uint32_t>(right
, KNOB_MAX_SCISSOR_X
);
740 bottom
= std::min
<uint32_t>(bottom
, KNOB_MAX_SCISSOR_Y
);
742 if (left
> KNOB_MAX_SCISSOR_X
|| top
> KNOB_MAX_SCISSOR_Y
)
744 pState
->scissorInFixedPoint
.left
= 0;
745 pState
->scissorInFixedPoint
.right
= 0;
746 pState
->scissorInFixedPoint
.top
= 0;
747 pState
->scissorInFixedPoint
.bottom
= 0;
751 pState
->scissorInFixedPoint
.left
= left
* FIXED_POINT_SCALE
;
752 pState
->scissorInFixedPoint
.right
= right
* FIXED_POINT_SCALE
- 1;
753 pState
->scissorInFixedPoint
.top
= top
* FIXED_POINT_SCALE
;
754 pState
->scissorInFixedPoint
.bottom
= bottom
* FIXED_POINT_SCALE
- 1;
757 // templated backend function tables
758 extern PFN_BACKEND_FUNC gBackendNullPs
[SWR_MULTISAMPLE_TYPE_MAX
];
759 extern PFN_BACKEND_FUNC gBackendSingleSample
[2][2];
760 extern PFN_BACKEND_FUNC gBackendPixelRateTable
[SWR_MULTISAMPLE_TYPE_MAX
][SWR_MSAA_SAMPLE_PATTERN_MAX
][SWR_INPUT_COVERAGE_MAX
][2][2];
761 extern PFN_BACKEND_FUNC gBackendSampleRateTable
[SWR_MULTISAMPLE_TYPE_MAX
][SWR_INPUT_COVERAGE_MAX
][2];
762 extern PFN_OUTPUT_MERGER gBackendOutputMergerTable
[SWR_NUM_RENDERTARGETS
+ 1][SWR_MULTISAMPLE_TYPE_MAX
];
763 extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable
[2];
764 extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable
[2];
765 extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable
[SWR_MULTISAMPLE_TYPE_MAX
][2][2][2];
766 void SetupPipeline(DRAW_CONTEXT
*pDC
)
768 DRAW_STATE
* pState
= pDC
->pState
;
769 const SWR_RASTSTATE
&rastState
= pState
->state
.rastState
;
770 const SWR_PS_STATE
&psState
= pState
->state
.psState
;
771 BACKEND_FUNCS
& backendFuncs
= pState
->backendFuncs
;
772 const uint32_t forcedSampleCount
= (rastState
.bForcedSampleCount
) ? 1 : 0;
775 if (psState
.pfnPixelShader
== nullptr)
777 backendFuncs
.pfnBackend
= gBackendNullPs
[pState
->state
.rastState
.sampleCount
];
778 // always need to generate I & J per sample for Z interpolation
779 backendFuncs
.pfnCalcSampleBarycentrics
= gSampleBarycentricTable
[1];
783 const bool bMultisampleEnable
= ((rastState
.sampleCount
> SWR_MULTISAMPLE_1X
) || rastState
.bForcedSampleCount
) ? 1 : 0;
784 const uint32_t centroid
= ((psState
.barycentricsMask
& SWR_BARYCENTRIC_CENTROID_MASK
) > 0) ? 1 : 0;
786 // currently only support 'normal' input coverage
787 SWR_ASSERT(psState
.inputCoverage
== SWR_INPUT_COVERAGE_NORMAL
||
788 psState
.inputCoverage
== SWR_INPUT_COVERAGE_NONE
);
790 SWR_BARYCENTRICS_MASK barycentricsMask
= (SWR_BARYCENTRICS_MASK
)psState
.barycentricsMask
;
792 // select backend function
793 switch(psState
.shadingRate
)
795 case SWR_SHADING_RATE_PIXEL
:
796 if(bMultisampleEnable
)
798 // always need to generate I & J per sample for Z interpolation
799 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_SAMPLE_MASK
);
800 backendFuncs
.pfnBackend
= gBackendPixelRateTable
[rastState
.sampleCount
][rastState
.samplePattern
][psState
.inputCoverage
][centroid
][forcedSampleCount
];
801 backendFuncs
.pfnOutputMerger
= gBackendOutputMergerTable
[psState
.numRenderTargets
][pState
->state
.blendState
.sampleCount
];
805 // always need to generate I & J per pixel for Z interpolation
806 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_PIXEL_MASK
);
807 backendFuncs
.pfnBackend
= gBackendSingleSample
[psState
.inputCoverage
][centroid
];
808 backendFuncs
.pfnOutputMerger
= gBackendOutputMergerTable
[psState
.numRenderTargets
][SWR_MULTISAMPLE_1X
];
811 case SWR_SHADING_RATE_SAMPLE
:
812 SWR_ASSERT(rastState
.samplePattern
== SWR_MSAA_STANDARD_PATTERN
);
813 // always need to generate I & J per sample for Z interpolation
814 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_SAMPLE_MASK
);
815 backendFuncs
.pfnBackend
= gBackendSampleRateTable
[rastState
.sampleCount
][psState
.inputCoverage
][centroid
];
816 backendFuncs
.pfnOutputMerger
= gBackendOutputMergerTable
[psState
.numRenderTargets
][pState
->state
.blendState
.sampleCount
];
819 SWR_ASSERT(0 && "Invalid shading rate");
823 // setup pointer to function that generates necessary barycentrics required by the PS
824 bool bBarycentrics
= (barycentricsMask
& SWR_BARYCENTRIC_PER_PIXEL_MASK
) > 0 ? 1 : 0;
825 backendFuncs
.pfnCalcPixelBarycentrics
= gPixelBarycentricTable
[bBarycentrics
];
827 bBarycentrics
= (barycentricsMask
& SWR_BARYCENTRIC_PER_SAMPLE_MASK
) > 0 ? 1 : 0;
828 backendFuncs
.pfnCalcSampleBarycentrics
= gSampleBarycentricTable
[bBarycentrics
];
830 bBarycentrics
= (barycentricsMask
& SWR_BARYCENTRIC_CENTROID_MASK
) > 0 ? 1 : 0;
831 backendFuncs
.pfnCalcCentroidBarycentrics
= gCentroidBarycentricTable
[rastState
.sampleCount
][bBarycentrics
][rastState
.samplePattern
][forcedSampleCount
];
834 PFN_PROCESS_PRIMS pfnBinner
;
835 switch (pState
->state
.topology
)
838 pState
->pfnProcessPrims
= ClipPoints
;
839 pfnBinner
= BinPoints
;
844 case TOP_LINE_LIST_ADJ
:
845 case TOP_LISTSTRIP_ADJ
:
846 pState
->pfnProcessPrims
= ClipLines
;
847 pfnBinner
= BinLines
;
850 pState
->pfnProcessPrims
= ClipTriangles
;
851 pfnBinner
= BinTriangles
;
855 // disable clipper if viewport transform is disabled
856 if (pState
->state
.frontendState
.vpTransformDisable
)
858 pState
->pfnProcessPrims
= pfnBinner
;
861 if ((pState
->state
.psState
.pfnPixelShader
== nullptr) &&
862 (pState
->state
.depthStencilState
.depthTestEnable
== FALSE
) &&
863 (pState
->state
.depthStencilState
.depthWriteEnable
== FALSE
) &&
864 (pState
->state
.depthStencilState
.stencilTestEnable
== FALSE
) &&
865 (pState
->state
.depthStencilState
.stencilWriteEnable
== FALSE
) &&
866 (pState
->state
.linkageCount
== 0))
868 pState
->pfnProcessPrims
= nullptr;
869 pState
->state
.linkageMask
= 0;
872 if (pState
->state
.soState
.rasterizerDisable
== true)
874 pState
->pfnProcessPrims
= nullptr;
875 pState
->state
.linkageMask
= 0;
878 // set up the frontend attrib mask
879 pState
->state
.feAttribMask
= pState
->state
.linkageMask
;
880 if (pState
->state
.soState
.soEnable
)
882 for (uint32_t i
= 0; i
< 4; ++i
)
884 pState
->state
.feAttribMask
|= pState
->state
.soState
.streamMasks
[i
];
888 // complicated logic to test for cases where we don't need backing hottile memory for a draw
889 // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
890 pState
->state
.depthHottileEnable
= ((!(pState
->state
.depthStencilState
.depthTestEnable
&&
891 !pState
->state
.depthStencilState
.depthWriteEnable
&&
892 pState
->state
.depthStencilState
.depthTestFunc
== ZFUNC_ALWAYS
)) &&
893 (pState
->state
.depthStencilState
.depthTestEnable
||
894 pState
->state
.depthStencilState
.depthWriteEnable
)) ? true : false;
896 pState
->state
.stencilHottileEnable
= (((!(pState
->state
.depthStencilState
.stencilTestEnable
&&
897 !pState
->state
.depthStencilState
.stencilWriteEnable
&&
898 pState
->state
.depthStencilState
.stencilTestFunc
== ZFUNC_ALWAYS
)) ||
899 // for stencil we have to check the double sided state as well
900 (!(pState
->state
.depthStencilState
.doubleSidedStencilTestEnable
&&
901 !pState
->state
.depthStencilState
.stencilWriteEnable
&&
902 pState
->state
.depthStencilState
.backfaceStencilTestFunc
== ZFUNC_ALWAYS
))) &&
903 (pState
->state
.depthStencilState
.stencilTestEnable
||
904 pState
->state
.depthStencilState
.stencilWriteEnable
)) ? true : false;
906 uint32_t numRTs
= pState
->state
.psState
.numRenderTargets
;
907 pState
->state
.colorHottileEnable
= 0;
908 if (psState
.pfnPixelShader
!= nullptr)
910 for (uint32_t rt
= 0; rt
< numRTs
; ++rt
)
912 pState
->state
.colorHottileEnable
|=
913 (!pState
->state
.blendState
.renderTarget
[rt
].writeDisableAlpha
||
914 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableRed
||
915 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableGreen
||
916 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableBlue
) ? (1 << rt
) : 0;
920 // Setup depth quantization function
921 if (pState
->state
.depthHottileEnable
)
923 switch (pState
->state
.rastState
.depthFormat
)
925 case R32_FLOAT_X8X24_TYPELESS
: pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R32_FLOAT_X8X24_TYPELESS
> ; break;
926 case R32_FLOAT
: pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R32_FLOAT
> ; break;
927 case R24_UNORM_X8_TYPELESS
: pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R24_UNORM_X8_TYPELESS
> ; break;
928 case R16_UNORM
: pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R16_UNORM
> ; break;
929 default: SWR_ASSERT(false, "Unsupported depth format for depth quantiztion.");
930 pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R32_FLOAT
> ;
935 // set up pass-through quantize if depth isn't enabled
936 pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R32_FLOAT
> ;
940 //////////////////////////////////////////////////////////////////////////
942 /// @param pDC - Draw context to initialize for this draw.
947 // We don't need to re-setup the scissors/pipeline state again for split draw.
948 if (isSplitDraw
== false)
950 SetupMacroTileScissors(pDC
);
955 //////////////////////////////////////////////////////////////////////////
956 /// @brief We can split the draw for certain topologies for better performance.
957 /// @param totalVerts - Total vertices for draw
958 /// @param topology - Topology used for draw
959 uint32_t MaxVertsPerDraw(
962 PRIMITIVE_TOPOLOGY topology
)
964 API_STATE
& state
= pDC
->pState
->state
;
966 uint32_t vertsPerDraw
= totalVerts
;
968 if (state
.soState
.soEnable
)
976 case TOP_TRIANGLE_LIST
:
977 vertsPerDraw
= KNOB_MAX_PRIMS_PER_DRAW
;
980 case TOP_PATCHLIST_1
:
981 case TOP_PATCHLIST_2
:
982 case TOP_PATCHLIST_3
:
983 case TOP_PATCHLIST_4
:
984 case TOP_PATCHLIST_5
:
985 case TOP_PATCHLIST_6
:
986 case TOP_PATCHLIST_7
:
987 case TOP_PATCHLIST_8
:
988 case TOP_PATCHLIST_9
:
989 case TOP_PATCHLIST_10
:
990 case TOP_PATCHLIST_11
:
991 case TOP_PATCHLIST_12
:
992 case TOP_PATCHLIST_13
:
993 case TOP_PATCHLIST_14
:
994 case TOP_PATCHLIST_15
:
995 case TOP_PATCHLIST_16
:
996 case TOP_PATCHLIST_17
:
997 case TOP_PATCHLIST_18
:
998 case TOP_PATCHLIST_19
:
999 case TOP_PATCHLIST_20
:
1000 case TOP_PATCHLIST_21
:
1001 case TOP_PATCHLIST_22
:
1002 case TOP_PATCHLIST_23
:
1003 case TOP_PATCHLIST_24
:
1004 case TOP_PATCHLIST_25
:
1005 case TOP_PATCHLIST_26
:
1006 case TOP_PATCHLIST_27
:
1007 case TOP_PATCHLIST_28
:
1008 case TOP_PATCHLIST_29
:
1009 case TOP_PATCHLIST_30
:
1010 case TOP_PATCHLIST_31
:
1011 case TOP_PATCHLIST_32
:
1012 if (pDC
->pState
->state
.tsState
.tsEnable
)
1014 uint32_t vertsPerPrim
= topology
- TOP_PATCHLIST_BASE
;
1015 vertsPerDraw
= vertsPerPrim
* KNOB_MAX_TESS_PRIMS_PER_DRAW
;
1019 // The Primitive Assembly code can only handle 1 RECT at a time.
1025 // We are not splitting up draws for other topologies.
1029 return vertsPerDraw
;
1032 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
1033 // arguments to static template arguments.
1034 template <bool... ArgsB
>
1035 struct FEDrawChooser
1037 // Last Arg Terminator
1038 static PFN_FE_WORK_FUNC
GetFunc(bool bArg
)
1042 return ProcessDraw
<ArgsB
..., true>;
1045 return ProcessDraw
<ArgsB
..., false>;
1048 // Recursively parse args
1049 template <typename
... TArgsT
>
1050 static PFN_FE_WORK_FUNC
GetFunc(bool bArg
, TArgsT
... remainingArgs
)
1054 return FEDrawChooser
<ArgsB
..., true>::GetFunc(remainingArgs
...);
1057 return FEDrawChooser
<ArgsB
..., false>::GetFunc(remainingArgs
...);
1061 // Selector for correct templated Draw front-end function
1063 static PFN_FE_WORK_FUNC
GetFEDrawFunc(bool IsIndexed
, bool HasTessellation
, bool HasGeometryShader
, bool HasStreamOut
, bool RasterizerEnabled
)
1065 return FEDrawChooser
<>::GetFunc(IsIndexed
, HasTessellation
, HasGeometryShader
, HasStreamOut
, RasterizerEnabled
);
1069 //////////////////////////////////////////////////////////////////////////
1070 /// @brief DrawInstanced
1071 /// @param hContext - Handle passed back from SwrCreateContext
1072 /// @param topology - Specifies topology for draw.
1073 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
1074 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1075 /// @param numInstances - How many instances to render.
1076 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1079 PRIMITIVE_TOPOLOGY topology
,
1080 uint32_t numVertices
,
1081 uint32_t startVertex
,
1082 uint32_t numInstances
= 1,
1083 uint32_t startInstance
= 0)
1090 RDTSC_START(APIDraw
);
1092 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1093 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1095 uint32_t maxVertsPerDraw
= MaxVertsPerDraw(pDC
, numVertices
, topology
);
1096 uint32_t primsPerDraw
= GetNumPrims(topology
, maxVertsPerDraw
);
1097 uint32_t remainingVerts
= numVertices
;
1099 API_STATE
*pState
= &pDC
->pState
->state
;
1100 pState
->topology
= topology
;
1101 pState
->forceFront
= false;
1103 // disable culling for points/lines
1104 uint32_t oldCullMode
= pState
->rastState
.cullMode
;
1105 if (topology
== TOP_POINT_LIST
)
1107 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
1108 pState
->forceFront
= true;
1112 while (remainingVerts
)
1114 uint32_t numVertsForDraw
= (remainingVerts
< maxVertsPerDraw
) ?
1115 remainingVerts
: maxVertsPerDraw
;
1117 bool isSplitDraw
= (draw
> 0) ? true : false;
1118 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
, isSplitDraw
);
1119 InitDraw(pDC
, isSplitDraw
);
1121 pDC
->FeWork
.type
= DRAW
;
1122 pDC
->FeWork
.pfnWork
= GetFEDrawFunc(
1124 pState
->tsState
.tsEnable
,
1125 pState
->gsState
.gsEnable
,
1126 pState
->soState
.soEnable
,
1127 pDC
->pState
->pfnProcessPrims
!= nullptr);
1128 pDC
->FeWork
.desc
.draw
.numVerts
= numVertsForDraw
;
1129 pDC
->FeWork
.desc
.draw
.startVertex
= startVertex
;
1130 pDC
->FeWork
.desc
.draw
.numInstances
= numInstances
;
1131 pDC
->FeWork
.desc
.draw
.startInstance
= startInstance
;
1132 pDC
->FeWork
.desc
.draw
.startPrimID
= draw
* primsPerDraw
;
1133 pDC
->FeWork
.desc
.draw
.startVertexID
= draw
* maxVertsPerDraw
;
1135 pDC
->cleanupState
= (remainingVerts
== numVertsForDraw
);
1138 QueueDraw(pContext
);
1140 remainingVerts
-= numVertsForDraw
;
1144 // restore culling state
1145 pDC
= GetDrawContext(pContext
);
1146 pDC
->pState
->state
.rastState
.cullMode
= oldCullMode
;
1148 RDTSC_STOP(APIDraw
, numVertices
* numInstances
, 0);
1151 //////////////////////////////////////////////////////////////////////////
1153 /// @param hContext - Handle passed back from SwrCreateContext
1154 /// @param topology - Specifies topology for draw.
1155 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
1156 /// @param primCount - Number of vertices.
1159 PRIMITIVE_TOPOLOGY topology
,
1160 uint32_t startVertex
,
1161 uint32_t numVertices
)
1163 DrawInstanced(hContext
, topology
, numVertices
, startVertex
);
1166 //////////////////////////////////////////////////////////////////////////
1167 /// @brief SwrDrawInstanced
1168 /// @param hContext - Handle passed back from SwrCreateContext
1169 /// @param topology - Specifies topology for draw.
1170 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
1171 /// @param numInstances - How many instances to render.
1172 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1173 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1174 void SwrDrawInstanced(
1176 PRIMITIVE_TOPOLOGY topology
,
1177 uint32_t numVertsPerInstance
,
1178 uint32_t numInstances
,
1179 uint32_t startVertex
,
1180 uint32_t startInstance
1183 DrawInstanced(hContext
, topology
, numVertsPerInstance
, startVertex
, numInstances
, startInstance
);
1186 //////////////////////////////////////////////////////////////////////////
1187 /// @brief DrawIndexedInstanced
1188 /// @param hContext - Handle passed back from SwrCreateContext
1189 /// @param topology - Specifies topology for draw.
1190 /// @param numIndices - Number of indices to read sequentially from index buffer.
1191 /// @param indexOffset - Starting index into index buffer.
1192 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1193 /// @param numInstances - Number of instances to render.
1194 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1195 void DrawIndexedInstance(
1197 PRIMITIVE_TOPOLOGY topology
,
1198 uint32_t numIndices
,
1199 uint32_t indexOffset
,
1201 uint32_t numInstances
= 1,
1202 uint32_t startInstance
= 0)
1209 RDTSC_START(APIDrawIndexed
);
1211 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1212 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1213 API_STATE
* pState
= &pDC
->pState
->state
;
1215 uint32_t maxIndicesPerDraw
= MaxVertsPerDraw(pDC
, numIndices
, topology
);
1216 uint32_t primsPerDraw
= GetNumPrims(topology
, maxIndicesPerDraw
);
1217 uint32_t remainingIndices
= numIndices
;
1219 uint32_t indexSize
= 0;
1220 switch (pState
->indexBuffer
.format
)
1222 case R32_UINT
: indexSize
= sizeof(uint32_t); break;
1223 case R16_UINT
: indexSize
= sizeof(uint16_t); break;
1224 case R8_UINT
: indexSize
= sizeof(uint8_t); break;
1230 uint8_t *pIB
= (uint8_t*)pState
->indexBuffer
.pIndices
;
1231 pIB
+= (uint64_t)indexOffset
* (uint64_t)indexSize
;
1233 pState
->topology
= topology
;
1234 pState
->forceFront
= false;
1236 // disable culling for points/lines
1237 uint32_t oldCullMode
= pState
->rastState
.cullMode
;
1238 if (topology
== TOP_POINT_LIST
)
1240 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
1241 pState
->forceFront
= true;
1244 while (remainingIndices
)
1246 uint32_t numIndicesForDraw
= (remainingIndices
< maxIndicesPerDraw
) ?
1247 remainingIndices
: maxIndicesPerDraw
;
1249 // When breaking up draw, we need to obtain new draw context for each iteration.
1250 bool isSplitDraw
= (draw
> 0) ? true : false;
1251 pDC
= GetDrawContext(pContext
, isSplitDraw
);
1252 InitDraw(pDC
, isSplitDraw
);
1254 pDC
->FeWork
.type
= DRAW
;
1255 pDC
->FeWork
.pfnWork
= GetFEDrawFunc(
1257 pState
->tsState
.tsEnable
,
1258 pState
->gsState
.gsEnable
,
1259 pState
->soState
.soEnable
,
1260 pDC
->pState
->pfnProcessPrims
!= nullptr);
1261 pDC
->FeWork
.desc
.draw
.pDC
= pDC
;
1262 pDC
->FeWork
.desc
.draw
.numIndices
= numIndicesForDraw
;
1263 pDC
->FeWork
.desc
.draw
.pIB
= (int*)pIB
;
1264 pDC
->FeWork
.desc
.draw
.type
= pDC
->pState
->state
.indexBuffer
.format
;
1266 pDC
->FeWork
.desc
.draw
.numInstances
= numInstances
;
1267 pDC
->FeWork
.desc
.draw
.startInstance
= startInstance
;
1268 pDC
->FeWork
.desc
.draw
.baseVertex
= baseVertex
;
1269 pDC
->FeWork
.desc
.draw
.startPrimID
= draw
* primsPerDraw
;
1271 pDC
->cleanupState
= (remainingIndices
== numIndicesForDraw
);
1274 QueueDraw(pContext
);
1276 pIB
+= maxIndicesPerDraw
* indexSize
;
1277 remainingIndices
-= numIndicesForDraw
;
1281 // restore culling state
1282 pDC
= GetDrawContext(pContext
);
1283 pDC
->pState
->state
.rastState
.cullMode
= oldCullMode
;
1285 RDTSC_STOP(APIDrawIndexed
, numIndices
* numInstances
, 0);
1289 //////////////////////////////////////////////////////////////////////////
1290 /// @brief DrawIndexed
1291 /// @param hContext - Handle passed back from SwrCreateContext
1292 /// @param topology - Specifies topology for draw.
1293 /// @param numIndices - Number of indices to read sequentially from index buffer.
1294 /// @param indexOffset - Starting index into index buffer.
1295 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1296 void SwrDrawIndexed(
1298 PRIMITIVE_TOPOLOGY topology
,
1299 uint32_t numIndices
,
1300 uint32_t indexOffset
,
1304 DrawIndexedInstance(hContext
, topology
, numIndices
, indexOffset
, baseVertex
);
1307 //////////////////////////////////////////////////////////////////////////
1308 /// @brief SwrDrawIndexedInstanced
1309 /// @param hContext - Handle passed back from SwrCreateContext
1310 /// @param topology - Specifies topology for draw.
1311 /// @param numIndices - Number of indices to read sequentially from index buffer.
1312 /// @param numInstances - Number of instances to render.
1313 /// @param indexOffset - Starting index into index buffer.
1314 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1315 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1316 void SwrDrawIndexedInstanced(
1318 PRIMITIVE_TOPOLOGY topology
,
1319 uint32_t numIndices
,
1320 uint32_t numInstances
,
1321 uint32_t indexOffset
,
1323 uint32_t startInstance
)
1325 DrawIndexedInstance(hContext
, topology
, numIndices
, indexOffset
, baseVertex
, numInstances
, startInstance
);
1328 //////////////////////////////////////////////////////////////////////////
1329 /// @brief SwrInvalidateTiles
1330 /// @param hContext - Handle passed back from SwrCreateContext
1331 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
1332 void SwrInvalidateTiles(
1334 uint32_t attachmentMask
)
1336 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1337 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1339 pDC
->FeWork
.type
= DISCARDINVALIDATETILES
;
1340 pDC
->FeWork
.pfnWork
= ProcessDiscardInvalidateTiles
;
1341 pDC
->FeWork
.desc
.discardInvalidateTiles
.attachmentMask
= attachmentMask
;
1342 memset(&pDC
->FeWork
.desc
.discardInvalidateTiles
.rect
, 0, sizeof(SWR_RECT
));
1343 pDC
->FeWork
.desc
.discardInvalidateTiles
.newTileState
= SWR_TILE_INVALID
;
1344 pDC
->FeWork
.desc
.discardInvalidateTiles
.createNewTiles
= false;
1345 pDC
->FeWork
.desc
.discardInvalidateTiles
.fullTilesOnly
= false;
1348 QueueDraw(pContext
);
1351 //////////////////////////////////////////////////////////////////////////
1352 /// @brief SwrDiscardRect
1353 /// @param hContext - Handle passed back from SwrCreateContext
1354 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
1355 /// @param rect - if rect is all zeros, the entire attachment surface will be discarded
1356 void SwrDiscardRect(
1358 uint32_t attachmentMask
,
1361 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1362 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1364 // Queue a load to the hottile
1365 pDC
->FeWork
.type
= DISCARDINVALIDATETILES
;
1366 pDC
->FeWork
.pfnWork
= ProcessDiscardInvalidateTiles
;
1367 pDC
->FeWork
.desc
.discardInvalidateTiles
.attachmentMask
= attachmentMask
;
1368 pDC
->FeWork
.desc
.discardInvalidateTiles
.rect
= rect
;
1369 pDC
->FeWork
.desc
.discardInvalidateTiles
.newTileState
= SWR_TILE_RESOLVED
;
1370 pDC
->FeWork
.desc
.discardInvalidateTiles
.createNewTiles
= true;
1371 pDC
->FeWork
.desc
.discardInvalidateTiles
.fullTilesOnly
= true;
1374 QueueDraw(pContext
);
1377 //////////////////////////////////////////////////////////////////////////
1378 /// @brief SwrDispatch
1379 /// @param hContext - Handle passed back from SwrCreateContext
1380 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
1381 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
1382 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
1385 uint32_t threadGroupCountX
,
1386 uint32_t threadGroupCountY
,
1387 uint32_t threadGroupCountZ
)
1394 RDTSC_START(APIDispatch
);
1395 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1396 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1398 pDC
->isCompute
= true; // This is a compute context.
1400 COMPUTE_DESC
* pTaskData
= (COMPUTE_DESC
*)pDC
->pArena
->AllocAligned(sizeof(COMPUTE_DESC
), 64);
1402 pTaskData
->threadGroupCountX
= threadGroupCountX
;
1403 pTaskData
->threadGroupCountY
= threadGroupCountY
;
1404 pTaskData
->threadGroupCountZ
= threadGroupCountZ
;
1406 uint32_t totalThreadGroups
= threadGroupCountX
* threadGroupCountY
* threadGroupCountZ
;
1407 uint32_t dcIndex
= pDC
->drawId
% KNOB_MAX_DRAWS_IN_FLIGHT
;
1408 pDC
->pDispatch
= &pContext
->pDispatchQueueArray
[dcIndex
];
1409 pDC
->pDispatch
->initialize(totalThreadGroups
, pTaskData
);
1411 QueueDispatch(pContext
);
1412 RDTSC_STOP(APIDispatch
, threadGroupCountX
* threadGroupCountY
* threadGroupCountZ
, 0);
1415 // Deswizzles, converts and stores current contents of the hot tiles to surface
1416 // described by pState
1419 SWR_RENDERTARGET_ATTACHMENT attachment
,
1420 SWR_TILE_STATE postStoreTileState
)
1422 RDTSC_START(APIStoreTiles
);
1424 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1425 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1427 SetupMacroTileScissors(pDC
);
1429 pDC
->FeWork
.type
= STORETILES
;
1430 pDC
->FeWork
.pfnWork
= ProcessStoreTiles
;
1431 pDC
->FeWork
.desc
.storeTiles
.attachment
= attachment
;
1432 pDC
->FeWork
.desc
.storeTiles
.postStoreTileState
= postStoreTileState
;
1435 QueueDraw(pContext
);
1437 RDTSC_STOP(APIStoreTiles
, 0, 0);
1440 void SwrClearRenderTarget(
1443 const float clearColor
[4],
1447 RDTSC_START(APIClearRenderTarget
);
1449 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1451 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1453 SetupMacroTileScissors(pDC
);
1456 flags
.mask
= clearMask
;
1458 pDC
->FeWork
.type
= CLEAR
;
1459 pDC
->FeWork
.pfnWork
= ProcessClear
;
1460 pDC
->FeWork
.desc
.clear
.flags
= flags
;
1461 pDC
->FeWork
.desc
.clear
.clearDepth
= z
;
1462 pDC
->FeWork
.desc
.clear
.clearRTColor
[0] = clearColor
[0];
1463 pDC
->FeWork
.desc
.clear
.clearRTColor
[1] = clearColor
[1];
1464 pDC
->FeWork
.desc
.clear
.clearRTColor
[2] = clearColor
[2];
1465 pDC
->FeWork
.desc
.clear
.clearRTColor
[3] = clearColor
[3];
1466 pDC
->FeWork
.desc
.clear
.clearStencil
= stencil
;
1469 QueueDraw(pContext
);
1471 RDTSC_STOP(APIClearRenderTarget
, 0, pDC
->drawId
);
1474 //////////////////////////////////////////////////////////////////////////
1475 /// @brief Returns a pointer to the private context state for the current
1476 /// draw operation. This is used for external componets such as the
1478 /// SWR is responsible for the allocation of the private context state.
1479 /// @param hContext - Handle passed back from SwrCreateContext
1480 VOID
* SwrGetPrivateContextState(
1483 SWR_CONTEXT
* pContext
= GetContext(hContext
);
1484 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1485 DRAW_STATE
* pState
= pDC
->pState
;
1487 if (pState
->pPrivateState
== nullptr)
1489 pState
->pPrivateState
= pState
->pArena
->AllocAligned(pContext
->privateStateSize
, KNOB_SIMD_WIDTH
*sizeof(float));
1492 return pState
->pPrivateState
;
1495 //////////////////////////////////////////////////////////////////////////
1496 /// @brief Clients can use this to allocate memory for draw/dispatch
1497 /// operations. The memory will automatically be freed once operation
1498 /// has completed. Client can use this to allocate binding tables,
1499 /// etc. needed for shader execution.
1500 /// @param hContext - Handle passed back from SwrCreateContext
1501 /// @param size - Size of allocation
1502 /// @param align - Alignment needed for allocation.
1503 VOID
* SwrAllocDrawContextMemory(
1508 SWR_CONTEXT
* pContext
= GetContext(hContext
);
1509 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1511 return pDC
->pState
->pArena
->AllocAligned(size
, align
);
1514 //////////////////////////////////////////////////////////////////////////
1515 /// @brief Returns pointer to SWR stats.
1516 /// @note The counters are atomically incremented by multiple threads.
1517 /// When calling this, you need to ensure all previous operations
1519 /// @todo If necessary, add a callback to avoid stalling the pipe to
1520 /// sample the counters.
1521 /// @param hContext - Handle passed back from SwrCreateContext
1522 /// @param pStats - SWR will fill this out for caller.
1527 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1528 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1530 pDC
->FeWork
.type
= QUERYSTATS
;
1531 pDC
->FeWork
.pfnWork
= ProcessQueryStats
;
1532 pDC
->FeWork
.desc
.queryStats
.pStats
= pStats
;
1534 // cannot execute until all previous draws have completed
1535 pDC
->dependency
= pDC
->drawId
- 1;
1538 QueueDraw(pContext
);
1541 //////////////////////////////////////////////////////////////////////////
1542 /// @brief Enables stats counting
1543 /// @param hContext - Handle passed back from SwrCreateContext
1544 /// @param enable - If true then counts are incremented.
1545 void SwrEnableStats(
1549 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1550 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1552 pDC
->pState
->state
.enableStats
= enable
;
1555 //////////////////////////////////////////////////////////////////////////
1556 /// @brief Mark end of frame - used for performance profiling
1557 /// @param hContext - Handle passed back from SwrCreateContext
1558 void SWR_API
SwrEndFrame(
1562 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1563 pContext
->frameCount
++;