1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief API implementation
27 ******************************************************************************/
35 #include "core/backend.h"
36 #include "core/context.h"
37 #include "core/depthstencil.h"
38 #include "core/frontend.h"
39 #include "core/rasterizer.h"
40 #include "core/rdtsc_core.h"
41 #include "core/threads.h"
42 #include "core/tilemgr.h"
43 #include "core/clip.h"
44 #include "core/utils.h"
46 #include "common/simdintrin.h"
47 #include "common/os.h"
49 void SetupDefaultState(SWR_CONTEXT
*pContext
);
51 static INLINE SWR_CONTEXT
* GetContext(HANDLE hContext
)
53 return (SWR_CONTEXT
*)hContext
;
56 //////////////////////////////////////////////////////////////////////////
57 /// @brief Create SWR Context.
58 /// @param pCreateInfo - pointer to creation info.
59 HANDLE
SwrCreateContext(
60 SWR_CREATECONTEXT_INFO
* pCreateInfo
)
65 void* pContextMem
= AlignedMalloc(sizeof(SWR_CONTEXT
), KNOB_SIMD_WIDTH
* 4);
66 memset(pContextMem
, 0, sizeof(SWR_CONTEXT
));
67 SWR_CONTEXT
*pContext
= new (pContextMem
) SWR_CONTEXT();
69 pContext
->driverType
= pCreateInfo
->driver
;
70 pContext
->privateStateSize
= pCreateInfo
->privateStateSize
;
72 pContext
->dcRing
.Init(KNOB_MAX_DRAWS_IN_FLIGHT
);
73 pContext
->dsRing
.Init(KNOB_MAX_DRAWS_IN_FLIGHT
);
75 pContext
->pMacroTileManagerArray
= (MacroTileMgr
*)AlignedMalloc(sizeof(MacroTileMgr
) * KNOB_MAX_DRAWS_IN_FLIGHT
, 64);
76 pContext
->pDispatchQueueArray
= (DispatchQueue
*)AlignedMalloc(sizeof(DispatchQueue
) * KNOB_MAX_DRAWS_IN_FLIGHT
, 64);
78 for (uint32_t dc
= 0; dc
< KNOB_MAX_DRAWS_IN_FLIGHT
; ++dc
)
80 pContext
->dcRing
[dc
].pArena
= new CachingArena(pContext
->cachingArenaAllocator
);
81 new (&pContext
->pMacroTileManagerArray
[dc
]) MacroTileMgr(*pContext
->dcRing
[dc
].pArena
);
82 new (&pContext
->pDispatchQueueArray
[dc
]) DispatchQueue();
84 pContext
->dsRing
[dc
].pArena
= new CachingArena(pContext
->cachingArenaAllocator
);
87 if (!KNOB_SINGLE_THREADED
)
89 memset(&pContext
->WaitLock
, 0, sizeof(pContext
->WaitLock
));
90 memset(&pContext
->FifosNotEmpty
, 0, sizeof(pContext
->FifosNotEmpty
));
91 new (&pContext
->WaitLock
) std::mutex();
92 new (&pContext
->FifosNotEmpty
) std::condition_variable();
94 CreateThreadPool(pContext
, &pContext
->threadPool
);
97 // Calling createThreadPool() above can set SINGLE_THREADED
98 if (KNOB_SINGLE_THREADED
)
100 SET_KNOB(HYPERTHREADED_FE
, false);
101 pContext
->NumWorkerThreads
= 1;
102 pContext
->NumFEThreads
= 1;
103 pContext
->NumBEThreads
= 1;
106 // Allocate scratch space for workers.
107 ///@note We could lazily allocate this but its rather small amount of memory.
108 for (uint32_t i
= 0; i
< pContext
->NumWorkerThreads
; ++i
)
111 uint32_t numaNode
= pContext
->threadPool
.pThreadData
?
112 pContext
->threadPool
.pThreadData
[i
].numaId
: 0;
113 pContext
->pScratch
[i
] = (uint8_t*)VirtualAllocExNuma(
114 GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE
),
115 MEM_RESERVE
| MEM_COMMIT
, PAGE_READWRITE
,
118 pContext
->pScratch
[i
] = (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE
), KNOB_SIMD_WIDTH
* 4);
122 // State setup AFTER context is fully initialized
123 SetupDefaultState(pContext
);
125 // initialize hot tile manager
126 pContext
->pHotTileMgr
= new HotTileMgr();
128 // initialize function pointer tables
129 InitClearTilesTable();
131 // initialize store tiles function
132 pContext
->pfnLoadTile
= pCreateInfo
->pfnLoadTile
;
133 pContext
->pfnStoreTile
= pCreateInfo
->pfnStoreTile
;
134 pContext
->pfnClearTile
= pCreateInfo
->pfnClearTile
;
136 // pass pointer to bucket manager back to caller
137 #ifdef KNOB_ENABLE_RDTSC
138 pCreateInfo
->pBucketMgr
= &gBucketMgr
;
141 pCreateInfo
->contextSaveSize
= sizeof(API_STATE
);
143 return (HANDLE
)pContext
;
146 void SwrDestroyContext(HANDLE hContext
)
148 SWR_CONTEXT
*pContext
= GetContext(hContext
);
149 DestroyThreadPool(pContext
, &pContext
->threadPool
);
152 for (uint32_t i
= 0; i
< KNOB_MAX_DRAWS_IN_FLIGHT
; ++i
)
154 delete pContext
->dcRing
[i
].pArena
;
155 delete pContext
->dsRing
[i
].pArena
;
156 pContext
->pMacroTileManagerArray
[i
].~MacroTileMgr();
157 pContext
->pDispatchQueueArray
[i
].~DispatchQueue();
160 AlignedFree(pContext
->pDispatchQueueArray
);
161 AlignedFree(pContext
->pMacroTileManagerArray
);
163 // Free scratch space.
164 for (uint32_t i
= 0; i
< pContext
->NumWorkerThreads
; ++i
)
167 VirtualFree(pContext
->pScratch
[i
], 0, MEM_RELEASE
);
169 AlignedFree(pContext
->pScratch
[i
]);
173 delete(pContext
->pHotTileMgr
);
175 pContext
->~SWR_CONTEXT();
176 AlignedFree(GetContext(hContext
));
179 void CopyState(DRAW_STATE
& dst
, const DRAW_STATE
& src
)
181 memcpy(&dst
.state
, &src
.state
, sizeof(API_STATE
));
184 void WakeAllThreads(SWR_CONTEXT
*pContext
)
186 pContext
->FifosNotEmpty
.notify_all();
189 static TileSet gSingleThreadLockedTiles
;
191 template<bool IsDraw
>
192 void QueueWork(SWR_CONTEXT
*pContext
)
194 DRAW_CONTEXT
* pDC
= pContext
->pCurDrawContext
;
195 uint32_t dcIndex
= pDC
->drawId
% KNOB_MAX_DRAWS_IN_FLIGHT
;
199 pDC
->pTileMgr
= &pContext
->pMacroTileManagerArray
[dcIndex
];
200 pDC
->pTileMgr
->initialize();
203 // Each worker thread looks at a DC for both FE and BE work at different times and so we
204 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
205 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
206 // then moved on if all work is done.)
207 pContext
->pCurDrawContext
->threadsDone
= pContext
->NumFEThreads
+ pContext
->NumBEThreads
;
211 std::unique_lock
<std::mutex
> lock(pContext
->WaitLock
);
212 pContext
->dcRing
.Enqueue();
215 if (KNOB_SINGLE_THREADED
)
217 // flush denormals to 0
218 uint32_t mxcsr
= _mm_getcsr();
219 _mm_setcsr(mxcsr
| _MM_FLUSH_ZERO_ON
| _MM_DENORMALS_ZERO_ON
);
223 uint32_t curDraw
[2] = { pContext
->pCurDrawContext
->drawId
, pContext
->pCurDrawContext
->drawId
};
224 WorkOnFifoFE(pContext
, 0, curDraw
[0]);
225 WorkOnFifoBE(pContext
, 0, curDraw
[1], gSingleThreadLockedTiles
, 0, 0);
229 uint32_t curDispatch
= pContext
->pCurDrawContext
->drawId
;
230 WorkOnCompute(pContext
, 0, curDispatch
);
233 // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
234 while (CompleteDrawContext(pContext
, pContext
->pCurDrawContext
) > 0) {}
241 RDTSC_START(APIDrawWakeAllThreads
);
242 WakeAllThreads(pContext
);
243 RDTSC_STOP(APIDrawWakeAllThreads
, 1, 0);
246 // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
247 pContext
->pPrevDrawContext
= pContext
->pCurDrawContext
;
248 pContext
->pCurDrawContext
= nullptr;
251 INLINE
void QueueDraw(SWR_CONTEXT
* pContext
)
253 QueueWork
<true>(pContext
);
256 INLINE
void QueueDispatch(SWR_CONTEXT
* pContext
)
258 QueueWork
<false>(pContext
);
261 DRAW_CONTEXT
* GetDrawContext(SWR_CONTEXT
*pContext
, bool isSplitDraw
= false)
263 RDTSC_START(APIGetDrawContext
);
264 // If current draw context is null then need to obtain a new draw context to use from ring.
265 if (pContext
->pCurDrawContext
== nullptr)
267 // Need to wait for a free entry.
268 while (pContext
->dcRing
.IsFull())
273 uint64_t curDraw
= pContext
->dcRing
.GetHead();
274 uint32_t dcIndex
= curDraw
% KNOB_MAX_DRAWS_IN_FLIGHT
;
276 static uint64_t lastDrawChecked
;
277 static uint32_t lastFrameChecked
;
278 if ((pContext
->frameCount
- lastFrameChecked
) > 2 ||
279 (curDraw
- lastDrawChecked
) > 0x10000)
281 // Take this opportunity to clean-up old arena allocations
282 pContext
->cachingArenaAllocator
.FreeOldBlocks();
284 lastFrameChecked
= pContext
->frameCount
;
285 lastDrawChecked
= curDraw
;
288 DRAW_CONTEXT
* pCurDrawContext
= &pContext
->dcRing
[dcIndex
];
289 pContext
->pCurDrawContext
= pCurDrawContext
;
291 // Assign next available entry in DS ring to this DC.
292 uint32_t dsIndex
= pContext
->curStateId
% KNOB_MAX_DRAWS_IN_FLIGHT
;
293 pCurDrawContext
->pState
= &pContext
->dsRing
[dsIndex
];
295 // Copy previous state to current state.
296 if (pContext
->pPrevDrawContext
)
298 DRAW_CONTEXT
* pPrevDrawContext
= pContext
->pPrevDrawContext
;
300 // If we're splitting our draw then we can just use the same state from the previous
301 // draw. In this case, we won't increment the DS ring index so the next non-split
302 // draw can receive the state.
303 if (isSplitDraw
== false)
305 CopyState(*pCurDrawContext
->pState
, *pPrevDrawContext
->pState
);
307 // Should have been cleaned up previously
308 SWR_ASSERT(pCurDrawContext
->pState
->pArena
->IsEmpty() == true);
310 pCurDrawContext
->pState
->pPrivateState
= nullptr;
312 pContext
->curStateId
++; // Progress state ring index forward.
316 // If its a split draw then just copy the state pointer over
317 // since its the same draw.
318 pCurDrawContext
->pState
= pPrevDrawContext
->pState
;
319 SWR_ASSERT(pPrevDrawContext
->cleanupState
== false);
324 SWR_ASSERT(pCurDrawContext
->pState
->pArena
->IsEmpty() == true);
325 pContext
->curStateId
++; // Progress state ring index forward.
328 SWR_ASSERT(pCurDrawContext
->pArena
->IsEmpty() == true);
330 pCurDrawContext
->dependent
= false;
331 pCurDrawContext
->pContext
= pContext
;
332 pCurDrawContext
->isCompute
= false; // Dispatch has to set this to true.
334 pCurDrawContext
->doneFE
= false;
335 pCurDrawContext
->FeLock
= 0;
336 pCurDrawContext
->threadsDone
= 0;
338 // Assign unique drawId for this DC
339 pCurDrawContext
->drawId
= pContext
->dcRing
.GetHead();
341 pCurDrawContext
->cleanupState
= true;
345 SWR_ASSERT(isSplitDraw
== false, "Split draw should only be used when obtaining a new DC");
348 RDTSC_STOP(APIGetDrawContext
, 0, 0);
349 return pContext
->pCurDrawContext
;
352 API_STATE
* GetDrawState(SWR_CONTEXT
*pContext
)
354 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
355 SWR_ASSERT(pDC
->pState
!= nullptr);
357 return &pDC
->pState
->state
;
360 void SWR_API
SwrSaveState(
362 void* pOutputStateBlock
,
365 SWR_CONTEXT
*pContext
= GetContext(hContext
);
366 auto pSrc
= GetDrawState(pContext
);
367 SWR_ASSERT(pOutputStateBlock
&& memSize
>= sizeof(*pSrc
));
369 memcpy(pOutputStateBlock
, pSrc
, sizeof(*pSrc
));
372 void SWR_API
SwrRestoreState(
374 const void* pStateBlock
,
377 SWR_CONTEXT
*pContext
= GetContext(hContext
);
378 auto pDst
= GetDrawState(pContext
);
379 SWR_ASSERT(pStateBlock
&& memSize
>= sizeof(*pDst
));
381 memcpy(pDst
, pStateBlock
, sizeof(*pDst
));
384 void SetupDefaultState(SWR_CONTEXT
*pContext
)
386 API_STATE
* pState
= GetDrawState(pContext
);
388 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
389 pState
->rastState
.frontWinding
= SWR_FRONTWINDING_CCW
;
392 void SwrSync(HANDLE hContext
, PFN_CALLBACK_FUNC pfnFunc
, uint64_t userData
, uint64_t userData2
, uint64_t userData3
)
394 RDTSC_START(APISync
);
396 SWR_ASSERT(pfnFunc
!= nullptr);
398 SWR_CONTEXT
*pContext
= GetContext(hContext
);
399 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
401 pDC
->FeWork
.type
= SYNC
;
402 pDC
->FeWork
.pfnWork
= ProcessSync
;
403 pDC
->FeWork
.desc
.sync
.pfnCallbackFunc
= pfnFunc
;
404 pDC
->FeWork
.desc
.sync
.userData
= userData
;
405 pDC
->FeWork
.desc
.sync
.userData2
= userData2
;
406 pDC
->FeWork
.desc
.sync
.userData3
= userData3
;
408 // cannot execute until all previous draws have completed
409 pDC
->dependent
= true;
414 RDTSC_STOP(APISync
, 1, 0);
417 void SwrWaitForIdle(HANDLE hContext
)
419 SWR_CONTEXT
*pContext
= GetContext(hContext
);
421 RDTSC_START(APIWaitForIdle
);
423 while (!pContext
->dcRing
.IsEmpty())
428 RDTSC_STOP(APIWaitForIdle
, 1, 0);
431 void SwrSetVertexBuffers(
434 const SWR_VERTEX_BUFFER_STATE
* pVertexBuffers
)
436 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
438 for (uint32_t i
= 0; i
< numBuffers
; ++i
)
440 const SWR_VERTEX_BUFFER_STATE
*pVB
= &pVertexBuffers
[i
];
441 pState
->vertexBuffers
[pVB
->index
] = *pVB
;
445 void SwrSetIndexBuffer(
447 const SWR_INDEX_BUFFER_STATE
* pIndexBuffer
)
449 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
451 pState
->indexBuffer
= *pIndexBuffer
;
454 void SwrSetFetchFunc(
456 PFN_FETCH_FUNC pfnFetchFunc
)
458 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
460 pState
->pfnFetchFunc
= pfnFetchFunc
;
465 PFN_SO_FUNC pfnSoFunc
,
466 uint32_t streamIndex
)
468 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
470 SWR_ASSERT(streamIndex
< MAX_SO_STREAMS
);
472 pState
->pfnSoFunc
[streamIndex
] = pfnSoFunc
;
477 SWR_STREAMOUT_STATE
* pSoState
)
479 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
481 pState
->soState
= *pSoState
;
484 void SwrSetSoBuffers(
486 SWR_STREAMOUT_BUFFER
* pSoBuffer
,
489 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
491 SWR_ASSERT((slot
< 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot
);
493 pState
->soBuffer
[slot
] = *pSoBuffer
;
496 void SwrSetVertexFunc(
498 PFN_VERTEX_FUNC pfnVertexFunc
)
500 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
502 pState
->pfnVertexFunc
= pfnVertexFunc
;
505 void SwrSetFrontendState(
507 SWR_FRONTEND_STATE
*pFEState
)
509 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
510 pState
->frontendState
= *pFEState
;
515 SWR_GS_STATE
*pGSState
)
517 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
518 pState
->gsState
= *pGSState
;
523 PFN_GS_FUNC pfnGsFunc
)
525 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
526 pState
->pfnGsFunc
= pfnGsFunc
;
531 PFN_CS_FUNC pfnCsFunc
,
532 uint32_t totalThreadsInGroup
,
533 uint32_t totalSpillFillSize
)
535 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
536 pState
->pfnCsFunc
= pfnCsFunc
;
537 pState
->totalThreadsInGroup
= totalThreadsInGroup
;
538 pState
->totalSpillFillSize
= totalSpillFillSize
;
543 SWR_TS_STATE
*pState
)
545 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
546 pApiState
->tsState
= *pState
;
553 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
554 pApiState
->pfnHsFunc
= pfnFunc
;
561 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
562 pApiState
->pfnDsFunc
= pfnFunc
;
565 void SwrSetDepthStencilState(
567 SWR_DEPTH_STENCIL_STATE
*pDSState
)
569 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
571 pState
->depthStencilState
= *pDSState
;
574 void SwrSetBackendState(
576 SWR_BACKEND_STATE
*pBEState
)
578 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
580 pState
->backendState
= *pBEState
;
583 void SwrSetPixelShaderState(
585 SWR_PS_STATE
*pPSState
)
587 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
588 pState
->psState
= *pPSState
;
591 void SwrSetBlendState(
593 SWR_BLEND_STATE
*pBlendState
)
595 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
596 memcpy(&pState
->blendState
, pBlendState
, sizeof(SWR_BLEND_STATE
));
599 void SwrSetBlendFunc(
601 uint32_t renderTarget
,
602 PFN_BLEND_JIT_FUNC pfnBlendFunc
)
604 SWR_ASSERT(renderTarget
< SWR_NUM_RENDERTARGETS
);
605 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
606 pState
->pfnBlendFunc
[renderTarget
] = pfnBlendFunc
;
614 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
616 static const uint8_t IDENTITY_MAP
[] =
618 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
619 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
621 static_assert(sizeof(IDENTITY_MAP
) == sizeof(pState
->linkageMap
),
622 "Update for new value of MAX_ATTRIBUTES");
624 pState
->linkageMask
= mask
;
625 pState
->linkageCount
= _mm_popcnt_u32(mask
);
631 memcpy(pState
->linkageMap
, pMap
, pState
->linkageCount
);
634 // update guardband multipliers for the viewport
635 void updateGuardband(API_STATE
*pState
)
637 // guardband center is viewport center
638 pState
->gbState
.left
= KNOB_GUARDBAND_WIDTH
/ pState
->vp
[0].width
;
639 pState
->gbState
.right
= KNOB_GUARDBAND_WIDTH
/ pState
->vp
[0].width
;
640 pState
->gbState
.top
= KNOB_GUARDBAND_HEIGHT
/ pState
->vp
[0].height
;
641 pState
->gbState
.bottom
= KNOB_GUARDBAND_HEIGHT
/ pState
->vp
[0].height
;
644 void SwrSetRastState(
646 const SWR_RASTSTATE
*pRastState
)
648 SWR_CONTEXT
*pContext
= GetContext(hContext
);
649 API_STATE
* pState
= GetDrawState(pContext
);
651 memcpy(&pState
->rastState
, pRastState
, sizeof(SWR_RASTSTATE
));
654 void SwrSetViewports(
656 uint32_t numViewports
,
657 const SWR_VIEWPORT
* pViewports
,
658 const SWR_VIEWPORT_MATRIX
* pMatrices
)
660 SWR_ASSERT(numViewports
<= KNOB_NUM_VIEWPORTS_SCISSORS
,
661 "Invalid number of viewports.");
663 SWR_CONTEXT
*pContext
= GetContext(hContext
);
664 API_STATE
* pState
= GetDrawState(pContext
);
666 memcpy(&pState
->vp
[0], pViewports
, sizeof(SWR_VIEWPORT
) * numViewports
);
668 if (pMatrices
!= nullptr)
670 memcpy(&pState
->vpMatrix
[0], pMatrices
, sizeof(SWR_VIEWPORT_MATRIX
) * numViewports
);
674 // Compute default viewport transform.
675 for (uint32_t i
= 0; i
< numViewports
; ++i
)
677 if (pContext
->driverType
== DX
)
679 pState
->vpMatrix
[i
].m00
= pState
->vp
[i
].width
/ 2.0f
;
680 pState
->vpMatrix
[i
].m11
= -pState
->vp
[i
].height
/ 2.0f
;
681 pState
->vpMatrix
[i
].m22
= pState
->vp
[i
].maxZ
- pState
->vp
[i
].minZ
;
682 pState
->vpMatrix
[i
].m30
= pState
->vp
[i
].x
+ pState
->vpMatrix
[i
].m00
;
683 pState
->vpMatrix
[i
].m31
= pState
->vp
[i
].y
- pState
->vpMatrix
[i
].m11
;
684 pState
->vpMatrix
[i
].m32
= pState
->vp
[i
].minZ
;
688 // Standard, with the exception that Y is inverted.
689 pState
->vpMatrix
[i
].m00
= (pState
->vp
[i
].width
- pState
->vp
[i
].x
) / 2.0f
;
690 pState
->vpMatrix
[i
].m11
= (pState
->vp
[i
].y
- pState
->vp
[i
].height
) / 2.0f
;
691 pState
->vpMatrix
[i
].m22
= (pState
->vp
[i
].maxZ
- pState
->vp
[i
].minZ
) / 2.0f
;
692 pState
->vpMatrix
[i
].m30
= pState
->vp
[i
].x
+ pState
->vpMatrix
[i
].m00
;
693 pState
->vpMatrix
[i
].m31
= pState
->vp
[i
].height
+ pState
->vpMatrix
[i
].m11
;
694 pState
->vpMatrix
[i
].m32
= pState
->vp
[i
].minZ
+ pState
->vpMatrix
[i
].m22
;
696 // Now that the matrix is calculated, clip the view coords to screen size.
697 // OpenGL allows for -ve x,y in the viewport.
698 pState
->vp
[i
].x
= std::max(pState
->vp
[i
].x
, 0.0f
);
699 pState
->vp
[i
].y
= std::max(pState
->vp
[i
].y
, 0.0f
);
704 updateGuardband(pState
);
707 void SwrSetScissorRects(
709 uint32_t numScissors
,
710 const BBOX
* pScissors
)
712 SWR_ASSERT(numScissors
<= KNOB_NUM_VIEWPORTS_SCISSORS
,
713 "Invalid number of scissor rects.");
715 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
716 memcpy(&pState
->scissorRects
[0], pScissors
, numScissors
* sizeof(BBOX
));
719 void SetupMacroTileScissors(DRAW_CONTEXT
*pDC
)
721 API_STATE
*pState
= &pDC
->pState
->state
;
722 uint32_t left
, right
, top
, bottom
;
724 // Set up scissor dimensions based on scissor or viewport
725 if (pState
->rastState
.scissorEnable
)
727 // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges
728 left
= pState
->scissorRects
[0].left
;
729 right
= pState
->scissorRects
[0].right
;
730 top
= pState
->scissorRects
[0].top
;
731 bottom
= pState
->scissorRects
[0].bottom
;
735 // the vp width and height must be added to origin un-rounded then the result round to -inf.
736 // The cast to int works for rounding assuming all [left, right, top, bottom] are positive.
737 left
= (int32_t)pState
->vp
[0].x
;
738 right
= (int32_t)(pState
->vp
[0].x
+ pState
->vp
[0].width
);
739 top
= (int32_t)pState
->vp
[0].y
;
740 bottom
= (int32_t)(pState
->vp
[0].y
+ pState
->vp
[0].height
);
743 right
= std::min
<uint32_t>(right
, KNOB_MAX_SCISSOR_X
);
744 bottom
= std::min
<uint32_t>(bottom
, KNOB_MAX_SCISSOR_Y
);
746 if (left
> KNOB_MAX_SCISSOR_X
|| top
> KNOB_MAX_SCISSOR_Y
)
748 pState
->scissorInFixedPoint
.left
= 0;
749 pState
->scissorInFixedPoint
.right
= 0;
750 pState
->scissorInFixedPoint
.top
= 0;
751 pState
->scissorInFixedPoint
.bottom
= 0;
755 pState
->scissorInFixedPoint
.left
= left
* FIXED_POINT_SCALE
;
756 pState
->scissorInFixedPoint
.right
= right
* FIXED_POINT_SCALE
- 1;
757 pState
->scissorInFixedPoint
.top
= top
* FIXED_POINT_SCALE
;
758 pState
->scissorInFixedPoint
.bottom
= bottom
* FIXED_POINT_SCALE
- 1;
762 // templated backend function tables
763 extern PFN_BACKEND_FUNC gBackendNullPs
[SWR_MULTISAMPLE_TYPE_MAX
];
764 extern PFN_BACKEND_FUNC gBackendSingleSample
[2][2][2];
765 extern PFN_BACKEND_FUNC gBackendPixelRateTable
[SWR_MULTISAMPLE_TYPE_MAX
][SWR_MSAA_SAMPLE_PATTERN_MAX
][2][2][2][2];
766 extern PFN_BACKEND_FUNC gBackendSampleRateTable
[SWR_MULTISAMPLE_TYPE_MAX
][2][2][2];
767 void SetupPipeline(DRAW_CONTEXT
*pDC
)
769 DRAW_STATE
* pState
= pDC
->pState
;
770 const SWR_RASTSTATE
&rastState
= pState
->state
.rastState
;
771 const SWR_PS_STATE
&psState
= pState
->state
.psState
;
772 BACKEND_FUNCS
& backendFuncs
= pState
->backendFuncs
;
773 const uint32_t forcedSampleCount
= (rastState
.forcedSampleCount
) ? 1 : 0;
776 if (psState
.pfnPixelShader
== nullptr)
778 backendFuncs
.pfnBackend
= gBackendNullPs
[pState
->state
.rastState
.sampleCount
];
782 const bool bMultisampleEnable
= ((rastState
.sampleCount
> SWR_MULTISAMPLE_1X
) || rastState
.forcedSampleCount
) ? 1 : 0;
783 const uint32_t centroid
= ((psState
.barycentricsMask
& SWR_BARYCENTRIC_CENTROID_MASK
) > 0) ? 1 : 0;
784 const uint32_t canEarlyZ
= (psState
.forceEarlyZ
|| (!psState
.writesODepth
&& !psState
.usesSourceDepth
&& !psState
.usesUAV
)) ? 1 : 0;
785 const uint32_t inputCoverage
= (psState
.inputCoverage
!= SWR_INPUT_COVERAGE_NONE
) ? 1 : 0;
787 SWR_BARYCENTRICS_MASK barycentricsMask
= (SWR_BARYCENTRICS_MASK
)psState
.barycentricsMask
;
789 // select backend function
790 switch(psState
.shadingRate
)
792 case SWR_SHADING_RATE_PIXEL
:
793 if(bMultisampleEnable
)
795 // always need to generate I & J per sample for Z interpolation
796 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_SAMPLE_MASK
);
797 backendFuncs
.pfnBackend
= gBackendPixelRateTable
[rastState
.sampleCount
][rastState
.samplePattern
][inputCoverage
][centroid
][forcedSampleCount
][canEarlyZ
];
801 // always need to generate I & J per pixel for Z interpolation
802 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_PIXEL_MASK
);
803 backendFuncs
.pfnBackend
= gBackendSingleSample
[inputCoverage
][centroid
][canEarlyZ
];
806 case SWR_SHADING_RATE_SAMPLE
:
807 SWR_ASSERT(rastState
.samplePattern
== SWR_MSAA_STANDARD_PATTERN
);
808 // always need to generate I & J per sample for Z interpolation
809 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_SAMPLE_MASK
);
810 backendFuncs
.pfnBackend
= gBackendSampleRateTable
[rastState
.sampleCount
][inputCoverage
][centroid
][canEarlyZ
];
813 SWR_ASSERT(0 && "Invalid shading rate");
818 PFN_PROCESS_PRIMS pfnBinner
;
819 switch (pState
->state
.topology
)
822 pState
->pfnProcessPrims
= ClipPoints
;
823 pfnBinner
= BinPoints
;
828 case TOP_LINE_LIST_ADJ
:
829 case TOP_LISTSTRIP_ADJ
:
830 pState
->pfnProcessPrims
= ClipLines
;
831 pfnBinner
= BinLines
;
834 pState
->pfnProcessPrims
= ClipTriangles
;
835 pfnBinner
= GetBinTrianglesFunc((rastState
.conservativeRast
> 0));
839 // disable clipper if viewport transform is disabled
840 if (pState
->state
.frontendState
.vpTransformDisable
)
842 pState
->pfnProcessPrims
= pfnBinner
;
845 if ((pState
->state
.psState
.pfnPixelShader
== nullptr) &&
846 (pState
->state
.depthStencilState
.depthTestEnable
== FALSE
) &&
847 (pState
->state
.depthStencilState
.depthWriteEnable
== FALSE
) &&
848 (pState
->state
.depthStencilState
.stencilTestEnable
== FALSE
) &&
849 (pState
->state
.depthStencilState
.stencilWriteEnable
== FALSE
) &&
850 (pState
->state
.linkageCount
== 0))
852 pState
->pfnProcessPrims
= nullptr;
853 pState
->state
.linkageMask
= 0;
856 if (pState
->state
.soState
.rasterizerDisable
== true)
858 pState
->pfnProcessPrims
= nullptr;
859 pState
->state
.linkageMask
= 0;
862 // set up the frontend attrib mask
863 pState
->state
.feAttribMask
= pState
->state
.linkageMask
;
864 if (pState
->state
.soState
.soEnable
)
866 for (uint32_t i
= 0; i
< 4; ++i
)
868 pState
->state
.feAttribMask
|= pState
->state
.soState
.streamMasks
[i
];
872 // complicated logic to test for cases where we don't need backing hottile memory for a draw
873 // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
874 pState
->state
.depthHottileEnable
= ((!(pState
->state
.depthStencilState
.depthTestEnable
&&
875 !pState
->state
.depthStencilState
.depthWriteEnable
&&
876 pState
->state
.depthStencilState
.depthTestFunc
== ZFUNC_ALWAYS
)) &&
877 (pState
->state
.depthStencilState
.depthTestEnable
||
878 pState
->state
.depthStencilState
.depthWriteEnable
)) ? true : false;
880 pState
->state
.stencilHottileEnable
= (((!(pState
->state
.depthStencilState
.stencilTestEnable
&&
881 !pState
->state
.depthStencilState
.stencilWriteEnable
&&
882 pState
->state
.depthStencilState
.stencilTestFunc
== ZFUNC_ALWAYS
)) ||
883 // for stencil we have to check the double sided state as well
884 (!(pState
->state
.depthStencilState
.doubleSidedStencilTestEnable
&&
885 !pState
->state
.depthStencilState
.stencilWriteEnable
&&
886 pState
->state
.depthStencilState
.backfaceStencilTestFunc
== ZFUNC_ALWAYS
))) &&
887 (pState
->state
.depthStencilState
.stencilTestEnable
||
888 pState
->state
.depthStencilState
.stencilWriteEnable
)) ? true : false;
890 uint32_t numRTs
= pState
->state
.psState
.numRenderTargets
;
891 pState
->state
.colorHottileEnable
= 0;
892 if (psState
.pfnPixelShader
!= nullptr)
894 for (uint32_t rt
= 0; rt
< numRTs
; ++rt
)
896 pState
->state
.colorHottileEnable
|=
897 (!pState
->state
.blendState
.renderTarget
[rt
].writeDisableAlpha
||
898 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableRed
||
899 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableGreen
||
900 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableBlue
) ? (1 << rt
) : 0;
904 // Setup depth quantization function
905 if (pState
->state
.depthHottileEnable
)
907 switch (pState
->state
.rastState
.depthFormat
)
909 case R32_FLOAT_X8X24_TYPELESS
: pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R32_FLOAT_X8X24_TYPELESS
> ; break;
910 case R32_FLOAT
: pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R32_FLOAT
> ; break;
911 case R24_UNORM_X8_TYPELESS
: pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R24_UNORM_X8_TYPELESS
> ; break;
912 case R16_UNORM
: pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R16_UNORM
> ; break;
913 default: SWR_ASSERT(false, "Unsupported depth format for depth quantiztion.");
914 pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R32_FLOAT
> ;
919 // set up pass-through quantize if depth isn't enabled
920 pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R32_FLOAT
> ;
924 //////////////////////////////////////////////////////////////////////////
926 /// @param pDC - Draw context to initialize for this draw.
931 // We don't need to re-setup the scissors/pipeline state again for split draw.
932 if (isSplitDraw
== false)
934 SetupMacroTileScissors(pDC
);
939 //////////////////////////////////////////////////////////////////////////
940 /// @brief We can split the draw for certain topologies for better performance.
941 /// @param totalVerts - Total vertices for draw
942 /// @param topology - Topology used for draw
943 uint32_t MaxVertsPerDraw(
946 PRIMITIVE_TOPOLOGY topology
)
948 API_STATE
& state
= pDC
->pState
->state
;
950 uint32_t vertsPerDraw
= totalVerts
;
952 if (state
.soState
.soEnable
)
960 case TOP_TRIANGLE_LIST
:
961 vertsPerDraw
= KNOB_MAX_PRIMS_PER_DRAW
;
964 case TOP_PATCHLIST_1
:
965 case TOP_PATCHLIST_2
:
966 case TOP_PATCHLIST_3
:
967 case TOP_PATCHLIST_4
:
968 case TOP_PATCHLIST_5
:
969 case TOP_PATCHLIST_6
:
970 case TOP_PATCHLIST_7
:
971 case TOP_PATCHLIST_8
:
972 case TOP_PATCHLIST_9
:
973 case TOP_PATCHLIST_10
:
974 case TOP_PATCHLIST_11
:
975 case TOP_PATCHLIST_12
:
976 case TOP_PATCHLIST_13
:
977 case TOP_PATCHLIST_14
:
978 case TOP_PATCHLIST_15
:
979 case TOP_PATCHLIST_16
:
980 case TOP_PATCHLIST_17
:
981 case TOP_PATCHLIST_18
:
982 case TOP_PATCHLIST_19
:
983 case TOP_PATCHLIST_20
:
984 case TOP_PATCHLIST_21
:
985 case TOP_PATCHLIST_22
:
986 case TOP_PATCHLIST_23
:
987 case TOP_PATCHLIST_24
:
988 case TOP_PATCHLIST_25
:
989 case TOP_PATCHLIST_26
:
990 case TOP_PATCHLIST_27
:
991 case TOP_PATCHLIST_28
:
992 case TOP_PATCHLIST_29
:
993 case TOP_PATCHLIST_30
:
994 case TOP_PATCHLIST_31
:
995 case TOP_PATCHLIST_32
:
996 if (pDC
->pState
->state
.tsState
.tsEnable
)
998 uint32_t vertsPerPrim
= topology
- TOP_PATCHLIST_BASE
;
999 vertsPerDraw
= vertsPerPrim
* KNOB_MAX_TESS_PRIMS_PER_DRAW
;
1003 // The Primitive Assembly code can only handle 1 RECT at a time.
1009 // We are not splitting up draws for other topologies.
1013 return vertsPerDraw
;
1017 //////////////////////////////////////////////////////////////////////////
1018 /// @brief DrawInstanced
1019 /// @param hContext - Handle passed back from SwrCreateContext
1020 /// @param topology - Specifies topology for draw.
1021 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
1022 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1023 /// @param numInstances - How many instances to render.
1024 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1027 PRIMITIVE_TOPOLOGY topology
,
1028 uint32_t numVertices
,
1029 uint32_t startVertex
,
1030 uint32_t numInstances
= 1,
1031 uint32_t startInstance
= 0)
1038 RDTSC_START(APIDraw
);
1040 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1041 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1043 uint32_t maxVertsPerDraw
= MaxVertsPerDraw(pDC
, numVertices
, topology
);
1044 uint32_t primsPerDraw
= GetNumPrims(topology
, maxVertsPerDraw
);
1045 uint32_t remainingVerts
= numVertices
;
1047 API_STATE
*pState
= &pDC
->pState
->state
;
1048 pState
->topology
= topology
;
1049 pState
->forceFront
= false;
1051 // disable culling for points/lines
1052 uint32_t oldCullMode
= pState
->rastState
.cullMode
;
1053 if (topology
== TOP_POINT_LIST
)
1055 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
1056 pState
->forceFront
= true;
1060 while (remainingVerts
)
1062 uint32_t numVertsForDraw
= (remainingVerts
< maxVertsPerDraw
) ?
1063 remainingVerts
: maxVertsPerDraw
;
1065 bool isSplitDraw
= (draw
> 0) ? true : false;
1066 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
, isSplitDraw
);
1067 InitDraw(pDC
, isSplitDraw
);
1069 pDC
->FeWork
.type
= DRAW
;
1070 pDC
->FeWork
.pfnWork
= GetProcessDrawFunc(
1072 false, // bEnableCutIndex
1073 pState
->tsState
.tsEnable
,
1074 pState
->gsState
.gsEnable
,
1075 pState
->soState
.soEnable
,
1076 pDC
->pState
->pfnProcessPrims
!= nullptr);
1077 pDC
->FeWork
.desc
.draw
.numVerts
= numVertsForDraw
;
1078 pDC
->FeWork
.desc
.draw
.startVertex
= startVertex
;
1079 pDC
->FeWork
.desc
.draw
.numInstances
= numInstances
;
1080 pDC
->FeWork
.desc
.draw
.startInstance
= startInstance
;
1081 pDC
->FeWork
.desc
.draw
.startPrimID
= draw
* primsPerDraw
;
1082 pDC
->FeWork
.desc
.draw
.startVertexID
= draw
* maxVertsPerDraw
;
1084 pDC
->cleanupState
= (remainingVerts
== numVertsForDraw
);
1087 QueueDraw(pContext
);
1089 remainingVerts
-= numVertsForDraw
;
1093 // restore culling state
1094 pDC
= GetDrawContext(pContext
);
1095 pDC
->pState
->state
.rastState
.cullMode
= oldCullMode
;
1097 RDTSC_STOP(APIDraw
, numVertices
* numInstances
, 0);
1100 //////////////////////////////////////////////////////////////////////////
1102 /// @param hContext - Handle passed back from SwrCreateContext
1103 /// @param topology - Specifies topology for draw.
1104 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
1105 /// @param primCount - Number of vertices.
1108 PRIMITIVE_TOPOLOGY topology
,
1109 uint32_t startVertex
,
1110 uint32_t numVertices
)
1112 DrawInstanced(hContext
, topology
, numVertices
, startVertex
);
1115 //////////////////////////////////////////////////////////////////////////
1116 /// @brief SwrDrawInstanced
1117 /// @param hContext - Handle passed back from SwrCreateContext
1118 /// @param topology - Specifies topology for draw.
1119 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
1120 /// @param numInstances - How many instances to render.
1121 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1122 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1123 void SwrDrawInstanced(
1125 PRIMITIVE_TOPOLOGY topology
,
1126 uint32_t numVertsPerInstance
,
1127 uint32_t numInstances
,
1128 uint32_t startVertex
,
1129 uint32_t startInstance
1132 DrawInstanced(hContext
, topology
, numVertsPerInstance
, startVertex
, numInstances
, startInstance
);
1135 //////////////////////////////////////////////////////////////////////////
1136 /// @brief DrawIndexedInstanced
1137 /// @param hContext - Handle passed back from SwrCreateContext
1138 /// @param topology - Specifies topology for draw.
1139 /// @param numIndices - Number of indices to read sequentially from index buffer.
1140 /// @param indexOffset - Starting index into index buffer.
1141 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1142 /// @param numInstances - Number of instances to render.
1143 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1144 void DrawIndexedInstance(
1146 PRIMITIVE_TOPOLOGY topology
,
1147 uint32_t numIndices
,
1148 uint32_t indexOffset
,
1150 uint32_t numInstances
= 1,
1151 uint32_t startInstance
= 0)
1158 RDTSC_START(APIDrawIndexed
);
1160 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1161 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1162 API_STATE
* pState
= &pDC
->pState
->state
;
1164 uint32_t maxIndicesPerDraw
= MaxVertsPerDraw(pDC
, numIndices
, topology
);
1165 uint32_t primsPerDraw
= GetNumPrims(topology
, maxIndicesPerDraw
);
1166 uint32_t remainingIndices
= numIndices
;
1168 uint32_t indexSize
= 0;
1169 switch (pState
->indexBuffer
.format
)
1171 case R32_UINT
: indexSize
= sizeof(uint32_t); break;
1172 case R16_UINT
: indexSize
= sizeof(uint16_t); break;
1173 case R8_UINT
: indexSize
= sizeof(uint8_t); break;
1179 uint8_t *pIB
= (uint8_t*)pState
->indexBuffer
.pIndices
;
1180 pIB
+= (uint64_t)indexOffset
* (uint64_t)indexSize
;
1182 pState
->topology
= topology
;
1183 pState
->forceFront
= false;
1185 // disable culling for points/lines
1186 uint32_t oldCullMode
= pState
->rastState
.cullMode
;
1187 if (topology
== TOP_POINT_LIST
)
1189 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
1190 pState
->forceFront
= true;
1193 while (remainingIndices
)
1195 uint32_t numIndicesForDraw
= (remainingIndices
< maxIndicesPerDraw
) ?
1196 remainingIndices
: maxIndicesPerDraw
;
1198 // When breaking up draw, we need to obtain new draw context for each iteration.
1199 bool isSplitDraw
= (draw
> 0) ? true : false;
1200 pDC
= GetDrawContext(pContext
, isSplitDraw
);
1201 InitDraw(pDC
, isSplitDraw
);
1203 pDC
->FeWork
.type
= DRAW
;
1204 pDC
->FeWork
.pfnWork
= GetProcessDrawFunc(
1206 pState
->frontendState
.bEnableCutIndex
,
1207 pState
->tsState
.tsEnable
,
1208 pState
->gsState
.gsEnable
,
1209 pState
->soState
.soEnable
,
1210 pDC
->pState
->pfnProcessPrims
!= nullptr);
1211 pDC
->FeWork
.desc
.draw
.pDC
= pDC
;
1212 pDC
->FeWork
.desc
.draw
.numIndices
= numIndicesForDraw
;
1213 pDC
->FeWork
.desc
.draw
.pIB
= (int*)pIB
;
1214 pDC
->FeWork
.desc
.draw
.type
= pDC
->pState
->state
.indexBuffer
.format
;
1216 pDC
->FeWork
.desc
.draw
.numInstances
= numInstances
;
1217 pDC
->FeWork
.desc
.draw
.startInstance
= startInstance
;
1218 pDC
->FeWork
.desc
.draw
.baseVertex
= baseVertex
;
1219 pDC
->FeWork
.desc
.draw
.startPrimID
= draw
* primsPerDraw
;
1221 pDC
->cleanupState
= (remainingIndices
== numIndicesForDraw
);
1224 QueueDraw(pContext
);
1226 pIB
+= maxIndicesPerDraw
* indexSize
;
1227 remainingIndices
-= numIndicesForDraw
;
1231 // restore culling state
1232 pDC
= GetDrawContext(pContext
);
1233 pDC
->pState
->state
.rastState
.cullMode
= oldCullMode
;
1235 RDTSC_STOP(APIDrawIndexed
, numIndices
* numInstances
, 0);
1239 //////////////////////////////////////////////////////////////////////////
1240 /// @brief DrawIndexed
1241 /// @param hContext - Handle passed back from SwrCreateContext
1242 /// @param topology - Specifies topology for draw.
1243 /// @param numIndices - Number of indices to read sequentially from index buffer.
1244 /// @param indexOffset - Starting index into index buffer.
1245 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1246 void SwrDrawIndexed(
1248 PRIMITIVE_TOPOLOGY topology
,
1249 uint32_t numIndices
,
1250 uint32_t indexOffset
,
1254 DrawIndexedInstance(hContext
, topology
, numIndices
, indexOffset
, baseVertex
);
1257 //////////////////////////////////////////////////////////////////////////
1258 /// @brief SwrDrawIndexedInstanced
1259 /// @param hContext - Handle passed back from SwrCreateContext
1260 /// @param topology - Specifies topology for draw.
1261 /// @param numIndices - Number of indices to read sequentially from index buffer.
1262 /// @param numInstances - Number of instances to render.
1263 /// @param indexOffset - Starting index into index buffer.
1264 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1265 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1266 void SwrDrawIndexedInstanced(
1268 PRIMITIVE_TOPOLOGY topology
,
1269 uint32_t numIndices
,
1270 uint32_t numInstances
,
1271 uint32_t indexOffset
,
1273 uint32_t startInstance
)
1275 DrawIndexedInstance(hContext
, topology
, numIndices
, indexOffset
, baseVertex
, numInstances
, startInstance
);
1278 //////////////////////////////////////////////////////////////////////////
1279 /// @brief SwrInvalidateTiles
1280 /// @param hContext - Handle passed back from SwrCreateContext
1281 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
1282 void SwrInvalidateTiles(
1284 uint32_t attachmentMask
)
1291 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1292 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1294 pDC
->FeWork
.type
= DISCARDINVALIDATETILES
;
1295 pDC
->FeWork
.pfnWork
= ProcessDiscardInvalidateTiles
;
1296 pDC
->FeWork
.desc
.discardInvalidateTiles
.attachmentMask
= attachmentMask
;
1297 memset(&pDC
->FeWork
.desc
.discardInvalidateTiles
.rect
, 0, sizeof(SWR_RECT
));
1298 pDC
->FeWork
.desc
.discardInvalidateTiles
.newTileState
= SWR_TILE_INVALID
;
1299 pDC
->FeWork
.desc
.discardInvalidateTiles
.createNewTiles
= false;
1300 pDC
->FeWork
.desc
.discardInvalidateTiles
.fullTilesOnly
= false;
1303 QueueDraw(pContext
);
1306 //////////////////////////////////////////////////////////////////////////
1307 /// @brief SwrDiscardRect
1308 /// @param hContext - Handle passed back from SwrCreateContext
1309 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
1310 /// @param rect - if rect is all zeros, the entire attachment surface will be discarded
1311 void SwrDiscardRect(
1313 uint32_t attachmentMask
,
1321 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1322 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1324 // Queue a load to the hottile
1325 pDC
->FeWork
.type
= DISCARDINVALIDATETILES
;
1326 pDC
->FeWork
.pfnWork
= ProcessDiscardInvalidateTiles
;
1327 pDC
->FeWork
.desc
.discardInvalidateTiles
.attachmentMask
= attachmentMask
;
1328 pDC
->FeWork
.desc
.discardInvalidateTiles
.rect
= rect
;
1329 pDC
->FeWork
.desc
.discardInvalidateTiles
.newTileState
= SWR_TILE_RESOLVED
;
1330 pDC
->FeWork
.desc
.discardInvalidateTiles
.createNewTiles
= true;
1331 pDC
->FeWork
.desc
.discardInvalidateTiles
.fullTilesOnly
= true;
1334 QueueDraw(pContext
);
1337 //////////////////////////////////////////////////////////////////////////
1338 /// @brief SwrDispatch
1339 /// @param hContext - Handle passed back from SwrCreateContext
1340 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
1341 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
1342 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
1345 uint32_t threadGroupCountX
,
1346 uint32_t threadGroupCountY
,
1347 uint32_t threadGroupCountZ
)
1354 RDTSC_START(APIDispatch
);
1355 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1356 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1358 pDC
->isCompute
= true; // This is a compute context.
1360 COMPUTE_DESC
* pTaskData
= (COMPUTE_DESC
*)pDC
->pArena
->AllocAligned(sizeof(COMPUTE_DESC
), 64);
1362 pTaskData
->threadGroupCountX
= threadGroupCountX
;
1363 pTaskData
->threadGroupCountY
= threadGroupCountY
;
1364 pTaskData
->threadGroupCountZ
= threadGroupCountZ
;
1366 uint32_t totalThreadGroups
= threadGroupCountX
* threadGroupCountY
* threadGroupCountZ
;
1367 uint32_t dcIndex
= pDC
->drawId
% KNOB_MAX_DRAWS_IN_FLIGHT
;
1368 pDC
->pDispatch
= &pContext
->pDispatchQueueArray
[dcIndex
];
1369 pDC
->pDispatch
->initialize(totalThreadGroups
, pTaskData
);
1371 QueueDispatch(pContext
);
1372 RDTSC_STOP(APIDispatch
, threadGroupCountX
* threadGroupCountY
* threadGroupCountZ
, 0);
1375 // Deswizzles, converts and stores current contents of the hot tiles to surface
1376 // described by pState
1379 SWR_RENDERTARGET_ATTACHMENT attachment
,
1380 SWR_TILE_STATE postStoreTileState
)
1387 RDTSC_START(APIStoreTiles
);
1389 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1390 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1392 SetupMacroTileScissors(pDC
);
1394 pDC
->FeWork
.type
= STORETILES
;
1395 pDC
->FeWork
.pfnWork
= ProcessStoreTiles
;
1396 pDC
->FeWork
.desc
.storeTiles
.attachment
= attachment
;
1397 pDC
->FeWork
.desc
.storeTiles
.postStoreTileState
= postStoreTileState
;
1400 QueueDraw(pContext
);
1402 RDTSC_STOP(APIStoreTiles
, 0, 0);
1405 void SwrClearRenderTarget(
1408 const float clearColor
[4],
1417 RDTSC_START(APIClearRenderTarget
);
1419 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1421 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1423 SetupMacroTileScissors(pDC
);
1426 flags
.mask
= clearMask
;
1428 pDC
->FeWork
.type
= CLEAR
;
1429 pDC
->FeWork
.pfnWork
= ProcessClear
;
1430 pDC
->FeWork
.desc
.clear
.flags
= flags
;
1431 pDC
->FeWork
.desc
.clear
.clearDepth
= z
;
1432 pDC
->FeWork
.desc
.clear
.clearRTColor
[0] = clearColor
[0];
1433 pDC
->FeWork
.desc
.clear
.clearRTColor
[1] = clearColor
[1];
1434 pDC
->FeWork
.desc
.clear
.clearRTColor
[2] = clearColor
[2];
1435 pDC
->FeWork
.desc
.clear
.clearRTColor
[3] = clearColor
[3];
1436 pDC
->FeWork
.desc
.clear
.clearStencil
= stencil
;
1439 QueueDraw(pContext
);
1441 RDTSC_STOP(APIClearRenderTarget
, 0, pDC
->drawId
);
1444 //////////////////////////////////////////////////////////////////////////
1445 /// @brief Returns a pointer to the private context state for the current
1446 /// draw operation. This is used for external componets such as the
1448 /// SWR is responsible for the allocation of the private context state.
1449 /// @param hContext - Handle passed back from SwrCreateContext
1450 VOID
* SwrGetPrivateContextState(
1453 SWR_CONTEXT
* pContext
= GetContext(hContext
);
1454 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1455 DRAW_STATE
* pState
= pDC
->pState
;
1457 if (pState
->pPrivateState
== nullptr)
1459 pState
->pPrivateState
= pState
->pArena
->AllocAligned(pContext
->privateStateSize
, KNOB_SIMD_WIDTH
*sizeof(float));
1462 return pState
->pPrivateState
;
1465 //////////////////////////////////////////////////////////////////////////
1466 /// @brief Clients can use this to allocate memory for draw/dispatch
1467 /// operations. The memory will automatically be freed once operation
1468 /// has completed. Client can use this to allocate binding tables,
1469 /// etc. needed for shader execution.
1470 /// @param hContext - Handle passed back from SwrCreateContext
1471 /// @param size - Size of allocation
1472 /// @param align - Alignment needed for allocation.
1473 VOID
* SwrAllocDrawContextMemory(
1478 SWR_CONTEXT
* pContext
= GetContext(hContext
);
1479 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1481 return pDC
->pState
->pArena
->AllocAligned(size
, align
);
1484 //////////////////////////////////////////////////////////////////////////
1485 /// @brief Returns pointer to SWR stats.
1486 /// @note The counters are atomically incremented by multiple threads.
1487 /// When calling this, you need to ensure all previous operations
1489 /// @todo If necessary, add a callback to avoid stalling the pipe to
1490 /// sample the counters.
1491 /// @param hContext - Handle passed back from SwrCreateContext
1492 /// @param pStats - SWR will fill this out for caller.
1497 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1498 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1500 pDC
->FeWork
.type
= QUERYSTATS
;
1501 pDC
->FeWork
.pfnWork
= ProcessQueryStats
;
1502 pDC
->FeWork
.desc
.queryStats
.pStats
= pStats
;
1504 // cannot execute until all previous draws have completed
1505 pDC
->dependent
= true;
1508 QueueDraw(pContext
);
1511 //////////////////////////////////////////////////////////////////////////
1512 /// @brief Enables stats counting
1513 /// @param hContext - Handle passed back from SwrCreateContext
1514 /// @param enable - If true then counts are incremented.
1515 void SwrEnableStats(
1519 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1520 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1522 pDC
->pState
->state
.enableStats
= enable
;
1525 //////////////////////////////////////////////////////////////////////////
1526 /// @brief Mark end of frame - used for performance profiling
1527 /// @param hContext - Handle passed back from SwrCreateContext
1528 void SWR_API
SwrEndFrame(
1532 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1533 pContext
->frameCount
++;