1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief API implementation
27 ******************************************************************************/
35 #include "core/backend.h"
36 #include "core/context.h"
37 #include "core/depthstencil.h"
38 #include "core/frontend.h"
39 #include "core/rasterizer.h"
40 #include "core/rdtsc_core.h"
41 #include "core/threads.h"
42 #include "core/tilemgr.h"
43 #include "core/clip.h"
44 #include "core/utils.h"
46 #include "common/simdintrin.h"
47 #include "common/os.h"
49 void SetupDefaultState(SWR_CONTEXT
*pContext
);
51 static INLINE SWR_CONTEXT
* GetContext(HANDLE hContext
)
53 return (SWR_CONTEXT
*)hContext
;
56 //////////////////////////////////////////////////////////////////////////
57 /// @brief Create SWR Context.
58 /// @param pCreateInfo - pointer to creation info.
59 HANDLE
SwrCreateContext(
60 SWR_CREATECONTEXT_INFO
* pCreateInfo
)
65 void* pContextMem
= AlignedMalloc(sizeof(SWR_CONTEXT
), KNOB_SIMD_WIDTH
* 4);
66 memset(pContextMem
, 0, sizeof(SWR_CONTEXT
));
67 SWR_CONTEXT
*pContext
= new (pContextMem
) SWR_CONTEXT();
69 pContext
->driverType
= pCreateInfo
->driver
;
70 pContext
->privateStateSize
= pCreateInfo
->privateStateSize
;
72 pContext
->dcRing
.Init(KNOB_MAX_DRAWS_IN_FLIGHT
);
73 pContext
->dsRing
.Init(KNOB_MAX_DRAWS_IN_FLIGHT
);
75 pContext
->pMacroTileManagerArray
= (MacroTileMgr
*)AlignedMalloc(sizeof(MacroTileMgr
) * KNOB_MAX_DRAWS_IN_FLIGHT
, 64);
76 pContext
->pDispatchQueueArray
= (DispatchQueue
*)AlignedMalloc(sizeof(DispatchQueue
) * KNOB_MAX_DRAWS_IN_FLIGHT
, 64);
78 for (uint32_t dc
= 0; dc
< KNOB_MAX_DRAWS_IN_FLIGHT
; ++dc
)
80 pContext
->dcRing
[dc
].pArena
= new CachingArena(pContext
->cachingArenaAllocator
);
81 new (&pContext
->pMacroTileManagerArray
[dc
]) MacroTileMgr(*pContext
->dcRing
[dc
].pArena
);
82 new (&pContext
->pDispatchQueueArray
[dc
]) DispatchQueue();
84 pContext
->dsRing
[dc
].pArena
= new CachingArena(pContext
->cachingArenaAllocator
);
87 if (!KNOB_SINGLE_THREADED
)
89 memset(&pContext
->WaitLock
, 0, sizeof(pContext
->WaitLock
));
90 memset(&pContext
->FifosNotEmpty
, 0, sizeof(pContext
->FifosNotEmpty
));
91 new (&pContext
->WaitLock
) std::mutex();
92 new (&pContext
->FifosNotEmpty
) std::condition_variable();
94 CreateThreadPool(pContext
, &pContext
->threadPool
);
97 // Calling createThreadPool() above can set SINGLE_THREADED
98 if (KNOB_SINGLE_THREADED
)
100 SET_KNOB(HYPERTHREADED_FE
, false);
101 pContext
->NumWorkerThreads
= 1;
102 pContext
->NumFEThreads
= 1;
103 pContext
->NumBEThreads
= 1;
106 // Allocate scratch space for workers.
107 ///@note We could lazily allocate this but its rather small amount of memory.
108 for (uint32_t i
= 0; i
< pContext
->NumWorkerThreads
; ++i
)
111 uint32_t numaNode
= pContext
->threadPool
.pThreadData
?
112 pContext
->threadPool
.pThreadData
[i
].numaId
: 0;
113 pContext
->pScratch
[i
] = (uint8_t*)VirtualAllocExNuma(
114 GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE
),
115 MEM_RESERVE
| MEM_COMMIT
, PAGE_READWRITE
,
118 pContext
->pScratch
[i
] = (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE
), KNOB_SIMD_WIDTH
* 4);
122 // State setup AFTER context is fully initialized
123 SetupDefaultState(pContext
);
125 // initialize hot tile manager
126 pContext
->pHotTileMgr
= new HotTileMgr();
128 // initialize function pointer tables
129 InitClearTilesTable();
131 // initialize store tiles function
132 pContext
->pfnLoadTile
= pCreateInfo
->pfnLoadTile
;
133 pContext
->pfnStoreTile
= pCreateInfo
->pfnStoreTile
;
134 pContext
->pfnClearTile
= pCreateInfo
->pfnClearTile
;
136 // pass pointer to bucket manager back to caller
137 #ifdef KNOB_ENABLE_RDTSC
138 pCreateInfo
->pBucketMgr
= &gBucketMgr
;
141 pCreateInfo
->contextSaveSize
= sizeof(API_STATE
);
143 return (HANDLE
)pContext
;
146 void SwrDestroyContext(HANDLE hContext
)
148 SWR_CONTEXT
*pContext
= GetContext(hContext
);
149 DestroyThreadPool(pContext
, &pContext
->threadPool
);
152 for (uint32_t i
= 0; i
< KNOB_MAX_DRAWS_IN_FLIGHT
; ++i
)
154 delete pContext
->dcRing
[i
].pArena
;
155 delete pContext
->dsRing
[i
].pArena
;
156 pContext
->pMacroTileManagerArray
[i
].~MacroTileMgr();
157 pContext
->pDispatchQueueArray
[i
].~DispatchQueue();
160 AlignedFree(pContext
->pDispatchQueueArray
);
161 AlignedFree(pContext
->pMacroTileManagerArray
);
163 // Free scratch space.
164 for (uint32_t i
= 0; i
< pContext
->NumWorkerThreads
; ++i
)
167 VirtualFree(pContext
->pScratch
[i
], 0, MEM_RELEASE
);
169 AlignedFree(pContext
->pScratch
[i
]);
173 delete(pContext
->pHotTileMgr
);
175 pContext
->~SWR_CONTEXT();
176 AlignedFree(GetContext(hContext
));
179 void CopyState(DRAW_STATE
& dst
, const DRAW_STATE
& src
)
181 memcpy(&dst
.state
, &src
.state
, sizeof(API_STATE
));
184 void WakeAllThreads(SWR_CONTEXT
*pContext
)
186 pContext
->FifosNotEmpty
.notify_all();
189 static TileSet gSingleThreadLockedTiles
;
191 template<bool IsDraw
>
192 void QueueWork(SWR_CONTEXT
*pContext
)
194 DRAW_CONTEXT
* pDC
= pContext
->pCurDrawContext
;
195 uint32_t dcIndex
= pDC
->drawId
% KNOB_MAX_DRAWS_IN_FLIGHT
;
199 pDC
->pTileMgr
= &pContext
->pMacroTileManagerArray
[dcIndex
];
200 pDC
->pTileMgr
->initialize();
203 // Each worker thread looks at a DC for both FE and BE work at different times and so we
204 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
205 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
206 // then moved on if all work is done.)
207 pContext
->pCurDrawContext
->threadsDone
= pContext
->NumFEThreads
+ pContext
->NumBEThreads
;
211 std::unique_lock
<std::mutex
> lock(pContext
->WaitLock
);
212 pContext
->dcRing
.Enqueue();
215 if (KNOB_SINGLE_THREADED
)
217 // flush denormals to 0
218 uint32_t mxcsr
= _mm_getcsr();
219 _mm_setcsr(mxcsr
| _MM_FLUSH_ZERO_ON
| _MM_DENORMALS_ZERO_ON
);
223 uint32_t curDraw
[2] = { pContext
->pCurDrawContext
->drawId
, pContext
->pCurDrawContext
->drawId
};
224 WorkOnFifoFE(pContext
, 0, curDraw
[0]);
225 WorkOnFifoBE(pContext
, 0, curDraw
[1], gSingleThreadLockedTiles
, 0, 0);
229 uint32_t curDispatch
= pContext
->pCurDrawContext
->drawId
;
230 WorkOnCompute(pContext
, 0, curDispatch
);
233 // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
234 while (CompleteDrawContext(pContext
, pContext
->pCurDrawContext
) > 0) {}
241 RDTSC_START(APIDrawWakeAllThreads
);
242 WakeAllThreads(pContext
);
243 RDTSC_STOP(APIDrawWakeAllThreads
, 1, 0);
246 // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
247 pContext
->pPrevDrawContext
= pContext
->pCurDrawContext
;
248 pContext
->pCurDrawContext
= nullptr;
251 INLINE
void QueueDraw(SWR_CONTEXT
* pContext
)
253 QueueWork
<true>(pContext
);
256 INLINE
void QueueDispatch(SWR_CONTEXT
* pContext
)
258 QueueWork
<false>(pContext
);
261 DRAW_CONTEXT
* GetDrawContext(SWR_CONTEXT
*pContext
, bool isSplitDraw
= false)
263 RDTSC_START(APIGetDrawContext
);
264 // If current draw context is null then need to obtain a new draw context to use from ring.
265 if (pContext
->pCurDrawContext
== nullptr)
267 // Need to wait for a free entry.
268 while (pContext
->dcRing
.IsFull())
273 uint64_t curDraw
= pContext
->dcRing
.GetHead();
274 uint32_t dcIndex
= curDraw
% KNOB_MAX_DRAWS_IN_FLIGHT
;
276 static uint64_t lastDrawChecked
;
277 static uint32_t lastFrameChecked
;
278 if ((pContext
->frameCount
- lastFrameChecked
) > 2 ||
279 (curDraw
- lastDrawChecked
) > 0x10000)
281 // Take this opportunity to clean-up old arena allocations
282 pContext
->cachingArenaAllocator
.FreeOldBlocks();
284 lastFrameChecked
= pContext
->frameCount
;
285 lastDrawChecked
= curDraw
;
288 DRAW_CONTEXT
* pCurDrawContext
= &pContext
->dcRing
[dcIndex
];
289 pContext
->pCurDrawContext
= pCurDrawContext
;
291 // Assign next available entry in DS ring to this DC.
292 uint32_t dsIndex
= pContext
->curStateId
% KNOB_MAX_DRAWS_IN_FLIGHT
;
293 pCurDrawContext
->pState
= &pContext
->dsRing
[dsIndex
];
295 // Copy previous state to current state.
296 if (pContext
->pPrevDrawContext
)
298 DRAW_CONTEXT
* pPrevDrawContext
= pContext
->pPrevDrawContext
;
300 // If we're splitting our draw then we can just use the same state from the previous
301 // draw. In this case, we won't increment the DS ring index so the next non-split
302 // draw can receive the state.
303 if (isSplitDraw
== false)
305 CopyState(*pCurDrawContext
->pState
, *pPrevDrawContext
->pState
);
307 // Should have been cleaned up previously
308 SWR_ASSERT(pCurDrawContext
->pState
->pArena
->IsEmpty() == true);
310 pCurDrawContext
->pState
->pPrivateState
= nullptr;
312 pContext
->curStateId
++; // Progress state ring index forward.
316 // If its a split draw then just copy the state pointer over
317 // since its the same draw.
318 pCurDrawContext
->pState
= pPrevDrawContext
->pState
;
319 SWR_ASSERT(pPrevDrawContext
->cleanupState
== false);
324 SWR_ASSERT(pCurDrawContext
->pState
->pArena
->IsEmpty() == true);
325 pContext
->curStateId
++; // Progress state ring index forward.
328 SWR_ASSERT(pCurDrawContext
->pArena
->IsEmpty() == true);
330 pCurDrawContext
->dependent
= false;
331 pCurDrawContext
->pContext
= pContext
;
332 pCurDrawContext
->isCompute
= false; // Dispatch has to set this to true.
334 pCurDrawContext
->doneFE
= false;
335 pCurDrawContext
->FeLock
= 0;
336 pCurDrawContext
->threadsDone
= 0;
337 pCurDrawContext
->retireCallback
.pfnCallbackFunc
= nullptr;
339 // Assign unique drawId for this DC
340 pCurDrawContext
->drawId
= pContext
->dcRing
.GetHead();
342 pCurDrawContext
->cleanupState
= true;
346 SWR_ASSERT(isSplitDraw
== false, "Split draw should only be used when obtaining a new DC");
349 RDTSC_STOP(APIGetDrawContext
, 0, 0);
350 return pContext
->pCurDrawContext
;
353 API_STATE
* GetDrawState(SWR_CONTEXT
*pContext
)
355 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
356 SWR_ASSERT(pDC
->pState
!= nullptr);
358 return &pDC
->pState
->state
;
361 void SWR_API
SwrSaveState(
363 void* pOutputStateBlock
,
366 SWR_CONTEXT
*pContext
= GetContext(hContext
);
367 auto pSrc
= GetDrawState(pContext
);
368 SWR_ASSERT(pOutputStateBlock
&& memSize
>= sizeof(*pSrc
));
370 memcpy(pOutputStateBlock
, pSrc
, sizeof(*pSrc
));
373 void SWR_API
SwrRestoreState(
375 const void* pStateBlock
,
378 SWR_CONTEXT
*pContext
= GetContext(hContext
);
379 auto pDst
= GetDrawState(pContext
);
380 SWR_ASSERT(pStateBlock
&& memSize
>= sizeof(*pDst
));
382 memcpy(pDst
, pStateBlock
, sizeof(*pDst
));
385 void SetupDefaultState(SWR_CONTEXT
*pContext
)
387 API_STATE
* pState
= GetDrawState(pContext
);
389 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
390 pState
->rastState
.frontWinding
= SWR_FRONTWINDING_CCW
;
393 void SwrSync(HANDLE hContext
, PFN_CALLBACK_FUNC pfnFunc
, uint64_t userData
, uint64_t userData2
, uint64_t userData3
)
395 RDTSC_START(APISync
);
397 SWR_ASSERT(pfnFunc
!= nullptr);
399 SWR_CONTEXT
*pContext
= GetContext(hContext
);
400 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
402 pDC
->FeWork
.type
= SYNC
;
403 pDC
->FeWork
.pfnWork
= ProcessSync
;
405 // Setup callback function
406 pDC
->retireCallback
.pfnCallbackFunc
= pfnFunc
;
407 pDC
->retireCallback
.userData
= userData
;
408 pDC
->retireCallback
.userData2
= userData2
;
409 pDC
->retireCallback
.userData3
= userData3
;
414 RDTSC_STOP(APISync
, 1, 0);
417 void SwrWaitForIdle(HANDLE hContext
)
419 SWR_CONTEXT
*pContext
= GetContext(hContext
);
421 RDTSC_START(APIWaitForIdle
);
423 while (!pContext
->dcRing
.IsEmpty())
428 RDTSC_STOP(APIWaitForIdle
, 1, 0);
431 void SwrSetVertexBuffers(
434 const SWR_VERTEX_BUFFER_STATE
* pVertexBuffers
)
436 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
438 for (uint32_t i
= 0; i
< numBuffers
; ++i
)
440 const SWR_VERTEX_BUFFER_STATE
*pVB
= &pVertexBuffers
[i
];
441 pState
->vertexBuffers
[pVB
->index
] = *pVB
;
445 void SwrSetIndexBuffer(
447 const SWR_INDEX_BUFFER_STATE
* pIndexBuffer
)
449 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
451 pState
->indexBuffer
= *pIndexBuffer
;
454 void SwrSetFetchFunc(
456 PFN_FETCH_FUNC pfnFetchFunc
)
458 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
460 pState
->pfnFetchFunc
= pfnFetchFunc
;
465 PFN_SO_FUNC pfnSoFunc
,
466 uint32_t streamIndex
)
468 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
470 SWR_ASSERT(streamIndex
< MAX_SO_STREAMS
);
472 pState
->pfnSoFunc
[streamIndex
] = pfnSoFunc
;
477 SWR_STREAMOUT_STATE
* pSoState
)
479 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
481 pState
->soState
= *pSoState
;
484 void SwrSetSoBuffers(
486 SWR_STREAMOUT_BUFFER
* pSoBuffer
,
489 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
491 SWR_ASSERT((slot
< 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot
);
493 pState
->soBuffer
[slot
] = *pSoBuffer
;
496 void SwrSetVertexFunc(
498 PFN_VERTEX_FUNC pfnVertexFunc
)
500 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
502 pState
->pfnVertexFunc
= pfnVertexFunc
;
505 void SwrSetFrontendState(
507 SWR_FRONTEND_STATE
*pFEState
)
509 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
510 pState
->frontendState
= *pFEState
;
515 SWR_GS_STATE
*pGSState
)
517 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
518 pState
->gsState
= *pGSState
;
523 PFN_GS_FUNC pfnGsFunc
)
525 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
526 pState
->pfnGsFunc
= pfnGsFunc
;
531 PFN_CS_FUNC pfnCsFunc
,
532 uint32_t totalThreadsInGroup
,
533 uint32_t totalSpillFillSize
)
535 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
536 pState
->pfnCsFunc
= pfnCsFunc
;
537 pState
->totalThreadsInGroup
= totalThreadsInGroup
;
538 pState
->totalSpillFillSize
= totalSpillFillSize
;
543 SWR_TS_STATE
*pState
)
545 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
546 pApiState
->tsState
= *pState
;
553 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
554 pApiState
->pfnHsFunc
= pfnFunc
;
561 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
562 pApiState
->pfnDsFunc
= pfnFunc
;
565 void SwrSetDepthStencilState(
567 SWR_DEPTH_STENCIL_STATE
*pDSState
)
569 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
571 pState
->depthStencilState
= *pDSState
;
574 void SwrSetBackendState(
576 SWR_BACKEND_STATE
*pBEState
)
578 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
580 pState
->backendState
= *pBEState
;
583 void SwrSetPixelShaderState(
585 SWR_PS_STATE
*pPSState
)
587 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
588 pState
->psState
= *pPSState
;
591 void SwrSetBlendState(
593 SWR_BLEND_STATE
*pBlendState
)
595 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
596 memcpy(&pState
->blendState
, pBlendState
, sizeof(SWR_BLEND_STATE
));
599 void SwrSetBlendFunc(
601 uint32_t renderTarget
,
602 PFN_BLEND_JIT_FUNC pfnBlendFunc
)
604 SWR_ASSERT(renderTarget
< SWR_NUM_RENDERTARGETS
);
605 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
606 pState
->pfnBlendFunc
[renderTarget
] = pfnBlendFunc
;
609 // update guardband multipliers for the viewport
610 void updateGuardband(API_STATE
*pState
)
612 // guardband center is viewport center
613 pState
->gbState
.left
= KNOB_GUARDBAND_WIDTH
/ pState
->vp
[0].width
;
614 pState
->gbState
.right
= KNOB_GUARDBAND_WIDTH
/ pState
->vp
[0].width
;
615 pState
->gbState
.top
= KNOB_GUARDBAND_HEIGHT
/ pState
->vp
[0].height
;
616 pState
->gbState
.bottom
= KNOB_GUARDBAND_HEIGHT
/ pState
->vp
[0].height
;
619 void SwrSetRastState(
621 const SWR_RASTSTATE
*pRastState
)
623 SWR_CONTEXT
*pContext
= GetContext(hContext
);
624 API_STATE
* pState
= GetDrawState(pContext
);
626 memcpy(&pState
->rastState
, pRastState
, sizeof(SWR_RASTSTATE
));
629 void SwrSetViewports(
631 uint32_t numViewports
,
632 const SWR_VIEWPORT
* pViewports
,
633 const SWR_VIEWPORT_MATRIX
* pMatrices
)
635 SWR_ASSERT(numViewports
<= KNOB_NUM_VIEWPORTS_SCISSORS
,
636 "Invalid number of viewports.");
638 SWR_CONTEXT
*pContext
= GetContext(hContext
);
639 API_STATE
* pState
= GetDrawState(pContext
);
641 memcpy(&pState
->vp
[0], pViewports
, sizeof(SWR_VIEWPORT
) * numViewports
);
643 if (pMatrices
!= nullptr)
645 memcpy(&pState
->vpMatrix
[0], pMatrices
, sizeof(SWR_VIEWPORT_MATRIX
) * numViewports
);
649 // Compute default viewport transform.
650 for (uint32_t i
= 0; i
< numViewports
; ++i
)
652 if (pContext
->driverType
== DX
)
654 pState
->vpMatrix
[i
].m00
= pState
->vp
[i
].width
/ 2.0f
;
655 pState
->vpMatrix
[i
].m11
= -pState
->vp
[i
].height
/ 2.0f
;
656 pState
->vpMatrix
[i
].m22
= pState
->vp
[i
].maxZ
- pState
->vp
[i
].minZ
;
657 pState
->vpMatrix
[i
].m30
= pState
->vp
[i
].x
+ pState
->vpMatrix
[i
].m00
;
658 pState
->vpMatrix
[i
].m31
= pState
->vp
[i
].y
- pState
->vpMatrix
[i
].m11
;
659 pState
->vpMatrix
[i
].m32
= pState
->vp
[i
].minZ
;
663 // Standard, with the exception that Y is inverted.
664 pState
->vpMatrix
[i
].m00
= (pState
->vp
[i
].width
- pState
->vp
[i
].x
) / 2.0f
;
665 pState
->vpMatrix
[i
].m11
= (pState
->vp
[i
].y
- pState
->vp
[i
].height
) / 2.0f
;
666 pState
->vpMatrix
[i
].m22
= (pState
->vp
[i
].maxZ
- pState
->vp
[i
].minZ
) / 2.0f
;
667 pState
->vpMatrix
[i
].m30
= pState
->vp
[i
].x
+ pState
->vpMatrix
[i
].m00
;
668 pState
->vpMatrix
[i
].m31
= pState
->vp
[i
].height
+ pState
->vpMatrix
[i
].m11
;
669 pState
->vpMatrix
[i
].m32
= pState
->vp
[i
].minZ
+ pState
->vpMatrix
[i
].m22
;
671 // Now that the matrix is calculated, clip the view coords to screen size.
672 // OpenGL allows for -ve x,y in the viewport.
673 pState
->vp
[i
].x
= std::max(pState
->vp
[i
].x
, 0.0f
);
674 pState
->vp
[i
].y
= std::max(pState
->vp
[i
].y
, 0.0f
);
679 updateGuardband(pState
);
682 void SwrSetScissorRects(
684 uint32_t numScissors
,
685 const BBOX
* pScissors
)
687 SWR_ASSERT(numScissors
<= KNOB_NUM_VIEWPORTS_SCISSORS
,
688 "Invalid number of scissor rects.");
690 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
691 memcpy(&pState
->scissorRects
[0], pScissors
, numScissors
* sizeof(BBOX
));
694 void SetupMacroTileScissors(DRAW_CONTEXT
*pDC
)
696 API_STATE
*pState
= &pDC
->pState
->state
;
697 uint32_t left
, right
, top
, bottom
;
699 // Set up scissor dimensions based on scissor or viewport
700 if (pState
->rastState
.scissorEnable
)
702 // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges
703 left
= pState
->scissorRects
[0].left
;
704 right
= pState
->scissorRects
[0].right
;
705 top
= pState
->scissorRects
[0].top
;
706 bottom
= pState
->scissorRects
[0].bottom
;
710 // the vp width and height must be added to origin un-rounded then the result round to -inf.
711 // The cast to int works for rounding assuming all [left, right, top, bottom] are positive.
712 left
= (int32_t)pState
->vp
[0].x
;
713 right
= (int32_t)(pState
->vp
[0].x
+ pState
->vp
[0].width
);
714 top
= (int32_t)pState
->vp
[0].y
;
715 bottom
= (int32_t)(pState
->vp
[0].y
+ pState
->vp
[0].height
);
718 right
= std::min
<uint32_t>(right
, KNOB_MAX_SCISSOR_X
);
719 bottom
= std::min
<uint32_t>(bottom
, KNOB_MAX_SCISSOR_Y
);
721 if (left
> KNOB_MAX_SCISSOR_X
|| top
> KNOB_MAX_SCISSOR_Y
)
723 pState
->scissorInFixedPoint
.left
= 0;
724 pState
->scissorInFixedPoint
.right
= 0;
725 pState
->scissorInFixedPoint
.top
= 0;
726 pState
->scissorInFixedPoint
.bottom
= 0;
730 pState
->scissorInFixedPoint
.left
= left
* FIXED_POINT_SCALE
;
731 pState
->scissorInFixedPoint
.right
= right
* FIXED_POINT_SCALE
- 1;
732 pState
->scissorInFixedPoint
.top
= top
* FIXED_POINT_SCALE
;
733 pState
->scissorInFixedPoint
.bottom
= bottom
* FIXED_POINT_SCALE
- 1;
737 // templated backend function tables
738 extern PFN_BACKEND_FUNC gBackendNullPs
[SWR_MULTISAMPLE_TYPE_COUNT
];
739 extern PFN_BACKEND_FUNC gBackendSingleSample
[2][2][2];
740 extern PFN_BACKEND_FUNC gBackendPixelRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
][SWR_MSAA_SAMPLE_PATTERN_COUNT
][2][2][2][2];
741 extern PFN_BACKEND_FUNC gBackendSampleRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
][2][2][2];
742 void SetupPipeline(DRAW_CONTEXT
*pDC
)
744 DRAW_STATE
* pState
= pDC
->pState
;
745 const SWR_RASTSTATE
&rastState
= pState
->state
.rastState
;
746 const SWR_PS_STATE
&psState
= pState
->state
.psState
;
747 BACKEND_FUNCS
& backendFuncs
= pState
->backendFuncs
;
748 const uint32_t forcedSampleCount
= (rastState
.forcedSampleCount
) ? 1 : 0;
751 if (psState
.pfnPixelShader
== nullptr)
753 backendFuncs
.pfnBackend
= gBackendNullPs
[pState
->state
.rastState
.sampleCount
];
757 const bool bMultisampleEnable
= ((rastState
.sampleCount
> SWR_MULTISAMPLE_1X
) || rastState
.forcedSampleCount
) ? 1 : 0;
758 const uint32_t centroid
= ((psState
.barycentricsMask
& SWR_BARYCENTRIC_CENTROID_MASK
) > 0) ? 1 : 0;
759 const uint32_t canEarlyZ
= (psState
.forceEarlyZ
|| (!psState
.writesODepth
&& !psState
.usesSourceDepth
&& !psState
.usesUAV
)) ? 1 : 0;
760 const uint32_t inputCoverage
= (psState
.inputCoverage
!= SWR_INPUT_COVERAGE_NONE
) ? 1 : 0;
762 SWR_BARYCENTRICS_MASK barycentricsMask
= (SWR_BARYCENTRICS_MASK
)psState
.barycentricsMask
;
764 // select backend function
765 switch(psState
.shadingRate
)
767 case SWR_SHADING_RATE_PIXEL
:
768 if(bMultisampleEnable
)
770 // always need to generate I & J per sample for Z interpolation
771 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_SAMPLE_MASK
);
772 backendFuncs
.pfnBackend
= gBackendPixelRateTable
[rastState
.sampleCount
][rastState
.samplePattern
][inputCoverage
][centroid
][forcedSampleCount
][canEarlyZ
];
776 // always need to generate I & J per pixel for Z interpolation
777 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_PIXEL_MASK
);
778 backendFuncs
.pfnBackend
= gBackendSingleSample
[inputCoverage
][centroid
][canEarlyZ
];
781 case SWR_SHADING_RATE_SAMPLE
:
782 SWR_ASSERT(rastState
.samplePattern
== SWR_MSAA_STANDARD_PATTERN
);
783 // always need to generate I & J per sample for Z interpolation
784 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_SAMPLE_MASK
);
785 backendFuncs
.pfnBackend
= gBackendSampleRateTable
[rastState
.sampleCount
][inputCoverage
][centroid
][canEarlyZ
];
788 SWR_ASSERT(0 && "Invalid shading rate");
793 PFN_PROCESS_PRIMS pfnBinner
;
794 switch (pState
->state
.topology
)
797 pState
->pfnProcessPrims
= ClipPoints
;
798 pfnBinner
= BinPoints
;
803 case TOP_LINE_LIST_ADJ
:
804 case TOP_LISTSTRIP_ADJ
:
805 pState
->pfnProcessPrims
= ClipLines
;
806 pfnBinner
= BinLines
;
809 pState
->pfnProcessPrims
= ClipTriangles
;
810 pfnBinner
= GetBinTrianglesFunc((rastState
.conservativeRast
> 0));
814 // disable clipper if viewport transform is disabled
815 if (pState
->state
.frontendState
.vpTransformDisable
)
817 pState
->pfnProcessPrims
= pfnBinner
;
820 if ((pState
->state
.psState
.pfnPixelShader
== nullptr) &&
821 (pState
->state
.depthStencilState
.depthTestEnable
== FALSE
) &&
822 (pState
->state
.depthStencilState
.depthWriteEnable
== FALSE
) &&
823 (pState
->state
.depthStencilState
.stencilTestEnable
== FALSE
) &&
824 (pState
->state
.depthStencilState
.stencilWriteEnable
== FALSE
) &&
825 (pState
->state
.backendState
.numAttributes
== 0))
827 pState
->pfnProcessPrims
= nullptr;
830 if (pState
->state
.soState
.rasterizerDisable
== true)
832 pState
->pfnProcessPrims
= nullptr;
835 // set up the frontend attribute count
836 pState
->state
.feNumAttributes
= 0;
837 const SWR_BACKEND_STATE
& backendState
= pState
->state
.backendState
;
838 if (backendState
.swizzleEnable
)
840 // attribute swizzling is enabled, iterate over the map and record the max attribute used
841 for (uint32_t i
= 0; i
< backendState
.numAttributes
; ++i
)
843 pState
->state
.feNumAttributes
= std::max(pState
->state
.feNumAttributes
, (uint32_t)backendState
.swizzleMap
[i
].sourceAttrib
+ 1);
848 pState
->state
.feNumAttributes
= pState
->state
.backendState
.numAttributes
;
851 if (pState
->state
.soState
.soEnable
)
853 uint32_t streamMasks
= 0;
854 for (uint32_t i
= 0; i
< 4; ++i
)
856 streamMasks
|= pState
->state
.soState
.streamMasks
[i
];
860 if (_BitScanReverse(&maxAttrib
, streamMasks
))
862 pState
->state
.feNumAttributes
= std::max(pState
->state
.feNumAttributes
, (uint32_t)(maxAttrib
+ 1));
866 // complicated logic to test for cases where we don't need backing hottile memory for a draw
867 // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
868 pState
->state
.depthHottileEnable
= ((!(pState
->state
.depthStencilState
.depthTestEnable
&&
869 !pState
->state
.depthStencilState
.depthWriteEnable
&&
870 pState
->state
.depthStencilState
.depthTestFunc
== ZFUNC_ALWAYS
)) &&
871 (pState
->state
.depthStencilState
.depthTestEnable
||
872 pState
->state
.depthStencilState
.depthWriteEnable
)) ? true : false;
874 pState
->state
.stencilHottileEnable
= (((!(pState
->state
.depthStencilState
.stencilTestEnable
&&
875 !pState
->state
.depthStencilState
.stencilWriteEnable
&&
876 pState
->state
.depthStencilState
.stencilTestFunc
== ZFUNC_ALWAYS
)) ||
877 // for stencil we have to check the double sided state as well
878 (!(pState
->state
.depthStencilState
.doubleSidedStencilTestEnable
&&
879 !pState
->state
.depthStencilState
.stencilWriteEnable
&&
880 pState
->state
.depthStencilState
.backfaceStencilTestFunc
== ZFUNC_ALWAYS
))) &&
881 (pState
->state
.depthStencilState
.stencilTestEnable
||
882 pState
->state
.depthStencilState
.stencilWriteEnable
)) ? true : false;
884 uint32_t numRTs
= pState
->state
.psState
.numRenderTargets
;
885 pState
->state
.colorHottileEnable
= 0;
886 if (psState
.pfnPixelShader
!= nullptr)
888 for (uint32_t rt
= 0; rt
< numRTs
; ++rt
)
890 pState
->state
.colorHottileEnable
|=
891 (!pState
->state
.blendState
.renderTarget
[rt
].writeDisableAlpha
||
892 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableRed
||
893 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableGreen
||
894 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableBlue
) ? (1 << rt
) : 0;
898 // Setup depth quantization function
899 if (pState
->state
.depthHottileEnable
)
901 switch (pState
->state
.rastState
.depthFormat
)
903 case R32_FLOAT_X8X24_TYPELESS
: pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R32_FLOAT_X8X24_TYPELESS
> ; break;
904 case R32_FLOAT
: pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R32_FLOAT
> ; break;
905 case R24_UNORM_X8_TYPELESS
: pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R24_UNORM_X8_TYPELESS
> ; break;
906 case R16_UNORM
: pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R16_UNORM
> ; break;
907 default: SWR_ASSERT(false, "Unsupported depth format for depth quantiztion.");
908 pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R32_FLOAT
> ;
913 // set up pass-through quantize if depth isn't enabled
914 pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R32_FLOAT
> ;
918 //////////////////////////////////////////////////////////////////////////
920 /// @param pDC - Draw context to initialize for this draw.
925 // We don't need to re-setup the scissors/pipeline state again for split draw.
926 if (isSplitDraw
== false)
928 SetupMacroTileScissors(pDC
);
933 //////////////////////////////////////////////////////////////////////////
934 /// @brief We can split the draw for certain topologies for better performance.
935 /// @param totalVerts - Total vertices for draw
936 /// @param topology - Topology used for draw
937 uint32_t MaxVertsPerDraw(
940 PRIMITIVE_TOPOLOGY topology
)
942 API_STATE
& state
= pDC
->pState
->state
;
944 uint32_t vertsPerDraw
= totalVerts
;
946 if (state
.soState
.soEnable
)
954 case TOP_TRIANGLE_LIST
:
955 vertsPerDraw
= KNOB_MAX_PRIMS_PER_DRAW
;
958 case TOP_PATCHLIST_1
:
959 case TOP_PATCHLIST_2
:
960 case TOP_PATCHLIST_3
:
961 case TOP_PATCHLIST_4
:
962 case TOP_PATCHLIST_5
:
963 case TOP_PATCHLIST_6
:
964 case TOP_PATCHLIST_7
:
965 case TOP_PATCHLIST_8
:
966 case TOP_PATCHLIST_9
:
967 case TOP_PATCHLIST_10
:
968 case TOP_PATCHLIST_11
:
969 case TOP_PATCHLIST_12
:
970 case TOP_PATCHLIST_13
:
971 case TOP_PATCHLIST_14
:
972 case TOP_PATCHLIST_15
:
973 case TOP_PATCHLIST_16
:
974 case TOP_PATCHLIST_17
:
975 case TOP_PATCHLIST_18
:
976 case TOP_PATCHLIST_19
:
977 case TOP_PATCHLIST_20
:
978 case TOP_PATCHLIST_21
:
979 case TOP_PATCHLIST_22
:
980 case TOP_PATCHLIST_23
:
981 case TOP_PATCHLIST_24
:
982 case TOP_PATCHLIST_25
:
983 case TOP_PATCHLIST_26
:
984 case TOP_PATCHLIST_27
:
985 case TOP_PATCHLIST_28
:
986 case TOP_PATCHLIST_29
:
987 case TOP_PATCHLIST_30
:
988 case TOP_PATCHLIST_31
:
989 case TOP_PATCHLIST_32
:
990 if (pDC
->pState
->state
.tsState
.tsEnable
)
992 uint32_t vertsPerPrim
= topology
- TOP_PATCHLIST_BASE
;
993 vertsPerDraw
= vertsPerPrim
* KNOB_MAX_TESS_PRIMS_PER_DRAW
;
997 // The Primitive Assembly code can only handle 1 RECT at a time.
1003 // We are not splitting up draws for other topologies.
1007 return vertsPerDraw
;
1011 //////////////////////////////////////////////////////////////////////////
1012 /// @brief DrawInstanced
1013 /// @param hContext - Handle passed back from SwrCreateContext
1014 /// @param topology - Specifies topology for draw.
1015 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
1016 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1017 /// @param numInstances - How many instances to render.
1018 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1021 PRIMITIVE_TOPOLOGY topology
,
1022 uint32_t numVertices
,
1023 uint32_t startVertex
,
1024 uint32_t numInstances
= 1,
1025 uint32_t startInstance
= 0)
1032 RDTSC_START(APIDraw
);
1034 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1035 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1037 uint32_t maxVertsPerDraw
= MaxVertsPerDraw(pDC
, numVertices
, topology
);
1038 uint32_t primsPerDraw
= GetNumPrims(topology
, maxVertsPerDraw
);
1039 uint32_t remainingVerts
= numVertices
;
1041 API_STATE
*pState
= &pDC
->pState
->state
;
1042 pState
->topology
= topology
;
1043 pState
->forceFront
= false;
1045 // disable culling for points/lines
1046 uint32_t oldCullMode
= pState
->rastState
.cullMode
;
1047 if (topology
== TOP_POINT_LIST
)
1049 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
1050 pState
->forceFront
= true;
1054 while (remainingVerts
)
1056 uint32_t numVertsForDraw
= (remainingVerts
< maxVertsPerDraw
) ?
1057 remainingVerts
: maxVertsPerDraw
;
1059 bool isSplitDraw
= (draw
> 0) ? true : false;
1060 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
, isSplitDraw
);
1061 InitDraw(pDC
, isSplitDraw
);
1063 pDC
->FeWork
.type
= DRAW
;
1064 pDC
->FeWork
.pfnWork
= GetProcessDrawFunc(
1066 false, // bEnableCutIndex
1067 pState
->tsState
.tsEnable
,
1068 pState
->gsState
.gsEnable
,
1069 pState
->soState
.soEnable
,
1070 pDC
->pState
->pfnProcessPrims
!= nullptr);
1071 pDC
->FeWork
.desc
.draw
.numVerts
= numVertsForDraw
;
1072 pDC
->FeWork
.desc
.draw
.startVertex
= startVertex
;
1073 pDC
->FeWork
.desc
.draw
.numInstances
= numInstances
;
1074 pDC
->FeWork
.desc
.draw
.startInstance
= startInstance
;
1075 pDC
->FeWork
.desc
.draw
.startPrimID
= draw
* primsPerDraw
;
1076 pDC
->FeWork
.desc
.draw
.startVertexID
= draw
* maxVertsPerDraw
;
1078 pDC
->cleanupState
= (remainingVerts
== numVertsForDraw
);
1081 QueueDraw(pContext
);
1083 remainingVerts
-= numVertsForDraw
;
1087 // restore culling state
1088 pDC
= GetDrawContext(pContext
);
1089 pDC
->pState
->state
.rastState
.cullMode
= oldCullMode
;
1091 RDTSC_STOP(APIDraw
, numVertices
* numInstances
, 0);
1094 //////////////////////////////////////////////////////////////////////////
1096 /// @param hContext - Handle passed back from SwrCreateContext
1097 /// @param topology - Specifies topology for draw.
1098 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
1099 /// @param primCount - Number of vertices.
1102 PRIMITIVE_TOPOLOGY topology
,
1103 uint32_t startVertex
,
1104 uint32_t numVertices
)
1106 DrawInstanced(hContext
, topology
, numVertices
, startVertex
);
1109 //////////////////////////////////////////////////////////////////////////
1110 /// @brief SwrDrawInstanced
1111 /// @param hContext - Handle passed back from SwrCreateContext
1112 /// @param topology - Specifies topology for draw.
1113 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
1114 /// @param numInstances - How many instances to render.
1115 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1116 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1117 void SwrDrawInstanced(
1119 PRIMITIVE_TOPOLOGY topology
,
1120 uint32_t numVertsPerInstance
,
1121 uint32_t numInstances
,
1122 uint32_t startVertex
,
1123 uint32_t startInstance
1126 DrawInstanced(hContext
, topology
, numVertsPerInstance
, startVertex
, numInstances
, startInstance
);
1129 //////////////////////////////////////////////////////////////////////////
1130 /// @brief DrawIndexedInstanced
1131 /// @param hContext - Handle passed back from SwrCreateContext
1132 /// @param topology - Specifies topology for draw.
1133 /// @param numIndices - Number of indices to read sequentially from index buffer.
1134 /// @param indexOffset - Starting index into index buffer.
1135 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1136 /// @param numInstances - Number of instances to render.
1137 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1138 void DrawIndexedInstance(
1140 PRIMITIVE_TOPOLOGY topology
,
1141 uint32_t numIndices
,
1142 uint32_t indexOffset
,
1144 uint32_t numInstances
= 1,
1145 uint32_t startInstance
= 0)
1152 RDTSC_START(APIDrawIndexed
);
1154 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1155 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1156 API_STATE
* pState
= &pDC
->pState
->state
;
1158 uint32_t maxIndicesPerDraw
= MaxVertsPerDraw(pDC
, numIndices
, topology
);
1159 uint32_t primsPerDraw
= GetNumPrims(topology
, maxIndicesPerDraw
);
1160 uint32_t remainingIndices
= numIndices
;
1162 uint32_t indexSize
= 0;
1163 switch (pState
->indexBuffer
.format
)
1165 case R32_UINT
: indexSize
= sizeof(uint32_t); break;
1166 case R16_UINT
: indexSize
= sizeof(uint16_t); break;
1167 case R8_UINT
: indexSize
= sizeof(uint8_t); break;
1173 uint8_t *pIB
= (uint8_t*)pState
->indexBuffer
.pIndices
;
1174 pIB
+= (uint64_t)indexOffset
* (uint64_t)indexSize
;
1176 pState
->topology
= topology
;
1177 pState
->forceFront
= false;
1179 // disable culling for points/lines
1180 uint32_t oldCullMode
= pState
->rastState
.cullMode
;
1181 if (topology
== TOP_POINT_LIST
)
1183 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
1184 pState
->forceFront
= true;
1187 while (remainingIndices
)
1189 uint32_t numIndicesForDraw
= (remainingIndices
< maxIndicesPerDraw
) ?
1190 remainingIndices
: maxIndicesPerDraw
;
1192 // When breaking up draw, we need to obtain new draw context for each iteration.
1193 bool isSplitDraw
= (draw
> 0) ? true : false;
1194 pDC
= GetDrawContext(pContext
, isSplitDraw
);
1195 InitDraw(pDC
, isSplitDraw
);
1197 pDC
->FeWork
.type
= DRAW
;
1198 pDC
->FeWork
.pfnWork
= GetProcessDrawFunc(
1200 pState
->frontendState
.bEnableCutIndex
,
1201 pState
->tsState
.tsEnable
,
1202 pState
->gsState
.gsEnable
,
1203 pState
->soState
.soEnable
,
1204 pDC
->pState
->pfnProcessPrims
!= nullptr);
1205 pDC
->FeWork
.desc
.draw
.pDC
= pDC
;
1206 pDC
->FeWork
.desc
.draw
.numIndices
= numIndicesForDraw
;
1207 pDC
->FeWork
.desc
.draw
.pIB
= (int*)pIB
;
1208 pDC
->FeWork
.desc
.draw
.type
= pDC
->pState
->state
.indexBuffer
.format
;
1210 pDC
->FeWork
.desc
.draw
.numInstances
= numInstances
;
1211 pDC
->FeWork
.desc
.draw
.startInstance
= startInstance
;
1212 pDC
->FeWork
.desc
.draw
.baseVertex
= baseVertex
;
1213 pDC
->FeWork
.desc
.draw
.startPrimID
= draw
* primsPerDraw
;
1215 pDC
->cleanupState
= (remainingIndices
== numIndicesForDraw
);
1218 QueueDraw(pContext
);
1220 pIB
+= maxIndicesPerDraw
* indexSize
;
1221 remainingIndices
-= numIndicesForDraw
;
1225 // restore culling state
1226 pDC
= GetDrawContext(pContext
);
1227 pDC
->pState
->state
.rastState
.cullMode
= oldCullMode
;
1229 RDTSC_STOP(APIDrawIndexed
, numIndices
* numInstances
, 0);
1233 //////////////////////////////////////////////////////////////////////////
1234 /// @brief DrawIndexed
1235 /// @param hContext - Handle passed back from SwrCreateContext
1236 /// @param topology - Specifies topology for draw.
1237 /// @param numIndices - Number of indices to read sequentially from index buffer.
1238 /// @param indexOffset - Starting index into index buffer.
1239 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1240 void SwrDrawIndexed(
1242 PRIMITIVE_TOPOLOGY topology
,
1243 uint32_t numIndices
,
1244 uint32_t indexOffset
,
1248 DrawIndexedInstance(hContext
, topology
, numIndices
, indexOffset
, baseVertex
);
1251 //////////////////////////////////////////////////////////////////////////
1252 /// @brief SwrDrawIndexedInstanced
1253 /// @param hContext - Handle passed back from SwrCreateContext
1254 /// @param topology - Specifies topology for draw.
1255 /// @param numIndices - Number of indices to read sequentially from index buffer.
1256 /// @param numInstances - Number of instances to render.
1257 /// @param indexOffset - Starting index into index buffer.
1258 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1259 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1260 void SwrDrawIndexedInstanced(
1262 PRIMITIVE_TOPOLOGY topology
,
1263 uint32_t numIndices
,
1264 uint32_t numInstances
,
1265 uint32_t indexOffset
,
1267 uint32_t startInstance
)
1269 DrawIndexedInstance(hContext
, topology
, numIndices
, indexOffset
, baseVertex
, numInstances
, startInstance
);
1272 //////////////////////////////////////////////////////////////////////////
1273 /// @brief SwrInvalidateTiles
1274 /// @param hContext - Handle passed back from SwrCreateContext
1275 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
1276 void SwrInvalidateTiles(
1278 uint32_t attachmentMask
)
1285 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1286 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1288 pDC
->FeWork
.type
= DISCARDINVALIDATETILES
;
1289 pDC
->FeWork
.pfnWork
= ProcessDiscardInvalidateTiles
;
1290 pDC
->FeWork
.desc
.discardInvalidateTiles
.attachmentMask
= attachmentMask
;
1291 memset(&pDC
->FeWork
.desc
.discardInvalidateTiles
.rect
, 0, sizeof(SWR_RECT
));
1292 pDC
->FeWork
.desc
.discardInvalidateTiles
.newTileState
= SWR_TILE_INVALID
;
1293 pDC
->FeWork
.desc
.discardInvalidateTiles
.createNewTiles
= false;
1294 pDC
->FeWork
.desc
.discardInvalidateTiles
.fullTilesOnly
= false;
1297 QueueDraw(pContext
);
1300 //////////////////////////////////////////////////////////////////////////
1301 /// @brief SwrDiscardRect
1302 /// @param hContext - Handle passed back from SwrCreateContext
1303 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
1304 /// @param rect - if rect is all zeros, the entire attachment surface will be discarded
1305 void SwrDiscardRect(
1307 uint32_t attachmentMask
,
1315 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1316 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1318 // Queue a load to the hottile
1319 pDC
->FeWork
.type
= DISCARDINVALIDATETILES
;
1320 pDC
->FeWork
.pfnWork
= ProcessDiscardInvalidateTiles
;
1321 pDC
->FeWork
.desc
.discardInvalidateTiles
.attachmentMask
= attachmentMask
;
1322 pDC
->FeWork
.desc
.discardInvalidateTiles
.rect
= rect
;
1323 pDC
->FeWork
.desc
.discardInvalidateTiles
.newTileState
= SWR_TILE_RESOLVED
;
1324 pDC
->FeWork
.desc
.discardInvalidateTiles
.createNewTiles
= true;
1325 pDC
->FeWork
.desc
.discardInvalidateTiles
.fullTilesOnly
= true;
1328 QueueDraw(pContext
);
1331 //////////////////////////////////////////////////////////////////////////
1332 /// @brief SwrDispatch
1333 /// @param hContext - Handle passed back from SwrCreateContext
1334 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
1335 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
1336 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
1339 uint32_t threadGroupCountX
,
1340 uint32_t threadGroupCountY
,
1341 uint32_t threadGroupCountZ
)
1348 RDTSC_START(APIDispatch
);
1349 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1350 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1352 pDC
->isCompute
= true; // This is a compute context.
1354 COMPUTE_DESC
* pTaskData
= (COMPUTE_DESC
*)pDC
->pArena
->AllocAligned(sizeof(COMPUTE_DESC
), 64);
1356 pTaskData
->threadGroupCountX
= threadGroupCountX
;
1357 pTaskData
->threadGroupCountY
= threadGroupCountY
;
1358 pTaskData
->threadGroupCountZ
= threadGroupCountZ
;
1360 uint32_t totalThreadGroups
= threadGroupCountX
* threadGroupCountY
* threadGroupCountZ
;
1361 uint32_t dcIndex
= pDC
->drawId
% KNOB_MAX_DRAWS_IN_FLIGHT
;
1362 pDC
->pDispatch
= &pContext
->pDispatchQueueArray
[dcIndex
];
1363 pDC
->pDispatch
->initialize(totalThreadGroups
, pTaskData
);
1365 QueueDispatch(pContext
);
1366 RDTSC_STOP(APIDispatch
, threadGroupCountX
* threadGroupCountY
* threadGroupCountZ
, 0);
1369 // Deswizzles, converts and stores current contents of the hot tiles to surface
1370 // described by pState
1373 SWR_RENDERTARGET_ATTACHMENT attachment
,
1374 SWR_TILE_STATE postStoreTileState
)
1381 RDTSC_START(APIStoreTiles
);
1383 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1384 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1386 SetupMacroTileScissors(pDC
);
1388 pDC
->FeWork
.type
= STORETILES
;
1389 pDC
->FeWork
.pfnWork
= ProcessStoreTiles
;
1390 pDC
->FeWork
.desc
.storeTiles
.attachment
= attachment
;
1391 pDC
->FeWork
.desc
.storeTiles
.postStoreTileState
= postStoreTileState
;
1394 QueueDraw(pContext
);
1396 RDTSC_STOP(APIStoreTiles
, 0, 0);
1399 void SwrClearRenderTarget(
1402 const float clearColor
[4],
1411 RDTSC_START(APIClearRenderTarget
);
1413 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1415 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1417 SetupMacroTileScissors(pDC
);
1420 flags
.mask
= clearMask
;
1422 pDC
->FeWork
.type
= CLEAR
;
1423 pDC
->FeWork
.pfnWork
= ProcessClear
;
1424 pDC
->FeWork
.desc
.clear
.flags
= flags
;
1425 pDC
->FeWork
.desc
.clear
.clearDepth
= z
;
1426 pDC
->FeWork
.desc
.clear
.clearRTColor
[0] = clearColor
[0];
1427 pDC
->FeWork
.desc
.clear
.clearRTColor
[1] = clearColor
[1];
1428 pDC
->FeWork
.desc
.clear
.clearRTColor
[2] = clearColor
[2];
1429 pDC
->FeWork
.desc
.clear
.clearRTColor
[3] = clearColor
[3];
1430 pDC
->FeWork
.desc
.clear
.clearStencil
= stencil
;
1433 QueueDraw(pContext
);
1435 RDTSC_STOP(APIClearRenderTarget
, 0, pDC
->drawId
);
1438 //////////////////////////////////////////////////////////////////////////
1439 /// @brief Returns a pointer to the private context state for the current
1440 /// draw operation. This is used for external componets such as the
1442 /// SWR is responsible for the allocation of the private context state.
1443 /// @param hContext - Handle passed back from SwrCreateContext
1444 VOID
* SwrGetPrivateContextState(
1447 SWR_CONTEXT
* pContext
= GetContext(hContext
);
1448 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1449 DRAW_STATE
* pState
= pDC
->pState
;
1451 if (pState
->pPrivateState
== nullptr)
1453 pState
->pPrivateState
= pState
->pArena
->AllocAligned(pContext
->privateStateSize
, KNOB_SIMD_WIDTH
*sizeof(float));
1456 return pState
->pPrivateState
;
1459 //////////////////////////////////////////////////////////////////////////
1460 /// @brief Clients can use this to allocate memory for draw/dispatch
1461 /// operations. The memory will automatically be freed once operation
1462 /// has completed. Client can use this to allocate binding tables,
1463 /// etc. needed for shader execution.
1464 /// @param hContext - Handle passed back from SwrCreateContext
1465 /// @param size - Size of allocation
1466 /// @param align - Alignment needed for allocation.
1467 VOID
* SwrAllocDrawContextMemory(
1472 SWR_CONTEXT
* pContext
= GetContext(hContext
);
1473 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1475 return pDC
->pState
->pArena
->AllocAligned(size
, align
);
1478 //////////////////////////////////////////////////////////////////////////
1479 /// @brief Returns pointer to SWR stats.
1480 /// @note The counters are atomically incremented by multiple threads.
1481 /// When calling this, you need to ensure all previous operations
1483 /// @todo If necessary, add a callback to avoid stalling the pipe to
1484 /// sample the counters.
1485 /// @param hContext - Handle passed back from SwrCreateContext
1486 /// @param pStats - SWR will fill this out for caller.
1491 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1492 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1494 pDC
->FeWork
.type
= QUERYSTATS
;
1495 pDC
->FeWork
.pfnWork
= ProcessQueryStats
;
1496 pDC
->FeWork
.desc
.queryStats
.pStats
= pStats
;
1498 // cannot execute until all previous draws have completed
1499 pDC
->dependent
= true;
1502 QueueDraw(pContext
);
1505 //////////////////////////////////////////////////////////////////////////
1506 /// @brief Enables stats counting
1507 /// @param hContext - Handle passed back from SwrCreateContext
1508 /// @param enable - If true then counts are incremented.
1509 void SwrEnableStats(
1513 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1514 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1516 pDC
->pState
->state
.enableStats
= enable
;
1519 //////////////////////////////////////////////////////////////////////////
1520 /// @brief Mark end of frame - used for performance profiling
1521 /// @param hContext - Handle passed back from SwrCreateContext
1522 void SWR_API
SwrEndFrame(
1526 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1527 pContext
->frameCount
++;