1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief API implementation
27 ******************************************************************************/
34 #include "core/backend.h"
35 #include "core/context.h"
36 #include "core/frontend.h"
37 #include "core/rasterizer.h"
38 #include "core/rdtsc_core.h"
39 #include "core/threads.h"
40 #include "core/tilemgr.h"
41 #include "core/clip.h"
43 #include "common/simdintrin.h"
44 #include "common/os.h"
46 void SetupDefaultState(SWR_CONTEXT
*pContext
);
48 //////////////////////////////////////////////////////////////////////////
49 /// @brief Create SWR Context.
50 /// @param pCreateInfo - pointer to creation info.
51 HANDLE
SwrCreateContext(
52 SWR_CREATECONTEXT_INFO
* pCreateInfo
)
57 void* pContextMem
= _aligned_malloc(sizeof(SWR_CONTEXT
), KNOB_SIMD_WIDTH
* 4);
58 memset(pContextMem
, 0, sizeof(SWR_CONTEXT
));
59 SWR_CONTEXT
*pContext
= new (pContextMem
) SWR_CONTEXT();
61 pContext
->driverType
= pCreateInfo
->driver
;
62 pContext
->privateStateSize
= pCreateInfo
->privateStateSize
;
64 pContext
->dcRing
.Init(KNOB_MAX_DRAWS_IN_FLIGHT
);
65 pContext
->dsRing
.Init(KNOB_MAX_DRAWS_IN_FLIGHT
);
67 for (uint32_t dc
= 0; dc
< KNOB_MAX_DRAWS_IN_FLIGHT
; ++dc
)
69 pContext
->dcRing
[dc
].pArena
= new Arena();
70 pContext
->dcRing
[dc
].pTileMgr
= new MacroTileMgr(*(pContext
->dcRing
[dc
].pArena
));
71 pContext
->dcRing
[dc
].pDispatch
= new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
73 pContext
->dsRing
[dc
].pArena
= new Arena();
76 if (!KNOB_SINGLE_THREADED
)
78 memset(&pContext
->WaitLock
, 0, sizeof(pContext
->WaitLock
));
79 memset(&pContext
->FifosNotEmpty
, 0, sizeof(pContext
->FifosNotEmpty
));
80 new (&pContext
->WaitLock
) std::mutex();
81 new (&pContext
->FifosNotEmpty
) std::condition_variable();
83 CreateThreadPool(pContext
, &pContext
->threadPool
);
86 // Calling createThreadPool() above can set SINGLE_THREADED
87 if (KNOB_SINGLE_THREADED
)
89 pContext
->NumWorkerThreads
= 1;
92 // Allocate scratch space for workers.
93 ///@note We could lazily allocate this but its rather small amount of memory.
94 for (uint32_t i
= 0; i
< pContext
->NumWorkerThreads
; ++i
)
96 ///@todo Use numa API for allocations using numa information from thread data (if exists).
97 pContext
->pScratch
[i
] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH
* 4);
100 // State setup AFTER context is fully initialized
101 SetupDefaultState(pContext
);
103 // initialize hot tile manager
104 pContext
->pHotTileMgr
= new HotTileMgr();
106 // initialize function pointer tables
107 InitClearTilesTable();
109 // initialize store tiles function
110 pContext
->pfnLoadTile
= pCreateInfo
->pfnLoadTile
;
111 pContext
->pfnStoreTile
= pCreateInfo
->pfnStoreTile
;
112 pContext
->pfnClearTile
= pCreateInfo
->pfnClearTile
;
114 // pass pointer to bucket manager back to caller
115 #ifdef KNOB_ENABLE_RDTSC
116 pCreateInfo
->pBucketMgr
= &gBucketMgr
;
119 pCreateInfo
->contextSaveSize
= sizeof(API_STATE
);
121 return (HANDLE
)pContext
;
124 void SwrDestroyContext(HANDLE hContext
)
126 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
127 DestroyThreadPool(pContext
, &pContext
->threadPool
);
130 for (uint32_t i
= 0; i
< KNOB_MAX_DRAWS_IN_FLIGHT
; ++i
)
132 delete pContext
->dcRing
[i
].pArena
;
133 delete pContext
->dsRing
[i
].pArena
;
134 delete(pContext
->dcRing
[i
].pTileMgr
);
135 delete(pContext
->dcRing
[i
].pDispatch
);
138 // Free scratch space.
139 for (uint32_t i
= 0; i
< pContext
->NumWorkerThreads
; ++i
)
141 _aligned_free(pContext
->pScratch
[i
]);
144 delete(pContext
->pHotTileMgr
);
146 pContext
->~SWR_CONTEXT();
147 _aligned_free((SWR_CONTEXT
*)hContext
);
150 void CopyState(DRAW_STATE
& dst
, const DRAW_STATE
& src
)
152 memcpy(&dst
.state
, &src
.state
, sizeof(API_STATE
));
155 void WakeAllThreads(SWR_CONTEXT
*pContext
)
157 pContext
->FifosNotEmpty
.notify_all();
160 template<bool IsDraw
>
161 void QueueWork(SWR_CONTEXT
*pContext
)
165 // Each worker thread looks at a DC for both FE and BE work at different times and so we
166 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
167 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
168 // then moved on if all work is done.)
169 pContext
->pCurDrawContext
->threadsDone
=
170 pContext
->NumWorkerThreads
? pContext
->NumWorkerThreads
* 2 : 2;
174 pContext
->pCurDrawContext
->threadsDone
=
175 pContext
->NumWorkerThreads
? pContext
->NumWorkerThreads
: 1;
180 std::unique_lock
<std::mutex
> lock(pContext
->WaitLock
);
181 pContext
->dcRing
.Enqueue();
184 if (KNOB_SINGLE_THREADED
)
186 // flush denormals to 0
187 uint32_t mxcsr
= _mm_getcsr();
188 _mm_setcsr(mxcsr
| _MM_FLUSH_ZERO_ON
| _MM_DENORMALS_ZERO_ON
);
192 static TileSet lockedTiles
;
193 uint64_t curDraw
[2] = { pContext
->pCurDrawContext
->drawId
, pContext
->pCurDrawContext
->drawId
};
194 WorkOnFifoFE(pContext
, 0, curDraw
[0], 0);
195 WorkOnFifoBE(pContext
, 0, curDraw
[1], lockedTiles
);
199 uint64_t curDispatch
= pContext
->pCurDrawContext
->drawId
;
200 WorkOnCompute(pContext
, 0, curDispatch
);
203 // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
204 if (!pContext
->dcRing
.IsEmpty())
206 pContext
->dcRing
.Dequeue();
214 RDTSC_START(APIDrawWakeAllThreads
);
215 WakeAllThreads(pContext
);
216 RDTSC_STOP(APIDrawWakeAllThreads
, 1, 0);
219 // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
220 pContext
->pPrevDrawContext
= pContext
->pCurDrawContext
;
221 pContext
->pCurDrawContext
= nullptr;
224 INLINE
void QueueDraw(SWR_CONTEXT
* pContext
)
226 QueueWork
<true>(pContext
);
229 INLINE
void QueueDispatch(SWR_CONTEXT
* pContext
)
231 QueueWork
<false>(pContext
);
234 DRAW_CONTEXT
* GetDrawContext(SWR_CONTEXT
*pContext
, bool isSplitDraw
= false)
236 RDTSC_START(APIGetDrawContext
);
237 // If current draw context is null then need to obtain a new draw context to use from ring.
238 if (pContext
->pCurDrawContext
== nullptr)
240 // Need to wait for a free entry.
241 while (pContext
->dcRing
.IsFull())
246 uint32_t dcIndex
= pContext
->dcRing
.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT
;
248 DRAW_CONTEXT
* pCurDrawContext
= &pContext
->dcRing
[dcIndex
];
249 pContext
->pCurDrawContext
= pCurDrawContext
;
251 // Assign next available entry in DS ring to this DC.
252 uint32_t dsIndex
= pContext
->curStateId
% KNOB_MAX_DRAWS_IN_FLIGHT
;
253 pCurDrawContext
->pState
= &pContext
->dsRing
[dsIndex
];
255 Arena
& stateArena
= *(pCurDrawContext
->pState
->pArena
);
257 // Copy previous state to current state.
258 if (pContext
->pPrevDrawContext
)
260 DRAW_CONTEXT
* pPrevDrawContext
= pContext
->pPrevDrawContext
;
262 // If we're splitting our draw then we can just use the same state from the previous
263 // draw. In this case, we won't increment the DS ring index so the next non-split
264 // draw can receive the state.
265 if (isSplitDraw
== false)
267 CopyState(*pCurDrawContext
->pState
, *pPrevDrawContext
->pState
);
269 stateArena
.Reset(true); // Reset memory.
270 pCurDrawContext
->pState
->pPrivateState
= nullptr;
272 pContext
->curStateId
++; // Progress state ring index forward.
276 // If its a split draw then just copy the state pointer over
277 // since its the same draw.
278 pCurDrawContext
->pState
= pPrevDrawContext
->pState
;
283 stateArena
.Reset(); // Reset memory.
284 pContext
->curStateId
++; // Progress state ring index forward.
287 pCurDrawContext
->dependency
= 0;
288 pCurDrawContext
->pArena
->Reset();
289 pCurDrawContext
->pContext
= pContext
;
290 pCurDrawContext
->isCompute
= false; // Dispatch has to set this to true.
292 pCurDrawContext
->doneFE
= false;
293 pCurDrawContext
->FeLock
= 0;
294 pCurDrawContext
->threadsDone
= 0;
296 pCurDrawContext
->pTileMgr
->initialize();
298 // Assign unique drawId for this DC
299 pCurDrawContext
->drawId
= pContext
->dcRing
.GetHead();
303 SWR_ASSERT(isSplitDraw
== false, "Split draw should only be used when obtaining a new DC");
306 RDTSC_STOP(APIGetDrawContext
, 0, 0);
307 return pContext
->pCurDrawContext
;
310 API_STATE
* GetDrawState(SWR_CONTEXT
*pContext
)
312 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
313 SWR_ASSERT(pDC
->pState
!= nullptr);
315 return &pDC
->pState
->state
;
318 void SWR_API
SwrSaveState(
320 void* pOutputStateBlock
,
323 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
324 auto pSrc
= GetDrawState(pContext
);
325 SWR_ASSERT(pOutputStateBlock
&& memSize
>= sizeof(*pSrc
));
327 memcpy(pOutputStateBlock
, pSrc
, sizeof(*pSrc
));
330 void SWR_API
SwrRestoreState(
332 const void* pStateBlock
,
335 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
336 auto pDst
= GetDrawState(pContext
);
337 SWR_ASSERT(pStateBlock
&& memSize
>= sizeof(*pDst
));
339 memcpy(pDst
, pStateBlock
, sizeof(*pDst
));
342 void SetupDefaultState(SWR_CONTEXT
*pContext
)
344 API_STATE
* pState
= GetDrawState(pContext
);
346 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
347 pState
->rastState
.frontWinding
= SWR_FRONTWINDING_CCW
;
350 static INLINE SWR_CONTEXT
* GetContext(HANDLE hContext
)
352 return (SWR_CONTEXT
*)hContext
;
355 void SwrSync(HANDLE hContext
, PFN_CALLBACK_FUNC pfnFunc
, uint64_t userData
, uint64_t userData2
, uint64_t userData3
)
357 RDTSC_START(APISync
);
359 SWR_ASSERT(pfnFunc
!= nullptr);
361 SWR_CONTEXT
*pContext
= GetContext(hContext
);
362 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
364 pDC
->FeWork
.type
= SYNC
;
365 pDC
->FeWork
.pfnWork
= ProcessSync
;
366 pDC
->FeWork
.desc
.sync
.pfnCallbackFunc
= pfnFunc
;
367 pDC
->FeWork
.desc
.sync
.userData
= userData
;
368 pDC
->FeWork
.desc
.sync
.userData2
= userData2
;
369 pDC
->FeWork
.desc
.sync
.userData3
= userData3
;
371 // cannot execute until all previous draws have completed
372 pDC
->dependency
= pDC
->drawId
- 1;
377 RDTSC_STOP(APISync
, 1, 0);
380 void SwrWaitForIdle(HANDLE hContext
)
382 SWR_CONTEXT
*pContext
= GetContext(hContext
);
384 RDTSC_START(APIWaitForIdle
);
386 while (!pContext
->dcRing
.IsEmpty())
391 RDTSC_STOP(APIWaitForIdle
, 1, 0);
394 void SwrSetVertexBuffers(
397 const SWR_VERTEX_BUFFER_STATE
* pVertexBuffers
)
399 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
401 for (uint32_t i
= 0; i
< numBuffers
; ++i
)
403 const SWR_VERTEX_BUFFER_STATE
*pVB
= &pVertexBuffers
[i
];
404 pState
->vertexBuffers
[pVB
->index
] = *pVB
;
408 void SwrSetIndexBuffer(
410 const SWR_INDEX_BUFFER_STATE
* pIndexBuffer
)
412 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
414 pState
->indexBuffer
= *pIndexBuffer
;
417 void SwrSetFetchFunc(
419 PFN_FETCH_FUNC pfnFetchFunc
)
421 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
423 pState
->pfnFetchFunc
= pfnFetchFunc
;
428 PFN_SO_FUNC pfnSoFunc
,
429 uint32_t streamIndex
)
431 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
433 SWR_ASSERT(streamIndex
< MAX_SO_STREAMS
);
435 pState
->pfnSoFunc
[streamIndex
] = pfnSoFunc
;
440 SWR_STREAMOUT_STATE
* pSoState
)
442 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
444 pState
->soState
= *pSoState
;
447 void SwrSetSoBuffers(
449 SWR_STREAMOUT_BUFFER
* pSoBuffer
,
452 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
454 SWR_ASSERT((slot
< 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot
);
456 pState
->soBuffer
[slot
] = *pSoBuffer
;
459 void SwrSetVertexFunc(
461 PFN_VERTEX_FUNC pfnVertexFunc
)
463 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
465 pState
->pfnVertexFunc
= pfnVertexFunc
;
468 void SwrSetFrontendState(
470 SWR_FRONTEND_STATE
*pFEState
)
472 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
473 pState
->frontendState
= *pFEState
;
478 SWR_GS_STATE
*pGSState
)
480 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
481 pState
->gsState
= *pGSState
;
486 PFN_GS_FUNC pfnGsFunc
)
488 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
489 pState
->pfnGsFunc
= pfnGsFunc
;
494 PFN_CS_FUNC pfnCsFunc
,
495 uint32_t totalThreadsInGroup
)
497 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
498 pState
->pfnCsFunc
= pfnCsFunc
;
499 pState
->totalThreadsInGroup
= totalThreadsInGroup
;
504 SWR_TS_STATE
*pState
)
506 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
507 pApiState
->tsState
= *pState
;
514 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
515 pApiState
->pfnHsFunc
= pfnFunc
;
522 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
523 pApiState
->pfnDsFunc
= pfnFunc
;
526 void SwrSetDepthStencilState(
528 SWR_DEPTH_STENCIL_STATE
*pDSState
)
530 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
532 pState
->depthStencilState
= *pDSState
;
535 void SwrSetBackendState(
537 SWR_BACKEND_STATE
*pBEState
)
539 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
541 pState
->backendState
= *pBEState
;
544 void SwrSetPixelShaderState(
546 SWR_PS_STATE
*pPSState
)
548 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
549 pState
->psState
= *pPSState
;
552 void SwrSetBlendState(
554 SWR_BLEND_STATE
*pBlendState
)
556 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
557 memcpy(&pState
->blendState
, pBlendState
, sizeof(SWR_BLEND_STATE
));
560 void SwrSetBlendFunc(
562 uint32_t renderTarget
,
563 PFN_BLEND_JIT_FUNC pfnBlendFunc
)
565 SWR_ASSERT(renderTarget
< SWR_NUM_RENDERTARGETS
);
566 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
567 pState
->pfnBlendFunc
[renderTarget
] = pfnBlendFunc
;
575 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
577 static const uint8_t IDENTITY_MAP
[] =
579 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
580 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
582 static_assert(sizeof(IDENTITY_MAP
) == sizeof(pState
->linkageMap
),
583 "Update for new value of MAX_ATTRIBUTES");
585 pState
->linkageMask
= mask
;
586 pState
->linkageCount
= _mm_popcnt_u32(mask
);
592 memcpy(pState
->linkageMap
, pMap
, pState
->linkageCount
);
595 // update guardband multipliers for the viewport
596 void updateGuardband(API_STATE
*pState
)
598 // guardband center is viewport center
599 pState
->gbState
.left
= KNOB_GUARDBAND_WIDTH
/ pState
->vp
[0].width
;
600 pState
->gbState
.right
= KNOB_GUARDBAND_WIDTH
/ pState
->vp
[0].width
;
601 pState
->gbState
.top
= KNOB_GUARDBAND_HEIGHT
/ pState
->vp
[0].height
;
602 pState
->gbState
.bottom
= KNOB_GUARDBAND_HEIGHT
/ pState
->vp
[0].height
;
605 void SwrSetRastState(
607 const SWR_RASTSTATE
*pRastState
)
609 SWR_CONTEXT
*pContext
= GetContext(hContext
);
610 API_STATE
* pState
= GetDrawState(pContext
);
612 memcpy(&pState
->rastState
, pRastState
, sizeof(SWR_RASTSTATE
));
615 void SwrSetViewports(
617 uint32_t numViewports
,
618 const SWR_VIEWPORT
* pViewports
,
619 const SWR_VIEWPORT_MATRIX
* pMatrices
)
621 SWR_ASSERT(numViewports
<= KNOB_NUM_VIEWPORTS_SCISSORS
,
622 "Invalid number of viewports.");
624 SWR_CONTEXT
*pContext
= GetContext(hContext
);
625 API_STATE
* pState
= GetDrawState(pContext
);
627 memcpy(&pState
->vp
[0], pViewports
, sizeof(SWR_VIEWPORT
) * numViewports
);
629 if (pMatrices
!= nullptr)
631 memcpy(&pState
->vpMatrix
[0], pMatrices
, sizeof(SWR_VIEWPORT_MATRIX
) * numViewports
);
635 // Compute default viewport transform.
636 for (uint32_t i
= 0; i
< numViewports
; ++i
)
638 if (pContext
->driverType
== DX
)
640 pState
->vpMatrix
[i
].m00
= pState
->vp
[i
].width
/ 2.0f
;
641 pState
->vpMatrix
[i
].m11
= -pState
->vp
[i
].height
/ 2.0f
;
642 pState
->vpMatrix
[i
].m22
= pState
->vp
[i
].maxZ
- pState
->vp
[i
].minZ
;
643 pState
->vpMatrix
[i
].m30
= pState
->vp
[i
].x
+ pState
->vpMatrix
[i
].m00
;
644 pState
->vpMatrix
[i
].m31
= pState
->vp
[i
].y
- pState
->vpMatrix
[i
].m11
;
645 pState
->vpMatrix
[i
].m32
= pState
->vp
[i
].minZ
;
649 // Standard, with the exception that Y is inverted.
650 pState
->vpMatrix
[i
].m00
= (pState
->vp
[i
].width
- pState
->vp
[i
].x
) / 2.0f
;
651 pState
->vpMatrix
[i
].m11
= (pState
->vp
[i
].y
- pState
->vp
[i
].height
) / 2.0f
;
652 pState
->vpMatrix
[i
].m22
= (pState
->vp
[i
].maxZ
- pState
->vp
[i
].minZ
) / 2.0f
;
653 pState
->vpMatrix
[i
].m30
= pState
->vp
[i
].x
+ pState
->vpMatrix
[i
].m00
;
654 pState
->vpMatrix
[i
].m31
= pState
->vp
[i
].height
+ pState
->vpMatrix
[i
].m11
;
655 pState
->vpMatrix
[i
].m32
= pState
->vp
[i
].minZ
+ pState
->vpMatrix
[i
].m22
;
657 // Now that the matrix is calculated, clip the view coords to screen size.
658 // OpenGL allows for -ve x,y in the viewport.
659 pState
->vp
[i
].x
= std::max(pState
->vp
[i
].x
, 0.0f
);
660 pState
->vp
[i
].y
= std::max(pState
->vp
[i
].y
, 0.0f
);
665 updateGuardband(pState
);
668 void SwrSetScissorRects(
670 uint32_t numScissors
,
671 const BBOX
* pScissors
)
673 SWR_ASSERT(numScissors
<= KNOB_NUM_VIEWPORTS_SCISSORS
,
674 "Invalid number of scissor rects.");
676 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
677 memcpy(&pState
->scissorRects
[0], pScissors
, numScissors
* sizeof(BBOX
));
680 void SetupMacroTileScissors(DRAW_CONTEXT
*pDC
)
682 API_STATE
*pState
= &pDC
->pState
->state
;
683 uint32_t left
, right
, top
, bottom
;
685 // Set up scissor dimensions based on scissor or viewport
686 if (pState
->rastState
.scissorEnable
)
688 // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges
689 left
= pState
->scissorRects
[0].left
;
690 right
= pState
->scissorRects
[0].right
;
691 top
= pState
->scissorRects
[0].top
;
692 bottom
= pState
->scissorRects
[0].bottom
;
696 left
= (int32_t)pState
->vp
[0].x
;
697 right
= (int32_t)pState
->vp
[0].x
+ (int32_t)pState
->vp
[0].width
;
698 top
= (int32_t)pState
->vp
[0].y
;
699 bottom
= (int32_t)pState
->vp
[0].y
+ (int32_t)pState
->vp
[0].height
;
702 right
= std::min
<uint32_t>(right
, KNOB_MAX_SCISSOR_X
);
703 bottom
= std::min
<uint32_t>(bottom
, KNOB_MAX_SCISSOR_Y
);
705 if (left
> KNOB_MAX_SCISSOR_X
|| top
> KNOB_MAX_SCISSOR_Y
)
707 pState
->scissorInFixedPoint
.left
= 0;
708 pState
->scissorInFixedPoint
.right
= 0;
709 pState
->scissorInFixedPoint
.top
= 0;
710 pState
->scissorInFixedPoint
.bottom
= 0;
714 pState
->scissorInFixedPoint
.left
= left
* FIXED_POINT_SCALE
;
715 pState
->scissorInFixedPoint
.right
= right
* FIXED_POINT_SCALE
- 1;
716 pState
->scissorInFixedPoint
.top
= top
* FIXED_POINT_SCALE
;
717 pState
->scissorInFixedPoint
.bottom
= bottom
* FIXED_POINT_SCALE
- 1;
720 // templated backend function tables
721 extern PFN_BACKEND_FUNC gBackendNullPs
[SWR_MULTISAMPLE_TYPE_MAX
];
722 extern PFN_BACKEND_FUNC gBackendSingleSample
[2][2];
723 extern PFN_BACKEND_FUNC gBackendPixelRateTable
[SWR_MULTISAMPLE_TYPE_MAX
][SWR_MSAA_SAMPLE_PATTERN_MAX
][SWR_INPUT_COVERAGE_MAX
][2][2];
724 extern PFN_BACKEND_FUNC gBackendSampleRateTable
[SWR_MULTISAMPLE_TYPE_MAX
][SWR_INPUT_COVERAGE_MAX
][2];
725 extern PFN_OUTPUT_MERGER gBackendOutputMergerTable
[SWR_NUM_RENDERTARGETS
+ 1][SWR_MULTISAMPLE_TYPE_MAX
];
726 extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable
[2];
727 extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable
[2];
728 extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable
[SWR_MULTISAMPLE_TYPE_MAX
][2][2][2];
729 void SetupPipeline(DRAW_CONTEXT
*pDC
)
731 DRAW_STATE
* pState
= pDC
->pState
;
732 const SWR_RASTSTATE
&rastState
= pState
->state
.rastState
;
733 const SWR_PS_STATE
&psState
= pState
->state
.psState
;
734 BACKEND_FUNCS
& backendFuncs
= pState
->backendFuncs
;
735 const uint32_t forcedSampleCount
= (rastState
.bForcedSampleCount
) ? 1 : 0;
738 if (psState
.pfnPixelShader
== nullptr)
740 backendFuncs
.pfnBackend
= gBackendNullPs
[pState
->state
.rastState
.sampleCount
];
741 // always need to generate I & J per sample for Z interpolation
742 backendFuncs
.pfnCalcSampleBarycentrics
= gSampleBarycentricTable
[1];
746 const bool bMultisampleEnable
= ((rastState
.sampleCount
> SWR_MULTISAMPLE_1X
) || rastState
.bForcedSampleCount
) ? 1 : 0;
747 const uint32_t centroid
= ((psState
.barycentricsMask
& SWR_BARYCENTRIC_CENTROID_MASK
) > 0) ? 1 : 0;
749 // currently only support 'normal' input coverage
750 SWR_ASSERT(psState
.inputCoverage
== SWR_INPUT_COVERAGE_NORMAL
||
751 psState
.inputCoverage
== SWR_INPUT_COVERAGE_NONE
);
753 SWR_BARYCENTRICS_MASK barycentricsMask
= (SWR_BARYCENTRICS_MASK
)psState
.barycentricsMask
;
755 // select backend function
756 switch(psState
.shadingRate
)
758 case SWR_SHADING_RATE_PIXEL
:
759 if(bMultisampleEnable
)
761 // always need to generate I & J per sample for Z interpolation
762 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_SAMPLE_MASK
);
763 backendFuncs
.pfnBackend
= gBackendPixelRateTable
[rastState
.sampleCount
][rastState
.samplePattern
][psState
.inputCoverage
][centroid
][forcedSampleCount
];
764 backendFuncs
.pfnOutputMerger
= gBackendOutputMergerTable
[psState
.numRenderTargets
][pState
->state
.blendState
.sampleCount
];
768 // always need to generate I & J per pixel for Z interpolation
769 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_PIXEL_MASK
);
770 backendFuncs
.pfnBackend
= gBackendSingleSample
[psState
.inputCoverage
][centroid
];
771 backendFuncs
.pfnOutputMerger
= gBackendOutputMergerTable
[psState
.numRenderTargets
][SWR_MULTISAMPLE_1X
];
774 case SWR_SHADING_RATE_SAMPLE
:
775 SWR_ASSERT(rastState
.samplePattern
== SWR_MSAA_STANDARD_PATTERN
);
776 // always need to generate I & J per sample for Z interpolation
777 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_SAMPLE_MASK
);
778 backendFuncs
.pfnBackend
= gBackendSampleRateTable
[rastState
.sampleCount
][psState
.inputCoverage
][centroid
];
779 backendFuncs
.pfnOutputMerger
= gBackendOutputMergerTable
[psState
.numRenderTargets
][pState
->state
.blendState
.sampleCount
];
782 SWR_ASSERT(0 && "Invalid shading rate");
786 // setup pointer to function that generates necessary barycentrics required by the PS
787 bool bBarycentrics
= (barycentricsMask
& SWR_BARYCENTRIC_PER_PIXEL_MASK
) > 0 ? 1 : 0;
788 backendFuncs
.pfnCalcPixelBarycentrics
= gPixelBarycentricTable
[bBarycentrics
];
790 bBarycentrics
= (barycentricsMask
& SWR_BARYCENTRIC_PER_SAMPLE_MASK
) > 0 ? 1 : 0;
791 backendFuncs
.pfnCalcSampleBarycentrics
= gSampleBarycentricTable
[bBarycentrics
];
793 bBarycentrics
= (barycentricsMask
& SWR_BARYCENTRIC_CENTROID_MASK
) > 0 ? 1 : 0;
794 backendFuncs
.pfnCalcCentroidBarycentrics
= gCentroidBarycentricTable
[rastState
.sampleCount
][bBarycentrics
][rastState
.samplePattern
][forcedSampleCount
];
797 PFN_PROCESS_PRIMS pfnBinner
;
798 switch (pState
->state
.topology
)
801 pState
->pfnProcessPrims
= ClipPoints
;
802 pfnBinner
= BinPoints
;
807 case TOP_LINE_LIST_ADJ
:
808 case TOP_LISTSTRIP_ADJ
:
809 pState
->pfnProcessPrims
= ClipLines
;
810 pfnBinner
= BinLines
;
813 pState
->pfnProcessPrims
= ClipTriangles
;
814 pfnBinner
= BinTriangles
;
818 // disable clipper if viewport transform is disabled
819 if (pState
->state
.frontendState
.vpTransformDisable
)
821 pState
->pfnProcessPrims
= pfnBinner
;
824 if ((pState
->state
.psState
.pfnPixelShader
== nullptr) &&
825 (pState
->state
.depthStencilState
.depthTestEnable
== FALSE
) &&
826 (pState
->state
.depthStencilState
.depthWriteEnable
== FALSE
) &&
827 (pState
->state
.depthStencilState
.stencilTestEnable
== FALSE
) &&
828 (pState
->state
.depthStencilState
.stencilWriteEnable
== FALSE
) &&
829 (pState
->state
.linkageCount
== 0))
831 pState
->pfnProcessPrims
= nullptr;
832 pState
->state
.linkageMask
= 0;
835 if (pState
->state
.soState
.rasterizerDisable
== true)
837 pState
->pfnProcessPrims
= nullptr;
838 pState
->state
.linkageMask
= 0;
841 // set up the frontend attrib mask
842 pState
->state
.feAttribMask
= pState
->state
.linkageMask
;
843 if (pState
->state
.soState
.soEnable
)
845 for (uint32_t i
= 0; i
< 4; ++i
)
847 pState
->state
.feAttribMask
|= pState
->state
.soState
.streamMasks
[i
];
851 // complicated logic to test for cases where we don't need backing hottile memory for a draw
852 // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
853 pState
->state
.depthHottileEnable
= ((!(pState
->state
.depthStencilState
.depthTestEnable
&&
854 !pState
->state
.depthStencilState
.depthWriteEnable
&&
855 pState
->state
.depthStencilState
.depthTestFunc
== ZFUNC_ALWAYS
)) &&
856 (pState
->state
.depthStencilState
.depthTestEnable
||
857 pState
->state
.depthStencilState
.depthWriteEnable
)) ? true : false;
859 pState
->state
.stencilHottileEnable
= (((!(pState
->state
.depthStencilState
.stencilTestEnable
&&
860 !pState
->state
.depthStencilState
.stencilWriteEnable
&&
861 pState
->state
.depthStencilState
.stencilTestFunc
== ZFUNC_ALWAYS
)) ||
862 // for stencil we have to check the double sided state as well
863 (!(pState
->state
.depthStencilState
.doubleSidedStencilTestEnable
&&
864 !pState
->state
.depthStencilState
.stencilWriteEnable
&&
865 pState
->state
.depthStencilState
.backfaceStencilTestFunc
== ZFUNC_ALWAYS
))) &&
866 (pState
->state
.depthStencilState
.stencilTestEnable
||
867 pState
->state
.depthStencilState
.stencilWriteEnable
)) ? true : false;
869 uint32_t numRTs
= pState
->state
.psState
.numRenderTargets
;
870 pState
->state
.colorHottileEnable
= 0;
871 if (psState
.pfnPixelShader
!= nullptr)
873 for (uint32_t rt
= 0; rt
< numRTs
; ++rt
)
875 pState
->state
.colorHottileEnable
|=
876 (!pState
->state
.blendState
.renderTarget
[rt
].writeDisableAlpha
||
877 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableRed
||
878 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableGreen
||
879 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableBlue
) ? (1 << rt
) : 0;
884 //////////////////////////////////////////////////////////////////////////
886 /// @param pDC - Draw context to initialize for this draw.
891 // We don't need to re-setup the scissors/pipeline state again for split draw.
892 if (isSplitDraw
== false)
894 SetupMacroTileScissors(pDC
);
899 //////////////////////////////////////////////////////////////////////////
900 /// @brief We can split the draw for certain topologies for better performance.
901 /// @param totalVerts - Total vertices for draw
902 /// @param topology - Topology used for draw
903 uint32_t MaxVertsPerDraw(
906 PRIMITIVE_TOPOLOGY topology
)
908 API_STATE
& state
= pDC
->pState
->state
;
910 uint32_t vertsPerDraw
= totalVerts
;
912 if (state
.soState
.soEnable
)
920 case TOP_TRIANGLE_LIST
:
921 vertsPerDraw
= KNOB_MAX_PRIMS_PER_DRAW
;
924 case TOP_PATCHLIST_1
:
925 case TOP_PATCHLIST_2
:
926 case TOP_PATCHLIST_3
:
927 case TOP_PATCHLIST_4
:
928 case TOP_PATCHLIST_5
:
929 case TOP_PATCHLIST_6
:
930 case TOP_PATCHLIST_7
:
931 case TOP_PATCHLIST_8
:
932 case TOP_PATCHLIST_9
:
933 case TOP_PATCHLIST_10
:
934 case TOP_PATCHLIST_11
:
935 case TOP_PATCHLIST_12
:
936 case TOP_PATCHLIST_13
:
937 case TOP_PATCHLIST_14
:
938 case TOP_PATCHLIST_15
:
939 case TOP_PATCHLIST_16
:
940 case TOP_PATCHLIST_17
:
941 case TOP_PATCHLIST_18
:
942 case TOP_PATCHLIST_19
:
943 case TOP_PATCHLIST_20
:
944 case TOP_PATCHLIST_21
:
945 case TOP_PATCHLIST_22
:
946 case TOP_PATCHLIST_23
:
947 case TOP_PATCHLIST_24
:
948 case TOP_PATCHLIST_25
:
949 case TOP_PATCHLIST_26
:
950 case TOP_PATCHLIST_27
:
951 case TOP_PATCHLIST_28
:
952 case TOP_PATCHLIST_29
:
953 case TOP_PATCHLIST_30
:
954 case TOP_PATCHLIST_31
:
955 case TOP_PATCHLIST_32
:
956 if (pDC
->pState
->state
.tsState
.tsEnable
)
958 uint32_t vertsPerPrim
= topology
- TOP_PATCHLIST_BASE
;
959 vertsPerDraw
= vertsPerPrim
* KNOB_MAX_TESS_PRIMS_PER_DRAW
;
963 // The Primitive Assembly code can only handle 1 RECT at a time.
969 // We are not splitting up draws for other topologies.
976 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
977 // arguments to static template arguments.
978 template <bool... ArgsB
>
981 // Last Arg Terminator
982 static PFN_FE_WORK_FUNC
GetFunc(bool bArg
)
986 return ProcessDraw
<ArgsB
..., true>;
989 return ProcessDraw
<ArgsB
..., false>;
992 // Recursively parse args
993 template <typename
... TArgsT
>
994 static PFN_FE_WORK_FUNC
GetFunc(bool bArg
, TArgsT
... remainingArgs
)
998 return FEDrawChooser
<ArgsB
..., true>::GetFunc(remainingArgs
...);
1001 return FEDrawChooser
<ArgsB
..., false>::GetFunc(remainingArgs
...);
1005 // Selector for correct templated Draw front-end function
1007 static PFN_FE_WORK_FUNC
GetFEDrawFunc(bool IsIndexed
, bool HasTessellation
, bool HasGeometryShader
, bool HasStreamOut
, bool RasterizerEnabled
)
1009 return FEDrawChooser
<>::GetFunc(IsIndexed
, HasTessellation
, HasGeometryShader
, HasStreamOut
, RasterizerEnabled
);
1013 //////////////////////////////////////////////////////////////////////////
1014 /// @brief DrawInstanced
1015 /// @param hContext - Handle passed back from SwrCreateContext
1016 /// @param topology - Specifies topology for draw.
1017 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
1018 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1019 /// @param numInstances - How many instances to render.
1020 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1023 PRIMITIVE_TOPOLOGY topology
,
1024 uint32_t numVertices
,
1025 uint32_t startVertex
,
1026 uint32_t numInstances
= 1,
1027 uint32_t startInstance
= 0)
1034 RDTSC_START(APIDraw
);
1036 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1037 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1039 int32_t maxVertsPerDraw
= MaxVertsPerDraw(pDC
, numVertices
, topology
);
1040 uint32_t primsPerDraw
= GetNumPrims(topology
, maxVertsPerDraw
);
1041 int32_t remainingVerts
= numVertices
;
1043 API_STATE
*pState
= &pDC
->pState
->state
;
1044 pState
->topology
= topology
;
1045 pState
->forceFront
= false;
1047 // disable culling for points/lines
1048 uint32_t oldCullMode
= pState
->rastState
.cullMode
;
1049 if (topology
== TOP_POINT_LIST
)
1051 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
1052 pState
->forceFront
= true;
1056 while (remainingVerts
)
1058 uint32_t numVertsForDraw
= (remainingVerts
< maxVertsPerDraw
) ?
1059 remainingVerts
: maxVertsPerDraw
;
1061 bool isSplitDraw
= (draw
> 0) ? true : false;
1062 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
, isSplitDraw
);
1063 InitDraw(pDC
, isSplitDraw
);
1065 pDC
->FeWork
.type
= DRAW
;
1066 pDC
->FeWork
.pfnWork
= GetFEDrawFunc(
1068 pState
->tsState
.tsEnable
,
1069 pState
->gsState
.gsEnable
,
1070 pState
->soState
.soEnable
,
1071 pDC
->pState
->pfnProcessPrims
!= nullptr);
1072 pDC
->FeWork
.desc
.draw
.numVerts
= numVertsForDraw
;
1073 pDC
->FeWork
.desc
.draw
.startVertex
= startVertex
;
1074 pDC
->FeWork
.desc
.draw
.numInstances
= numInstances
;
1075 pDC
->FeWork
.desc
.draw
.startInstance
= startInstance
;
1076 pDC
->FeWork
.desc
.draw
.startPrimID
= draw
* primsPerDraw
;
1077 pDC
->FeWork
.desc
.draw
.startVertexID
= draw
* maxVertsPerDraw
;
1080 QueueDraw(pContext
);
1082 remainingVerts
-= numVertsForDraw
;
1086 // restore culling state
1087 pDC
= GetDrawContext(pContext
);
1088 pDC
->pState
->state
.rastState
.cullMode
= oldCullMode
;
1090 RDTSC_STOP(APIDraw
, numVertices
* numInstances
, 0);
1093 //////////////////////////////////////////////////////////////////////////
1095 /// @param hContext - Handle passed back from SwrCreateContext
1096 /// @param topology - Specifies topology for draw.
1097 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
1098 /// @param primCount - Number of vertices.
1101 PRIMITIVE_TOPOLOGY topology
,
1102 uint32_t startVertex
,
1103 uint32_t numVertices
)
1105 DrawInstanced(hContext
, topology
, numVertices
, startVertex
);
1108 //////////////////////////////////////////////////////////////////////////
1109 /// @brief SwrDrawInstanced
1110 /// @param hContext - Handle passed back from SwrCreateContext
1111 /// @param topology - Specifies topology for draw.
1112 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
1113 /// @param numInstances - How many instances to render.
1114 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1115 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1116 void SwrDrawInstanced(
1118 PRIMITIVE_TOPOLOGY topology
,
1119 uint32_t numVertsPerInstance
,
1120 uint32_t numInstances
,
1121 uint32_t startVertex
,
1122 uint32_t startInstance
1125 DrawInstanced(hContext
, topology
, numVertsPerInstance
, startVertex
, numInstances
, startInstance
);
1128 //////////////////////////////////////////////////////////////////////////
1129 /// @brief DrawIndexedInstanced
1130 /// @param hContext - Handle passed back from SwrCreateContext
1131 /// @param topology - Specifies topology for draw.
1132 /// @param numIndices - Number of indices to read sequentially from index buffer.
1133 /// @param indexOffset - Starting index into index buffer.
1134 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1135 /// @param numInstances - Number of instances to render.
1136 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1137 void DrawIndexedInstance(
1139 PRIMITIVE_TOPOLOGY topology
,
1140 uint32_t numIndices
,
1141 uint32_t indexOffset
,
1143 uint32_t numInstances
= 1,
1144 uint32_t startInstance
= 0)
1151 RDTSC_START(APIDrawIndexed
);
1153 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1154 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1155 API_STATE
* pState
= &pDC
->pState
->state
;
1157 int32_t maxIndicesPerDraw
= MaxVertsPerDraw(pDC
, numIndices
, topology
);
1158 uint32_t primsPerDraw
= GetNumPrims(topology
, maxIndicesPerDraw
);
1159 int32_t remainingIndices
= numIndices
;
1161 uint32_t indexSize
= 0;
1162 switch (pState
->indexBuffer
.format
)
1164 case R32_UINT
: indexSize
= sizeof(uint32_t); break;
1165 case R16_UINT
: indexSize
= sizeof(uint16_t); break;
1166 case R8_UINT
: indexSize
= sizeof(uint8_t); break;
1172 uint8_t *pIB
= (uint8_t*)pState
->indexBuffer
.pIndices
;
1173 pIB
+= (uint64_t)indexOffset
* (uint64_t)indexSize
;
1175 pState
->topology
= topology
;
1176 pState
->forceFront
= false;
1178 // disable culling for points/lines
1179 uint32_t oldCullMode
= pState
->rastState
.cullMode
;
1180 if (topology
== TOP_POINT_LIST
)
1182 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
1183 pState
->forceFront
= true;
1186 while (remainingIndices
)
1188 uint32_t numIndicesForDraw
= (remainingIndices
< maxIndicesPerDraw
) ?
1189 remainingIndices
: maxIndicesPerDraw
;
1191 // When breaking up draw, we need to obtain new draw context for each iteration.
1192 bool isSplitDraw
= (draw
> 0) ? true : false;
1193 pDC
= GetDrawContext(pContext
, isSplitDraw
);
1194 InitDraw(pDC
, isSplitDraw
);
1196 pDC
->FeWork
.type
= DRAW
;
1197 pDC
->FeWork
.pfnWork
= GetFEDrawFunc(
1199 pState
->tsState
.tsEnable
,
1200 pState
->gsState
.gsEnable
,
1201 pState
->soState
.soEnable
,
1202 pDC
->pState
->pfnProcessPrims
!= nullptr);
1203 pDC
->FeWork
.desc
.draw
.pDC
= pDC
;
1204 pDC
->FeWork
.desc
.draw
.numIndices
= numIndicesForDraw
;
1205 pDC
->FeWork
.desc
.draw
.pIB
= (int*)pIB
;
1206 pDC
->FeWork
.desc
.draw
.type
= pDC
->pState
->state
.indexBuffer
.format
;
1208 pDC
->FeWork
.desc
.draw
.numInstances
= numInstances
;
1209 pDC
->FeWork
.desc
.draw
.startInstance
= startInstance
;
1210 pDC
->FeWork
.desc
.draw
.baseVertex
= baseVertex
;
1211 pDC
->FeWork
.desc
.draw
.startPrimID
= draw
* primsPerDraw
;
1214 QueueDraw(pContext
);
1216 pIB
+= maxIndicesPerDraw
* indexSize
;
1217 remainingIndices
-= numIndicesForDraw
;
1221 // restore culling state
1222 pDC
= GetDrawContext(pContext
);
1223 pDC
->pState
->state
.rastState
.cullMode
= oldCullMode
;
1225 RDTSC_STOP(APIDrawIndexed
, numIndices
* numInstances
, 0);
1229 //////////////////////////////////////////////////////////////////////////
1230 /// @brief DrawIndexed
1231 /// @param hContext - Handle passed back from SwrCreateContext
1232 /// @param topology - Specifies topology for draw.
1233 /// @param numIndices - Number of indices to read sequentially from index buffer.
1234 /// @param indexOffset - Starting index into index buffer.
1235 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1236 void SwrDrawIndexed(
1238 PRIMITIVE_TOPOLOGY topology
,
1239 uint32_t numIndices
,
1240 uint32_t indexOffset
,
1244 DrawIndexedInstance(hContext
, topology
, numIndices
, indexOffset
, baseVertex
);
1247 //////////////////////////////////////////////////////////////////////////
1248 /// @brief SwrDrawIndexedInstanced
1249 /// @param hContext - Handle passed back from SwrCreateContext
1250 /// @param topology - Specifies topology for draw.
1251 /// @param numIndices - Number of indices to read sequentially from index buffer.
1252 /// @param numInstances - Number of instances to render.
1253 /// @param indexOffset - Starting index into index buffer.
1254 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1255 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1256 void SwrDrawIndexedInstanced(
1258 PRIMITIVE_TOPOLOGY topology
,
1259 uint32_t numIndices
,
1260 uint32_t numInstances
,
1261 uint32_t indexOffset
,
1263 uint32_t startInstance
)
1265 DrawIndexedInstance(hContext
, topology
, numIndices
, indexOffset
, baseVertex
, numInstances
, startInstance
);
1268 //////////////////////////////////////////////////////////////////////////
1269 /// @brief SwrInvalidateTiles
1270 /// @param hContext - Handle passed back from SwrCreateContext
1271 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
1272 void SwrInvalidateTiles(
1274 uint32_t attachmentMask
)
1276 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1277 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1279 pDC
->FeWork
.type
= DISCARDINVALIDATETILES
;
1280 pDC
->FeWork
.pfnWork
= ProcessDiscardInvalidateTiles
;
1281 pDC
->FeWork
.desc
.discardInvalidateTiles
.attachmentMask
= attachmentMask
;
1282 memset(&pDC
->FeWork
.desc
.discardInvalidateTiles
.rect
, 0, sizeof(SWR_RECT
));
1283 pDC
->FeWork
.desc
.discardInvalidateTiles
.newTileState
= SWR_TILE_INVALID
;
1284 pDC
->FeWork
.desc
.discardInvalidateTiles
.createNewTiles
= false;
1285 pDC
->FeWork
.desc
.discardInvalidateTiles
.fullTilesOnly
= false;
1288 QueueDraw(pContext
);
1291 //////////////////////////////////////////////////////////////////////////
1292 /// @brief SwrDiscardRect
1293 /// @param hContext - Handle passed back from SwrCreateContext
1294 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
1295 /// @param rect - if rect is all zeros, the entire attachment surface will be discarded
1296 void SwrDiscardRect(
1298 uint32_t attachmentMask
,
1301 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1302 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1304 // Queue a load to the hottile
1305 pDC
->FeWork
.type
= DISCARDINVALIDATETILES
;
1306 pDC
->FeWork
.pfnWork
= ProcessDiscardInvalidateTiles
;
1307 pDC
->FeWork
.desc
.discardInvalidateTiles
.attachmentMask
= attachmentMask
;
1308 pDC
->FeWork
.desc
.discardInvalidateTiles
.rect
= rect
;
1309 pDC
->FeWork
.desc
.discardInvalidateTiles
.newTileState
= SWR_TILE_RESOLVED
;
1310 pDC
->FeWork
.desc
.discardInvalidateTiles
.createNewTiles
= true;
1311 pDC
->FeWork
.desc
.discardInvalidateTiles
.fullTilesOnly
= true;
1314 QueueDraw(pContext
);
1317 //////////////////////////////////////////////////////////////////////////
1318 /// @brief SwrDispatch
1319 /// @param hContext - Handle passed back from SwrCreateContext
1320 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
1321 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
1322 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
1325 uint32_t threadGroupCountX
,
1326 uint32_t threadGroupCountY
,
1327 uint32_t threadGroupCountZ
)
1334 RDTSC_START(APIDispatch
);
1335 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1336 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1338 pDC
->isCompute
= true; // This is a compute context.
1340 // Ensure spill fill pointers are initialized to nullptr.
1341 memset(pDC
->pSpillFill
, 0, sizeof(pDC
->pSpillFill
));
1343 COMPUTE_DESC
* pTaskData
= (COMPUTE_DESC
*)pDC
->pArena
->AllocAligned(sizeof(COMPUTE_DESC
), 64);
1345 pTaskData
->threadGroupCountX
= threadGroupCountX
;
1346 pTaskData
->threadGroupCountY
= threadGroupCountY
;
1347 pTaskData
->threadGroupCountZ
= threadGroupCountZ
;
1349 uint32_t totalThreadGroups
= threadGroupCountX
* threadGroupCountY
* threadGroupCountZ
;
1350 pDC
->pDispatch
->initialize(totalThreadGroups
, pTaskData
);
1352 QueueDispatch(pContext
);
1353 RDTSC_STOP(APIDispatch
, threadGroupCountX
* threadGroupCountY
* threadGroupCountZ
, 0);
1356 // Deswizzles, converts and stores current contents of the hot tiles to surface
1357 // described by pState
1360 SWR_RENDERTARGET_ATTACHMENT attachment
,
1361 SWR_TILE_STATE postStoreTileState
)
1363 RDTSC_START(APIStoreTiles
);
1365 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1366 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1368 SetupMacroTileScissors(pDC
);
1370 pDC
->FeWork
.type
= STORETILES
;
1371 pDC
->FeWork
.pfnWork
= ProcessStoreTiles
;
1372 pDC
->FeWork
.desc
.storeTiles
.attachment
= attachment
;
1373 pDC
->FeWork
.desc
.storeTiles
.postStoreTileState
= postStoreTileState
;
1376 QueueDraw(pContext
);
1378 RDTSC_STOP(APIStoreTiles
, 0, 0);
1381 void SwrClearRenderTarget(
1384 const float clearColor
[4],
1388 RDTSC_START(APIClearRenderTarget
);
1390 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1392 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1394 SetupMacroTileScissors(pDC
);
1397 flags
.mask
= clearMask
;
1399 pDC
->FeWork
.type
= CLEAR
;
1400 pDC
->FeWork
.pfnWork
= ProcessClear
;
1401 pDC
->FeWork
.desc
.clear
.flags
= flags
;
1402 pDC
->FeWork
.desc
.clear
.clearDepth
= z
;
1403 pDC
->FeWork
.desc
.clear
.clearRTColor
[0] = clearColor
[0];
1404 pDC
->FeWork
.desc
.clear
.clearRTColor
[1] = clearColor
[1];
1405 pDC
->FeWork
.desc
.clear
.clearRTColor
[2] = clearColor
[2];
1406 pDC
->FeWork
.desc
.clear
.clearRTColor
[3] = clearColor
[3];
1407 pDC
->FeWork
.desc
.clear
.clearStencil
= stencil
;
1410 QueueDraw(pContext
);
1412 RDTSC_STOP(APIClearRenderTarget
, 0, pDC
->drawId
);
1415 //////////////////////////////////////////////////////////////////////////
1416 /// @brief Returns a pointer to the private context state for the current
1417 /// draw operation. This is used for external componets such as the
1419 /// SWR is responsible for the allocation of the private context state.
1420 /// @param hContext - Handle passed back from SwrCreateContext
1421 VOID
* SwrGetPrivateContextState(
1424 SWR_CONTEXT
* pContext
= GetContext(hContext
);
1425 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1426 DRAW_STATE
* pState
= pDC
->pState
;
1428 if (pState
->pPrivateState
== nullptr)
1430 pState
->pPrivateState
= pState
->pArena
->AllocAligned(pContext
->privateStateSize
, KNOB_SIMD_WIDTH
*sizeof(float));
1433 return pState
->pPrivateState
;
1436 //////////////////////////////////////////////////////////////////////////
1437 /// @brief Clients can use this to allocate memory for draw/dispatch
1438 /// operations. The memory will automatically be freed once operation
1439 /// has completed. Client can use this to allocate binding tables,
1440 /// etc. needed for shader execution.
1441 /// @param hContext - Handle passed back from SwrCreateContext
1442 /// @param size - Size of allocation
1443 /// @param align - Alignment needed for allocation.
1444 VOID
* SwrAllocDrawContextMemory(
1449 SWR_CONTEXT
* pContext
= GetContext(hContext
);
1450 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1452 return pDC
->pState
->pArena
->AllocAligned(size
, align
);
1455 //////////////////////////////////////////////////////////////////////////
1456 /// @brief Returns pointer to SWR stats.
1457 /// @note The counters are atomically incremented by multiple threads.
1458 /// When calling this, you need to ensure all previous operations
1460 /// @todo If necessary, add a callback to avoid stalling the pipe to
1461 /// sample the counters.
1462 /// @param hContext - Handle passed back from SwrCreateContext
1463 /// @param pStats - SWR will fill this out for caller.
1468 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1469 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1471 pDC
->FeWork
.type
= QUERYSTATS
;
1472 pDC
->FeWork
.pfnWork
= ProcessQueryStats
;
1473 pDC
->FeWork
.desc
.queryStats
.pStats
= pStats
;
1475 // cannot execute until all previous draws have completed
1476 pDC
->dependency
= pDC
->drawId
- 1;
1479 QueueDraw(pContext
);
1482 //////////////////////////////////////////////////////////////////////////
1483 /// @brief Enables stats counting
1484 /// @param hContext - Handle passed back from SwrCreateContext
1485 /// @param enable - If true then counts are incremented.
1486 void SwrEnableStats(
1490 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1491 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1493 pDC
->pState
->state
.enableStats
= enable
;
1496 //////////////////////////////////////////////////////////////////////////
1497 /// @brief Mark end of frame - used for performance profiling
1498 /// @param hContext - Handle passed back from SwrCreateContext
1499 void SWR_API
SwrEndFrame(