1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief API implementation
27 ******************************************************************************/
34 #include "core/backend.h"
35 #include "core/context.h"
36 #include "core/frontend.h"
37 #include "core/rasterizer.h"
38 #include "core/rdtsc_core.h"
39 #include "core/threads.h"
40 #include "core/tilemgr.h"
41 #include "core/clip.h"
43 #include "common/simdintrin.h"
44 #include "common/os.h"
46 void SetupDefaultState(SWR_CONTEXT
*pContext
);
48 //////////////////////////////////////////////////////////////////////////
49 /// @brief Create SWR Context.
50 /// @param pCreateInfo - pointer to creation info.
51 HANDLE
SwrCreateContext(
52 SWR_CREATECONTEXT_INFO
* pCreateInfo
)
57 void* pContextMem
= _aligned_malloc(sizeof(SWR_CONTEXT
), KNOB_SIMD_WIDTH
* 4);
58 memset(pContextMem
, 0, sizeof(SWR_CONTEXT
));
59 SWR_CONTEXT
*pContext
= new (pContextMem
) SWR_CONTEXT();
61 pContext
->driverType
= pCreateInfo
->driver
;
62 pContext
->privateStateSize
= pCreateInfo
->privateStateSize
;
64 pContext
->dcRing
.Init(KNOB_MAX_DRAWS_IN_FLIGHT
);
65 pContext
->dsRing
.Init(KNOB_MAX_DRAWS_IN_FLIGHT
);
67 for (uint32_t dc
= 0; dc
< KNOB_MAX_DRAWS_IN_FLIGHT
; ++dc
)
69 pContext
->dcRing
[dc
].pArena
= new CachingArena(pContext
->cachingArenaAllocator
);
70 pContext
->dcRing
[dc
].pTileMgr
= new MacroTileMgr(*(pContext
->dcRing
[dc
].pArena
));
71 pContext
->dcRing
[dc
].pDispatch
= new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
73 pContext
->dsRing
[dc
].pArena
= new CachingArena(pContext
->cachingArenaAllocator
);
76 if (!KNOB_SINGLE_THREADED
)
78 memset(&pContext
->WaitLock
, 0, sizeof(pContext
->WaitLock
));
79 memset(&pContext
->FifosNotEmpty
, 0, sizeof(pContext
->FifosNotEmpty
));
80 new (&pContext
->WaitLock
) std::mutex();
81 new (&pContext
->FifosNotEmpty
) std::condition_variable();
83 CreateThreadPool(pContext
, &pContext
->threadPool
);
86 // Calling createThreadPool() above can set SINGLE_THREADED
87 if (KNOB_SINGLE_THREADED
)
89 pContext
->NumWorkerThreads
= 1;
92 // Allocate scratch space for workers.
93 ///@note We could lazily allocate this but its rather small amount of memory.
94 for (uint32_t i
= 0; i
< pContext
->NumWorkerThreads
; ++i
)
97 uint32_t numaNode
= pContext
->threadPool
.pThreadData
?
98 pContext
->threadPool
.pThreadData
[i
].numaId
: 0;
99 pContext
->pScratch
[i
] = (uint8_t*)VirtualAllocExNuma(
100 GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE
),
101 MEM_RESERVE
| MEM_COMMIT
, PAGE_READWRITE
,
104 pContext
->pScratch
[i
] = (uint8_t*)_aligned_malloc(32 * sizeof(KILOBYTE
), KNOB_SIMD_WIDTH
* 4);
108 // State setup AFTER context is fully initialized
109 SetupDefaultState(pContext
);
111 // initialize hot tile manager
112 pContext
->pHotTileMgr
= new HotTileMgr();
114 // initialize function pointer tables
115 InitClearTilesTable();
117 // initialize store tiles function
118 pContext
->pfnLoadTile
= pCreateInfo
->pfnLoadTile
;
119 pContext
->pfnStoreTile
= pCreateInfo
->pfnStoreTile
;
120 pContext
->pfnClearTile
= pCreateInfo
->pfnClearTile
;
122 // pass pointer to bucket manager back to caller
123 #ifdef KNOB_ENABLE_RDTSC
124 pCreateInfo
->pBucketMgr
= &gBucketMgr
;
127 pCreateInfo
->contextSaveSize
= sizeof(API_STATE
);
129 return (HANDLE
)pContext
;
132 void SwrDestroyContext(HANDLE hContext
)
134 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
135 DestroyThreadPool(pContext
, &pContext
->threadPool
);
138 for (uint32_t i
= 0; i
< KNOB_MAX_DRAWS_IN_FLIGHT
; ++i
)
140 delete pContext
->dcRing
[i
].pArena
;
141 delete pContext
->dsRing
[i
].pArena
;
142 delete(pContext
->dcRing
[i
].pTileMgr
);
143 delete(pContext
->dcRing
[i
].pDispatch
);
146 // Free scratch space.
147 for (uint32_t i
= 0; i
< pContext
->NumWorkerThreads
; ++i
)
150 VirtualFree(pContext
->pScratch
[i
], 0, MEM_RELEASE
);
152 _aligned_free(pContext
->pScratch
[i
]);
156 delete(pContext
->pHotTileMgr
);
158 pContext
->~SWR_CONTEXT();
159 _aligned_free((SWR_CONTEXT
*)hContext
);
162 void CopyState(DRAW_STATE
& dst
, const DRAW_STATE
& src
)
164 memcpy(&dst
.state
, &src
.state
, sizeof(API_STATE
));
167 void WakeAllThreads(SWR_CONTEXT
*pContext
)
169 pContext
->FifosNotEmpty
.notify_all();
172 template<bool IsDraw
>
173 void QueueWork(SWR_CONTEXT
*pContext
)
175 // Each worker thread looks at a DC for both FE and BE work at different times and so we
176 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
177 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
178 // then moved on if all work is done.)
179 pContext
->pCurDrawContext
->threadsDone
=
180 pContext
->NumWorkerThreads
? pContext
->NumWorkerThreads
* 2 : 2;
184 std::unique_lock
<std::mutex
> lock(pContext
->WaitLock
);
185 pContext
->dcRing
.Enqueue();
188 if (KNOB_SINGLE_THREADED
)
190 // flush denormals to 0
191 uint32_t mxcsr
= _mm_getcsr();
192 _mm_setcsr(mxcsr
| _MM_FLUSH_ZERO_ON
| _MM_DENORMALS_ZERO_ON
);
196 static TileSet lockedTiles
;
197 uint64_t curDraw
[2] = { pContext
->pCurDrawContext
->drawId
, pContext
->pCurDrawContext
->drawId
};
198 WorkOnFifoFE(pContext
, 0, curDraw
[0], 0);
199 WorkOnFifoBE(pContext
, 0, curDraw
[1], lockedTiles
, 0, 0);
203 uint64_t curDispatch
= pContext
->pCurDrawContext
->drawId
;
204 WorkOnCompute(pContext
, 0, curDispatch
);
207 // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
208 while (CompleteDrawContext(pContext
, pContext
->pCurDrawContext
) > 0) {}
215 RDTSC_START(APIDrawWakeAllThreads
);
216 WakeAllThreads(pContext
);
217 RDTSC_STOP(APIDrawWakeAllThreads
, 1, 0);
220 // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
221 pContext
->pPrevDrawContext
= pContext
->pCurDrawContext
;
222 pContext
->pCurDrawContext
= nullptr;
225 INLINE
void QueueDraw(SWR_CONTEXT
* pContext
)
227 QueueWork
<true>(pContext
);
230 INLINE
void QueueDispatch(SWR_CONTEXT
* pContext
)
232 QueueWork
<false>(pContext
);
235 DRAW_CONTEXT
* GetDrawContext(SWR_CONTEXT
*pContext
, bool isSplitDraw
= false)
237 RDTSC_START(APIGetDrawContext
);
238 // If current draw context is null then need to obtain a new draw context to use from ring.
239 if (pContext
->pCurDrawContext
== nullptr)
241 // Need to wait for a free entry.
242 while (pContext
->dcRing
.IsFull())
247 uint32_t dcIndex
= pContext
->dcRing
.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT
;
249 DRAW_CONTEXT
* pCurDrawContext
= &pContext
->dcRing
[dcIndex
];
250 pContext
->pCurDrawContext
= pCurDrawContext
;
252 // Assign next available entry in DS ring to this DC.
253 uint32_t dsIndex
= pContext
->curStateId
% KNOB_MAX_DRAWS_IN_FLIGHT
;
254 pCurDrawContext
->pState
= &pContext
->dsRing
[dsIndex
];
256 // Copy previous state to current state.
257 if (pContext
->pPrevDrawContext
)
259 DRAW_CONTEXT
* pPrevDrawContext
= pContext
->pPrevDrawContext
;
261 // If we're splitting our draw then we can just use the same state from the previous
262 // draw. In this case, we won't increment the DS ring index so the next non-split
263 // draw can receive the state.
264 if (isSplitDraw
== false)
266 CopyState(*pCurDrawContext
->pState
, *pPrevDrawContext
->pState
);
268 // Should have been cleaned up previously
269 SWR_ASSERT(pCurDrawContext
->pState
->pArena
->IsEmpty() == true);
271 pCurDrawContext
->pState
->pPrivateState
= nullptr;
273 pContext
->curStateId
++; // Progress state ring index forward.
277 // If its a split draw then just copy the state pointer over
278 // since its the same draw.
279 pCurDrawContext
->pState
= pPrevDrawContext
->pState
;
280 SWR_ASSERT(pPrevDrawContext
->cleanupState
== false);
285 SWR_ASSERT(pCurDrawContext
->pState
->pArena
->IsEmpty() == true);
286 pContext
->curStateId
++; // Progress state ring index forward.
289 SWR_ASSERT(pCurDrawContext
->pArena
->IsEmpty() == true);
291 pCurDrawContext
->dependency
= 0;
292 pCurDrawContext
->pContext
= pContext
;
293 pCurDrawContext
->isCompute
= false; // Dispatch has to set this to true.
295 pCurDrawContext
->doneFE
= false;
296 pCurDrawContext
->FeLock
= 0;
297 pCurDrawContext
->threadsDone
= 0;
299 pCurDrawContext
->pTileMgr
->initialize();
301 // Assign unique drawId for this DC
302 pCurDrawContext
->drawId
= pContext
->dcRing
.GetHead();
304 pCurDrawContext
->cleanupState
= true;
308 SWR_ASSERT(isSplitDraw
== false, "Split draw should only be used when obtaining a new DC");
311 RDTSC_STOP(APIGetDrawContext
, 0, 0);
312 return pContext
->pCurDrawContext
;
315 API_STATE
* GetDrawState(SWR_CONTEXT
*pContext
)
317 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
318 SWR_ASSERT(pDC
->pState
!= nullptr);
320 return &pDC
->pState
->state
;
323 void SWR_API
SwrSaveState(
325 void* pOutputStateBlock
,
328 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
329 auto pSrc
= GetDrawState(pContext
);
330 SWR_ASSERT(pOutputStateBlock
&& memSize
>= sizeof(*pSrc
));
332 memcpy(pOutputStateBlock
, pSrc
, sizeof(*pSrc
));
335 void SWR_API
SwrRestoreState(
337 const void* pStateBlock
,
340 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
341 auto pDst
= GetDrawState(pContext
);
342 SWR_ASSERT(pStateBlock
&& memSize
>= sizeof(*pDst
));
344 memcpy(pDst
, pStateBlock
, sizeof(*pDst
));
347 void SetupDefaultState(SWR_CONTEXT
*pContext
)
349 API_STATE
* pState
= GetDrawState(pContext
);
351 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
352 pState
->rastState
.frontWinding
= SWR_FRONTWINDING_CCW
;
355 static INLINE SWR_CONTEXT
* GetContext(HANDLE hContext
)
357 return (SWR_CONTEXT
*)hContext
;
360 void SwrSync(HANDLE hContext
, PFN_CALLBACK_FUNC pfnFunc
, uint64_t userData
, uint64_t userData2
, uint64_t userData3
)
362 RDTSC_START(APISync
);
364 SWR_ASSERT(pfnFunc
!= nullptr);
366 SWR_CONTEXT
*pContext
= GetContext(hContext
);
367 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
369 pDC
->FeWork
.type
= SYNC
;
370 pDC
->FeWork
.pfnWork
= ProcessSync
;
371 pDC
->FeWork
.desc
.sync
.pfnCallbackFunc
= pfnFunc
;
372 pDC
->FeWork
.desc
.sync
.userData
= userData
;
373 pDC
->FeWork
.desc
.sync
.userData2
= userData2
;
374 pDC
->FeWork
.desc
.sync
.userData3
= userData3
;
376 // cannot execute until all previous draws have completed
377 pDC
->dependency
= pDC
->drawId
- 1;
382 RDTSC_STOP(APISync
, 1, 0);
385 void SwrWaitForIdle(HANDLE hContext
)
387 SWR_CONTEXT
*pContext
= GetContext(hContext
);
389 RDTSC_START(APIWaitForIdle
);
391 while (!pContext
->dcRing
.IsEmpty())
396 RDTSC_STOP(APIWaitForIdle
, 1, 0);
399 void SwrSetVertexBuffers(
402 const SWR_VERTEX_BUFFER_STATE
* pVertexBuffers
)
404 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
406 for (uint32_t i
= 0; i
< numBuffers
; ++i
)
408 const SWR_VERTEX_BUFFER_STATE
*pVB
= &pVertexBuffers
[i
];
409 pState
->vertexBuffers
[pVB
->index
] = *pVB
;
413 void SwrSetIndexBuffer(
415 const SWR_INDEX_BUFFER_STATE
* pIndexBuffer
)
417 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
419 pState
->indexBuffer
= *pIndexBuffer
;
422 void SwrSetFetchFunc(
424 PFN_FETCH_FUNC pfnFetchFunc
)
426 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
428 pState
->pfnFetchFunc
= pfnFetchFunc
;
433 PFN_SO_FUNC pfnSoFunc
,
434 uint32_t streamIndex
)
436 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
438 SWR_ASSERT(streamIndex
< MAX_SO_STREAMS
);
440 pState
->pfnSoFunc
[streamIndex
] = pfnSoFunc
;
445 SWR_STREAMOUT_STATE
* pSoState
)
447 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
449 pState
->soState
= *pSoState
;
452 void SwrSetSoBuffers(
454 SWR_STREAMOUT_BUFFER
* pSoBuffer
,
457 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
459 SWR_ASSERT((slot
< 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot
);
461 pState
->soBuffer
[slot
] = *pSoBuffer
;
464 void SwrSetVertexFunc(
466 PFN_VERTEX_FUNC pfnVertexFunc
)
468 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
470 pState
->pfnVertexFunc
= pfnVertexFunc
;
473 void SwrSetFrontendState(
475 SWR_FRONTEND_STATE
*pFEState
)
477 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
478 pState
->frontendState
= *pFEState
;
483 SWR_GS_STATE
*pGSState
)
485 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
486 pState
->gsState
= *pGSState
;
491 PFN_GS_FUNC pfnGsFunc
)
493 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
494 pState
->pfnGsFunc
= pfnGsFunc
;
499 PFN_CS_FUNC pfnCsFunc
,
500 uint32_t totalThreadsInGroup
)
502 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
503 pState
->pfnCsFunc
= pfnCsFunc
;
504 pState
->totalThreadsInGroup
= totalThreadsInGroup
;
509 SWR_TS_STATE
*pState
)
511 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
512 pApiState
->tsState
= *pState
;
519 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
520 pApiState
->pfnHsFunc
= pfnFunc
;
527 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
528 pApiState
->pfnDsFunc
= pfnFunc
;
531 void SwrSetDepthStencilState(
533 SWR_DEPTH_STENCIL_STATE
*pDSState
)
535 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
537 pState
->depthStencilState
= *pDSState
;
540 void SwrSetBackendState(
542 SWR_BACKEND_STATE
*pBEState
)
544 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
546 pState
->backendState
= *pBEState
;
549 void SwrSetPixelShaderState(
551 SWR_PS_STATE
*pPSState
)
553 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
554 pState
->psState
= *pPSState
;
557 void SwrSetBlendState(
559 SWR_BLEND_STATE
*pBlendState
)
561 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
562 memcpy(&pState
->blendState
, pBlendState
, sizeof(SWR_BLEND_STATE
));
565 void SwrSetBlendFunc(
567 uint32_t renderTarget
,
568 PFN_BLEND_JIT_FUNC pfnBlendFunc
)
570 SWR_ASSERT(renderTarget
< SWR_NUM_RENDERTARGETS
);
571 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
572 pState
->pfnBlendFunc
[renderTarget
] = pfnBlendFunc
;
580 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
582 static const uint8_t IDENTITY_MAP
[] =
584 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
585 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
587 static_assert(sizeof(IDENTITY_MAP
) == sizeof(pState
->linkageMap
),
588 "Update for new value of MAX_ATTRIBUTES");
590 pState
->linkageMask
= mask
;
591 pState
->linkageCount
= _mm_popcnt_u32(mask
);
597 memcpy(pState
->linkageMap
, pMap
, pState
->linkageCount
);
600 // update guardband multipliers for the viewport
601 void updateGuardband(API_STATE
*pState
)
603 // guardband center is viewport center
604 pState
->gbState
.left
= KNOB_GUARDBAND_WIDTH
/ pState
->vp
[0].width
;
605 pState
->gbState
.right
= KNOB_GUARDBAND_WIDTH
/ pState
->vp
[0].width
;
606 pState
->gbState
.top
= KNOB_GUARDBAND_HEIGHT
/ pState
->vp
[0].height
;
607 pState
->gbState
.bottom
= KNOB_GUARDBAND_HEIGHT
/ pState
->vp
[0].height
;
610 void SwrSetRastState(
612 const SWR_RASTSTATE
*pRastState
)
614 SWR_CONTEXT
*pContext
= GetContext(hContext
);
615 API_STATE
* pState
= GetDrawState(pContext
);
617 memcpy(&pState
->rastState
, pRastState
, sizeof(SWR_RASTSTATE
));
620 void SwrSetViewports(
622 uint32_t numViewports
,
623 const SWR_VIEWPORT
* pViewports
,
624 const SWR_VIEWPORT_MATRIX
* pMatrices
)
626 SWR_ASSERT(numViewports
<= KNOB_NUM_VIEWPORTS_SCISSORS
,
627 "Invalid number of viewports.");
629 SWR_CONTEXT
*pContext
= GetContext(hContext
);
630 API_STATE
* pState
= GetDrawState(pContext
);
632 memcpy(&pState
->vp
[0], pViewports
, sizeof(SWR_VIEWPORT
) * numViewports
);
634 if (pMatrices
!= nullptr)
636 memcpy(&pState
->vpMatrix
[0], pMatrices
, sizeof(SWR_VIEWPORT_MATRIX
) * numViewports
);
640 // Compute default viewport transform.
641 for (uint32_t i
= 0; i
< numViewports
; ++i
)
643 if (pContext
->driverType
== DX
)
645 pState
->vpMatrix
[i
].m00
= pState
->vp
[i
].width
/ 2.0f
;
646 pState
->vpMatrix
[i
].m11
= -pState
->vp
[i
].height
/ 2.0f
;
647 pState
->vpMatrix
[i
].m22
= pState
->vp
[i
].maxZ
- pState
->vp
[i
].minZ
;
648 pState
->vpMatrix
[i
].m30
= pState
->vp
[i
].x
+ pState
->vpMatrix
[i
].m00
;
649 pState
->vpMatrix
[i
].m31
= pState
->vp
[i
].y
- pState
->vpMatrix
[i
].m11
;
650 pState
->vpMatrix
[i
].m32
= pState
->vp
[i
].minZ
;
654 // Standard, with the exception that Y is inverted.
655 pState
->vpMatrix
[i
].m00
= (pState
->vp
[i
].width
- pState
->vp
[i
].x
) / 2.0f
;
656 pState
->vpMatrix
[i
].m11
= (pState
->vp
[i
].y
- pState
->vp
[i
].height
) / 2.0f
;
657 pState
->vpMatrix
[i
].m22
= (pState
->vp
[i
].maxZ
- pState
->vp
[i
].minZ
) / 2.0f
;
658 pState
->vpMatrix
[i
].m30
= pState
->vp
[i
].x
+ pState
->vpMatrix
[i
].m00
;
659 pState
->vpMatrix
[i
].m31
= pState
->vp
[i
].height
+ pState
->vpMatrix
[i
].m11
;
660 pState
->vpMatrix
[i
].m32
= pState
->vp
[i
].minZ
+ pState
->vpMatrix
[i
].m22
;
662 // Now that the matrix is calculated, clip the view coords to screen size.
663 // OpenGL allows for -ve x,y in the viewport.
664 pState
->vp
[i
].x
= std::max(pState
->vp
[i
].x
, 0.0f
);
665 pState
->vp
[i
].y
= std::max(pState
->vp
[i
].y
, 0.0f
);
670 updateGuardband(pState
);
673 void SwrSetScissorRects(
675 uint32_t numScissors
,
676 const BBOX
* pScissors
)
678 SWR_ASSERT(numScissors
<= KNOB_NUM_VIEWPORTS_SCISSORS
,
679 "Invalid number of scissor rects.");
681 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
682 memcpy(&pState
->scissorRects
[0], pScissors
, numScissors
* sizeof(BBOX
));
685 void SetupMacroTileScissors(DRAW_CONTEXT
*pDC
)
687 API_STATE
*pState
= &pDC
->pState
->state
;
688 uint32_t left
, right
, top
, bottom
;
690 // Set up scissor dimensions based on scissor or viewport
691 if (pState
->rastState
.scissorEnable
)
693 // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges
694 left
= pState
->scissorRects
[0].left
;
695 right
= pState
->scissorRects
[0].right
;
696 top
= pState
->scissorRects
[0].top
;
697 bottom
= pState
->scissorRects
[0].bottom
;
701 left
= (int32_t)pState
->vp
[0].x
;
702 right
= (int32_t)pState
->vp
[0].x
+ (int32_t)pState
->vp
[0].width
;
703 top
= (int32_t)pState
->vp
[0].y
;
704 bottom
= (int32_t)pState
->vp
[0].y
+ (int32_t)pState
->vp
[0].height
;
707 right
= std::min
<uint32_t>(right
, KNOB_MAX_SCISSOR_X
);
708 bottom
= std::min
<uint32_t>(bottom
, KNOB_MAX_SCISSOR_Y
);
710 if (left
> KNOB_MAX_SCISSOR_X
|| top
> KNOB_MAX_SCISSOR_Y
)
712 pState
->scissorInFixedPoint
.left
= 0;
713 pState
->scissorInFixedPoint
.right
= 0;
714 pState
->scissorInFixedPoint
.top
= 0;
715 pState
->scissorInFixedPoint
.bottom
= 0;
719 pState
->scissorInFixedPoint
.left
= left
* FIXED_POINT_SCALE
;
720 pState
->scissorInFixedPoint
.right
= right
* FIXED_POINT_SCALE
- 1;
721 pState
->scissorInFixedPoint
.top
= top
* FIXED_POINT_SCALE
;
722 pState
->scissorInFixedPoint
.bottom
= bottom
* FIXED_POINT_SCALE
- 1;
725 // templated backend function tables
726 extern PFN_BACKEND_FUNC gBackendNullPs
[SWR_MULTISAMPLE_TYPE_MAX
];
727 extern PFN_BACKEND_FUNC gBackendSingleSample
[2][2];
728 extern PFN_BACKEND_FUNC gBackendPixelRateTable
[SWR_MULTISAMPLE_TYPE_MAX
][SWR_MSAA_SAMPLE_PATTERN_MAX
][SWR_INPUT_COVERAGE_MAX
][2][2];
729 extern PFN_BACKEND_FUNC gBackendSampleRateTable
[SWR_MULTISAMPLE_TYPE_MAX
][SWR_INPUT_COVERAGE_MAX
][2];
730 extern PFN_OUTPUT_MERGER gBackendOutputMergerTable
[SWR_NUM_RENDERTARGETS
+ 1][SWR_MULTISAMPLE_TYPE_MAX
];
731 extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable
[2];
732 extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable
[2];
733 extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable
[SWR_MULTISAMPLE_TYPE_MAX
][2][2][2];
734 void SetupPipeline(DRAW_CONTEXT
*pDC
)
736 DRAW_STATE
* pState
= pDC
->pState
;
737 const SWR_RASTSTATE
&rastState
= pState
->state
.rastState
;
738 const SWR_PS_STATE
&psState
= pState
->state
.psState
;
739 BACKEND_FUNCS
& backendFuncs
= pState
->backendFuncs
;
740 const uint32_t forcedSampleCount
= (rastState
.bForcedSampleCount
) ? 1 : 0;
743 if (psState
.pfnPixelShader
== nullptr)
745 backendFuncs
.pfnBackend
= gBackendNullPs
[pState
->state
.rastState
.sampleCount
];
746 // always need to generate I & J per sample for Z interpolation
747 backendFuncs
.pfnCalcSampleBarycentrics
= gSampleBarycentricTable
[1];
751 const bool bMultisampleEnable
= ((rastState
.sampleCount
> SWR_MULTISAMPLE_1X
) || rastState
.bForcedSampleCount
) ? 1 : 0;
752 const uint32_t centroid
= ((psState
.barycentricsMask
& SWR_BARYCENTRIC_CENTROID_MASK
) > 0) ? 1 : 0;
754 // currently only support 'normal' input coverage
755 SWR_ASSERT(psState
.inputCoverage
== SWR_INPUT_COVERAGE_NORMAL
||
756 psState
.inputCoverage
== SWR_INPUT_COVERAGE_NONE
);
758 SWR_BARYCENTRICS_MASK barycentricsMask
= (SWR_BARYCENTRICS_MASK
)psState
.barycentricsMask
;
760 // select backend function
761 switch(psState
.shadingRate
)
763 case SWR_SHADING_RATE_PIXEL
:
764 if(bMultisampleEnable
)
766 // always need to generate I & J per sample for Z interpolation
767 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_SAMPLE_MASK
);
768 backendFuncs
.pfnBackend
= gBackendPixelRateTable
[rastState
.sampleCount
][rastState
.samplePattern
][psState
.inputCoverage
][centroid
][forcedSampleCount
];
769 backendFuncs
.pfnOutputMerger
= gBackendOutputMergerTable
[psState
.numRenderTargets
][pState
->state
.blendState
.sampleCount
];
773 // always need to generate I & J per pixel for Z interpolation
774 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_PIXEL_MASK
);
775 backendFuncs
.pfnBackend
= gBackendSingleSample
[psState
.inputCoverage
][centroid
];
776 backendFuncs
.pfnOutputMerger
= gBackendOutputMergerTable
[psState
.numRenderTargets
][SWR_MULTISAMPLE_1X
];
779 case SWR_SHADING_RATE_SAMPLE
:
780 SWR_ASSERT(rastState
.samplePattern
== SWR_MSAA_STANDARD_PATTERN
);
781 // always need to generate I & J per sample for Z interpolation
782 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_SAMPLE_MASK
);
783 backendFuncs
.pfnBackend
= gBackendSampleRateTable
[rastState
.sampleCount
][psState
.inputCoverage
][centroid
];
784 backendFuncs
.pfnOutputMerger
= gBackendOutputMergerTable
[psState
.numRenderTargets
][pState
->state
.blendState
.sampleCount
];
787 SWR_ASSERT(0 && "Invalid shading rate");
791 // setup pointer to function that generates necessary barycentrics required by the PS
792 bool bBarycentrics
= (barycentricsMask
& SWR_BARYCENTRIC_PER_PIXEL_MASK
) > 0 ? 1 : 0;
793 backendFuncs
.pfnCalcPixelBarycentrics
= gPixelBarycentricTable
[bBarycentrics
];
795 bBarycentrics
= (barycentricsMask
& SWR_BARYCENTRIC_PER_SAMPLE_MASK
) > 0 ? 1 : 0;
796 backendFuncs
.pfnCalcSampleBarycentrics
= gSampleBarycentricTable
[bBarycentrics
];
798 bBarycentrics
= (barycentricsMask
& SWR_BARYCENTRIC_CENTROID_MASK
) > 0 ? 1 : 0;
799 backendFuncs
.pfnCalcCentroidBarycentrics
= gCentroidBarycentricTable
[rastState
.sampleCount
][bBarycentrics
][rastState
.samplePattern
][forcedSampleCount
];
802 PFN_PROCESS_PRIMS pfnBinner
;
803 switch (pState
->state
.topology
)
806 pState
->pfnProcessPrims
= ClipPoints
;
807 pfnBinner
= BinPoints
;
812 case TOP_LINE_LIST_ADJ
:
813 case TOP_LISTSTRIP_ADJ
:
814 pState
->pfnProcessPrims
= ClipLines
;
815 pfnBinner
= BinLines
;
818 pState
->pfnProcessPrims
= ClipTriangles
;
819 pfnBinner
= BinTriangles
;
823 // disable clipper if viewport transform is disabled
824 if (pState
->state
.frontendState
.vpTransformDisable
)
826 pState
->pfnProcessPrims
= pfnBinner
;
829 if ((pState
->state
.psState
.pfnPixelShader
== nullptr) &&
830 (pState
->state
.depthStencilState
.depthTestEnable
== FALSE
) &&
831 (pState
->state
.depthStencilState
.depthWriteEnable
== FALSE
) &&
832 (pState
->state
.depthStencilState
.stencilTestEnable
== FALSE
) &&
833 (pState
->state
.depthStencilState
.stencilWriteEnable
== FALSE
) &&
834 (pState
->state
.linkageCount
== 0))
836 pState
->pfnProcessPrims
= nullptr;
837 pState
->state
.linkageMask
= 0;
840 if (pState
->state
.soState
.rasterizerDisable
== true)
842 pState
->pfnProcessPrims
= nullptr;
843 pState
->state
.linkageMask
= 0;
846 // set up the frontend attrib mask
847 pState
->state
.feAttribMask
= pState
->state
.linkageMask
;
848 if (pState
->state
.soState
.soEnable
)
850 for (uint32_t i
= 0; i
< 4; ++i
)
852 pState
->state
.feAttribMask
|= pState
->state
.soState
.streamMasks
[i
];
856 // complicated logic to test for cases where we don't need backing hottile memory for a draw
857 // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
858 pState
->state
.depthHottileEnable
= ((!(pState
->state
.depthStencilState
.depthTestEnable
&&
859 !pState
->state
.depthStencilState
.depthWriteEnable
&&
860 pState
->state
.depthStencilState
.depthTestFunc
== ZFUNC_ALWAYS
)) &&
861 (pState
->state
.depthStencilState
.depthTestEnable
||
862 pState
->state
.depthStencilState
.depthWriteEnable
)) ? true : false;
864 pState
->state
.stencilHottileEnable
= (((!(pState
->state
.depthStencilState
.stencilTestEnable
&&
865 !pState
->state
.depthStencilState
.stencilWriteEnable
&&
866 pState
->state
.depthStencilState
.stencilTestFunc
== ZFUNC_ALWAYS
)) ||
867 // for stencil we have to check the double sided state as well
868 (!(pState
->state
.depthStencilState
.doubleSidedStencilTestEnable
&&
869 !pState
->state
.depthStencilState
.stencilWriteEnable
&&
870 pState
->state
.depthStencilState
.backfaceStencilTestFunc
== ZFUNC_ALWAYS
))) &&
871 (pState
->state
.depthStencilState
.stencilTestEnable
||
872 pState
->state
.depthStencilState
.stencilWriteEnable
)) ? true : false;
874 uint32_t numRTs
= pState
->state
.psState
.numRenderTargets
;
875 pState
->state
.colorHottileEnable
= 0;
876 if (psState
.pfnPixelShader
!= nullptr)
878 for (uint32_t rt
= 0; rt
< numRTs
; ++rt
)
880 pState
->state
.colorHottileEnable
|=
881 (!pState
->state
.blendState
.renderTarget
[rt
].writeDisableAlpha
||
882 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableRed
||
883 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableGreen
||
884 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableBlue
) ? (1 << rt
) : 0;
889 //////////////////////////////////////////////////////////////////////////
891 /// @param pDC - Draw context to initialize for this draw.
896 // We don't need to re-setup the scissors/pipeline state again for split draw.
897 if (isSplitDraw
== false)
899 SetupMacroTileScissors(pDC
);
904 //////////////////////////////////////////////////////////////////////////
905 /// @brief We can split the draw for certain topologies for better performance.
906 /// @param totalVerts - Total vertices for draw
907 /// @param topology - Topology used for draw
908 uint32_t MaxVertsPerDraw(
911 PRIMITIVE_TOPOLOGY topology
)
913 API_STATE
& state
= pDC
->pState
->state
;
915 uint32_t vertsPerDraw
= totalVerts
;
917 if (state
.soState
.soEnable
)
925 case TOP_TRIANGLE_LIST
:
926 vertsPerDraw
= KNOB_MAX_PRIMS_PER_DRAW
;
929 case TOP_PATCHLIST_1
:
930 case TOP_PATCHLIST_2
:
931 case TOP_PATCHLIST_3
:
932 case TOP_PATCHLIST_4
:
933 case TOP_PATCHLIST_5
:
934 case TOP_PATCHLIST_6
:
935 case TOP_PATCHLIST_7
:
936 case TOP_PATCHLIST_8
:
937 case TOP_PATCHLIST_9
:
938 case TOP_PATCHLIST_10
:
939 case TOP_PATCHLIST_11
:
940 case TOP_PATCHLIST_12
:
941 case TOP_PATCHLIST_13
:
942 case TOP_PATCHLIST_14
:
943 case TOP_PATCHLIST_15
:
944 case TOP_PATCHLIST_16
:
945 case TOP_PATCHLIST_17
:
946 case TOP_PATCHLIST_18
:
947 case TOP_PATCHLIST_19
:
948 case TOP_PATCHLIST_20
:
949 case TOP_PATCHLIST_21
:
950 case TOP_PATCHLIST_22
:
951 case TOP_PATCHLIST_23
:
952 case TOP_PATCHLIST_24
:
953 case TOP_PATCHLIST_25
:
954 case TOP_PATCHLIST_26
:
955 case TOP_PATCHLIST_27
:
956 case TOP_PATCHLIST_28
:
957 case TOP_PATCHLIST_29
:
958 case TOP_PATCHLIST_30
:
959 case TOP_PATCHLIST_31
:
960 case TOP_PATCHLIST_32
:
961 if (pDC
->pState
->state
.tsState
.tsEnable
)
963 uint32_t vertsPerPrim
= topology
- TOP_PATCHLIST_BASE
;
964 vertsPerDraw
= vertsPerPrim
* KNOB_MAX_TESS_PRIMS_PER_DRAW
;
968 // The Primitive Assembly code can only handle 1 RECT at a time.
974 // We are not splitting up draws for other topologies.
981 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
982 // arguments to static template arguments.
983 template <bool... ArgsB
>
986 // Last Arg Terminator
987 static PFN_FE_WORK_FUNC
GetFunc(bool bArg
)
991 return ProcessDraw
<ArgsB
..., true>;
994 return ProcessDraw
<ArgsB
..., false>;
997 // Recursively parse args
998 template <typename
... TArgsT
>
999 static PFN_FE_WORK_FUNC
GetFunc(bool bArg
, TArgsT
... remainingArgs
)
1003 return FEDrawChooser
<ArgsB
..., true>::GetFunc(remainingArgs
...);
1006 return FEDrawChooser
<ArgsB
..., false>::GetFunc(remainingArgs
...);
1010 // Selector for correct templated Draw front-end function
1012 static PFN_FE_WORK_FUNC
GetFEDrawFunc(bool IsIndexed
, bool HasTessellation
, bool HasGeometryShader
, bool HasStreamOut
, bool RasterizerEnabled
)
1014 return FEDrawChooser
<>::GetFunc(IsIndexed
, HasTessellation
, HasGeometryShader
, HasStreamOut
, RasterizerEnabled
);
1018 //////////////////////////////////////////////////////////////////////////
1019 /// @brief DrawInstanced
1020 /// @param hContext - Handle passed back from SwrCreateContext
1021 /// @param topology - Specifies topology for draw.
1022 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
1023 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1024 /// @param numInstances - How many instances to render.
1025 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1028 PRIMITIVE_TOPOLOGY topology
,
1029 uint32_t numVertices
,
1030 uint32_t startVertex
,
1031 uint32_t numInstances
= 1,
1032 uint32_t startInstance
= 0)
1039 RDTSC_START(APIDraw
);
1041 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1042 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1044 int32_t maxVertsPerDraw
= MaxVertsPerDraw(pDC
, numVertices
, topology
);
1045 uint32_t primsPerDraw
= GetNumPrims(topology
, maxVertsPerDraw
);
1046 int32_t remainingVerts
= numVertices
;
1048 API_STATE
*pState
= &pDC
->pState
->state
;
1049 pState
->topology
= topology
;
1050 pState
->forceFront
= false;
1052 // disable culling for points/lines
1053 uint32_t oldCullMode
= pState
->rastState
.cullMode
;
1054 if (topology
== TOP_POINT_LIST
)
1056 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
1057 pState
->forceFront
= true;
1061 while (remainingVerts
)
1063 uint32_t numVertsForDraw
= (remainingVerts
< maxVertsPerDraw
) ?
1064 remainingVerts
: maxVertsPerDraw
;
1066 bool isSplitDraw
= (draw
> 0) ? true : false;
1067 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
, isSplitDraw
);
1068 InitDraw(pDC
, isSplitDraw
);
1070 pDC
->FeWork
.type
= DRAW
;
1071 pDC
->FeWork
.pfnWork
= GetFEDrawFunc(
1073 pState
->tsState
.tsEnable
,
1074 pState
->gsState
.gsEnable
,
1075 pState
->soState
.soEnable
,
1076 pDC
->pState
->pfnProcessPrims
!= nullptr);
1077 pDC
->FeWork
.desc
.draw
.numVerts
= numVertsForDraw
;
1078 pDC
->FeWork
.desc
.draw
.startVertex
= startVertex
;
1079 pDC
->FeWork
.desc
.draw
.numInstances
= numInstances
;
1080 pDC
->FeWork
.desc
.draw
.startInstance
= startInstance
;
1081 pDC
->FeWork
.desc
.draw
.startPrimID
= draw
* primsPerDraw
;
1082 pDC
->FeWork
.desc
.draw
.startVertexID
= draw
* maxVertsPerDraw
;
1084 pDC
->cleanupState
= (remainingVerts
== numVertsForDraw
);
1087 QueueDraw(pContext
);
1089 remainingVerts
-= numVertsForDraw
;
1093 // restore culling state
1094 pDC
= GetDrawContext(pContext
);
1095 pDC
->pState
->state
.rastState
.cullMode
= oldCullMode
;
1097 RDTSC_STOP(APIDraw
, numVertices
* numInstances
, 0);
1100 //////////////////////////////////////////////////////////////////////////
1102 /// @param hContext - Handle passed back from SwrCreateContext
1103 /// @param topology - Specifies topology for draw.
1104 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
1105 /// @param primCount - Number of vertices.
1108 PRIMITIVE_TOPOLOGY topology
,
1109 uint32_t startVertex
,
1110 uint32_t numVertices
)
1112 DrawInstanced(hContext
, topology
, numVertices
, startVertex
);
1115 //////////////////////////////////////////////////////////////////////////
1116 /// @brief SwrDrawInstanced
1117 /// @param hContext - Handle passed back from SwrCreateContext
1118 /// @param topology - Specifies topology for draw.
1119 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
1120 /// @param numInstances - How many instances to render.
1121 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1122 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1123 void SwrDrawInstanced(
1125 PRIMITIVE_TOPOLOGY topology
,
1126 uint32_t numVertsPerInstance
,
1127 uint32_t numInstances
,
1128 uint32_t startVertex
,
1129 uint32_t startInstance
1132 DrawInstanced(hContext
, topology
, numVertsPerInstance
, startVertex
, numInstances
, startInstance
);
1135 //////////////////////////////////////////////////////////////////////////
1136 /// @brief DrawIndexedInstanced
1137 /// @param hContext - Handle passed back from SwrCreateContext
1138 /// @param topology - Specifies topology for draw.
1139 /// @param numIndices - Number of indices to read sequentially from index buffer.
1140 /// @param indexOffset - Starting index into index buffer.
1141 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1142 /// @param numInstances - Number of instances to render.
1143 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1144 void DrawIndexedInstance(
1146 PRIMITIVE_TOPOLOGY topology
,
1147 uint32_t numIndices
,
1148 uint32_t indexOffset
,
1150 uint32_t numInstances
= 1,
1151 uint32_t startInstance
= 0)
1158 RDTSC_START(APIDrawIndexed
);
1160 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1161 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1162 API_STATE
* pState
= &pDC
->pState
->state
;
1164 int32_t maxIndicesPerDraw
= MaxVertsPerDraw(pDC
, numIndices
, topology
);
1165 uint32_t primsPerDraw
= GetNumPrims(topology
, maxIndicesPerDraw
);
1166 int32_t remainingIndices
= numIndices
;
1168 uint32_t indexSize
= 0;
1169 switch (pState
->indexBuffer
.format
)
1171 case R32_UINT
: indexSize
= sizeof(uint32_t); break;
1172 case R16_UINT
: indexSize
= sizeof(uint16_t); break;
1173 case R8_UINT
: indexSize
= sizeof(uint8_t); break;
1179 uint8_t *pIB
= (uint8_t*)pState
->indexBuffer
.pIndices
;
1180 pIB
+= (uint64_t)indexOffset
* (uint64_t)indexSize
;
1182 pState
->topology
= topology
;
1183 pState
->forceFront
= false;
1185 // disable culling for points/lines
1186 uint32_t oldCullMode
= pState
->rastState
.cullMode
;
1187 if (topology
== TOP_POINT_LIST
)
1189 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
1190 pState
->forceFront
= true;
1193 while (remainingIndices
)
1195 uint32_t numIndicesForDraw
= (remainingIndices
< maxIndicesPerDraw
) ?
1196 remainingIndices
: maxIndicesPerDraw
;
1198 // When breaking up draw, we need to obtain new draw context for each iteration.
1199 bool isSplitDraw
= (draw
> 0) ? true : false;
1200 pDC
= GetDrawContext(pContext
, isSplitDraw
);
1201 InitDraw(pDC
, isSplitDraw
);
1203 pDC
->FeWork
.type
= DRAW
;
1204 pDC
->FeWork
.pfnWork
= GetFEDrawFunc(
1206 pState
->tsState
.tsEnable
,
1207 pState
->gsState
.gsEnable
,
1208 pState
->soState
.soEnable
,
1209 pDC
->pState
->pfnProcessPrims
!= nullptr);
1210 pDC
->FeWork
.desc
.draw
.pDC
= pDC
;
1211 pDC
->FeWork
.desc
.draw
.numIndices
= numIndicesForDraw
;
1212 pDC
->FeWork
.desc
.draw
.pIB
= (int*)pIB
;
1213 pDC
->FeWork
.desc
.draw
.type
= pDC
->pState
->state
.indexBuffer
.format
;
1215 pDC
->FeWork
.desc
.draw
.numInstances
= numInstances
;
1216 pDC
->FeWork
.desc
.draw
.startInstance
= startInstance
;
1217 pDC
->FeWork
.desc
.draw
.baseVertex
= baseVertex
;
1218 pDC
->FeWork
.desc
.draw
.startPrimID
= draw
* primsPerDraw
;
1220 pDC
->cleanupState
= (remainingIndices
== numIndicesForDraw
);
1223 QueueDraw(pContext
);
1225 pIB
+= maxIndicesPerDraw
* indexSize
;
1226 remainingIndices
-= numIndicesForDraw
;
1230 // restore culling state
1231 pDC
= GetDrawContext(pContext
);
1232 pDC
->pState
->state
.rastState
.cullMode
= oldCullMode
;
1234 RDTSC_STOP(APIDrawIndexed
, numIndices
* numInstances
, 0);
1238 //////////////////////////////////////////////////////////////////////////
1239 /// @brief DrawIndexed
1240 /// @param hContext - Handle passed back from SwrCreateContext
1241 /// @param topology - Specifies topology for draw.
1242 /// @param numIndices - Number of indices to read sequentially from index buffer.
1243 /// @param indexOffset - Starting index into index buffer.
1244 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1245 void SwrDrawIndexed(
1247 PRIMITIVE_TOPOLOGY topology
,
1248 uint32_t numIndices
,
1249 uint32_t indexOffset
,
1253 DrawIndexedInstance(hContext
, topology
, numIndices
, indexOffset
, baseVertex
);
1256 //////////////////////////////////////////////////////////////////////////
1257 /// @brief SwrDrawIndexedInstanced
1258 /// @param hContext - Handle passed back from SwrCreateContext
1259 /// @param topology - Specifies topology for draw.
1260 /// @param numIndices - Number of indices to read sequentially from index buffer.
1261 /// @param numInstances - Number of instances to render.
1262 /// @param indexOffset - Starting index into index buffer.
1263 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1264 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1265 void SwrDrawIndexedInstanced(
1267 PRIMITIVE_TOPOLOGY topology
,
1268 uint32_t numIndices
,
1269 uint32_t numInstances
,
1270 uint32_t indexOffset
,
1272 uint32_t startInstance
)
1274 DrawIndexedInstance(hContext
, topology
, numIndices
, indexOffset
, baseVertex
, numInstances
, startInstance
);
1277 //////////////////////////////////////////////////////////////////////////
1278 /// @brief SwrInvalidateTiles
1279 /// @param hContext - Handle passed back from SwrCreateContext
1280 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
1281 void SwrInvalidateTiles(
1283 uint32_t attachmentMask
)
1285 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1286 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1288 pDC
->FeWork
.type
= DISCARDINVALIDATETILES
;
1289 pDC
->FeWork
.pfnWork
= ProcessDiscardInvalidateTiles
;
1290 pDC
->FeWork
.desc
.discardInvalidateTiles
.attachmentMask
= attachmentMask
;
1291 memset(&pDC
->FeWork
.desc
.discardInvalidateTiles
.rect
, 0, sizeof(SWR_RECT
));
1292 pDC
->FeWork
.desc
.discardInvalidateTiles
.newTileState
= SWR_TILE_INVALID
;
1293 pDC
->FeWork
.desc
.discardInvalidateTiles
.createNewTiles
= false;
1294 pDC
->FeWork
.desc
.discardInvalidateTiles
.fullTilesOnly
= false;
1297 QueueDraw(pContext
);
1300 //////////////////////////////////////////////////////////////////////////
1301 /// @brief SwrDiscardRect
1302 /// @param hContext - Handle passed back from SwrCreateContext
1303 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
1304 /// @param rect - if rect is all zeros, the entire attachment surface will be discarded
1305 void SwrDiscardRect(
1307 uint32_t attachmentMask
,
1310 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1311 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1313 // Queue a load to the hottile
1314 pDC
->FeWork
.type
= DISCARDINVALIDATETILES
;
1315 pDC
->FeWork
.pfnWork
= ProcessDiscardInvalidateTiles
;
1316 pDC
->FeWork
.desc
.discardInvalidateTiles
.attachmentMask
= attachmentMask
;
1317 pDC
->FeWork
.desc
.discardInvalidateTiles
.rect
= rect
;
1318 pDC
->FeWork
.desc
.discardInvalidateTiles
.newTileState
= SWR_TILE_RESOLVED
;
1319 pDC
->FeWork
.desc
.discardInvalidateTiles
.createNewTiles
= true;
1320 pDC
->FeWork
.desc
.discardInvalidateTiles
.fullTilesOnly
= true;
1323 QueueDraw(pContext
);
1326 //////////////////////////////////////////////////////////////////////////
1327 /// @brief SwrDispatch
1328 /// @param hContext - Handle passed back from SwrCreateContext
1329 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
1330 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
1331 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
1334 uint32_t threadGroupCountX
,
1335 uint32_t threadGroupCountY
,
1336 uint32_t threadGroupCountZ
)
1343 RDTSC_START(APIDispatch
);
1344 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1345 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1347 pDC
->isCompute
= true; // This is a compute context.
1349 // Ensure spill fill pointers are initialized to nullptr.
1350 memset(pDC
->pSpillFill
, 0, sizeof(pDC
->pSpillFill
));
1352 COMPUTE_DESC
* pTaskData
= (COMPUTE_DESC
*)pDC
->pArena
->AllocAligned(sizeof(COMPUTE_DESC
), 64);
1354 pTaskData
->threadGroupCountX
= threadGroupCountX
;
1355 pTaskData
->threadGroupCountY
= threadGroupCountY
;
1356 pTaskData
->threadGroupCountZ
= threadGroupCountZ
;
1358 uint32_t totalThreadGroups
= threadGroupCountX
* threadGroupCountY
* threadGroupCountZ
;
1359 pDC
->pDispatch
->initialize(totalThreadGroups
, pTaskData
);
1361 QueueDispatch(pContext
);
1362 RDTSC_STOP(APIDispatch
, threadGroupCountX
* threadGroupCountY
* threadGroupCountZ
, 0);
1365 // Deswizzles, converts and stores current contents of the hot tiles to surface
1366 // described by pState
1369 SWR_RENDERTARGET_ATTACHMENT attachment
,
1370 SWR_TILE_STATE postStoreTileState
)
1372 RDTSC_START(APIStoreTiles
);
1374 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1375 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1377 SetupMacroTileScissors(pDC
);
1379 pDC
->FeWork
.type
= STORETILES
;
1380 pDC
->FeWork
.pfnWork
= ProcessStoreTiles
;
1381 pDC
->FeWork
.desc
.storeTiles
.attachment
= attachment
;
1382 pDC
->FeWork
.desc
.storeTiles
.postStoreTileState
= postStoreTileState
;
1385 QueueDraw(pContext
);
1387 RDTSC_STOP(APIStoreTiles
, 0, 0);
1390 void SwrClearRenderTarget(
1393 const float clearColor
[4],
1397 RDTSC_START(APIClearRenderTarget
);
1399 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1401 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1403 SetupMacroTileScissors(pDC
);
1406 flags
.mask
= clearMask
;
1408 pDC
->FeWork
.type
= CLEAR
;
1409 pDC
->FeWork
.pfnWork
= ProcessClear
;
1410 pDC
->FeWork
.desc
.clear
.flags
= flags
;
1411 pDC
->FeWork
.desc
.clear
.clearDepth
= z
;
1412 pDC
->FeWork
.desc
.clear
.clearRTColor
[0] = clearColor
[0];
1413 pDC
->FeWork
.desc
.clear
.clearRTColor
[1] = clearColor
[1];
1414 pDC
->FeWork
.desc
.clear
.clearRTColor
[2] = clearColor
[2];
1415 pDC
->FeWork
.desc
.clear
.clearRTColor
[3] = clearColor
[3];
1416 pDC
->FeWork
.desc
.clear
.clearStencil
= stencil
;
1419 QueueDraw(pContext
);
1421 RDTSC_STOP(APIClearRenderTarget
, 0, pDC
->drawId
);
1424 //////////////////////////////////////////////////////////////////////////
1425 /// @brief Returns a pointer to the private context state for the current
1426 /// draw operation. This is used for external componets such as the
1428 /// SWR is responsible for the allocation of the private context state.
1429 /// @param hContext - Handle passed back from SwrCreateContext
1430 VOID
* SwrGetPrivateContextState(
1433 SWR_CONTEXT
* pContext
= GetContext(hContext
);
1434 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1435 DRAW_STATE
* pState
= pDC
->pState
;
1437 if (pState
->pPrivateState
== nullptr)
1439 pState
->pPrivateState
= pState
->pArena
->AllocAligned(pContext
->privateStateSize
, KNOB_SIMD_WIDTH
*sizeof(float));
1442 return pState
->pPrivateState
;
1445 //////////////////////////////////////////////////////////////////////////
1446 /// @brief Clients can use this to allocate memory for draw/dispatch
1447 /// operations. The memory will automatically be freed once operation
1448 /// has completed. Client can use this to allocate binding tables,
1449 /// etc. needed for shader execution.
1450 /// @param hContext - Handle passed back from SwrCreateContext
1451 /// @param size - Size of allocation
1452 /// @param align - Alignment needed for allocation.
1453 VOID
* SwrAllocDrawContextMemory(
1458 SWR_CONTEXT
* pContext
= GetContext(hContext
);
1459 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1461 return pDC
->pState
->pArena
->AllocAligned(size
, align
);
1464 //////////////////////////////////////////////////////////////////////////
1465 /// @brief Returns pointer to SWR stats.
1466 /// @note The counters are atomically incremented by multiple threads.
1467 /// When calling this, you need to ensure all previous operations
1469 /// @todo If necessary, add a callback to avoid stalling the pipe to
1470 /// sample the counters.
1471 /// @param hContext - Handle passed back from SwrCreateContext
1472 /// @param pStats - SWR will fill this out for caller.
1477 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1478 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1480 pDC
->FeWork
.type
= QUERYSTATS
;
1481 pDC
->FeWork
.pfnWork
= ProcessQueryStats
;
1482 pDC
->FeWork
.desc
.queryStats
.pStats
= pStats
;
1484 // cannot execute until all previous draws have completed
1485 pDC
->dependency
= pDC
->drawId
- 1;
1488 QueueDraw(pContext
);
1491 //////////////////////////////////////////////////////////////////////////
1492 /// @brief Enables stats counting
1493 /// @param hContext - Handle passed back from SwrCreateContext
1494 /// @param enable - If true then counts are incremented.
1495 void SwrEnableStats(
1499 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1500 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1502 pDC
->pState
->state
.enableStats
= enable
;
1505 //////////////////////////////////////////////////////////////////////////
1506 /// @brief Mark end of frame - used for performance profiling
1507 /// @param hContext - Handle passed back from SwrCreateContext
1508 void SWR_API
SwrEndFrame(