1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief API implementation
27 ******************************************************************************/
35 #include "core/backend.h"
36 #include "core/context.h"
37 #include "core/depthstencil.h"
38 #include "core/frontend.h"
39 #include "core/rasterizer.h"
40 #include "core/rdtsc_core.h"
41 #include "core/threads.h"
42 #include "core/tilemgr.h"
43 #include "core/clip.h"
44 #include "core/utils.h"
46 #include "common/simdintrin.h"
47 #include "common/os.h"
49 void SetupDefaultState(SWR_CONTEXT
*pContext
);
51 //////////////////////////////////////////////////////////////////////////
52 /// @brief Create SWR Context.
53 /// @param pCreateInfo - pointer to creation info.
54 HANDLE
SwrCreateContext(
55 SWR_CREATECONTEXT_INFO
* pCreateInfo
)
60 void* pContextMem
= _aligned_malloc(sizeof(SWR_CONTEXT
), KNOB_SIMD_WIDTH
* 4);
61 memset(pContextMem
, 0, sizeof(SWR_CONTEXT
));
62 SWR_CONTEXT
*pContext
= new (pContextMem
) SWR_CONTEXT();
64 pContext
->driverType
= pCreateInfo
->driver
;
65 pContext
->privateStateSize
= pCreateInfo
->privateStateSize
;
67 pContext
->dcRing
.Init(KNOB_MAX_DRAWS_IN_FLIGHT
);
68 pContext
->dsRing
.Init(KNOB_MAX_DRAWS_IN_FLIGHT
);
70 pContext
->pMacroTileManagerArray
= (MacroTileMgr
*)_aligned_malloc(sizeof(MacroTileMgr
) * KNOB_MAX_DRAWS_IN_FLIGHT
, 64);
71 pContext
->pDispatchQueueArray
= (DispatchQueue
*)_aligned_malloc(sizeof(DispatchQueue
) * KNOB_MAX_DRAWS_IN_FLIGHT
, 64);
73 for (uint32_t dc
= 0; dc
< KNOB_MAX_DRAWS_IN_FLIGHT
; ++dc
)
75 pContext
->dcRing
[dc
].pArena
= new CachingArena(pContext
->cachingArenaAllocator
);
76 new (&pContext
->pMacroTileManagerArray
[dc
]) MacroTileMgr(*pContext
->dcRing
[dc
].pArena
);
77 new (&pContext
->pDispatchQueueArray
[dc
]) DispatchQueue();
79 pContext
->dsRing
[dc
].pArena
= new CachingArena(pContext
->cachingArenaAllocator
);
82 if (!KNOB_SINGLE_THREADED
)
84 memset(&pContext
->WaitLock
, 0, sizeof(pContext
->WaitLock
));
85 memset(&pContext
->FifosNotEmpty
, 0, sizeof(pContext
->FifosNotEmpty
));
86 new (&pContext
->WaitLock
) std::mutex();
87 new (&pContext
->FifosNotEmpty
) std::condition_variable();
89 CreateThreadPool(pContext
, &pContext
->threadPool
);
92 // Calling createThreadPool() above can set SINGLE_THREADED
93 if (KNOB_SINGLE_THREADED
)
95 SET_KNOB(HYPERTHREADED_FE
, false);
96 pContext
->NumWorkerThreads
= 1;
97 pContext
->NumFEThreads
= 1;
98 pContext
->NumBEThreads
= 1;
101 // Allocate scratch space for workers.
102 ///@note We could lazily allocate this but its rather small amount of memory.
103 for (uint32_t i
= 0; i
< pContext
->NumWorkerThreads
; ++i
)
106 uint32_t numaNode
= pContext
->threadPool
.pThreadData
?
107 pContext
->threadPool
.pThreadData
[i
].numaId
: 0;
108 pContext
->pScratch
[i
] = (uint8_t*)VirtualAllocExNuma(
109 GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE
),
110 MEM_RESERVE
| MEM_COMMIT
, PAGE_READWRITE
,
113 pContext
->pScratch
[i
] = (uint8_t*)_aligned_malloc(32 * sizeof(KILOBYTE
), KNOB_SIMD_WIDTH
* 4);
117 // State setup AFTER context is fully initialized
118 SetupDefaultState(pContext
);
120 // initialize hot tile manager
121 pContext
->pHotTileMgr
= new HotTileMgr();
123 // initialize function pointer tables
124 InitClearTilesTable();
126 // initialize store tiles function
127 pContext
->pfnLoadTile
= pCreateInfo
->pfnLoadTile
;
128 pContext
->pfnStoreTile
= pCreateInfo
->pfnStoreTile
;
129 pContext
->pfnClearTile
= pCreateInfo
->pfnClearTile
;
131 // pass pointer to bucket manager back to caller
132 #ifdef KNOB_ENABLE_RDTSC
133 pCreateInfo
->pBucketMgr
= &gBucketMgr
;
136 pCreateInfo
->contextSaveSize
= sizeof(API_STATE
);
138 return (HANDLE
)pContext
;
141 void SwrDestroyContext(HANDLE hContext
)
143 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
144 DestroyThreadPool(pContext
, &pContext
->threadPool
);
147 for (uint32_t i
= 0; i
< KNOB_MAX_DRAWS_IN_FLIGHT
; ++i
)
149 delete pContext
->dcRing
[i
].pArena
;
150 delete pContext
->dsRing
[i
].pArena
;
151 pContext
->pMacroTileManagerArray
[i
].~MacroTileMgr();
152 pContext
->pDispatchQueueArray
[i
].~DispatchQueue();
155 _aligned_free(pContext
->pDispatchQueueArray
);
156 _aligned_free(pContext
->pMacroTileManagerArray
);
158 // Free scratch space.
159 for (uint32_t i
= 0; i
< pContext
->NumWorkerThreads
; ++i
)
162 VirtualFree(pContext
->pScratch
[i
], 0, MEM_RELEASE
);
164 _aligned_free(pContext
->pScratch
[i
]);
168 delete(pContext
->pHotTileMgr
);
170 pContext
->~SWR_CONTEXT();
171 _aligned_free((SWR_CONTEXT
*)hContext
);
174 void CopyState(DRAW_STATE
& dst
, const DRAW_STATE
& src
)
176 memcpy(&dst
.state
, &src
.state
, sizeof(API_STATE
));
179 void WakeAllThreads(SWR_CONTEXT
*pContext
)
181 pContext
->FifosNotEmpty
.notify_all();
184 template<bool IsDraw
>
185 void QueueWork(SWR_CONTEXT
*pContext
)
187 DRAW_CONTEXT
* pDC
= pContext
->pCurDrawContext
;
188 uint32_t dcIndex
= pDC
->drawId
% KNOB_MAX_DRAWS_IN_FLIGHT
;
192 pDC
->pTileMgr
= &pContext
->pMacroTileManagerArray
[dcIndex
];
193 pDC
->pTileMgr
->initialize();
196 // Each worker thread looks at a DC for both FE and BE work at different times and so we
197 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
198 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
199 // then moved on if all work is done.)
200 pContext
->pCurDrawContext
->threadsDone
= pContext
->NumFEThreads
+ pContext
->NumBEThreads
;
204 std::unique_lock
<std::mutex
> lock(pContext
->WaitLock
);
205 pContext
->dcRing
.Enqueue();
208 if (KNOB_SINGLE_THREADED
)
210 // flush denormals to 0
211 uint32_t mxcsr
= _mm_getcsr();
212 _mm_setcsr(mxcsr
| _MM_FLUSH_ZERO_ON
| _MM_DENORMALS_ZERO_ON
);
216 static TileSet lockedTiles
;
217 uint64_t curDraw
[2] = { pContext
->pCurDrawContext
->drawId
, pContext
->pCurDrawContext
->drawId
};
218 WorkOnFifoFE(pContext
, 0, curDraw
[0]);
219 WorkOnFifoBE(pContext
, 0, curDraw
[1], lockedTiles
, 0, 0);
223 uint64_t curDispatch
= pContext
->pCurDrawContext
->drawId
;
224 WorkOnCompute(pContext
, 0, curDispatch
);
227 // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
228 while (CompleteDrawContext(pContext
, pContext
->pCurDrawContext
) > 0) {}
235 RDTSC_START(APIDrawWakeAllThreads
);
236 WakeAllThreads(pContext
);
237 RDTSC_STOP(APIDrawWakeAllThreads
, 1, 0);
240 // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
241 pContext
->pPrevDrawContext
= pContext
->pCurDrawContext
;
242 pContext
->pCurDrawContext
= nullptr;
245 INLINE
void QueueDraw(SWR_CONTEXT
* pContext
)
247 QueueWork
<true>(pContext
);
250 INLINE
void QueueDispatch(SWR_CONTEXT
* pContext
)
252 QueueWork
<false>(pContext
);
255 DRAW_CONTEXT
* GetDrawContext(SWR_CONTEXT
*pContext
, bool isSplitDraw
= false)
257 RDTSC_START(APIGetDrawContext
);
258 // If current draw context is null then need to obtain a new draw context to use from ring.
259 if (pContext
->pCurDrawContext
== nullptr)
261 // Need to wait for a free entry.
262 while (pContext
->dcRing
.IsFull())
267 uint64_t curDraw
= pContext
->dcRing
.GetHead();
268 uint32_t dcIndex
= curDraw
% KNOB_MAX_DRAWS_IN_FLIGHT
;
270 static uint64_t lastDrawChecked
;
271 static uint32_t lastFrameChecked
;
272 if ((pContext
->frameCount
- lastFrameChecked
) > 2 ||
273 (curDraw
- lastDrawChecked
) > 0x10000)
275 // Take this opportunity to clean-up old arena allocations
276 pContext
->cachingArenaAllocator
.FreeOldBlocks();
278 lastFrameChecked
= pContext
->frameCount
;
279 lastDrawChecked
= curDraw
;
282 DRAW_CONTEXT
* pCurDrawContext
= &pContext
->dcRing
[dcIndex
];
283 pContext
->pCurDrawContext
= pCurDrawContext
;
285 // Assign next available entry in DS ring to this DC.
286 uint32_t dsIndex
= pContext
->curStateId
% KNOB_MAX_DRAWS_IN_FLIGHT
;
287 pCurDrawContext
->pState
= &pContext
->dsRing
[dsIndex
];
289 // Copy previous state to current state.
290 if (pContext
->pPrevDrawContext
)
292 DRAW_CONTEXT
* pPrevDrawContext
= pContext
->pPrevDrawContext
;
294 // If we're splitting our draw then we can just use the same state from the previous
295 // draw. In this case, we won't increment the DS ring index so the next non-split
296 // draw can receive the state.
297 if (isSplitDraw
== false)
299 CopyState(*pCurDrawContext
->pState
, *pPrevDrawContext
->pState
);
301 // Should have been cleaned up previously
302 SWR_ASSERT(pCurDrawContext
->pState
->pArena
->IsEmpty() == true);
304 pCurDrawContext
->pState
->pPrivateState
= nullptr;
306 pContext
->curStateId
++; // Progress state ring index forward.
310 // If its a split draw then just copy the state pointer over
311 // since its the same draw.
312 pCurDrawContext
->pState
= pPrevDrawContext
->pState
;
313 SWR_ASSERT(pPrevDrawContext
->cleanupState
== false);
318 SWR_ASSERT(pCurDrawContext
->pState
->pArena
->IsEmpty() == true);
319 pContext
->curStateId
++; // Progress state ring index forward.
322 SWR_ASSERT(pCurDrawContext
->pArena
->IsEmpty() == true);
324 pCurDrawContext
->dependency
= 0;
325 pCurDrawContext
->pContext
= pContext
;
326 pCurDrawContext
->isCompute
= false; // Dispatch has to set this to true.
328 pCurDrawContext
->doneFE
= false;
329 pCurDrawContext
->FeLock
= 0;
330 pCurDrawContext
->threadsDone
= 0;
332 // Assign unique drawId for this DC
333 pCurDrawContext
->drawId
= pContext
->dcRing
.GetHead();
335 pCurDrawContext
->cleanupState
= true;
339 SWR_ASSERT(isSplitDraw
== false, "Split draw should only be used when obtaining a new DC");
342 RDTSC_STOP(APIGetDrawContext
, 0, 0);
343 return pContext
->pCurDrawContext
;
346 API_STATE
* GetDrawState(SWR_CONTEXT
*pContext
)
348 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
349 SWR_ASSERT(pDC
->pState
!= nullptr);
351 return &pDC
->pState
->state
;
354 void SWR_API
SwrSaveState(
356 void* pOutputStateBlock
,
359 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
360 auto pSrc
= GetDrawState(pContext
);
361 SWR_ASSERT(pOutputStateBlock
&& memSize
>= sizeof(*pSrc
));
363 memcpy(pOutputStateBlock
, pSrc
, sizeof(*pSrc
));
366 void SWR_API
SwrRestoreState(
368 const void* pStateBlock
,
371 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
372 auto pDst
= GetDrawState(pContext
);
373 SWR_ASSERT(pStateBlock
&& memSize
>= sizeof(*pDst
));
375 memcpy(pDst
, pStateBlock
, sizeof(*pDst
));
378 void SetupDefaultState(SWR_CONTEXT
*pContext
)
380 API_STATE
* pState
= GetDrawState(pContext
);
382 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
383 pState
->rastState
.frontWinding
= SWR_FRONTWINDING_CCW
;
386 static INLINE SWR_CONTEXT
* GetContext(HANDLE hContext
)
388 return (SWR_CONTEXT
*)hContext
;
391 void SwrSync(HANDLE hContext
, PFN_CALLBACK_FUNC pfnFunc
, uint64_t userData
, uint64_t userData2
, uint64_t userData3
)
393 RDTSC_START(APISync
);
395 SWR_ASSERT(pfnFunc
!= nullptr);
397 SWR_CONTEXT
*pContext
= GetContext(hContext
);
398 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
400 pDC
->FeWork
.type
= SYNC
;
401 pDC
->FeWork
.pfnWork
= ProcessSync
;
402 pDC
->FeWork
.desc
.sync
.pfnCallbackFunc
= pfnFunc
;
403 pDC
->FeWork
.desc
.sync
.userData
= userData
;
404 pDC
->FeWork
.desc
.sync
.userData2
= userData2
;
405 pDC
->FeWork
.desc
.sync
.userData3
= userData3
;
407 // cannot execute until all previous draws have completed
408 pDC
->dependency
= pDC
->drawId
- 1;
413 RDTSC_STOP(APISync
, 1, 0);
416 void SwrWaitForIdle(HANDLE hContext
)
418 SWR_CONTEXT
*pContext
= GetContext(hContext
);
420 RDTSC_START(APIWaitForIdle
);
422 while (!pContext
->dcRing
.IsEmpty())
427 RDTSC_STOP(APIWaitForIdle
, 1, 0);
430 void SwrSetVertexBuffers(
433 const SWR_VERTEX_BUFFER_STATE
* pVertexBuffers
)
435 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
437 for (uint32_t i
= 0; i
< numBuffers
; ++i
)
439 const SWR_VERTEX_BUFFER_STATE
*pVB
= &pVertexBuffers
[i
];
440 pState
->vertexBuffers
[pVB
->index
] = *pVB
;
444 void SwrSetIndexBuffer(
446 const SWR_INDEX_BUFFER_STATE
* pIndexBuffer
)
448 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
450 pState
->indexBuffer
= *pIndexBuffer
;
453 void SwrSetFetchFunc(
455 PFN_FETCH_FUNC pfnFetchFunc
)
457 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
459 pState
->pfnFetchFunc
= pfnFetchFunc
;
464 PFN_SO_FUNC pfnSoFunc
,
465 uint32_t streamIndex
)
467 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
469 SWR_ASSERT(streamIndex
< MAX_SO_STREAMS
);
471 pState
->pfnSoFunc
[streamIndex
] = pfnSoFunc
;
476 SWR_STREAMOUT_STATE
* pSoState
)
478 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
480 pState
->soState
= *pSoState
;
483 void SwrSetSoBuffers(
485 SWR_STREAMOUT_BUFFER
* pSoBuffer
,
488 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
490 SWR_ASSERT((slot
< 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot
);
492 pState
->soBuffer
[slot
] = *pSoBuffer
;
495 void SwrSetVertexFunc(
497 PFN_VERTEX_FUNC pfnVertexFunc
)
499 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
501 pState
->pfnVertexFunc
= pfnVertexFunc
;
504 void SwrSetFrontendState(
506 SWR_FRONTEND_STATE
*pFEState
)
508 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
509 pState
->frontendState
= *pFEState
;
514 SWR_GS_STATE
*pGSState
)
516 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
517 pState
->gsState
= *pGSState
;
522 PFN_GS_FUNC pfnGsFunc
)
524 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
525 pState
->pfnGsFunc
= pfnGsFunc
;
530 PFN_CS_FUNC pfnCsFunc
,
531 uint32_t totalThreadsInGroup
,
532 uint32_t totalSpillFillSize
)
534 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
535 pState
->pfnCsFunc
= pfnCsFunc
;
536 pState
->totalThreadsInGroup
= totalThreadsInGroup
;
537 pState
->totalSpillFillSize
= totalSpillFillSize
;
542 SWR_TS_STATE
*pState
)
544 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
545 pApiState
->tsState
= *pState
;
552 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
553 pApiState
->pfnHsFunc
= pfnFunc
;
560 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
561 pApiState
->pfnDsFunc
= pfnFunc
;
564 void SwrSetDepthStencilState(
566 SWR_DEPTH_STENCIL_STATE
*pDSState
)
568 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
570 pState
->depthStencilState
= *pDSState
;
573 void SwrSetBackendState(
575 SWR_BACKEND_STATE
*pBEState
)
577 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
579 pState
->backendState
= *pBEState
;
582 void SwrSetPixelShaderState(
584 SWR_PS_STATE
*pPSState
)
586 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
587 pState
->psState
= *pPSState
;
590 void SwrSetBlendState(
592 SWR_BLEND_STATE
*pBlendState
)
594 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
595 memcpy(&pState
->blendState
, pBlendState
, sizeof(SWR_BLEND_STATE
));
598 void SwrSetBlendFunc(
600 uint32_t renderTarget
,
601 PFN_BLEND_JIT_FUNC pfnBlendFunc
)
603 SWR_ASSERT(renderTarget
< SWR_NUM_RENDERTARGETS
);
604 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
605 pState
->pfnBlendFunc
[renderTarget
] = pfnBlendFunc
;
613 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
615 static const uint8_t IDENTITY_MAP
[] =
617 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
618 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
620 static_assert(sizeof(IDENTITY_MAP
) == sizeof(pState
->linkageMap
),
621 "Update for new value of MAX_ATTRIBUTES");
623 pState
->linkageMask
= mask
;
624 pState
->linkageCount
= _mm_popcnt_u32(mask
);
630 memcpy(pState
->linkageMap
, pMap
, pState
->linkageCount
);
633 // update guardband multipliers for the viewport
634 void updateGuardband(API_STATE
*pState
)
636 // guardband center is viewport center
637 pState
->gbState
.left
= KNOB_GUARDBAND_WIDTH
/ pState
->vp
[0].width
;
638 pState
->gbState
.right
= KNOB_GUARDBAND_WIDTH
/ pState
->vp
[0].width
;
639 pState
->gbState
.top
= KNOB_GUARDBAND_HEIGHT
/ pState
->vp
[0].height
;
640 pState
->gbState
.bottom
= KNOB_GUARDBAND_HEIGHT
/ pState
->vp
[0].height
;
643 void SwrSetRastState(
645 const SWR_RASTSTATE
*pRastState
)
647 SWR_CONTEXT
*pContext
= GetContext(hContext
);
648 API_STATE
* pState
= GetDrawState(pContext
);
650 memcpy(&pState
->rastState
, pRastState
, sizeof(SWR_RASTSTATE
));
653 void SwrSetViewports(
655 uint32_t numViewports
,
656 const SWR_VIEWPORT
* pViewports
,
657 const SWR_VIEWPORT_MATRIX
* pMatrices
)
659 SWR_ASSERT(numViewports
<= KNOB_NUM_VIEWPORTS_SCISSORS
,
660 "Invalid number of viewports.");
662 SWR_CONTEXT
*pContext
= GetContext(hContext
);
663 API_STATE
* pState
= GetDrawState(pContext
);
665 memcpy(&pState
->vp
[0], pViewports
, sizeof(SWR_VIEWPORT
) * numViewports
);
667 if (pMatrices
!= nullptr)
669 memcpy(&pState
->vpMatrix
[0], pMatrices
, sizeof(SWR_VIEWPORT_MATRIX
) * numViewports
);
673 // Compute default viewport transform.
674 for (uint32_t i
= 0; i
< numViewports
; ++i
)
676 if (pContext
->driverType
== DX
)
678 pState
->vpMatrix
[i
].m00
= pState
->vp
[i
].width
/ 2.0f
;
679 pState
->vpMatrix
[i
].m11
= -pState
->vp
[i
].height
/ 2.0f
;
680 pState
->vpMatrix
[i
].m22
= pState
->vp
[i
].maxZ
- pState
->vp
[i
].minZ
;
681 pState
->vpMatrix
[i
].m30
= pState
->vp
[i
].x
+ pState
->vpMatrix
[i
].m00
;
682 pState
->vpMatrix
[i
].m31
= pState
->vp
[i
].y
- pState
->vpMatrix
[i
].m11
;
683 pState
->vpMatrix
[i
].m32
= pState
->vp
[i
].minZ
;
687 // Standard, with the exception that Y is inverted.
688 pState
->vpMatrix
[i
].m00
= (pState
->vp
[i
].width
- pState
->vp
[i
].x
) / 2.0f
;
689 pState
->vpMatrix
[i
].m11
= (pState
->vp
[i
].y
- pState
->vp
[i
].height
) / 2.0f
;
690 pState
->vpMatrix
[i
].m22
= (pState
->vp
[i
].maxZ
- pState
->vp
[i
].minZ
) / 2.0f
;
691 pState
->vpMatrix
[i
].m30
= pState
->vp
[i
].x
+ pState
->vpMatrix
[i
].m00
;
692 pState
->vpMatrix
[i
].m31
= pState
->vp
[i
].height
+ pState
->vpMatrix
[i
].m11
;
693 pState
->vpMatrix
[i
].m32
= pState
->vp
[i
].minZ
+ pState
->vpMatrix
[i
].m22
;
695 // Now that the matrix is calculated, clip the view coords to screen size.
696 // OpenGL allows for -ve x,y in the viewport.
697 pState
->vp
[i
].x
= std::max(pState
->vp
[i
].x
, 0.0f
);
698 pState
->vp
[i
].y
= std::max(pState
->vp
[i
].y
, 0.0f
);
703 updateGuardband(pState
);
706 void SwrSetScissorRects(
708 uint32_t numScissors
,
709 const BBOX
* pScissors
)
711 SWR_ASSERT(numScissors
<= KNOB_NUM_VIEWPORTS_SCISSORS
,
712 "Invalid number of scissor rects.");
714 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
715 memcpy(&pState
->scissorRects
[0], pScissors
, numScissors
* sizeof(BBOX
));
718 void SetupMacroTileScissors(DRAW_CONTEXT
*pDC
)
720 API_STATE
*pState
= &pDC
->pState
->state
;
721 uint32_t left
, right
, top
, bottom
;
723 // Set up scissor dimensions based on scissor or viewport
724 if (pState
->rastState
.scissorEnable
)
726 // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges
727 left
= pState
->scissorRects
[0].left
;
728 right
= pState
->scissorRects
[0].right
;
729 top
= pState
->scissorRects
[0].top
;
730 bottom
= pState
->scissorRects
[0].bottom
;
734 left
= (int32_t)pState
->vp
[0].x
;
735 right
= (int32_t)pState
->vp
[0].x
+ (int32_t)pState
->vp
[0].width
;
736 top
= (int32_t)pState
->vp
[0].y
;
737 bottom
= (int32_t)pState
->vp
[0].y
+ (int32_t)pState
->vp
[0].height
;
740 right
= std::min
<uint32_t>(right
, KNOB_MAX_SCISSOR_X
);
741 bottom
= std::min
<uint32_t>(bottom
, KNOB_MAX_SCISSOR_Y
);
743 if (left
> KNOB_MAX_SCISSOR_X
|| top
> KNOB_MAX_SCISSOR_Y
)
745 pState
->scissorInFixedPoint
.left
= 0;
746 pState
->scissorInFixedPoint
.right
= 0;
747 pState
->scissorInFixedPoint
.top
= 0;
748 pState
->scissorInFixedPoint
.bottom
= 0;
752 pState
->scissorInFixedPoint
.left
= left
* FIXED_POINT_SCALE
;
753 pState
->scissorInFixedPoint
.right
= right
* FIXED_POINT_SCALE
- 1;
754 pState
->scissorInFixedPoint
.top
= top
* FIXED_POINT_SCALE
;
755 pState
->scissorInFixedPoint
.bottom
= bottom
* FIXED_POINT_SCALE
- 1;
758 // templated backend function tables
759 extern PFN_BACKEND_FUNC gBackendNullPs
[SWR_MULTISAMPLE_TYPE_MAX
];
760 extern PFN_BACKEND_FUNC gBackendSingleSample
[2][2];
761 extern PFN_BACKEND_FUNC gBackendPixelRateTable
[SWR_MULTISAMPLE_TYPE_MAX
][SWR_MSAA_SAMPLE_PATTERN_MAX
][SWR_INPUT_COVERAGE_MAX
][2][2];
762 extern PFN_BACKEND_FUNC gBackendSampleRateTable
[SWR_MULTISAMPLE_TYPE_MAX
][SWR_INPUT_COVERAGE_MAX
][2];
763 extern PFN_OUTPUT_MERGER gBackendOutputMergerTable
[SWR_NUM_RENDERTARGETS
+ 1][SWR_MULTISAMPLE_TYPE_MAX
];
764 extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable
[2];
765 extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable
[2];
766 void SetupPipeline(DRAW_CONTEXT
*pDC
)
768 DRAW_STATE
* pState
= pDC
->pState
;
769 const SWR_RASTSTATE
&rastState
= pState
->state
.rastState
;
770 const SWR_PS_STATE
&psState
= pState
->state
.psState
;
771 BACKEND_FUNCS
& backendFuncs
= pState
->backendFuncs
;
772 const uint32_t forcedSampleCount
= (rastState
.bForcedSampleCount
) ? 1 : 0;
775 if (psState
.pfnPixelShader
== nullptr)
777 backendFuncs
.pfnBackend
= gBackendNullPs
[pState
->state
.rastState
.sampleCount
];
778 // always need to generate I & J per sample for Z interpolation
779 backendFuncs
.pfnCalcSampleBarycentrics
= gSampleBarycentricTable
[1];
783 const bool bMultisampleEnable
= ((rastState
.sampleCount
> SWR_MULTISAMPLE_1X
) || rastState
.bForcedSampleCount
) ? 1 : 0;
784 const uint32_t centroid
= ((psState
.barycentricsMask
& SWR_BARYCENTRIC_CENTROID_MASK
) > 0) ? 1 : 0;
786 // currently only support 'normal' input coverage
787 SWR_ASSERT(psState
.inputCoverage
== SWR_INPUT_COVERAGE_NORMAL
||
788 psState
.inputCoverage
== SWR_INPUT_COVERAGE_NONE
);
790 SWR_BARYCENTRICS_MASK barycentricsMask
= (SWR_BARYCENTRICS_MASK
)psState
.barycentricsMask
;
792 // select backend function
793 switch(psState
.shadingRate
)
795 case SWR_SHADING_RATE_PIXEL
:
796 if(bMultisampleEnable
)
798 // always need to generate I & J per sample for Z interpolation
799 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_SAMPLE_MASK
);
800 backendFuncs
.pfnBackend
= gBackendPixelRateTable
[rastState
.sampleCount
][rastState
.samplePattern
][psState
.inputCoverage
][centroid
][forcedSampleCount
];
801 backendFuncs
.pfnOutputMerger
= gBackendOutputMergerTable
[psState
.numRenderTargets
][pState
->state
.blendState
.sampleCount
];
805 // always need to generate I & J per pixel for Z interpolation
806 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_PIXEL_MASK
);
807 backendFuncs
.pfnBackend
= gBackendSingleSample
[psState
.inputCoverage
][centroid
];
808 backendFuncs
.pfnOutputMerger
= gBackendOutputMergerTable
[psState
.numRenderTargets
][SWR_MULTISAMPLE_1X
];
811 case SWR_SHADING_RATE_SAMPLE
:
812 SWR_ASSERT(rastState
.samplePattern
== SWR_MSAA_STANDARD_PATTERN
);
813 // always need to generate I & J per sample for Z interpolation
814 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_SAMPLE_MASK
);
815 backendFuncs
.pfnBackend
= gBackendSampleRateTable
[rastState
.sampleCount
][psState
.inputCoverage
][centroid
];
816 backendFuncs
.pfnOutputMerger
= gBackendOutputMergerTable
[psState
.numRenderTargets
][pState
->state
.blendState
.sampleCount
];
819 SWR_ASSERT(0 && "Invalid shading rate");
823 // setup pointer to function that generates necessary barycentrics required by the PS
824 bool bBarycentrics
= (barycentricsMask
& SWR_BARYCENTRIC_PER_PIXEL_MASK
) > 0 ? 1 : 0;
825 backendFuncs
.pfnCalcPixelBarycentrics
= gPixelBarycentricTable
[bBarycentrics
];
827 bBarycentrics
= (barycentricsMask
& SWR_BARYCENTRIC_PER_SAMPLE_MASK
) > 0 ? 1 : 0;
828 backendFuncs
.pfnCalcSampleBarycentrics
= gSampleBarycentricTable
[bBarycentrics
];
831 PFN_PROCESS_PRIMS pfnBinner
;
832 switch (pState
->state
.topology
)
835 pState
->pfnProcessPrims
= ClipPoints
;
836 pfnBinner
= BinPoints
;
841 case TOP_LINE_LIST_ADJ
:
842 case TOP_LISTSTRIP_ADJ
:
843 pState
->pfnProcessPrims
= ClipLines
;
844 pfnBinner
= BinLines
;
847 pState
->pfnProcessPrims
= ClipTriangles
;
848 pfnBinner
= BinTriangles
;
852 // disable clipper if viewport transform is disabled
853 if (pState
->state
.frontendState
.vpTransformDisable
)
855 pState
->pfnProcessPrims
= pfnBinner
;
858 if ((pState
->state
.psState
.pfnPixelShader
== nullptr) &&
859 (pState
->state
.depthStencilState
.depthTestEnable
== FALSE
) &&
860 (pState
->state
.depthStencilState
.depthWriteEnable
== FALSE
) &&
861 (pState
->state
.depthStencilState
.stencilTestEnable
== FALSE
) &&
862 (pState
->state
.depthStencilState
.stencilWriteEnable
== FALSE
) &&
863 (pState
->state
.linkageCount
== 0))
865 pState
->pfnProcessPrims
= nullptr;
866 pState
->state
.linkageMask
= 0;
869 if (pState
->state
.soState
.rasterizerDisable
== true)
871 pState
->pfnProcessPrims
= nullptr;
872 pState
->state
.linkageMask
= 0;
875 // set up the frontend attrib mask
876 pState
->state
.feAttribMask
= pState
->state
.linkageMask
;
877 if (pState
->state
.soState
.soEnable
)
879 for (uint32_t i
= 0; i
< 4; ++i
)
881 pState
->state
.feAttribMask
|= pState
->state
.soState
.streamMasks
[i
];
885 // complicated logic to test for cases where we don't need backing hottile memory for a draw
886 // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
887 pState
->state
.depthHottileEnable
= ((!(pState
->state
.depthStencilState
.depthTestEnable
&&
888 !pState
->state
.depthStencilState
.depthWriteEnable
&&
889 pState
->state
.depthStencilState
.depthTestFunc
== ZFUNC_ALWAYS
)) &&
890 (pState
->state
.depthStencilState
.depthTestEnable
||
891 pState
->state
.depthStencilState
.depthWriteEnable
)) ? true : false;
893 pState
->state
.stencilHottileEnable
= (((!(pState
->state
.depthStencilState
.stencilTestEnable
&&
894 !pState
->state
.depthStencilState
.stencilWriteEnable
&&
895 pState
->state
.depthStencilState
.stencilTestFunc
== ZFUNC_ALWAYS
)) ||
896 // for stencil we have to check the double sided state as well
897 (!(pState
->state
.depthStencilState
.doubleSidedStencilTestEnable
&&
898 !pState
->state
.depthStencilState
.stencilWriteEnable
&&
899 pState
->state
.depthStencilState
.backfaceStencilTestFunc
== ZFUNC_ALWAYS
))) &&
900 (pState
->state
.depthStencilState
.stencilTestEnable
||
901 pState
->state
.depthStencilState
.stencilWriteEnable
)) ? true : false;
903 uint32_t numRTs
= pState
->state
.psState
.numRenderTargets
;
904 pState
->state
.colorHottileEnable
= 0;
905 if (psState
.pfnPixelShader
!= nullptr)
907 for (uint32_t rt
= 0; rt
< numRTs
; ++rt
)
909 pState
->state
.colorHottileEnable
|=
910 (!pState
->state
.blendState
.renderTarget
[rt
].writeDisableAlpha
||
911 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableRed
||
912 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableGreen
||
913 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableBlue
) ? (1 << rt
) : 0;
917 // Setup depth quantization function
918 if (pState
->state
.depthHottileEnable
)
920 switch (pState
->state
.rastState
.depthFormat
)
922 case R32_FLOAT_X8X24_TYPELESS
: pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R32_FLOAT_X8X24_TYPELESS
> ; break;
923 case R32_FLOAT
: pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R32_FLOAT
> ; break;
924 case R24_UNORM_X8_TYPELESS
: pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R24_UNORM_X8_TYPELESS
> ; break;
925 case R16_UNORM
: pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R16_UNORM
> ; break;
926 default: SWR_ASSERT(false, "Unsupported depth format for depth quantiztion.");
927 pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R32_FLOAT
> ;
932 // set up pass-through quantize if depth isn't enabled
933 pState
->state
.pfnQuantizeDepth
= QuantizeDepth
< R32_FLOAT
> ;
937 //////////////////////////////////////////////////////////////////////////
939 /// @param pDC - Draw context to initialize for this draw.
944 // We don't need to re-setup the scissors/pipeline state again for split draw.
945 if (isSplitDraw
== false)
947 SetupMacroTileScissors(pDC
);
952 //////////////////////////////////////////////////////////////////////////
953 /// @brief We can split the draw for certain topologies for better performance.
954 /// @param totalVerts - Total vertices for draw
955 /// @param topology - Topology used for draw
956 uint32_t MaxVertsPerDraw(
959 PRIMITIVE_TOPOLOGY topology
)
961 API_STATE
& state
= pDC
->pState
->state
;
963 uint32_t vertsPerDraw
= totalVerts
;
965 if (state
.soState
.soEnable
)
973 case TOP_TRIANGLE_LIST
:
974 vertsPerDraw
= KNOB_MAX_PRIMS_PER_DRAW
;
977 case TOP_PATCHLIST_1
:
978 case TOP_PATCHLIST_2
:
979 case TOP_PATCHLIST_3
:
980 case TOP_PATCHLIST_4
:
981 case TOP_PATCHLIST_5
:
982 case TOP_PATCHLIST_6
:
983 case TOP_PATCHLIST_7
:
984 case TOP_PATCHLIST_8
:
985 case TOP_PATCHLIST_9
:
986 case TOP_PATCHLIST_10
:
987 case TOP_PATCHLIST_11
:
988 case TOP_PATCHLIST_12
:
989 case TOP_PATCHLIST_13
:
990 case TOP_PATCHLIST_14
:
991 case TOP_PATCHLIST_15
:
992 case TOP_PATCHLIST_16
:
993 case TOP_PATCHLIST_17
:
994 case TOP_PATCHLIST_18
:
995 case TOP_PATCHLIST_19
:
996 case TOP_PATCHLIST_20
:
997 case TOP_PATCHLIST_21
:
998 case TOP_PATCHLIST_22
:
999 case TOP_PATCHLIST_23
:
1000 case TOP_PATCHLIST_24
:
1001 case TOP_PATCHLIST_25
:
1002 case TOP_PATCHLIST_26
:
1003 case TOP_PATCHLIST_27
:
1004 case TOP_PATCHLIST_28
:
1005 case TOP_PATCHLIST_29
:
1006 case TOP_PATCHLIST_30
:
1007 case TOP_PATCHLIST_31
:
1008 case TOP_PATCHLIST_32
:
1009 if (pDC
->pState
->state
.tsState
.tsEnable
)
1011 uint32_t vertsPerPrim
= topology
- TOP_PATCHLIST_BASE
;
1012 vertsPerDraw
= vertsPerPrim
* KNOB_MAX_TESS_PRIMS_PER_DRAW
;
1016 // The Primitive Assembly code can only handle 1 RECT at a time.
1022 // We are not splitting up draws for other topologies.
1026 return vertsPerDraw
;
1030 //////////////////////////////////////////////////////////////////////////
1031 /// @brief DrawInstanced
1032 /// @param hContext - Handle passed back from SwrCreateContext
1033 /// @param topology - Specifies topology for draw.
1034 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
1035 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1036 /// @param numInstances - How many instances to render.
1037 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1040 PRIMITIVE_TOPOLOGY topology
,
1041 uint32_t numVertices
,
1042 uint32_t startVertex
,
1043 uint32_t numInstances
= 1,
1044 uint32_t startInstance
= 0)
1051 RDTSC_START(APIDraw
);
1053 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1054 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1056 uint32_t maxVertsPerDraw
= MaxVertsPerDraw(pDC
, numVertices
, topology
);
1057 uint32_t primsPerDraw
= GetNumPrims(topology
, maxVertsPerDraw
);
1058 uint32_t remainingVerts
= numVertices
;
1060 API_STATE
*pState
= &pDC
->pState
->state
;
1061 pState
->topology
= topology
;
1062 pState
->forceFront
= false;
1064 // disable culling for points/lines
1065 uint32_t oldCullMode
= pState
->rastState
.cullMode
;
1066 if (topology
== TOP_POINT_LIST
)
1068 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
1069 pState
->forceFront
= true;
1073 while (remainingVerts
)
1075 uint32_t numVertsForDraw
= (remainingVerts
< maxVertsPerDraw
) ?
1076 remainingVerts
: maxVertsPerDraw
;
1078 bool isSplitDraw
= (draw
> 0) ? true : false;
1079 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
, isSplitDraw
);
1080 InitDraw(pDC
, isSplitDraw
);
1082 pDC
->FeWork
.type
= DRAW
;
1083 pDC
->FeWork
.pfnWork
= GetProcessDrawFunc(
1085 pState
->tsState
.tsEnable
,
1086 pState
->gsState
.gsEnable
,
1087 pState
->soState
.soEnable
,
1088 pDC
->pState
->pfnProcessPrims
!= nullptr);
1089 pDC
->FeWork
.desc
.draw
.numVerts
= numVertsForDraw
;
1090 pDC
->FeWork
.desc
.draw
.startVertex
= startVertex
;
1091 pDC
->FeWork
.desc
.draw
.numInstances
= numInstances
;
1092 pDC
->FeWork
.desc
.draw
.startInstance
= startInstance
;
1093 pDC
->FeWork
.desc
.draw
.startPrimID
= draw
* primsPerDraw
;
1094 pDC
->FeWork
.desc
.draw
.startVertexID
= draw
* maxVertsPerDraw
;
1096 pDC
->cleanupState
= (remainingVerts
== numVertsForDraw
);
1099 QueueDraw(pContext
);
1101 remainingVerts
-= numVertsForDraw
;
1105 // restore culling state
1106 pDC
= GetDrawContext(pContext
);
1107 pDC
->pState
->state
.rastState
.cullMode
= oldCullMode
;
1109 RDTSC_STOP(APIDraw
, numVertices
* numInstances
, 0);
1112 //////////////////////////////////////////////////////////////////////////
1114 /// @param hContext - Handle passed back from SwrCreateContext
1115 /// @param topology - Specifies topology for draw.
1116 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
1117 /// @param primCount - Number of vertices.
1120 PRIMITIVE_TOPOLOGY topology
,
1121 uint32_t startVertex
,
1122 uint32_t numVertices
)
1124 DrawInstanced(hContext
, topology
, numVertices
, startVertex
);
1127 //////////////////////////////////////////////////////////////////////////
1128 /// @brief SwrDrawInstanced
1129 /// @param hContext - Handle passed back from SwrCreateContext
1130 /// @param topology - Specifies topology for draw.
1131 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
1132 /// @param numInstances - How many instances to render.
1133 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1134 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1135 void SwrDrawInstanced(
1137 PRIMITIVE_TOPOLOGY topology
,
1138 uint32_t numVertsPerInstance
,
1139 uint32_t numInstances
,
1140 uint32_t startVertex
,
1141 uint32_t startInstance
1144 DrawInstanced(hContext
, topology
, numVertsPerInstance
, startVertex
, numInstances
, startInstance
);
1147 //////////////////////////////////////////////////////////////////////////
1148 /// @brief DrawIndexedInstanced
1149 /// @param hContext - Handle passed back from SwrCreateContext
1150 /// @param topology - Specifies topology for draw.
1151 /// @param numIndices - Number of indices to read sequentially from index buffer.
1152 /// @param indexOffset - Starting index into index buffer.
1153 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1154 /// @param numInstances - Number of instances to render.
1155 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1156 void DrawIndexedInstance(
1158 PRIMITIVE_TOPOLOGY topology
,
1159 uint32_t numIndices
,
1160 uint32_t indexOffset
,
1162 uint32_t numInstances
= 1,
1163 uint32_t startInstance
= 0)
1170 RDTSC_START(APIDrawIndexed
);
1172 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1173 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1174 API_STATE
* pState
= &pDC
->pState
->state
;
1176 uint32_t maxIndicesPerDraw
= MaxVertsPerDraw(pDC
, numIndices
, topology
);
1177 uint32_t primsPerDraw
= GetNumPrims(topology
, maxIndicesPerDraw
);
1178 uint32_t remainingIndices
= numIndices
;
1180 uint32_t indexSize
= 0;
1181 switch (pState
->indexBuffer
.format
)
1183 case R32_UINT
: indexSize
= sizeof(uint32_t); break;
1184 case R16_UINT
: indexSize
= sizeof(uint16_t); break;
1185 case R8_UINT
: indexSize
= sizeof(uint8_t); break;
1191 uint8_t *pIB
= (uint8_t*)pState
->indexBuffer
.pIndices
;
1192 pIB
+= (uint64_t)indexOffset
* (uint64_t)indexSize
;
1194 pState
->topology
= topology
;
1195 pState
->forceFront
= false;
1197 // disable culling for points/lines
1198 uint32_t oldCullMode
= pState
->rastState
.cullMode
;
1199 if (topology
== TOP_POINT_LIST
)
1201 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
1202 pState
->forceFront
= true;
1205 while (remainingIndices
)
1207 uint32_t numIndicesForDraw
= (remainingIndices
< maxIndicesPerDraw
) ?
1208 remainingIndices
: maxIndicesPerDraw
;
1210 // When breaking up draw, we need to obtain new draw context for each iteration.
1211 bool isSplitDraw
= (draw
> 0) ? true : false;
1212 pDC
= GetDrawContext(pContext
, isSplitDraw
);
1213 InitDraw(pDC
, isSplitDraw
);
1215 pDC
->FeWork
.type
= DRAW
;
1216 pDC
->FeWork
.pfnWork
= GetProcessDrawFunc(
1218 pState
->tsState
.tsEnable
,
1219 pState
->gsState
.gsEnable
,
1220 pState
->soState
.soEnable
,
1221 pDC
->pState
->pfnProcessPrims
!= nullptr);
1222 pDC
->FeWork
.desc
.draw
.pDC
= pDC
;
1223 pDC
->FeWork
.desc
.draw
.numIndices
= numIndicesForDraw
;
1224 pDC
->FeWork
.desc
.draw
.pIB
= (int*)pIB
;
1225 pDC
->FeWork
.desc
.draw
.type
= pDC
->pState
->state
.indexBuffer
.format
;
1227 pDC
->FeWork
.desc
.draw
.numInstances
= numInstances
;
1228 pDC
->FeWork
.desc
.draw
.startInstance
= startInstance
;
1229 pDC
->FeWork
.desc
.draw
.baseVertex
= baseVertex
;
1230 pDC
->FeWork
.desc
.draw
.startPrimID
= draw
* primsPerDraw
;
1232 pDC
->cleanupState
= (remainingIndices
== numIndicesForDraw
);
1235 QueueDraw(pContext
);
1237 pIB
+= maxIndicesPerDraw
* indexSize
;
1238 remainingIndices
-= numIndicesForDraw
;
1242 // restore culling state
1243 pDC
= GetDrawContext(pContext
);
1244 pDC
->pState
->state
.rastState
.cullMode
= oldCullMode
;
1246 RDTSC_STOP(APIDrawIndexed
, numIndices
* numInstances
, 0);
1250 //////////////////////////////////////////////////////////////////////////
1251 /// @brief DrawIndexed
1252 /// @param hContext - Handle passed back from SwrCreateContext
1253 /// @param topology - Specifies topology for draw.
1254 /// @param numIndices - Number of indices to read sequentially from index buffer.
1255 /// @param indexOffset - Starting index into index buffer.
1256 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1257 void SwrDrawIndexed(
1259 PRIMITIVE_TOPOLOGY topology
,
1260 uint32_t numIndices
,
1261 uint32_t indexOffset
,
1265 DrawIndexedInstance(hContext
, topology
, numIndices
, indexOffset
, baseVertex
);
1268 //////////////////////////////////////////////////////////////////////////
1269 /// @brief SwrDrawIndexedInstanced
1270 /// @param hContext - Handle passed back from SwrCreateContext
1271 /// @param topology - Specifies topology for draw.
1272 /// @param numIndices - Number of indices to read sequentially from index buffer.
1273 /// @param numInstances - Number of instances to render.
1274 /// @param indexOffset - Starting index into index buffer.
1275 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1276 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1277 void SwrDrawIndexedInstanced(
1279 PRIMITIVE_TOPOLOGY topology
,
1280 uint32_t numIndices
,
1281 uint32_t numInstances
,
1282 uint32_t indexOffset
,
1284 uint32_t startInstance
)
1286 DrawIndexedInstance(hContext
, topology
, numIndices
, indexOffset
, baseVertex
, numInstances
, startInstance
);
1289 //////////////////////////////////////////////////////////////////////////
1290 /// @brief SwrInvalidateTiles
1291 /// @param hContext - Handle passed back from SwrCreateContext
1292 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
1293 void SwrInvalidateTiles(
1295 uint32_t attachmentMask
)
1297 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1298 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1300 pDC
->FeWork
.type
= DISCARDINVALIDATETILES
;
1301 pDC
->FeWork
.pfnWork
= ProcessDiscardInvalidateTiles
;
1302 pDC
->FeWork
.desc
.discardInvalidateTiles
.attachmentMask
= attachmentMask
;
1303 memset(&pDC
->FeWork
.desc
.discardInvalidateTiles
.rect
, 0, sizeof(SWR_RECT
));
1304 pDC
->FeWork
.desc
.discardInvalidateTiles
.newTileState
= SWR_TILE_INVALID
;
1305 pDC
->FeWork
.desc
.discardInvalidateTiles
.createNewTiles
= false;
1306 pDC
->FeWork
.desc
.discardInvalidateTiles
.fullTilesOnly
= false;
1309 QueueDraw(pContext
);
1312 //////////////////////////////////////////////////////////////////////////
1313 /// @brief SwrDiscardRect
1314 /// @param hContext - Handle passed back from SwrCreateContext
1315 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
1316 /// @param rect - if rect is all zeros, the entire attachment surface will be discarded
1317 void SwrDiscardRect(
1319 uint32_t attachmentMask
,
1322 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1323 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1325 // Queue a load to the hottile
1326 pDC
->FeWork
.type
= DISCARDINVALIDATETILES
;
1327 pDC
->FeWork
.pfnWork
= ProcessDiscardInvalidateTiles
;
1328 pDC
->FeWork
.desc
.discardInvalidateTiles
.attachmentMask
= attachmentMask
;
1329 pDC
->FeWork
.desc
.discardInvalidateTiles
.rect
= rect
;
1330 pDC
->FeWork
.desc
.discardInvalidateTiles
.newTileState
= SWR_TILE_RESOLVED
;
1331 pDC
->FeWork
.desc
.discardInvalidateTiles
.createNewTiles
= true;
1332 pDC
->FeWork
.desc
.discardInvalidateTiles
.fullTilesOnly
= true;
1335 QueueDraw(pContext
);
1338 //////////////////////////////////////////////////////////////////////////
1339 /// @brief SwrDispatch
1340 /// @param hContext - Handle passed back from SwrCreateContext
1341 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
1342 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
1343 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
1346 uint32_t threadGroupCountX
,
1347 uint32_t threadGroupCountY
,
1348 uint32_t threadGroupCountZ
)
1355 RDTSC_START(APIDispatch
);
1356 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1357 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1359 pDC
->isCompute
= true; // This is a compute context.
1361 COMPUTE_DESC
* pTaskData
= (COMPUTE_DESC
*)pDC
->pArena
->AllocAligned(sizeof(COMPUTE_DESC
), 64);
1363 pTaskData
->threadGroupCountX
= threadGroupCountX
;
1364 pTaskData
->threadGroupCountY
= threadGroupCountY
;
1365 pTaskData
->threadGroupCountZ
= threadGroupCountZ
;
1367 uint32_t totalThreadGroups
= threadGroupCountX
* threadGroupCountY
* threadGroupCountZ
;
1368 uint32_t dcIndex
= pDC
->drawId
% KNOB_MAX_DRAWS_IN_FLIGHT
;
1369 pDC
->pDispatch
= &pContext
->pDispatchQueueArray
[dcIndex
];
1370 pDC
->pDispatch
->initialize(totalThreadGroups
, pTaskData
);
1372 QueueDispatch(pContext
);
1373 RDTSC_STOP(APIDispatch
, threadGroupCountX
* threadGroupCountY
* threadGroupCountZ
, 0);
1376 // Deswizzles, converts and stores current contents of the hot tiles to surface
1377 // described by pState
1380 SWR_RENDERTARGET_ATTACHMENT attachment
,
1381 SWR_TILE_STATE postStoreTileState
)
1383 RDTSC_START(APIStoreTiles
);
1385 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1386 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1388 SetupMacroTileScissors(pDC
);
1390 pDC
->FeWork
.type
= STORETILES
;
1391 pDC
->FeWork
.pfnWork
= ProcessStoreTiles
;
1392 pDC
->FeWork
.desc
.storeTiles
.attachment
= attachment
;
1393 pDC
->FeWork
.desc
.storeTiles
.postStoreTileState
= postStoreTileState
;
1396 QueueDraw(pContext
);
1398 RDTSC_STOP(APIStoreTiles
, 0, 0);
1401 void SwrClearRenderTarget(
1404 const float clearColor
[4],
1408 RDTSC_START(APIClearRenderTarget
);
1410 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1412 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1414 SetupMacroTileScissors(pDC
);
1417 flags
.mask
= clearMask
;
1419 pDC
->FeWork
.type
= CLEAR
;
1420 pDC
->FeWork
.pfnWork
= ProcessClear
;
1421 pDC
->FeWork
.desc
.clear
.flags
= flags
;
1422 pDC
->FeWork
.desc
.clear
.clearDepth
= z
;
1423 pDC
->FeWork
.desc
.clear
.clearRTColor
[0] = clearColor
[0];
1424 pDC
->FeWork
.desc
.clear
.clearRTColor
[1] = clearColor
[1];
1425 pDC
->FeWork
.desc
.clear
.clearRTColor
[2] = clearColor
[2];
1426 pDC
->FeWork
.desc
.clear
.clearRTColor
[3] = clearColor
[3];
1427 pDC
->FeWork
.desc
.clear
.clearStencil
= stencil
;
1430 QueueDraw(pContext
);
1432 RDTSC_STOP(APIClearRenderTarget
, 0, pDC
->drawId
);
1435 //////////////////////////////////////////////////////////////////////////
1436 /// @brief Returns a pointer to the private context state for the current
1437 /// draw operation. This is used for external componets such as the
1439 /// SWR is responsible for the allocation of the private context state.
1440 /// @param hContext - Handle passed back from SwrCreateContext
1441 VOID
* SwrGetPrivateContextState(
1444 SWR_CONTEXT
* pContext
= GetContext(hContext
);
1445 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1446 DRAW_STATE
* pState
= pDC
->pState
;
1448 if (pState
->pPrivateState
== nullptr)
1450 pState
->pPrivateState
= pState
->pArena
->AllocAligned(pContext
->privateStateSize
, KNOB_SIMD_WIDTH
*sizeof(float));
1453 return pState
->pPrivateState
;
1456 //////////////////////////////////////////////////////////////////////////
1457 /// @brief Clients can use this to allocate memory for draw/dispatch
1458 /// operations. The memory will automatically be freed once operation
1459 /// has completed. Client can use this to allocate binding tables,
1460 /// etc. needed for shader execution.
1461 /// @param hContext - Handle passed back from SwrCreateContext
1462 /// @param size - Size of allocation
1463 /// @param align - Alignment needed for allocation.
1464 VOID
* SwrAllocDrawContextMemory(
1469 SWR_CONTEXT
* pContext
= GetContext(hContext
);
1470 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1472 return pDC
->pState
->pArena
->AllocAligned(size
, align
);
1475 //////////////////////////////////////////////////////////////////////////
1476 /// @brief Returns pointer to SWR stats.
1477 /// @note The counters are atomically incremented by multiple threads.
1478 /// When calling this, you need to ensure all previous operations
1480 /// @todo If necessary, add a callback to avoid stalling the pipe to
1481 /// sample the counters.
1482 /// @param hContext - Handle passed back from SwrCreateContext
1483 /// @param pStats - SWR will fill this out for caller.
1488 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1489 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1491 pDC
->FeWork
.type
= QUERYSTATS
;
1492 pDC
->FeWork
.pfnWork
= ProcessQueryStats
;
1493 pDC
->FeWork
.desc
.queryStats
.pStats
= pStats
;
1495 // cannot execute until all previous draws have completed
1496 pDC
->dependency
= pDC
->drawId
- 1;
1499 QueueDraw(pContext
);
1502 //////////////////////////////////////////////////////////////////////////
1503 /// @brief Enables stats counting
1504 /// @param hContext - Handle passed back from SwrCreateContext
1505 /// @param enable - If true then counts are incremented.
1506 void SwrEnableStats(
1510 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1511 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1513 pDC
->pState
->state
.enableStats
= enable
;
1516 //////////////////////////////////////////////////////////////////////////
1517 /// @brief Mark end of frame - used for performance profiling
1518 /// @param hContext - Handle passed back from SwrCreateContext
1519 void SWR_API
SwrEndFrame(
1523 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1524 pContext
->frameCount
++;