1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief API implementation
27 ******************************************************************************/
34 #include "core/backend.h"
35 #include "core/context.h"
36 #include "core/frontend.h"
37 #include "core/rasterizer.h"
38 #include "core/rdtsc_core.h"
39 #include "core/threads.h"
40 #include "core/tilemgr.h"
41 #include "core/clip.h"
43 #include "common/simdintrin.h"
44 #include "common/os.h"
46 void SetupDefaultState(SWR_CONTEXT
*pContext
);
48 //////////////////////////////////////////////////////////////////////////
49 /// @brief Create SWR Context.
50 /// @param pCreateInfo - pointer to creation info.
51 HANDLE
SwrCreateContext(
52 const SWR_CREATECONTEXT_INFO
* pCreateInfo
)
57 void* pContextMem
= _aligned_malloc(sizeof(SWR_CONTEXT
), KNOB_SIMD_WIDTH
* 4);
58 memset(pContextMem
, 0, sizeof(SWR_CONTEXT
));
59 SWR_CONTEXT
*pContext
= new (pContextMem
) SWR_CONTEXT();
61 pContext
->driverType
= pCreateInfo
->driver
;
62 pContext
->privateStateSize
= pCreateInfo
->privateStateSize
;
64 pContext
->dcRing
.Init(KNOB_MAX_DRAWS_IN_FLIGHT
);
65 pContext
->dsRing
.Init(KNOB_MAX_DRAWS_IN_FLIGHT
);
67 pContext
->numSubContexts
= pCreateInfo
->maxSubContexts
;
68 if (pContext
->numSubContexts
> 1)
70 pContext
->subCtxSave
= (DRAW_STATE
*)_aligned_malloc(sizeof(DRAW_STATE
) * pContext
->numSubContexts
, 64);
71 memset(pContext
->subCtxSave
, 0, sizeof(DRAW_STATE
) * pContext
->numSubContexts
);
74 for (uint32_t dc
= 0; dc
< KNOB_MAX_DRAWS_IN_FLIGHT
; ++dc
)
76 pContext
->dcRing
[dc
].pArena
= new Arena();
77 pContext
->dcRing
[dc
].pTileMgr
= new MacroTileMgr(*(pContext
->dcRing
[dc
].pArena
));
78 pContext
->dcRing
[dc
].pDispatch
= new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
80 pContext
->dsRing
[dc
].pArena
= new Arena();
83 if (!KNOB_SINGLE_THREADED
)
85 memset(&pContext
->WaitLock
, 0, sizeof(pContext
->WaitLock
));
86 memset(&pContext
->FifosNotEmpty
, 0, sizeof(pContext
->FifosNotEmpty
));
87 new (&pContext
->WaitLock
) std::mutex();
88 new (&pContext
->FifosNotEmpty
) std::condition_variable();
90 CreateThreadPool(pContext
, &pContext
->threadPool
);
93 // Calling createThreadPool() above can set SINGLE_THREADED
94 if (KNOB_SINGLE_THREADED
)
96 pContext
->NumWorkerThreads
= 1;
99 // Allocate scratch space for workers.
100 ///@note We could lazily allocate this but its rather small amount of memory.
101 for (uint32_t i
= 0; i
< pContext
->NumWorkerThreads
; ++i
)
103 ///@todo Use numa API for allocations using numa information from thread data (if exists).
104 pContext
->pScratch
[i
] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH
* 4);
107 // State setup AFTER context is fully initialized
108 SetupDefaultState(pContext
);
110 // initialize hot tile manager
111 pContext
->pHotTileMgr
= new HotTileMgr();
113 // initialize function pointer tables
114 InitClearTilesTable();
116 // initialize store tiles function
117 pContext
->pfnLoadTile
= pCreateInfo
->pfnLoadTile
;
118 pContext
->pfnStoreTile
= pCreateInfo
->pfnStoreTile
;
119 pContext
->pfnClearTile
= pCreateInfo
->pfnClearTile
;
121 return (HANDLE
)pContext
;
124 void SwrDestroyContext(HANDLE hContext
)
126 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
127 DestroyThreadPool(pContext
, &pContext
->threadPool
);
130 for (uint32_t i
= 0; i
< KNOB_MAX_DRAWS_IN_FLIGHT
; ++i
)
132 delete pContext
->dcRing
[i
].pArena
;
133 delete pContext
->dsRing
[i
].pArena
;
134 delete(pContext
->dcRing
[i
].pTileMgr
);
135 delete(pContext
->dcRing
[i
].pDispatch
);
138 // Free scratch space.
139 for (uint32_t i
= 0; i
< pContext
->NumWorkerThreads
; ++i
)
141 _aligned_free(pContext
->pScratch
[i
]);
144 _aligned_free(pContext
->subCtxSave
);
146 delete(pContext
->pHotTileMgr
);
148 pContext
->~SWR_CONTEXT();
149 _aligned_free((SWR_CONTEXT
*)hContext
);
152 void CopyState(DRAW_STATE
& dst
, const DRAW_STATE
& src
)
154 memcpy(&dst
.state
, &src
.state
, sizeof(API_STATE
));
157 void WakeAllThreads(SWR_CONTEXT
*pContext
)
159 pContext
->FifosNotEmpty
.notify_all();
162 template<bool IsDraw
>
163 void QueueWork(SWR_CONTEXT
*pContext
)
167 // Each worker thread looks at a DC for both FE and BE work at different times and so we
168 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
169 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
170 // then moved on if all work is done.)
171 pContext
->pCurDrawContext
->threadsDone
=
172 pContext
->NumWorkerThreads
? pContext
->NumWorkerThreads
* 2 : 2;
176 pContext
->pCurDrawContext
->threadsDone
=
177 pContext
->NumWorkerThreads
? pContext
->NumWorkerThreads
: 1;
182 std::unique_lock
<std::mutex
> lock(pContext
->WaitLock
);
183 pContext
->dcRing
.Enqueue();
186 if (KNOB_SINGLE_THREADED
)
188 // flush denormals to 0
189 uint32_t mxcsr
= _mm_getcsr();
190 _mm_setcsr(mxcsr
| _MM_FLUSH_ZERO_ON
| _MM_DENORMALS_ZERO_ON
);
194 std::unordered_set
<uint32_t> lockedTiles
;
195 uint64_t curDraw
[2] = { pContext
->pCurDrawContext
->drawId
, pContext
->pCurDrawContext
->drawId
};
196 WorkOnFifoFE(pContext
, 0, curDraw
[0], 0);
197 WorkOnFifoBE(pContext
, 0, curDraw
[1], lockedTiles
);
201 uint64_t curDispatch
= pContext
->pCurDrawContext
->drawId
;
202 WorkOnCompute(pContext
, 0, curDispatch
);
205 // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
206 if (!pContext
->dcRing
.IsEmpty())
208 pContext
->dcRing
.Dequeue();
216 RDTSC_START(APIDrawWakeAllThreads
);
217 WakeAllThreads(pContext
);
218 RDTSC_STOP(APIDrawWakeAllThreads
, 1, 0);
221 // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
222 pContext
->pPrevDrawContext
= pContext
->pCurDrawContext
;
223 pContext
->pCurDrawContext
= nullptr;
226 INLINE
void QueueDraw(SWR_CONTEXT
* pContext
)
228 QueueWork
<true>(pContext
);
231 INLINE
void QueueDispatch(SWR_CONTEXT
* pContext
)
233 QueueWork
<false>(pContext
);
236 DRAW_CONTEXT
* GetDrawContext(SWR_CONTEXT
*pContext
, bool isSplitDraw
= false)
238 RDTSC_START(APIGetDrawContext
);
239 // If current draw context is null then need to obtain a new draw context to use from ring.
240 if (pContext
->pCurDrawContext
== nullptr)
242 // Need to wait for a free entry.
243 while (pContext
->dcRing
.IsFull())
248 uint32_t dcIndex
= pContext
->dcRing
.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT
;
250 DRAW_CONTEXT
* pCurDrawContext
= &pContext
->dcRing
[dcIndex
];
251 pContext
->pCurDrawContext
= pCurDrawContext
;
253 // Assign next available entry in DS ring to this DC.
254 uint32_t dsIndex
= pContext
->curStateId
% KNOB_MAX_DRAWS_IN_FLIGHT
;
255 pCurDrawContext
->pState
= &pContext
->dsRing
[dsIndex
];
257 Arena
& stateArena
= *(pCurDrawContext
->pState
->pArena
);
259 // Copy previous state to current state.
260 if (pContext
->pPrevDrawContext
)
262 DRAW_CONTEXT
* pPrevDrawContext
= pContext
->pPrevDrawContext
;
264 // If we're splitting our draw then we can just use the same state from the previous
265 // draw. In this case, we won't increment the DS ring index so the next non-split
266 // draw can receive the state.
267 if (isSplitDraw
== false)
269 CopyState(*pCurDrawContext
->pState
, *pPrevDrawContext
->pState
);
271 stateArena
.Reset(true); // Reset memory.
272 pCurDrawContext
->pState
->pPrivateState
= nullptr;
274 pContext
->curStateId
++; // Progress state ring index forward.
278 // If its a split draw then just copy the state pointer over
279 // since its the same draw.
280 pCurDrawContext
->pState
= pPrevDrawContext
->pState
;
285 stateArena
.Reset(); // Reset memory.
286 pContext
->curStateId
++; // Progress state ring index forward.
289 pCurDrawContext
->dependency
= 0;
290 pCurDrawContext
->pArena
->Reset();
291 pCurDrawContext
->pContext
= pContext
;
292 pCurDrawContext
->isCompute
= false; // Dispatch has to set this to true.
294 pCurDrawContext
->doneFE
= false;
295 pCurDrawContext
->FeLock
= 0;
296 pCurDrawContext
->threadsDone
= 0;
298 pCurDrawContext
->pTileMgr
->initialize();
300 // Assign unique drawId for this DC
301 pCurDrawContext
->drawId
= pContext
->dcRing
.GetHead();
305 SWR_ASSERT(isSplitDraw
== false, "Split draw should only be used when obtaining a new DC");
308 RDTSC_STOP(APIGetDrawContext
, 0, 0);
309 return pContext
->pCurDrawContext
;
312 void SWR_API
SwrSetActiveSubContext(
314 uint32_t subContextIndex
)
316 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
317 if (subContextIndex
>= pContext
->numSubContexts
)
322 if (subContextIndex
!= pContext
->curSubCtxId
)
324 // Save and restore draw state
325 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
327 pContext
->subCtxSave
[pContext
->curSubCtxId
],
332 pContext
->subCtxSave
[subContextIndex
]);
334 pContext
->curSubCtxId
= subContextIndex
;
338 API_STATE
* GetDrawState(SWR_CONTEXT
*pContext
)
340 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
341 SWR_ASSERT(pDC
->pState
!= nullptr);
343 return &pDC
->pState
->state
;
346 void SetupDefaultState(SWR_CONTEXT
*pContext
)
348 API_STATE
* pState
= GetDrawState(pContext
);
350 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
351 pState
->rastState
.frontWinding
= SWR_FRONTWINDING_CCW
;
354 static INLINE SWR_CONTEXT
* GetContext(HANDLE hContext
)
356 return (SWR_CONTEXT
*)hContext
;
359 void SwrSync(HANDLE hContext
, PFN_CALLBACK_FUNC pfnFunc
, uint64_t userData
, uint64_t userData2
, uint64_t userData3
)
361 RDTSC_START(APISync
);
363 SWR_ASSERT(pfnFunc
!= nullptr);
365 SWR_CONTEXT
*pContext
= GetContext(hContext
);
366 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
368 pDC
->FeWork
.type
= SYNC
;
369 pDC
->FeWork
.pfnWork
= ProcessSync
;
370 pDC
->FeWork
.desc
.sync
.pfnCallbackFunc
= pfnFunc
;
371 pDC
->FeWork
.desc
.sync
.userData
= userData
;
372 pDC
->FeWork
.desc
.sync
.userData2
= userData2
;
373 pDC
->FeWork
.desc
.sync
.userData3
= userData3
;
375 // cannot execute until all previous draws have completed
376 pDC
->dependency
= pDC
->drawId
- 1;
381 RDTSC_STOP(APISync
, 1, 0);
384 void SwrWaitForIdle(HANDLE hContext
)
386 SWR_CONTEXT
*pContext
= GetContext(hContext
);
388 RDTSC_START(APIWaitForIdle
);
390 while (!pContext
->dcRing
.IsEmpty())
395 RDTSC_STOP(APIWaitForIdle
, 1, 0);
398 void SwrSetVertexBuffers(
401 const SWR_VERTEX_BUFFER_STATE
* pVertexBuffers
)
403 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
405 for (uint32_t i
= 0; i
< numBuffers
; ++i
)
407 const SWR_VERTEX_BUFFER_STATE
*pVB
= &pVertexBuffers
[i
];
408 pState
->vertexBuffers
[pVB
->index
] = *pVB
;
412 void SwrSetIndexBuffer(
414 const SWR_INDEX_BUFFER_STATE
* pIndexBuffer
)
416 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
418 pState
->indexBuffer
= *pIndexBuffer
;
421 void SwrSetFetchFunc(
423 PFN_FETCH_FUNC pfnFetchFunc
)
425 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
427 pState
->pfnFetchFunc
= pfnFetchFunc
;
432 PFN_SO_FUNC pfnSoFunc
,
433 uint32_t streamIndex
)
435 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
437 SWR_ASSERT(streamIndex
< MAX_SO_STREAMS
);
439 pState
->pfnSoFunc
[streamIndex
] = pfnSoFunc
;
444 SWR_STREAMOUT_STATE
* pSoState
)
446 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
448 pState
->soState
= *pSoState
;
451 void SwrSetSoBuffers(
453 SWR_STREAMOUT_BUFFER
* pSoBuffer
,
456 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
458 SWR_ASSERT((slot
< 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot
);
460 pState
->soBuffer
[slot
] = *pSoBuffer
;
463 void SwrSetVertexFunc(
465 PFN_VERTEX_FUNC pfnVertexFunc
)
467 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
469 pState
->pfnVertexFunc
= pfnVertexFunc
;
472 void SwrSetFrontendState(
474 SWR_FRONTEND_STATE
*pFEState
)
476 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
477 pState
->frontendState
= *pFEState
;
482 SWR_GS_STATE
*pGSState
)
484 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
485 pState
->gsState
= *pGSState
;
490 PFN_GS_FUNC pfnGsFunc
)
492 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
493 pState
->pfnGsFunc
= pfnGsFunc
;
498 PFN_CS_FUNC pfnCsFunc
,
499 uint32_t totalThreadsInGroup
)
501 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
502 pState
->pfnCsFunc
= pfnCsFunc
;
503 pState
->totalThreadsInGroup
= totalThreadsInGroup
;
508 SWR_TS_STATE
*pState
)
510 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
511 pApiState
->tsState
= *pState
;
518 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
519 pApiState
->pfnHsFunc
= pfnFunc
;
526 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
527 pApiState
->pfnDsFunc
= pfnFunc
;
530 void SwrSetDepthStencilState(
532 SWR_DEPTH_STENCIL_STATE
*pDSState
)
534 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
536 pState
->depthStencilState
= *pDSState
;
539 void SwrSetBackendState(
541 SWR_BACKEND_STATE
*pBEState
)
543 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
545 pState
->backendState
= *pBEState
;
548 void SwrSetPixelShaderState(
550 SWR_PS_STATE
*pPSState
)
552 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
553 pState
->psState
= *pPSState
;
556 void SwrSetBlendState(
558 SWR_BLEND_STATE
*pBlendState
)
560 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
561 memcpy(&pState
->blendState
, pBlendState
, sizeof(SWR_BLEND_STATE
));
564 void SwrSetBlendFunc(
566 uint32_t renderTarget
,
567 PFN_BLEND_JIT_FUNC pfnBlendFunc
)
569 SWR_ASSERT(renderTarget
< SWR_NUM_RENDERTARGETS
);
570 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
571 pState
->pfnBlendFunc
[renderTarget
] = pfnBlendFunc
;
579 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
581 static const uint8_t IDENTITY_MAP
[] =
583 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
584 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
586 static_assert(sizeof(IDENTITY_MAP
) == sizeof(pState
->linkageMap
),
587 "Update for new value of MAX_ATTRIBUTES");
589 pState
->linkageMask
= mask
;
590 pState
->linkageCount
= _mm_popcnt_u32(mask
);
596 memcpy(pState
->linkageMap
, pMap
, pState
->linkageCount
);
599 // update guardband multipliers for the viewport
600 void updateGuardband(API_STATE
*pState
)
602 // guardband center is viewport center
603 pState
->gbState
.left
= KNOB_GUARDBAND_WIDTH
/ pState
->vp
[0].width
;
604 pState
->gbState
.right
= KNOB_GUARDBAND_WIDTH
/ pState
->vp
[0].width
;
605 pState
->gbState
.top
= KNOB_GUARDBAND_HEIGHT
/ pState
->vp
[0].height
;
606 pState
->gbState
.bottom
= KNOB_GUARDBAND_HEIGHT
/ pState
->vp
[0].height
;
609 void SwrSetRastState(
611 const SWR_RASTSTATE
*pRastState
)
613 SWR_CONTEXT
*pContext
= GetContext(hContext
);
614 API_STATE
* pState
= GetDrawState(pContext
);
616 memcpy(&pState
->rastState
, pRastState
, sizeof(SWR_RASTSTATE
));
619 void SwrSetViewports(
621 uint32_t numViewports
,
622 const SWR_VIEWPORT
* pViewports
,
623 const SWR_VIEWPORT_MATRIX
* pMatrices
)
625 SWR_ASSERT(numViewports
<= KNOB_NUM_VIEWPORTS_SCISSORS
,
626 "Invalid number of viewports.");
628 SWR_CONTEXT
*pContext
= GetContext(hContext
);
629 API_STATE
* pState
= GetDrawState(pContext
);
631 memcpy(&pState
->vp
[0], pViewports
, sizeof(SWR_VIEWPORT
) * numViewports
);
633 if (pMatrices
!= nullptr)
635 memcpy(&pState
->vpMatrix
[0], pMatrices
, sizeof(SWR_VIEWPORT_MATRIX
) * numViewports
);
639 // Compute default viewport transform.
640 for (uint32_t i
= 0; i
< numViewports
; ++i
)
642 if (pContext
->driverType
== DX
)
644 pState
->vpMatrix
[i
].m00
= pState
->vp
[i
].width
/ 2.0f
;
645 pState
->vpMatrix
[i
].m11
= -pState
->vp
[i
].height
/ 2.0f
;
646 pState
->vpMatrix
[i
].m22
= pState
->vp
[i
].maxZ
- pState
->vp
[i
].minZ
;
647 pState
->vpMatrix
[i
].m30
= pState
->vp
[i
].x
+ pState
->vpMatrix
[i
].m00
;
648 pState
->vpMatrix
[i
].m31
= pState
->vp
[i
].y
- pState
->vpMatrix
[i
].m11
;
649 pState
->vpMatrix
[i
].m32
= pState
->vp
[i
].minZ
;
653 // Standard, with the exception that Y is inverted.
654 pState
->vpMatrix
[i
].m00
= (pState
->vp
[i
].width
- pState
->vp
[i
].x
) / 2.0f
;
655 pState
->vpMatrix
[i
].m11
= (pState
->vp
[i
].y
- pState
->vp
[i
].height
) / 2.0f
;
656 pState
->vpMatrix
[i
].m22
= (pState
->vp
[i
].maxZ
- pState
->vp
[i
].minZ
) / 2.0f
;
657 pState
->vpMatrix
[i
].m30
= pState
->vp
[i
].x
+ pState
->vpMatrix
[i
].m00
;
658 pState
->vpMatrix
[i
].m31
= pState
->vp
[i
].height
+ pState
->vpMatrix
[i
].m11
;
659 pState
->vpMatrix
[i
].m32
= pState
->vp
[i
].minZ
+ pState
->vpMatrix
[i
].m22
;
661 // Now that the matrix is calculated, clip the view coords to screen size.
662 // OpenGL allows for -ve x,y in the viewport.
663 pState
->vp
[i
].x
= std::max(pState
->vp
[i
].x
, 0.0f
);
664 pState
->vp
[i
].y
= std::max(pState
->vp
[i
].y
, 0.0f
);
669 updateGuardband(pState
);
672 void SwrSetScissorRects(
674 uint32_t numScissors
,
675 const BBOX
* pScissors
)
677 SWR_ASSERT(numScissors
<= KNOB_NUM_VIEWPORTS_SCISSORS
,
678 "Invalid number of scissor rects.");
680 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
681 memcpy(&pState
->scissorRects
[0], pScissors
, numScissors
* sizeof(BBOX
));
684 void SetupMacroTileScissors(DRAW_CONTEXT
*pDC
)
686 API_STATE
*pState
= &pDC
->pState
->state
;
687 uint32_t left
, right
, top
, bottom
;
689 // Set up scissor dimensions based on scissor or viewport
690 if (pState
->rastState
.scissorEnable
)
692 // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges
693 left
= pState
->scissorRects
[0].left
;
694 right
= pState
->scissorRects
[0].right
;
695 top
= pState
->scissorRects
[0].top
;
696 bottom
= pState
->scissorRects
[0].bottom
;
700 left
= (int32_t)pState
->vp
[0].x
;
701 right
= (int32_t)pState
->vp
[0].x
+ (int32_t)pState
->vp
[0].width
;
702 top
= (int32_t)pState
->vp
[0].y
;
703 bottom
= (int32_t)pState
->vp
[0].y
+ (int32_t)pState
->vp
[0].height
;
706 right
= std::min
<uint32_t>(right
, KNOB_MAX_SCISSOR_X
);
707 bottom
= std::min
<uint32_t>(bottom
, KNOB_MAX_SCISSOR_Y
);
709 if (left
> KNOB_MAX_SCISSOR_X
|| top
> KNOB_MAX_SCISSOR_Y
)
711 pState
->scissorInFixedPoint
.left
= 0;
712 pState
->scissorInFixedPoint
.right
= 0;
713 pState
->scissorInFixedPoint
.top
= 0;
714 pState
->scissorInFixedPoint
.bottom
= 0;
718 pState
->scissorInFixedPoint
.left
= left
* FIXED_POINT_SCALE
;
719 pState
->scissorInFixedPoint
.right
= right
* FIXED_POINT_SCALE
- 1;
720 pState
->scissorInFixedPoint
.top
= top
* FIXED_POINT_SCALE
;
721 pState
->scissorInFixedPoint
.bottom
= bottom
* FIXED_POINT_SCALE
- 1;
725 void SetupPipeline(DRAW_CONTEXT
*pDC
)
727 DRAW_STATE
* pState
= pDC
->pState
;
728 const SWR_RASTSTATE
&rastState
= pState
->state
.rastState
;
729 BACKEND_FUNCS
& backendFuncs
= pState
->backendFuncs
;
730 const uint32_t forcedSampleCount
= (rastState
.bForcedSampleCount
) ? 1 : 0;
733 if (pState
->state
.psState
.pfnPixelShader
== nullptr)
735 backendFuncs
.pfnBackend
= gBackendNullPs
[pState
->state
.rastState
.sampleCount
];
736 // always need to generate I & J per sample for Z interpolation
737 backendFuncs
.pfnCalcSampleBarycentrics
= gSampleBarycentricTable
[1];
741 const bool bMultisampleEnable
= ((rastState
.sampleCount
> SWR_MULTISAMPLE_1X
) || rastState
.bForcedSampleCount
) ? 1 : 0;
742 const uint32_t centroid
= ((pState
->state
.psState
.barycentricsMask
& SWR_BARYCENTRIC_CENTROID_MASK
) > 0) ? 1 : 0;
744 // currently only support 'normal' input coverage
745 SWR_ASSERT(pState
->state
.psState
.inputCoverage
== SWR_INPUT_COVERAGE_NORMAL
||
746 pState
->state
.psState
.inputCoverage
== SWR_INPUT_COVERAGE_NONE
);
748 SWR_BARYCENTRICS_MASK barycentricsMask
= (SWR_BARYCENTRICS_MASK
)pState
->state
.psState
.barycentricsMask
;
750 // select backend function
751 switch(pState
->state
.psState
.shadingRate
)
753 case SWR_SHADING_RATE_PIXEL
:
754 if(bMultisampleEnable
)
756 // always need to generate I & J per sample for Z interpolation
757 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_SAMPLE_MASK
);
758 backendFuncs
.pfnBackend
= gBackendPixelRateTable
[rastState
.sampleCount
][rastState
.samplePattern
][pState
->state
.psState
.inputCoverage
][centroid
][forcedSampleCount
];
759 backendFuncs
.pfnOutputMerger
= gBackendOutputMergerTable
[pState
->state
.psState
.numRenderTargets
][pState
->state
.blendState
.sampleCount
];
763 // always need to generate I & J per pixel for Z interpolation
764 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_PIXEL_MASK
);
765 backendFuncs
.pfnBackend
= gBackendSingleSample
[pState
->state
.psState
.inputCoverage
][centroid
];
766 backendFuncs
.pfnOutputMerger
= gBackendOutputMergerTable
[pState
->state
.psState
.numRenderTargets
][SWR_MULTISAMPLE_1X
];
769 case SWR_SHADING_RATE_SAMPLE
:
770 SWR_ASSERT(rastState
.samplePattern
== SWR_MSAA_STANDARD_PATTERN
);
771 // always need to generate I & J per sample for Z interpolation
772 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_SAMPLE_MASK
);
773 backendFuncs
.pfnBackend
= gBackendSampleRateTable
[rastState
.sampleCount
][pState
->state
.psState
.inputCoverage
][centroid
];
774 backendFuncs
.pfnOutputMerger
= gBackendOutputMergerTable
[pState
->state
.psState
.numRenderTargets
][pState
->state
.blendState
.sampleCount
];
776 case SWR_SHADING_RATE_COARSE
:
778 SWR_ASSERT(0 && "Invalid shading rate");
782 // setup pointer to function that generates necessary barycentrics required by the PS
783 bool bBarycentrics
= (barycentricsMask
& SWR_BARYCENTRIC_PER_PIXEL_MASK
) > 0 ? 1 : 0;
784 backendFuncs
.pfnCalcPixelBarycentrics
= gPixelBarycentricTable
[bBarycentrics
];
786 bBarycentrics
= (barycentricsMask
& SWR_BARYCENTRIC_PER_SAMPLE_MASK
) > 0 ? 1 : 0;
787 backendFuncs
.pfnCalcSampleBarycentrics
= gSampleBarycentricTable
[bBarycentrics
];
789 bBarycentrics
= (barycentricsMask
& SWR_BARYCENTRIC_CENTROID_MASK
) > 0 ? 1 : 0;
790 backendFuncs
.pfnCalcCentroidBarycentrics
= gCentroidBarycentricTable
[rastState
.sampleCount
][bBarycentrics
][rastState
.samplePattern
][forcedSampleCount
];
793 PFN_PROCESS_PRIMS pfnBinner
;
794 switch (pState
->state
.topology
)
797 pState
->pfnProcessPrims
= ClipPoints
;
798 pfnBinner
= BinPoints
;
803 case TOP_LINE_LIST_ADJ
:
804 case TOP_LISTSTRIP_ADJ
:
805 pState
->pfnProcessPrims
= ClipLines
;
806 pfnBinner
= BinLines
;
809 pState
->pfnProcessPrims
= ClipTriangles
;
810 pfnBinner
= BinTriangles
;
814 // disable clipper if viewport transform is disabled
815 if (pState
->state
.frontendState
.vpTransformDisable
)
817 pState
->pfnProcessPrims
= pfnBinner
;
820 if ((pState
->state
.psState
.pfnPixelShader
== nullptr) &&
821 (pState
->state
.depthStencilState
.depthTestEnable
== FALSE
) &&
822 (pState
->state
.depthStencilState
.depthWriteEnable
== FALSE
) &&
823 (pState
->state
.depthStencilState
.stencilTestEnable
== FALSE
) &&
824 (pState
->state
.depthStencilState
.stencilWriteEnable
== FALSE
) &&
825 (pState
->state
.linkageCount
== 0))
827 pState
->pfnProcessPrims
= nullptr;
828 pState
->state
.linkageMask
= 0;
831 if (pState
->state
.soState
.rasterizerDisable
== true)
833 pState
->pfnProcessPrims
= nullptr;
834 pState
->state
.linkageMask
= 0;
837 // set up the frontend attrib mask
838 pState
->state
.feAttribMask
= pState
->state
.linkageMask
;
839 if (pState
->state
.soState
.soEnable
)
841 for (uint32_t i
= 0; i
< 4; ++i
)
843 pState
->state
.feAttribMask
|= pState
->state
.soState
.streamMasks
[i
];
847 // complicated logic to test for cases where we don't need backing hottile memory for a draw
848 // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
849 pState
->state
.depthHottileEnable
= ((!(pState
->state
.depthStencilState
.depthTestEnable
&&
850 !pState
->state
.depthStencilState
.depthWriteEnable
&&
851 pState
->state
.depthStencilState
.depthTestFunc
== ZFUNC_ALWAYS
)) &&
852 (pState
->state
.depthStencilState
.depthTestEnable
||
853 pState
->state
.depthStencilState
.depthWriteEnable
)) ? true : false;
855 pState
->state
.stencilHottileEnable
= (((!(pState
->state
.depthStencilState
.stencilTestEnable
&&
856 !pState
->state
.depthStencilState
.stencilWriteEnable
&&
857 pState
->state
.depthStencilState
.stencilTestFunc
== ZFUNC_ALWAYS
)) ||
858 // for stencil we have to check the double sided state as well
859 (!(pState
->state
.depthStencilState
.doubleSidedStencilTestEnable
&&
860 !pState
->state
.depthStencilState
.stencilWriteEnable
&&
861 pState
->state
.depthStencilState
.backfaceStencilTestFunc
== ZFUNC_ALWAYS
))) &&
862 (pState
->state
.depthStencilState
.stencilTestEnable
||
863 pState
->state
.depthStencilState
.stencilWriteEnable
)) ? true : false;
865 uint32_t numRTs
= pState
->state
.psState
.numRenderTargets
;
866 pState
->state
.colorHottileEnable
= 0;
867 if(pState
->state
.psState
.pfnPixelShader
!= nullptr)
869 for (uint32_t rt
= 0; rt
< numRTs
; ++rt
)
871 pState
->state
.colorHottileEnable
|=
872 (!pState
->state
.blendState
.renderTarget
[rt
].writeDisableAlpha
||
873 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableRed
||
874 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableGreen
||
875 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableBlue
) ? (1 << rt
) : 0;
880 //////////////////////////////////////////////////////////////////////////
882 /// @param pDC - Draw context to initialize for this draw.
887 // We don't need to re-setup the scissors/pipeline state again for split draw.
888 if (isSplitDraw
== false)
890 SetupMacroTileScissors(pDC
);
895 //////////////////////////////////////////////////////////////////////////
896 /// @brief We can split the draw for certain topologies for better performance.
897 /// @param totalVerts - Total vertices for draw
898 /// @param topology - Topology used for draw
899 uint32_t MaxVertsPerDraw(
902 PRIMITIVE_TOPOLOGY topology
)
904 API_STATE
& state
= pDC
->pState
->state
;
906 uint32_t vertsPerDraw
= totalVerts
;
908 if (state
.soState
.soEnable
)
916 case TOP_TRIANGLE_LIST
:
917 vertsPerDraw
= KNOB_MAX_PRIMS_PER_DRAW
;
920 case TOP_PATCHLIST_1
:
921 case TOP_PATCHLIST_2
:
922 case TOP_PATCHLIST_3
:
923 case TOP_PATCHLIST_4
:
924 case TOP_PATCHLIST_5
:
925 case TOP_PATCHLIST_6
:
926 case TOP_PATCHLIST_7
:
927 case TOP_PATCHLIST_8
:
928 case TOP_PATCHLIST_9
:
929 case TOP_PATCHLIST_10
:
930 case TOP_PATCHLIST_11
:
931 case TOP_PATCHLIST_12
:
932 case TOP_PATCHLIST_13
:
933 case TOP_PATCHLIST_14
:
934 case TOP_PATCHLIST_15
:
935 case TOP_PATCHLIST_16
:
936 case TOP_PATCHLIST_17
:
937 case TOP_PATCHLIST_18
:
938 case TOP_PATCHLIST_19
:
939 case TOP_PATCHLIST_20
:
940 case TOP_PATCHLIST_21
:
941 case TOP_PATCHLIST_22
:
942 case TOP_PATCHLIST_23
:
943 case TOP_PATCHLIST_24
:
944 case TOP_PATCHLIST_25
:
945 case TOP_PATCHLIST_26
:
946 case TOP_PATCHLIST_27
:
947 case TOP_PATCHLIST_28
:
948 case TOP_PATCHLIST_29
:
949 case TOP_PATCHLIST_30
:
950 case TOP_PATCHLIST_31
:
951 case TOP_PATCHLIST_32
:
952 if (pDC
->pState
->state
.tsState
.tsEnable
)
954 uint32_t vertsPerPrim
= topology
- TOP_PATCHLIST_BASE
;
955 vertsPerDraw
= vertsPerPrim
* KNOB_MAX_TESS_PRIMS_PER_DRAW
;
959 // The Primitive Assembly code can only handle 1 RECT at a time.
965 // We are not splitting up draws for other topologies.
972 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
973 // arguments to static template arguments.
974 template <bool... ArgsB
>
977 // Last Arg Terminator
978 static PFN_FE_WORK_FUNC
GetFunc(bool bArg
)
982 return ProcessDraw
<ArgsB
..., true>;
985 return ProcessDraw
<ArgsB
..., false>;
988 // Recursively parse args
989 template <typename
... TArgsT
>
990 static PFN_FE_WORK_FUNC
GetFunc(bool bArg
, TArgsT
... remainingArgs
)
994 return FEDrawChooser
<ArgsB
..., true>::GetFunc(remainingArgs
...);
997 return FEDrawChooser
<ArgsB
..., false>::GetFunc(remainingArgs
...);
1001 // Selector for correct templated Draw front-end function
1003 static PFN_FE_WORK_FUNC
GetFEDrawFunc(bool IsIndexed
, bool HasTessellation
, bool HasGeometryShader
, bool HasStreamOut
, bool RasterizerEnabled
)
1005 return FEDrawChooser
<>::GetFunc(IsIndexed
, HasTessellation
, HasGeometryShader
, HasStreamOut
, RasterizerEnabled
);
1009 //////////////////////////////////////////////////////////////////////////
1010 /// @brief DrawInstanced
1011 /// @param hContext - Handle passed back from SwrCreateContext
1012 /// @param topology - Specifies topology for draw.
1013 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
1014 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1015 /// @param numInstances - How many instances to render.
1016 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1019 PRIMITIVE_TOPOLOGY topology
,
1020 uint32_t numVertices
,
1021 uint32_t startVertex
,
1022 uint32_t numInstances
= 1,
1023 uint32_t startInstance
= 0)
1030 RDTSC_START(APIDraw
);
1032 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1033 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1035 int32_t maxVertsPerDraw
= MaxVertsPerDraw(pDC
, numVertices
, topology
);
1036 uint32_t primsPerDraw
= GetNumPrims(topology
, maxVertsPerDraw
);
1037 int32_t remainingVerts
= numVertices
;
1039 API_STATE
*pState
= &pDC
->pState
->state
;
1040 pState
->topology
= topology
;
1041 pState
->forceFront
= false;
1043 // disable culling for points/lines
1044 uint32_t oldCullMode
= pState
->rastState
.cullMode
;
1045 if (topology
== TOP_POINT_LIST
)
1047 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
1048 pState
->forceFront
= true;
1052 while (remainingVerts
)
1054 uint32_t numVertsForDraw
= (remainingVerts
< maxVertsPerDraw
) ?
1055 remainingVerts
: maxVertsPerDraw
;
1057 bool isSplitDraw
= (draw
> 0) ? true : false;
1058 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
, isSplitDraw
);
1059 InitDraw(pDC
, isSplitDraw
);
1061 pDC
->FeWork
.type
= DRAW
;
1062 pDC
->FeWork
.pfnWork
= GetFEDrawFunc(
1064 pState
->tsState
.tsEnable
,
1065 pState
->gsState
.gsEnable
,
1066 pState
->soState
.soEnable
,
1067 pDC
->pState
->pfnProcessPrims
!= nullptr);
1068 pDC
->FeWork
.desc
.draw
.numVerts
= numVertsForDraw
;
1069 pDC
->FeWork
.desc
.draw
.startVertex
= startVertex
;
1070 pDC
->FeWork
.desc
.draw
.numInstances
= numInstances
;
1071 pDC
->FeWork
.desc
.draw
.startInstance
= startInstance
;
1072 pDC
->FeWork
.desc
.draw
.startPrimID
= draw
* primsPerDraw
;
1073 pDC
->FeWork
.desc
.draw
.startVertexID
= draw
* maxVertsPerDraw
;
1076 QueueDraw(pContext
);
1078 remainingVerts
-= numVertsForDraw
;
1082 // restore culling state
1083 pDC
= GetDrawContext(pContext
);
1084 pDC
->pState
->state
.rastState
.cullMode
= oldCullMode
;
1086 RDTSC_STOP(APIDraw
, numVertices
* numInstances
, 0);
1089 //////////////////////////////////////////////////////////////////////////
1091 /// @param hContext - Handle passed back from SwrCreateContext
1092 /// @param topology - Specifies topology for draw.
1093 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
1094 /// @param primCount - Number of vertices.
1097 PRIMITIVE_TOPOLOGY topology
,
1098 uint32_t startVertex
,
1099 uint32_t numVertices
)
1101 DrawInstanced(hContext
, topology
, numVertices
, startVertex
);
1104 //////////////////////////////////////////////////////////////////////////
1105 /// @brief SwrDrawInstanced
1106 /// @param hContext - Handle passed back from SwrCreateContext
1107 /// @param topology - Specifies topology for draw.
1108 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
1109 /// @param numInstances - How many instances to render.
1110 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1111 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1112 void SwrDrawInstanced(
1114 PRIMITIVE_TOPOLOGY topology
,
1115 uint32_t numVertsPerInstance
,
1116 uint32_t numInstances
,
1117 uint32_t startVertex
,
1118 uint32_t startInstance
1121 DrawInstanced(hContext
, topology
, numVertsPerInstance
, startVertex
, numInstances
, startInstance
);
1124 //////////////////////////////////////////////////////////////////////////
1125 /// @brief DrawIndexedInstanced
1126 /// @param hContext - Handle passed back from SwrCreateContext
1127 /// @param topology - Specifies topology for draw.
1128 /// @param numIndices - Number of indices to read sequentially from index buffer.
1129 /// @param indexOffset - Starting index into index buffer.
1130 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1131 /// @param numInstances - Number of instances to render.
1132 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1133 void DrawIndexedInstance(
1135 PRIMITIVE_TOPOLOGY topology
,
1136 uint32_t numIndices
,
1137 uint32_t indexOffset
,
1139 uint32_t numInstances
= 1,
1140 uint32_t startInstance
= 0)
1147 RDTSC_START(APIDrawIndexed
);
1149 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1150 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1151 API_STATE
* pState
= &pDC
->pState
->state
;
1153 int32_t maxIndicesPerDraw
= MaxVertsPerDraw(pDC
, numIndices
, topology
);
1154 uint32_t primsPerDraw
= GetNumPrims(topology
, maxIndicesPerDraw
);
1155 int32_t remainingIndices
= numIndices
;
1157 uint32_t indexSize
= 0;
1158 switch (pState
->indexBuffer
.format
)
1160 case R32_UINT
: indexSize
= sizeof(uint32_t); break;
1161 case R16_UINT
: indexSize
= sizeof(uint16_t); break;
1162 case R8_UINT
: indexSize
= sizeof(uint8_t); break;
1168 uint8_t *pIB
= (uint8_t*)pState
->indexBuffer
.pIndices
;
1169 pIB
+= (uint64_t)indexOffset
* (uint64_t)indexSize
;
1171 pState
->topology
= topology
;
1172 pState
->forceFront
= false;
1174 // disable culling for points/lines
1175 uint32_t oldCullMode
= pState
->rastState
.cullMode
;
1176 if (topology
== TOP_POINT_LIST
)
1178 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
1179 pState
->forceFront
= true;
1182 while (remainingIndices
)
1184 uint32_t numIndicesForDraw
= (remainingIndices
< maxIndicesPerDraw
) ?
1185 remainingIndices
: maxIndicesPerDraw
;
1187 // When breaking up draw, we need to obtain new draw context for each iteration.
1188 bool isSplitDraw
= (draw
> 0) ? true : false;
1189 pDC
= GetDrawContext(pContext
, isSplitDraw
);
1190 InitDraw(pDC
, isSplitDraw
);
1192 pDC
->FeWork
.type
= DRAW
;
1193 pDC
->FeWork
.pfnWork
= GetFEDrawFunc(
1195 pState
->tsState
.tsEnable
,
1196 pState
->gsState
.gsEnable
,
1197 pState
->soState
.soEnable
,
1198 pDC
->pState
->pfnProcessPrims
!= nullptr);
1199 pDC
->FeWork
.desc
.draw
.pDC
= pDC
;
1200 pDC
->FeWork
.desc
.draw
.numIndices
= numIndicesForDraw
;
1201 pDC
->FeWork
.desc
.draw
.pIB
= (int*)pIB
;
1202 pDC
->FeWork
.desc
.draw
.type
= pDC
->pState
->state
.indexBuffer
.format
;
1204 pDC
->FeWork
.desc
.draw
.numInstances
= numInstances
;
1205 pDC
->FeWork
.desc
.draw
.startInstance
= startInstance
;
1206 pDC
->FeWork
.desc
.draw
.baseVertex
= baseVertex
;
1207 pDC
->FeWork
.desc
.draw
.startPrimID
= draw
* primsPerDraw
;
1210 QueueDraw(pContext
);
1212 pIB
+= maxIndicesPerDraw
* indexSize
;
1213 remainingIndices
-= numIndicesForDraw
;
1217 // restore culling state
1218 pDC
= GetDrawContext(pContext
);
1219 pDC
->pState
->state
.rastState
.cullMode
= oldCullMode
;
1221 RDTSC_STOP(APIDrawIndexed
, numIndices
* numInstances
, 0);
1225 //////////////////////////////////////////////////////////////////////////
1226 /// @brief DrawIndexed
1227 /// @param hContext - Handle passed back from SwrCreateContext
1228 /// @param topology - Specifies topology for draw.
1229 /// @param numIndices - Number of indices to read sequentially from index buffer.
1230 /// @param indexOffset - Starting index into index buffer.
1231 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1232 void SwrDrawIndexed(
1234 PRIMITIVE_TOPOLOGY topology
,
1235 uint32_t numIndices
,
1236 uint32_t indexOffset
,
1240 DrawIndexedInstance(hContext
, topology
, numIndices
, indexOffset
, baseVertex
);
1243 //////////////////////////////////////////////////////////////////////////
1244 /// @brief SwrDrawIndexedInstanced
1245 /// @param hContext - Handle passed back from SwrCreateContext
1246 /// @param topology - Specifies topology for draw.
1247 /// @param numIndices - Number of indices to read sequentially from index buffer.
1248 /// @param numInstances - Number of instances to render.
1249 /// @param indexOffset - Starting index into index buffer.
1250 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1251 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1252 void SwrDrawIndexedInstanced(
1254 PRIMITIVE_TOPOLOGY topology
,
1255 uint32_t numIndices
,
1256 uint32_t numInstances
,
1257 uint32_t indexOffset
,
1259 uint32_t startInstance
)
1261 DrawIndexedInstance(hContext
, topology
, numIndices
, indexOffset
, baseVertex
, numInstances
, startInstance
);
1264 // Attach surfaces to pipeline
1265 void SwrInvalidateTiles(
1267 uint32_t attachmentMask
)
1269 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1270 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1272 // Queue a load to the hottile
1273 pDC
->FeWork
.type
= INVALIDATETILES
;
1274 pDC
->FeWork
.pfnWork
= ProcessInvalidateTiles
;
1275 pDC
->FeWork
.desc
.invalidateTiles
.attachmentMask
= attachmentMask
;
1278 QueueDraw(pContext
);
1281 //////////////////////////////////////////////////////////////////////////
1282 /// @brief SwrDispatch
1283 /// @param hContext - Handle passed back from SwrCreateContext
1284 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
1285 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
1286 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
1289 uint32_t threadGroupCountX
,
1290 uint32_t threadGroupCountY
,
1291 uint32_t threadGroupCountZ
)
1298 RDTSC_START(APIDispatch
);
1299 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1300 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1302 pDC
->isCompute
= true; // This is a compute context.
1304 // Ensure spill fill pointers are initialized to nullptr.
1305 memset(pDC
->pSpillFill
, 0, sizeof(pDC
->pSpillFill
));
1307 COMPUTE_DESC
* pTaskData
= (COMPUTE_DESC
*)pDC
->pArena
->AllocAligned(sizeof(COMPUTE_DESC
), 64);
1309 pTaskData
->threadGroupCountX
= threadGroupCountX
;
1310 pTaskData
->threadGroupCountY
= threadGroupCountY
;
1311 pTaskData
->threadGroupCountZ
= threadGroupCountZ
;
1313 uint32_t totalThreadGroups
= threadGroupCountX
* threadGroupCountY
* threadGroupCountZ
;
1314 pDC
->pDispatch
->initialize(totalThreadGroups
, pTaskData
);
1316 QueueDispatch(pContext
);
1317 RDTSC_STOP(APIDispatch
, threadGroupCountX
* threadGroupCountY
* threadGroupCountZ
, 0);
1320 // Deswizzles, converts and stores current contents of the hot tiles to surface
1321 // described by pState
1324 SWR_RENDERTARGET_ATTACHMENT attachment
,
1325 SWR_TILE_STATE postStoreTileState
)
1327 RDTSC_START(APIStoreTiles
);
1329 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1330 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1332 SetupMacroTileScissors(pDC
);
1334 pDC
->FeWork
.type
= STORETILES
;
1335 pDC
->FeWork
.pfnWork
= ProcessStoreTiles
;
1336 pDC
->FeWork
.desc
.storeTiles
.attachment
= attachment
;
1337 pDC
->FeWork
.desc
.storeTiles
.postStoreTileState
= postStoreTileState
;
1340 QueueDraw(pContext
);
1342 RDTSC_STOP(APIStoreTiles
, 0, 0);
1345 void SwrClearRenderTarget(
1348 const float clearColor
[4],
1352 RDTSC_START(APIClearRenderTarget
);
1354 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1356 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1358 SetupMacroTileScissors(pDC
);
1361 flags
.mask
= clearMask
;
1363 pDC
->FeWork
.type
= CLEAR
;
1364 pDC
->FeWork
.pfnWork
= ProcessClear
;
1365 pDC
->FeWork
.desc
.clear
.flags
= flags
;
1366 pDC
->FeWork
.desc
.clear
.clearDepth
= z
;
1367 pDC
->FeWork
.desc
.clear
.clearRTColor
[0] = clearColor
[0];
1368 pDC
->FeWork
.desc
.clear
.clearRTColor
[1] = clearColor
[1];
1369 pDC
->FeWork
.desc
.clear
.clearRTColor
[2] = clearColor
[2];
1370 pDC
->FeWork
.desc
.clear
.clearRTColor
[3] = clearColor
[3];
1371 pDC
->FeWork
.desc
.clear
.clearStencil
= stencil
;
1374 QueueDraw(pContext
);
1376 RDTSC_STOP(APIClearRenderTarget
, 0, pDC
->drawId
);
1379 //////////////////////////////////////////////////////////////////////////
1380 /// @brief Returns a pointer to the private context state for the current
1381 /// draw operation. This is used for external componets such as the
1383 /// SWR is responsible for the allocation of the private context state.
1384 /// @param hContext - Handle passed back from SwrCreateContext
1385 VOID
* SwrGetPrivateContextState(
1388 SWR_CONTEXT
* pContext
= GetContext(hContext
);
1389 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1390 DRAW_STATE
* pState
= pDC
->pState
;
1392 if (pState
->pPrivateState
== nullptr)
1394 pState
->pPrivateState
= pState
->pArena
->AllocAligned(pContext
->privateStateSize
, KNOB_SIMD_WIDTH
*sizeof(float));
1397 return pState
->pPrivateState
;
1400 //////////////////////////////////////////////////////////////////////////
1401 /// @brief Clients can use this to allocate memory for draw/dispatch
1402 /// operations. The memory will automatically be freed once operation
1403 /// has completed. Client can use this to allocate binding tables,
1404 /// etc. needed for shader execution.
1405 /// @param hContext - Handle passed back from SwrCreateContext
1406 /// @param size - Size of allocation
1407 /// @param align - Alignment needed for allocation.
1408 VOID
* SwrAllocDrawContextMemory(
1413 SWR_CONTEXT
* pContext
= GetContext(hContext
);
1414 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1416 return pDC
->pState
->pArena
->AllocAligned(size
, align
);
1419 //////////////////////////////////////////////////////////////////////////
1420 /// @brief Returns pointer to SWR stats.
1421 /// @note The counters are atomically incremented by multiple threads.
1422 /// When calling this, you need to ensure all previous operations
1424 /// @todo If necessary, add a callback to avoid stalling the pipe to
1425 /// sample the counters.
1426 /// @param hContext - Handle passed back from SwrCreateContext
1427 /// @param pStats - SWR will fill this out for caller.
1432 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1433 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1435 pDC
->FeWork
.type
= QUERYSTATS
;
1436 pDC
->FeWork
.pfnWork
= ProcessQueryStats
;
1437 pDC
->FeWork
.desc
.queryStats
.pStats
= pStats
;
1439 // cannot execute until all previous draws have completed
1440 pDC
->dependency
= pDC
->drawId
- 1;
1443 QueueDraw(pContext
);
1446 //////////////////////////////////////////////////////////////////////////
1447 /// @brief Enables stats counting
1448 /// @param hContext - Handle passed back from SwrCreateContext
1449 /// @param enable - If true then counts are incremented.
1450 void SwrEnableStats(
1454 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1455 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1457 pDC
->pState
->state
.enableStats
= enable
;
1460 //////////////////////////////////////////////////////////////////////////
1461 /// @brief Mark end of frame - used for performance profiling
1462 /// @param hContext - Handle passed back from SwrCreateContext
1463 void SWR_API
SwrEndFrame(