1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief API implementation
27 ******************************************************************************/
34 #include "core/backend.h"
35 #include "core/context.h"
36 #include "core/frontend.h"
37 #include "core/rasterizer.h"
38 #include "core/rdtsc_core.h"
39 #include "core/threads.h"
40 #include "core/tilemgr.h"
41 #include "core/clip.h"
43 #include "common/simdintrin.h"
44 #include "common/os.h"
46 void SetupDefaultState(SWR_CONTEXT
*pContext
);
48 //////////////////////////////////////////////////////////////////////////
49 /// @brief Create SWR Context.
50 /// @param pCreateInfo - pointer to creation info.
51 HANDLE
SwrCreateContext(
52 SWR_CREATECONTEXT_INFO
* pCreateInfo
)
57 void* pContextMem
= _aligned_malloc(sizeof(SWR_CONTEXT
), KNOB_SIMD_WIDTH
* 4);
58 memset(pContextMem
, 0, sizeof(SWR_CONTEXT
));
59 SWR_CONTEXT
*pContext
= new (pContextMem
) SWR_CONTEXT();
61 pContext
->driverType
= pCreateInfo
->driver
;
62 pContext
->privateStateSize
= pCreateInfo
->privateStateSize
;
64 pContext
->dcRing
.Init(KNOB_MAX_DRAWS_IN_FLIGHT
);
65 pContext
->dsRing
.Init(KNOB_MAX_DRAWS_IN_FLIGHT
);
67 for (uint32_t dc
= 0; dc
< KNOB_MAX_DRAWS_IN_FLIGHT
; ++dc
)
69 pContext
->dcRing
[dc
].pArena
= new CachingArena(pContext
->cachingArenaAllocator
);
70 pContext
->dcRing
[dc
].pTileMgr
= new MacroTileMgr(*(pContext
->dcRing
[dc
].pArena
));
71 pContext
->dcRing
[dc
].pDispatch
= new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
73 pContext
->dsRing
[dc
].pArena
= new CachingArena(pContext
->cachingArenaAllocator
);
76 if (!KNOB_SINGLE_THREADED
)
78 memset(&pContext
->WaitLock
, 0, sizeof(pContext
->WaitLock
));
79 memset(&pContext
->FifosNotEmpty
, 0, sizeof(pContext
->FifosNotEmpty
));
80 new (&pContext
->WaitLock
) std::mutex();
81 new (&pContext
->FifosNotEmpty
) std::condition_variable();
83 CreateThreadPool(pContext
, &pContext
->threadPool
);
86 // Calling createThreadPool() above can set SINGLE_THREADED
87 if (KNOB_SINGLE_THREADED
)
89 pContext
->NumWorkerThreads
= 1;
92 // Allocate scratch space for workers.
93 ///@note We could lazily allocate this but its rather small amount of memory.
94 for (uint32_t i
= 0; i
< pContext
->NumWorkerThreads
; ++i
)
96 ///@todo Use numa API for allocations using numa information from thread data (if exists).
97 pContext
->pScratch
[i
] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH
* 4);
100 // State setup AFTER context is fully initialized
101 SetupDefaultState(pContext
);
103 // initialize hot tile manager
104 pContext
->pHotTileMgr
= new HotTileMgr();
106 // initialize function pointer tables
107 InitClearTilesTable();
109 // initialize store tiles function
110 pContext
->pfnLoadTile
= pCreateInfo
->pfnLoadTile
;
111 pContext
->pfnStoreTile
= pCreateInfo
->pfnStoreTile
;
112 pContext
->pfnClearTile
= pCreateInfo
->pfnClearTile
;
114 // pass pointer to bucket manager back to caller
115 #ifdef KNOB_ENABLE_RDTSC
116 pCreateInfo
->pBucketMgr
= &gBucketMgr
;
119 pCreateInfo
->contextSaveSize
= sizeof(API_STATE
);
121 return (HANDLE
)pContext
;
124 void SwrDestroyContext(HANDLE hContext
)
126 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
127 DestroyThreadPool(pContext
, &pContext
->threadPool
);
130 for (uint32_t i
= 0; i
< KNOB_MAX_DRAWS_IN_FLIGHT
; ++i
)
132 delete pContext
->dcRing
[i
].pArena
;
133 delete pContext
->dsRing
[i
].pArena
;
134 delete(pContext
->dcRing
[i
].pTileMgr
);
135 delete(pContext
->dcRing
[i
].pDispatch
);
138 // Free scratch space.
139 for (uint32_t i
= 0; i
< pContext
->NumWorkerThreads
; ++i
)
141 _aligned_free(pContext
->pScratch
[i
]);
144 delete(pContext
->pHotTileMgr
);
146 pContext
->~SWR_CONTEXT();
147 _aligned_free((SWR_CONTEXT
*)hContext
);
150 void CopyState(DRAW_STATE
& dst
, const DRAW_STATE
& src
)
152 memcpy(&dst
.state
, &src
.state
, sizeof(API_STATE
));
155 void WakeAllThreads(SWR_CONTEXT
*pContext
)
157 pContext
->FifosNotEmpty
.notify_all();
160 template<bool IsDraw
>
161 void QueueWork(SWR_CONTEXT
*pContext
)
163 // Each worker thread looks at a DC for both FE and BE work at different times and so we
164 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
165 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
166 // then moved on if all work is done.)
167 pContext
->pCurDrawContext
->threadsDone
=
168 pContext
->NumWorkerThreads
? pContext
->NumWorkerThreads
* 2 : 2;
172 std::unique_lock
<std::mutex
> lock(pContext
->WaitLock
);
173 pContext
->dcRing
.Enqueue();
176 if (KNOB_SINGLE_THREADED
)
178 // flush denormals to 0
179 uint32_t mxcsr
= _mm_getcsr();
180 _mm_setcsr(mxcsr
| _MM_FLUSH_ZERO_ON
| _MM_DENORMALS_ZERO_ON
);
184 static TileSet lockedTiles
;
185 uint64_t curDraw
[2] = { pContext
->pCurDrawContext
->drawId
, pContext
->pCurDrawContext
->drawId
};
186 WorkOnFifoFE(pContext
, 0, curDraw
[0], 0);
187 WorkOnFifoBE(pContext
, 0, curDraw
[1], lockedTiles
, 0, 0);
191 uint64_t curDispatch
= pContext
->pCurDrawContext
->drawId
;
192 WorkOnCompute(pContext
, 0, curDispatch
);
195 // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
196 while (CompleteDrawContext(pContext
, pContext
->pCurDrawContext
) > 0) {}
203 RDTSC_START(APIDrawWakeAllThreads
);
204 WakeAllThreads(pContext
);
205 RDTSC_STOP(APIDrawWakeAllThreads
, 1, 0);
208 // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
209 pContext
->pPrevDrawContext
= pContext
->pCurDrawContext
;
210 pContext
->pCurDrawContext
= nullptr;
213 INLINE
void QueueDraw(SWR_CONTEXT
* pContext
)
215 QueueWork
<true>(pContext
);
218 INLINE
void QueueDispatch(SWR_CONTEXT
* pContext
)
220 QueueWork
<false>(pContext
);
223 DRAW_CONTEXT
* GetDrawContext(SWR_CONTEXT
*pContext
, bool isSplitDraw
= false)
225 RDTSC_START(APIGetDrawContext
);
226 // If current draw context is null then need to obtain a new draw context to use from ring.
227 if (pContext
->pCurDrawContext
== nullptr)
229 // Need to wait for a free entry.
230 while (pContext
->dcRing
.IsFull())
235 uint32_t dcIndex
= pContext
->dcRing
.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT
;
237 DRAW_CONTEXT
* pCurDrawContext
= &pContext
->dcRing
[dcIndex
];
238 pContext
->pCurDrawContext
= pCurDrawContext
;
240 // Assign next available entry in DS ring to this DC.
241 uint32_t dsIndex
= pContext
->curStateId
% KNOB_MAX_DRAWS_IN_FLIGHT
;
242 pCurDrawContext
->pState
= &pContext
->dsRing
[dsIndex
];
244 // Copy previous state to current state.
245 if (pContext
->pPrevDrawContext
)
247 DRAW_CONTEXT
* pPrevDrawContext
= pContext
->pPrevDrawContext
;
249 // If we're splitting our draw then we can just use the same state from the previous
250 // draw. In this case, we won't increment the DS ring index so the next non-split
251 // draw can receive the state.
252 if (isSplitDraw
== false)
254 CopyState(*pCurDrawContext
->pState
, *pPrevDrawContext
->pState
);
256 // Should have been cleaned up previously
257 SWR_ASSERT(pCurDrawContext
->pState
->pArena
->IsEmpty() == true);
259 pCurDrawContext
->pState
->pPrivateState
= nullptr;
261 pContext
->curStateId
++; // Progress state ring index forward.
265 // If its a split draw then just copy the state pointer over
266 // since its the same draw.
267 pCurDrawContext
->pState
= pPrevDrawContext
->pState
;
268 SWR_ASSERT(pPrevDrawContext
->cleanupState
== false);
273 SWR_ASSERT(pCurDrawContext
->pState
->pArena
->IsEmpty() == true);
274 pContext
->curStateId
++; // Progress state ring index forward.
277 SWR_ASSERT(pCurDrawContext
->pArena
->IsEmpty() == true);
279 pCurDrawContext
->dependency
= 0;
280 pCurDrawContext
->pContext
= pContext
;
281 pCurDrawContext
->isCompute
= false; // Dispatch has to set this to true.
283 pCurDrawContext
->doneFE
= false;
284 pCurDrawContext
->FeLock
= 0;
285 pCurDrawContext
->threadsDone
= 0;
287 pCurDrawContext
->pTileMgr
->initialize();
289 // Assign unique drawId for this DC
290 pCurDrawContext
->drawId
= pContext
->dcRing
.GetHead();
292 pCurDrawContext
->cleanupState
= true;
296 SWR_ASSERT(isSplitDraw
== false, "Split draw should only be used when obtaining a new DC");
299 RDTSC_STOP(APIGetDrawContext
, 0, 0);
300 return pContext
->pCurDrawContext
;
303 API_STATE
* GetDrawState(SWR_CONTEXT
*pContext
)
305 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
306 SWR_ASSERT(pDC
->pState
!= nullptr);
308 return &pDC
->pState
->state
;
311 void SWR_API
SwrSaveState(
313 void* pOutputStateBlock
,
316 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
317 auto pSrc
= GetDrawState(pContext
);
318 SWR_ASSERT(pOutputStateBlock
&& memSize
>= sizeof(*pSrc
));
320 memcpy(pOutputStateBlock
, pSrc
, sizeof(*pSrc
));
323 void SWR_API
SwrRestoreState(
325 const void* pStateBlock
,
328 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
329 auto pDst
= GetDrawState(pContext
);
330 SWR_ASSERT(pStateBlock
&& memSize
>= sizeof(*pDst
));
332 memcpy(pDst
, pStateBlock
, sizeof(*pDst
));
335 void SetupDefaultState(SWR_CONTEXT
*pContext
)
337 API_STATE
* pState
= GetDrawState(pContext
);
339 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
340 pState
->rastState
.frontWinding
= SWR_FRONTWINDING_CCW
;
343 static INLINE SWR_CONTEXT
* GetContext(HANDLE hContext
)
345 return (SWR_CONTEXT
*)hContext
;
348 void SwrSync(HANDLE hContext
, PFN_CALLBACK_FUNC pfnFunc
, uint64_t userData
, uint64_t userData2
, uint64_t userData3
)
350 RDTSC_START(APISync
);
352 SWR_ASSERT(pfnFunc
!= nullptr);
354 SWR_CONTEXT
*pContext
= GetContext(hContext
);
355 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
357 pDC
->FeWork
.type
= SYNC
;
358 pDC
->FeWork
.pfnWork
= ProcessSync
;
359 pDC
->FeWork
.desc
.sync
.pfnCallbackFunc
= pfnFunc
;
360 pDC
->FeWork
.desc
.sync
.userData
= userData
;
361 pDC
->FeWork
.desc
.sync
.userData2
= userData2
;
362 pDC
->FeWork
.desc
.sync
.userData3
= userData3
;
364 // cannot execute until all previous draws have completed
365 pDC
->dependency
= pDC
->drawId
- 1;
370 RDTSC_STOP(APISync
, 1, 0);
373 void SwrWaitForIdle(HANDLE hContext
)
375 SWR_CONTEXT
*pContext
= GetContext(hContext
);
377 RDTSC_START(APIWaitForIdle
);
379 while (!pContext
->dcRing
.IsEmpty())
384 RDTSC_STOP(APIWaitForIdle
, 1, 0);
387 void SwrSetVertexBuffers(
390 const SWR_VERTEX_BUFFER_STATE
* pVertexBuffers
)
392 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
394 for (uint32_t i
= 0; i
< numBuffers
; ++i
)
396 const SWR_VERTEX_BUFFER_STATE
*pVB
= &pVertexBuffers
[i
];
397 pState
->vertexBuffers
[pVB
->index
] = *pVB
;
401 void SwrSetIndexBuffer(
403 const SWR_INDEX_BUFFER_STATE
* pIndexBuffer
)
405 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
407 pState
->indexBuffer
= *pIndexBuffer
;
410 void SwrSetFetchFunc(
412 PFN_FETCH_FUNC pfnFetchFunc
)
414 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
416 pState
->pfnFetchFunc
= pfnFetchFunc
;
421 PFN_SO_FUNC pfnSoFunc
,
422 uint32_t streamIndex
)
424 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
426 SWR_ASSERT(streamIndex
< MAX_SO_STREAMS
);
428 pState
->pfnSoFunc
[streamIndex
] = pfnSoFunc
;
433 SWR_STREAMOUT_STATE
* pSoState
)
435 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
437 pState
->soState
= *pSoState
;
440 void SwrSetSoBuffers(
442 SWR_STREAMOUT_BUFFER
* pSoBuffer
,
445 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
447 SWR_ASSERT((slot
< 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot
);
449 pState
->soBuffer
[slot
] = *pSoBuffer
;
452 void SwrSetVertexFunc(
454 PFN_VERTEX_FUNC pfnVertexFunc
)
456 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
458 pState
->pfnVertexFunc
= pfnVertexFunc
;
461 void SwrSetFrontendState(
463 SWR_FRONTEND_STATE
*pFEState
)
465 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
466 pState
->frontendState
= *pFEState
;
471 SWR_GS_STATE
*pGSState
)
473 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
474 pState
->gsState
= *pGSState
;
479 PFN_GS_FUNC pfnGsFunc
)
481 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
482 pState
->pfnGsFunc
= pfnGsFunc
;
487 PFN_CS_FUNC pfnCsFunc
,
488 uint32_t totalThreadsInGroup
)
490 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
491 pState
->pfnCsFunc
= pfnCsFunc
;
492 pState
->totalThreadsInGroup
= totalThreadsInGroup
;
497 SWR_TS_STATE
*pState
)
499 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
500 pApiState
->tsState
= *pState
;
507 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
508 pApiState
->pfnHsFunc
= pfnFunc
;
515 API_STATE
* pApiState
= GetDrawState(GetContext(hContext
));
516 pApiState
->pfnDsFunc
= pfnFunc
;
519 void SwrSetDepthStencilState(
521 SWR_DEPTH_STENCIL_STATE
*pDSState
)
523 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
525 pState
->depthStencilState
= *pDSState
;
528 void SwrSetBackendState(
530 SWR_BACKEND_STATE
*pBEState
)
532 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
534 pState
->backendState
= *pBEState
;
537 void SwrSetPixelShaderState(
539 SWR_PS_STATE
*pPSState
)
541 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
542 pState
->psState
= *pPSState
;
545 void SwrSetBlendState(
547 SWR_BLEND_STATE
*pBlendState
)
549 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
550 memcpy(&pState
->blendState
, pBlendState
, sizeof(SWR_BLEND_STATE
));
553 void SwrSetBlendFunc(
555 uint32_t renderTarget
,
556 PFN_BLEND_JIT_FUNC pfnBlendFunc
)
558 SWR_ASSERT(renderTarget
< SWR_NUM_RENDERTARGETS
);
559 API_STATE
*pState
= GetDrawState(GetContext(hContext
));
560 pState
->pfnBlendFunc
[renderTarget
] = pfnBlendFunc
;
568 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
570 static const uint8_t IDENTITY_MAP
[] =
572 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
573 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
575 static_assert(sizeof(IDENTITY_MAP
) == sizeof(pState
->linkageMap
),
576 "Update for new value of MAX_ATTRIBUTES");
578 pState
->linkageMask
= mask
;
579 pState
->linkageCount
= _mm_popcnt_u32(mask
);
585 memcpy(pState
->linkageMap
, pMap
, pState
->linkageCount
);
588 // update guardband multipliers for the viewport
589 void updateGuardband(API_STATE
*pState
)
591 // guardband center is viewport center
592 pState
->gbState
.left
= KNOB_GUARDBAND_WIDTH
/ pState
->vp
[0].width
;
593 pState
->gbState
.right
= KNOB_GUARDBAND_WIDTH
/ pState
->vp
[0].width
;
594 pState
->gbState
.top
= KNOB_GUARDBAND_HEIGHT
/ pState
->vp
[0].height
;
595 pState
->gbState
.bottom
= KNOB_GUARDBAND_HEIGHT
/ pState
->vp
[0].height
;
598 void SwrSetRastState(
600 const SWR_RASTSTATE
*pRastState
)
602 SWR_CONTEXT
*pContext
= GetContext(hContext
);
603 API_STATE
* pState
= GetDrawState(pContext
);
605 memcpy(&pState
->rastState
, pRastState
, sizeof(SWR_RASTSTATE
));
608 void SwrSetViewports(
610 uint32_t numViewports
,
611 const SWR_VIEWPORT
* pViewports
,
612 const SWR_VIEWPORT_MATRIX
* pMatrices
)
614 SWR_ASSERT(numViewports
<= KNOB_NUM_VIEWPORTS_SCISSORS
,
615 "Invalid number of viewports.");
617 SWR_CONTEXT
*pContext
= GetContext(hContext
);
618 API_STATE
* pState
= GetDrawState(pContext
);
620 memcpy(&pState
->vp
[0], pViewports
, sizeof(SWR_VIEWPORT
) * numViewports
);
622 if (pMatrices
!= nullptr)
624 memcpy(&pState
->vpMatrix
[0], pMatrices
, sizeof(SWR_VIEWPORT_MATRIX
) * numViewports
);
628 // Compute default viewport transform.
629 for (uint32_t i
= 0; i
< numViewports
; ++i
)
631 if (pContext
->driverType
== DX
)
633 pState
->vpMatrix
[i
].m00
= pState
->vp
[i
].width
/ 2.0f
;
634 pState
->vpMatrix
[i
].m11
= -pState
->vp
[i
].height
/ 2.0f
;
635 pState
->vpMatrix
[i
].m22
= pState
->vp
[i
].maxZ
- pState
->vp
[i
].minZ
;
636 pState
->vpMatrix
[i
].m30
= pState
->vp
[i
].x
+ pState
->vpMatrix
[i
].m00
;
637 pState
->vpMatrix
[i
].m31
= pState
->vp
[i
].y
- pState
->vpMatrix
[i
].m11
;
638 pState
->vpMatrix
[i
].m32
= pState
->vp
[i
].minZ
;
642 // Standard, with the exception that Y is inverted.
643 pState
->vpMatrix
[i
].m00
= (pState
->vp
[i
].width
- pState
->vp
[i
].x
) / 2.0f
;
644 pState
->vpMatrix
[i
].m11
= (pState
->vp
[i
].y
- pState
->vp
[i
].height
) / 2.0f
;
645 pState
->vpMatrix
[i
].m22
= (pState
->vp
[i
].maxZ
- pState
->vp
[i
].minZ
) / 2.0f
;
646 pState
->vpMatrix
[i
].m30
= pState
->vp
[i
].x
+ pState
->vpMatrix
[i
].m00
;
647 pState
->vpMatrix
[i
].m31
= pState
->vp
[i
].height
+ pState
->vpMatrix
[i
].m11
;
648 pState
->vpMatrix
[i
].m32
= pState
->vp
[i
].minZ
+ pState
->vpMatrix
[i
].m22
;
650 // Now that the matrix is calculated, clip the view coords to screen size.
651 // OpenGL allows for -ve x,y in the viewport.
652 pState
->vp
[i
].x
= std::max(pState
->vp
[i
].x
, 0.0f
);
653 pState
->vp
[i
].y
= std::max(pState
->vp
[i
].y
, 0.0f
);
658 updateGuardband(pState
);
661 void SwrSetScissorRects(
663 uint32_t numScissors
,
664 const BBOX
* pScissors
)
666 SWR_ASSERT(numScissors
<= KNOB_NUM_VIEWPORTS_SCISSORS
,
667 "Invalid number of scissor rects.");
669 API_STATE
* pState
= GetDrawState(GetContext(hContext
));
670 memcpy(&pState
->scissorRects
[0], pScissors
, numScissors
* sizeof(BBOX
));
673 void SetupMacroTileScissors(DRAW_CONTEXT
*pDC
)
675 API_STATE
*pState
= &pDC
->pState
->state
;
676 uint32_t left
, right
, top
, bottom
;
678 // Set up scissor dimensions based on scissor or viewport
679 if (pState
->rastState
.scissorEnable
)
681 // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges
682 left
= pState
->scissorRects
[0].left
;
683 right
= pState
->scissorRects
[0].right
;
684 top
= pState
->scissorRects
[0].top
;
685 bottom
= pState
->scissorRects
[0].bottom
;
689 left
= (int32_t)pState
->vp
[0].x
;
690 right
= (int32_t)pState
->vp
[0].x
+ (int32_t)pState
->vp
[0].width
;
691 top
= (int32_t)pState
->vp
[0].y
;
692 bottom
= (int32_t)pState
->vp
[0].y
+ (int32_t)pState
->vp
[0].height
;
695 right
= std::min
<uint32_t>(right
, KNOB_MAX_SCISSOR_X
);
696 bottom
= std::min
<uint32_t>(bottom
, KNOB_MAX_SCISSOR_Y
);
698 if (left
> KNOB_MAX_SCISSOR_X
|| top
> KNOB_MAX_SCISSOR_Y
)
700 pState
->scissorInFixedPoint
.left
= 0;
701 pState
->scissorInFixedPoint
.right
= 0;
702 pState
->scissorInFixedPoint
.top
= 0;
703 pState
->scissorInFixedPoint
.bottom
= 0;
707 pState
->scissorInFixedPoint
.left
= left
* FIXED_POINT_SCALE
;
708 pState
->scissorInFixedPoint
.right
= right
* FIXED_POINT_SCALE
- 1;
709 pState
->scissorInFixedPoint
.top
= top
* FIXED_POINT_SCALE
;
710 pState
->scissorInFixedPoint
.bottom
= bottom
* FIXED_POINT_SCALE
- 1;
713 // templated backend function tables
714 extern PFN_BACKEND_FUNC gBackendNullPs
[SWR_MULTISAMPLE_TYPE_MAX
];
715 extern PFN_BACKEND_FUNC gBackendSingleSample
[2][2];
716 extern PFN_BACKEND_FUNC gBackendPixelRateTable
[SWR_MULTISAMPLE_TYPE_MAX
][SWR_MSAA_SAMPLE_PATTERN_MAX
][SWR_INPUT_COVERAGE_MAX
][2][2];
717 extern PFN_BACKEND_FUNC gBackendSampleRateTable
[SWR_MULTISAMPLE_TYPE_MAX
][SWR_INPUT_COVERAGE_MAX
][2];
718 extern PFN_OUTPUT_MERGER gBackendOutputMergerTable
[SWR_NUM_RENDERTARGETS
+ 1][SWR_MULTISAMPLE_TYPE_MAX
];
719 extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable
[2];
720 extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable
[2];
721 extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable
[SWR_MULTISAMPLE_TYPE_MAX
][2][2][2];
722 void SetupPipeline(DRAW_CONTEXT
*pDC
)
724 DRAW_STATE
* pState
= pDC
->pState
;
725 const SWR_RASTSTATE
&rastState
= pState
->state
.rastState
;
726 const SWR_PS_STATE
&psState
= pState
->state
.psState
;
727 BACKEND_FUNCS
& backendFuncs
= pState
->backendFuncs
;
728 const uint32_t forcedSampleCount
= (rastState
.bForcedSampleCount
) ? 1 : 0;
731 if (psState
.pfnPixelShader
== nullptr)
733 backendFuncs
.pfnBackend
= gBackendNullPs
[pState
->state
.rastState
.sampleCount
];
734 // always need to generate I & J per sample for Z interpolation
735 backendFuncs
.pfnCalcSampleBarycentrics
= gSampleBarycentricTable
[1];
739 const bool bMultisampleEnable
= ((rastState
.sampleCount
> SWR_MULTISAMPLE_1X
) || rastState
.bForcedSampleCount
) ? 1 : 0;
740 const uint32_t centroid
= ((psState
.barycentricsMask
& SWR_BARYCENTRIC_CENTROID_MASK
) > 0) ? 1 : 0;
742 // currently only support 'normal' input coverage
743 SWR_ASSERT(psState
.inputCoverage
== SWR_INPUT_COVERAGE_NORMAL
||
744 psState
.inputCoverage
== SWR_INPUT_COVERAGE_NONE
);
746 SWR_BARYCENTRICS_MASK barycentricsMask
= (SWR_BARYCENTRICS_MASK
)psState
.barycentricsMask
;
748 // select backend function
749 switch(psState
.shadingRate
)
751 case SWR_SHADING_RATE_PIXEL
:
752 if(bMultisampleEnable
)
754 // always need to generate I & J per sample for Z interpolation
755 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_SAMPLE_MASK
);
756 backendFuncs
.pfnBackend
= gBackendPixelRateTable
[rastState
.sampleCount
][rastState
.samplePattern
][psState
.inputCoverage
][centroid
][forcedSampleCount
];
757 backendFuncs
.pfnOutputMerger
= gBackendOutputMergerTable
[psState
.numRenderTargets
][pState
->state
.blendState
.sampleCount
];
761 // always need to generate I & J per pixel for Z interpolation
762 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_PIXEL_MASK
);
763 backendFuncs
.pfnBackend
= gBackendSingleSample
[psState
.inputCoverage
][centroid
];
764 backendFuncs
.pfnOutputMerger
= gBackendOutputMergerTable
[psState
.numRenderTargets
][SWR_MULTISAMPLE_1X
];
767 case SWR_SHADING_RATE_SAMPLE
:
768 SWR_ASSERT(rastState
.samplePattern
== SWR_MSAA_STANDARD_PATTERN
);
769 // always need to generate I & J per sample for Z interpolation
770 barycentricsMask
= (SWR_BARYCENTRICS_MASK
)(barycentricsMask
| SWR_BARYCENTRIC_PER_SAMPLE_MASK
);
771 backendFuncs
.pfnBackend
= gBackendSampleRateTable
[rastState
.sampleCount
][psState
.inputCoverage
][centroid
];
772 backendFuncs
.pfnOutputMerger
= gBackendOutputMergerTable
[psState
.numRenderTargets
][pState
->state
.blendState
.sampleCount
];
775 SWR_ASSERT(0 && "Invalid shading rate");
779 // setup pointer to function that generates necessary barycentrics required by the PS
780 bool bBarycentrics
= (barycentricsMask
& SWR_BARYCENTRIC_PER_PIXEL_MASK
) > 0 ? 1 : 0;
781 backendFuncs
.pfnCalcPixelBarycentrics
= gPixelBarycentricTable
[bBarycentrics
];
783 bBarycentrics
= (barycentricsMask
& SWR_BARYCENTRIC_PER_SAMPLE_MASK
) > 0 ? 1 : 0;
784 backendFuncs
.pfnCalcSampleBarycentrics
= gSampleBarycentricTable
[bBarycentrics
];
786 bBarycentrics
= (barycentricsMask
& SWR_BARYCENTRIC_CENTROID_MASK
) > 0 ? 1 : 0;
787 backendFuncs
.pfnCalcCentroidBarycentrics
= gCentroidBarycentricTable
[rastState
.sampleCount
][bBarycentrics
][rastState
.samplePattern
][forcedSampleCount
];
790 PFN_PROCESS_PRIMS pfnBinner
;
791 switch (pState
->state
.topology
)
794 pState
->pfnProcessPrims
= ClipPoints
;
795 pfnBinner
= BinPoints
;
800 case TOP_LINE_LIST_ADJ
:
801 case TOP_LISTSTRIP_ADJ
:
802 pState
->pfnProcessPrims
= ClipLines
;
803 pfnBinner
= BinLines
;
806 pState
->pfnProcessPrims
= ClipTriangles
;
807 pfnBinner
= BinTriangles
;
811 // disable clipper if viewport transform is disabled
812 if (pState
->state
.frontendState
.vpTransformDisable
)
814 pState
->pfnProcessPrims
= pfnBinner
;
817 if ((pState
->state
.psState
.pfnPixelShader
== nullptr) &&
818 (pState
->state
.depthStencilState
.depthTestEnable
== FALSE
) &&
819 (pState
->state
.depthStencilState
.depthWriteEnable
== FALSE
) &&
820 (pState
->state
.depthStencilState
.stencilTestEnable
== FALSE
) &&
821 (pState
->state
.depthStencilState
.stencilWriteEnable
== FALSE
) &&
822 (pState
->state
.linkageCount
== 0))
824 pState
->pfnProcessPrims
= nullptr;
825 pState
->state
.linkageMask
= 0;
828 if (pState
->state
.soState
.rasterizerDisable
== true)
830 pState
->pfnProcessPrims
= nullptr;
831 pState
->state
.linkageMask
= 0;
834 // set up the frontend attrib mask
835 pState
->state
.feAttribMask
= pState
->state
.linkageMask
;
836 if (pState
->state
.soState
.soEnable
)
838 for (uint32_t i
= 0; i
< 4; ++i
)
840 pState
->state
.feAttribMask
|= pState
->state
.soState
.streamMasks
[i
];
844 // complicated logic to test for cases where we don't need backing hottile memory for a draw
845 // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
846 pState
->state
.depthHottileEnable
= ((!(pState
->state
.depthStencilState
.depthTestEnable
&&
847 !pState
->state
.depthStencilState
.depthWriteEnable
&&
848 pState
->state
.depthStencilState
.depthTestFunc
== ZFUNC_ALWAYS
)) &&
849 (pState
->state
.depthStencilState
.depthTestEnable
||
850 pState
->state
.depthStencilState
.depthWriteEnable
)) ? true : false;
852 pState
->state
.stencilHottileEnable
= (((!(pState
->state
.depthStencilState
.stencilTestEnable
&&
853 !pState
->state
.depthStencilState
.stencilWriteEnable
&&
854 pState
->state
.depthStencilState
.stencilTestFunc
== ZFUNC_ALWAYS
)) ||
855 // for stencil we have to check the double sided state as well
856 (!(pState
->state
.depthStencilState
.doubleSidedStencilTestEnable
&&
857 !pState
->state
.depthStencilState
.stencilWriteEnable
&&
858 pState
->state
.depthStencilState
.backfaceStencilTestFunc
== ZFUNC_ALWAYS
))) &&
859 (pState
->state
.depthStencilState
.stencilTestEnable
||
860 pState
->state
.depthStencilState
.stencilWriteEnable
)) ? true : false;
862 uint32_t numRTs
= pState
->state
.psState
.numRenderTargets
;
863 pState
->state
.colorHottileEnable
= 0;
864 if (psState
.pfnPixelShader
!= nullptr)
866 for (uint32_t rt
= 0; rt
< numRTs
; ++rt
)
868 pState
->state
.colorHottileEnable
|=
869 (!pState
->state
.blendState
.renderTarget
[rt
].writeDisableAlpha
||
870 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableRed
||
871 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableGreen
||
872 !pState
->state
.blendState
.renderTarget
[rt
].writeDisableBlue
) ? (1 << rt
) : 0;
877 //////////////////////////////////////////////////////////////////////////
879 /// @param pDC - Draw context to initialize for this draw.
884 // We don't need to re-setup the scissors/pipeline state again for split draw.
885 if (isSplitDraw
== false)
887 SetupMacroTileScissors(pDC
);
892 //////////////////////////////////////////////////////////////////////////
893 /// @brief We can split the draw for certain topologies for better performance.
894 /// @param totalVerts - Total vertices for draw
895 /// @param topology - Topology used for draw
896 uint32_t MaxVertsPerDraw(
899 PRIMITIVE_TOPOLOGY topology
)
901 API_STATE
& state
= pDC
->pState
->state
;
903 uint32_t vertsPerDraw
= totalVerts
;
905 if (state
.soState
.soEnable
)
913 case TOP_TRIANGLE_LIST
:
914 vertsPerDraw
= KNOB_MAX_PRIMS_PER_DRAW
;
917 case TOP_PATCHLIST_1
:
918 case TOP_PATCHLIST_2
:
919 case TOP_PATCHLIST_3
:
920 case TOP_PATCHLIST_4
:
921 case TOP_PATCHLIST_5
:
922 case TOP_PATCHLIST_6
:
923 case TOP_PATCHLIST_7
:
924 case TOP_PATCHLIST_8
:
925 case TOP_PATCHLIST_9
:
926 case TOP_PATCHLIST_10
:
927 case TOP_PATCHLIST_11
:
928 case TOP_PATCHLIST_12
:
929 case TOP_PATCHLIST_13
:
930 case TOP_PATCHLIST_14
:
931 case TOP_PATCHLIST_15
:
932 case TOP_PATCHLIST_16
:
933 case TOP_PATCHLIST_17
:
934 case TOP_PATCHLIST_18
:
935 case TOP_PATCHLIST_19
:
936 case TOP_PATCHLIST_20
:
937 case TOP_PATCHLIST_21
:
938 case TOP_PATCHLIST_22
:
939 case TOP_PATCHLIST_23
:
940 case TOP_PATCHLIST_24
:
941 case TOP_PATCHLIST_25
:
942 case TOP_PATCHLIST_26
:
943 case TOP_PATCHLIST_27
:
944 case TOP_PATCHLIST_28
:
945 case TOP_PATCHLIST_29
:
946 case TOP_PATCHLIST_30
:
947 case TOP_PATCHLIST_31
:
948 case TOP_PATCHLIST_32
:
949 if (pDC
->pState
->state
.tsState
.tsEnable
)
951 uint32_t vertsPerPrim
= topology
- TOP_PATCHLIST_BASE
;
952 vertsPerDraw
= vertsPerPrim
* KNOB_MAX_TESS_PRIMS_PER_DRAW
;
956 // The Primitive Assembly code can only handle 1 RECT at a time.
962 // We are not splitting up draws for other topologies.
969 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
970 // arguments to static template arguments.
971 template <bool... ArgsB
>
974 // Last Arg Terminator
975 static PFN_FE_WORK_FUNC
GetFunc(bool bArg
)
979 return ProcessDraw
<ArgsB
..., true>;
982 return ProcessDraw
<ArgsB
..., false>;
985 // Recursively parse args
986 template <typename
... TArgsT
>
987 static PFN_FE_WORK_FUNC
GetFunc(bool bArg
, TArgsT
... remainingArgs
)
991 return FEDrawChooser
<ArgsB
..., true>::GetFunc(remainingArgs
...);
994 return FEDrawChooser
<ArgsB
..., false>::GetFunc(remainingArgs
...);
998 // Selector for correct templated Draw front-end function
1000 static PFN_FE_WORK_FUNC
GetFEDrawFunc(bool IsIndexed
, bool HasTessellation
, bool HasGeometryShader
, bool HasStreamOut
, bool RasterizerEnabled
)
1002 return FEDrawChooser
<>::GetFunc(IsIndexed
, HasTessellation
, HasGeometryShader
, HasStreamOut
, RasterizerEnabled
);
1006 //////////////////////////////////////////////////////////////////////////
1007 /// @brief DrawInstanced
1008 /// @param hContext - Handle passed back from SwrCreateContext
1009 /// @param topology - Specifies topology for draw.
1010 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
1011 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1012 /// @param numInstances - How many instances to render.
1013 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1016 PRIMITIVE_TOPOLOGY topology
,
1017 uint32_t numVertices
,
1018 uint32_t startVertex
,
1019 uint32_t numInstances
= 1,
1020 uint32_t startInstance
= 0)
1027 RDTSC_START(APIDraw
);
1029 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1030 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1032 int32_t maxVertsPerDraw
= MaxVertsPerDraw(pDC
, numVertices
, topology
);
1033 uint32_t primsPerDraw
= GetNumPrims(topology
, maxVertsPerDraw
);
1034 int32_t remainingVerts
= numVertices
;
1036 API_STATE
*pState
= &pDC
->pState
->state
;
1037 pState
->topology
= topology
;
1038 pState
->forceFront
= false;
1040 // disable culling for points/lines
1041 uint32_t oldCullMode
= pState
->rastState
.cullMode
;
1042 if (topology
== TOP_POINT_LIST
)
1044 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
1045 pState
->forceFront
= true;
1049 while (remainingVerts
)
1051 uint32_t numVertsForDraw
= (remainingVerts
< maxVertsPerDraw
) ?
1052 remainingVerts
: maxVertsPerDraw
;
1054 bool isSplitDraw
= (draw
> 0) ? true : false;
1055 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
, isSplitDraw
);
1056 InitDraw(pDC
, isSplitDraw
);
1058 pDC
->FeWork
.type
= DRAW
;
1059 pDC
->FeWork
.pfnWork
= GetFEDrawFunc(
1061 pState
->tsState
.tsEnable
,
1062 pState
->gsState
.gsEnable
,
1063 pState
->soState
.soEnable
,
1064 pDC
->pState
->pfnProcessPrims
!= nullptr);
1065 pDC
->FeWork
.desc
.draw
.numVerts
= numVertsForDraw
;
1066 pDC
->FeWork
.desc
.draw
.startVertex
= startVertex
;
1067 pDC
->FeWork
.desc
.draw
.numInstances
= numInstances
;
1068 pDC
->FeWork
.desc
.draw
.startInstance
= startInstance
;
1069 pDC
->FeWork
.desc
.draw
.startPrimID
= draw
* primsPerDraw
;
1070 pDC
->FeWork
.desc
.draw
.startVertexID
= draw
* maxVertsPerDraw
;
1072 pDC
->cleanupState
= (remainingVerts
== numVertsForDraw
);
1075 QueueDraw(pContext
);
1077 remainingVerts
-= numVertsForDraw
;
1081 // restore culling state
1082 pDC
= GetDrawContext(pContext
);
1083 pDC
->pState
->state
.rastState
.cullMode
= oldCullMode
;
1085 RDTSC_STOP(APIDraw
, numVertices
* numInstances
, 0);
1088 //////////////////////////////////////////////////////////////////////////
1090 /// @param hContext - Handle passed back from SwrCreateContext
1091 /// @param topology - Specifies topology for draw.
1092 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
1093 /// @param primCount - Number of vertices.
1096 PRIMITIVE_TOPOLOGY topology
,
1097 uint32_t startVertex
,
1098 uint32_t numVertices
)
1100 DrawInstanced(hContext
, topology
, numVertices
, startVertex
);
1103 //////////////////////////////////////////////////////////////////////////
1104 /// @brief SwrDrawInstanced
1105 /// @param hContext - Handle passed back from SwrCreateContext
1106 /// @param topology - Specifies topology for draw.
1107 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
1108 /// @param numInstances - How many instances to render.
1109 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1110 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1111 void SwrDrawInstanced(
1113 PRIMITIVE_TOPOLOGY topology
,
1114 uint32_t numVertsPerInstance
,
1115 uint32_t numInstances
,
1116 uint32_t startVertex
,
1117 uint32_t startInstance
1120 DrawInstanced(hContext
, topology
, numVertsPerInstance
, startVertex
, numInstances
, startInstance
);
1123 //////////////////////////////////////////////////////////////////////////
1124 /// @brief DrawIndexedInstanced
1125 /// @param hContext - Handle passed back from SwrCreateContext
1126 /// @param topology - Specifies topology for draw.
1127 /// @param numIndices - Number of indices to read sequentially from index buffer.
1128 /// @param indexOffset - Starting index into index buffer.
1129 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1130 /// @param numInstances - Number of instances to render.
1131 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1132 void DrawIndexedInstance(
1134 PRIMITIVE_TOPOLOGY topology
,
1135 uint32_t numIndices
,
1136 uint32_t indexOffset
,
1138 uint32_t numInstances
= 1,
1139 uint32_t startInstance
= 0)
1146 RDTSC_START(APIDrawIndexed
);
1148 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1149 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1150 API_STATE
* pState
= &pDC
->pState
->state
;
1152 int32_t maxIndicesPerDraw
= MaxVertsPerDraw(pDC
, numIndices
, topology
);
1153 uint32_t primsPerDraw
= GetNumPrims(topology
, maxIndicesPerDraw
);
1154 int32_t remainingIndices
= numIndices
;
1156 uint32_t indexSize
= 0;
1157 switch (pState
->indexBuffer
.format
)
1159 case R32_UINT
: indexSize
= sizeof(uint32_t); break;
1160 case R16_UINT
: indexSize
= sizeof(uint16_t); break;
1161 case R8_UINT
: indexSize
= sizeof(uint8_t); break;
1167 uint8_t *pIB
= (uint8_t*)pState
->indexBuffer
.pIndices
;
1168 pIB
+= (uint64_t)indexOffset
* (uint64_t)indexSize
;
1170 pState
->topology
= topology
;
1171 pState
->forceFront
= false;
1173 // disable culling for points/lines
1174 uint32_t oldCullMode
= pState
->rastState
.cullMode
;
1175 if (topology
== TOP_POINT_LIST
)
1177 pState
->rastState
.cullMode
= SWR_CULLMODE_NONE
;
1178 pState
->forceFront
= true;
1181 while (remainingIndices
)
1183 uint32_t numIndicesForDraw
= (remainingIndices
< maxIndicesPerDraw
) ?
1184 remainingIndices
: maxIndicesPerDraw
;
1186 // When breaking up draw, we need to obtain new draw context for each iteration.
1187 bool isSplitDraw
= (draw
> 0) ? true : false;
1188 pDC
= GetDrawContext(pContext
, isSplitDraw
);
1189 InitDraw(pDC
, isSplitDraw
);
1191 pDC
->FeWork
.type
= DRAW
;
1192 pDC
->FeWork
.pfnWork
= GetFEDrawFunc(
1194 pState
->tsState
.tsEnable
,
1195 pState
->gsState
.gsEnable
,
1196 pState
->soState
.soEnable
,
1197 pDC
->pState
->pfnProcessPrims
!= nullptr);
1198 pDC
->FeWork
.desc
.draw
.pDC
= pDC
;
1199 pDC
->FeWork
.desc
.draw
.numIndices
= numIndicesForDraw
;
1200 pDC
->FeWork
.desc
.draw
.pIB
= (int*)pIB
;
1201 pDC
->FeWork
.desc
.draw
.type
= pDC
->pState
->state
.indexBuffer
.format
;
1203 pDC
->FeWork
.desc
.draw
.numInstances
= numInstances
;
1204 pDC
->FeWork
.desc
.draw
.startInstance
= startInstance
;
1205 pDC
->FeWork
.desc
.draw
.baseVertex
= baseVertex
;
1206 pDC
->FeWork
.desc
.draw
.startPrimID
= draw
* primsPerDraw
;
1208 pDC
->cleanupState
= (remainingIndices
== numIndicesForDraw
);
1211 QueueDraw(pContext
);
1213 pIB
+= maxIndicesPerDraw
* indexSize
;
1214 remainingIndices
-= numIndicesForDraw
;
1218 // restore culling state
1219 pDC
= GetDrawContext(pContext
);
1220 pDC
->pState
->state
.rastState
.cullMode
= oldCullMode
;
1222 RDTSC_STOP(APIDrawIndexed
, numIndices
* numInstances
, 0);
1226 //////////////////////////////////////////////////////////////////////////
1227 /// @brief DrawIndexed
1228 /// @param hContext - Handle passed back from SwrCreateContext
1229 /// @param topology - Specifies topology for draw.
1230 /// @param numIndices - Number of indices to read sequentially from index buffer.
1231 /// @param indexOffset - Starting index into index buffer.
1232 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1233 void SwrDrawIndexed(
1235 PRIMITIVE_TOPOLOGY topology
,
1236 uint32_t numIndices
,
1237 uint32_t indexOffset
,
1241 DrawIndexedInstance(hContext
, topology
, numIndices
, indexOffset
, baseVertex
);
1244 //////////////////////////////////////////////////////////////////////////
1245 /// @brief SwrDrawIndexedInstanced
1246 /// @param hContext - Handle passed back from SwrCreateContext
1247 /// @param topology - Specifies topology for draw.
1248 /// @param numIndices - Number of indices to read sequentially from index buffer.
1249 /// @param numInstances - Number of instances to render.
1250 /// @param indexOffset - Starting index into index buffer.
1251 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1252 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1253 void SwrDrawIndexedInstanced(
1255 PRIMITIVE_TOPOLOGY topology
,
1256 uint32_t numIndices
,
1257 uint32_t numInstances
,
1258 uint32_t indexOffset
,
1260 uint32_t startInstance
)
1262 DrawIndexedInstance(hContext
, topology
, numIndices
, indexOffset
, baseVertex
, numInstances
, startInstance
);
1265 //////////////////////////////////////////////////////////////////////////
1266 /// @brief SwrInvalidateTiles
1267 /// @param hContext - Handle passed back from SwrCreateContext
1268 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
1269 void SwrInvalidateTiles(
1271 uint32_t attachmentMask
)
1273 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1274 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1276 pDC
->FeWork
.type
= DISCARDINVALIDATETILES
;
1277 pDC
->FeWork
.pfnWork
= ProcessDiscardInvalidateTiles
;
1278 pDC
->FeWork
.desc
.discardInvalidateTiles
.attachmentMask
= attachmentMask
;
1279 memset(&pDC
->FeWork
.desc
.discardInvalidateTiles
.rect
, 0, sizeof(SWR_RECT
));
1280 pDC
->FeWork
.desc
.discardInvalidateTiles
.newTileState
= SWR_TILE_INVALID
;
1281 pDC
->FeWork
.desc
.discardInvalidateTiles
.createNewTiles
= false;
1282 pDC
->FeWork
.desc
.discardInvalidateTiles
.fullTilesOnly
= false;
1285 QueueDraw(pContext
);
1288 //////////////////////////////////////////////////////////////////////////
1289 /// @brief SwrDiscardRect
1290 /// @param hContext - Handle passed back from SwrCreateContext
1291 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
1292 /// @param rect - if rect is all zeros, the entire attachment surface will be discarded
1293 void SwrDiscardRect(
1295 uint32_t attachmentMask
,
1298 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1299 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1301 // Queue a load to the hottile
1302 pDC
->FeWork
.type
= DISCARDINVALIDATETILES
;
1303 pDC
->FeWork
.pfnWork
= ProcessDiscardInvalidateTiles
;
1304 pDC
->FeWork
.desc
.discardInvalidateTiles
.attachmentMask
= attachmentMask
;
1305 pDC
->FeWork
.desc
.discardInvalidateTiles
.rect
= rect
;
1306 pDC
->FeWork
.desc
.discardInvalidateTiles
.newTileState
= SWR_TILE_RESOLVED
;
1307 pDC
->FeWork
.desc
.discardInvalidateTiles
.createNewTiles
= true;
1308 pDC
->FeWork
.desc
.discardInvalidateTiles
.fullTilesOnly
= true;
1311 QueueDraw(pContext
);
1314 //////////////////////////////////////////////////////////////////////////
1315 /// @brief SwrDispatch
1316 /// @param hContext - Handle passed back from SwrCreateContext
1317 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
1318 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
1319 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
1322 uint32_t threadGroupCountX
,
1323 uint32_t threadGroupCountY
,
1324 uint32_t threadGroupCountZ
)
1331 RDTSC_START(APIDispatch
);
1332 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1333 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1335 pDC
->isCompute
= true; // This is a compute context.
1337 // Ensure spill fill pointers are initialized to nullptr.
1338 memset(pDC
->pSpillFill
, 0, sizeof(pDC
->pSpillFill
));
1340 COMPUTE_DESC
* pTaskData
= (COMPUTE_DESC
*)pDC
->pArena
->AllocAligned(sizeof(COMPUTE_DESC
), 64);
1342 pTaskData
->threadGroupCountX
= threadGroupCountX
;
1343 pTaskData
->threadGroupCountY
= threadGroupCountY
;
1344 pTaskData
->threadGroupCountZ
= threadGroupCountZ
;
1346 uint32_t totalThreadGroups
= threadGroupCountX
* threadGroupCountY
* threadGroupCountZ
;
1347 pDC
->pDispatch
->initialize(totalThreadGroups
, pTaskData
);
1349 QueueDispatch(pContext
);
1350 RDTSC_STOP(APIDispatch
, threadGroupCountX
* threadGroupCountY
* threadGroupCountZ
, 0);
1353 // Deswizzles, converts and stores current contents of the hot tiles to surface
1354 // described by pState
1357 SWR_RENDERTARGET_ATTACHMENT attachment
,
1358 SWR_TILE_STATE postStoreTileState
)
1360 RDTSC_START(APIStoreTiles
);
1362 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1363 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1365 SetupMacroTileScissors(pDC
);
1367 pDC
->FeWork
.type
= STORETILES
;
1368 pDC
->FeWork
.pfnWork
= ProcessStoreTiles
;
1369 pDC
->FeWork
.desc
.storeTiles
.attachment
= attachment
;
1370 pDC
->FeWork
.desc
.storeTiles
.postStoreTileState
= postStoreTileState
;
1373 QueueDraw(pContext
);
1375 RDTSC_STOP(APIStoreTiles
, 0, 0);
1378 void SwrClearRenderTarget(
1381 const float clearColor
[4],
1385 RDTSC_START(APIClearRenderTarget
);
1387 SWR_CONTEXT
*pContext
= (SWR_CONTEXT
*)hContext
;
1389 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1391 SetupMacroTileScissors(pDC
);
1394 flags
.mask
= clearMask
;
1396 pDC
->FeWork
.type
= CLEAR
;
1397 pDC
->FeWork
.pfnWork
= ProcessClear
;
1398 pDC
->FeWork
.desc
.clear
.flags
= flags
;
1399 pDC
->FeWork
.desc
.clear
.clearDepth
= z
;
1400 pDC
->FeWork
.desc
.clear
.clearRTColor
[0] = clearColor
[0];
1401 pDC
->FeWork
.desc
.clear
.clearRTColor
[1] = clearColor
[1];
1402 pDC
->FeWork
.desc
.clear
.clearRTColor
[2] = clearColor
[2];
1403 pDC
->FeWork
.desc
.clear
.clearRTColor
[3] = clearColor
[3];
1404 pDC
->FeWork
.desc
.clear
.clearStencil
= stencil
;
1407 QueueDraw(pContext
);
1409 RDTSC_STOP(APIClearRenderTarget
, 0, pDC
->drawId
);
1412 //////////////////////////////////////////////////////////////////////////
1413 /// @brief Returns a pointer to the private context state for the current
1414 /// draw operation. This is used for external componets such as the
1416 /// SWR is responsible for the allocation of the private context state.
1417 /// @param hContext - Handle passed back from SwrCreateContext
1418 VOID
* SwrGetPrivateContextState(
1421 SWR_CONTEXT
* pContext
= GetContext(hContext
);
1422 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1423 DRAW_STATE
* pState
= pDC
->pState
;
1425 if (pState
->pPrivateState
== nullptr)
1427 pState
->pPrivateState
= pState
->pArena
->AllocAligned(pContext
->privateStateSize
, KNOB_SIMD_WIDTH
*sizeof(float));
1430 return pState
->pPrivateState
;
1433 //////////////////////////////////////////////////////////////////////////
1434 /// @brief Clients can use this to allocate memory for draw/dispatch
1435 /// operations. The memory will automatically be freed once operation
1436 /// has completed. Client can use this to allocate binding tables,
1437 /// etc. needed for shader execution.
1438 /// @param hContext - Handle passed back from SwrCreateContext
1439 /// @param size - Size of allocation
1440 /// @param align - Alignment needed for allocation.
1441 VOID
* SwrAllocDrawContextMemory(
1446 SWR_CONTEXT
* pContext
= GetContext(hContext
);
1447 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1449 return pDC
->pState
->pArena
->AllocAligned(size
, align
);
1452 //////////////////////////////////////////////////////////////////////////
1453 /// @brief Returns pointer to SWR stats.
1454 /// @note The counters are atomically incremented by multiple threads.
1455 /// When calling this, you need to ensure all previous operations
1457 /// @todo If necessary, add a callback to avoid stalling the pipe to
1458 /// sample the counters.
1459 /// @param hContext - Handle passed back from SwrCreateContext
1460 /// @param pStats - SWR will fill this out for caller.
1465 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1466 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1468 pDC
->FeWork
.type
= QUERYSTATS
;
1469 pDC
->FeWork
.pfnWork
= ProcessQueryStats
;
1470 pDC
->FeWork
.desc
.queryStats
.pStats
= pStats
;
1472 // cannot execute until all previous draws have completed
1473 pDC
->dependency
= pDC
->drawId
- 1;
1476 QueueDraw(pContext
);
1479 //////////////////////////////////////////////////////////////////////////
1480 /// @brief Enables stats counting
1481 /// @param hContext - Handle passed back from SwrCreateContext
1482 /// @param enable - If true then counts are incremented.
1483 void SwrEnableStats(
1487 SWR_CONTEXT
*pContext
= GetContext(hContext
);
1488 DRAW_CONTEXT
* pDC
= GetDrawContext(pContext
);
1490 pDC
->pState
->state
.enableStats
= enable
;
1493 //////////////////////////////////////////////////////////////////////////
1494 /// @brief Mark end of frame - used for performance profiling
1495 /// @param hContext - Handle passed back from SwrCreateContext
1496 void SWR_API
SwrEndFrame(