swr: [rasterizer core] Use CS spill/fill size in core
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / api.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file api.cpp
24 *
25 * @brief API implementation
26 *
27 ******************************************************************************/
28
29 #include <cfloat>
30 #include <cmath>
31 #include <cstdio>
32 #include <new>
33
34 #include "core/api.h"
35 #include "core/backend.h"
36 #include "core/context.h"
37 #include "core/depthstencil.h"
38 #include "core/frontend.h"
39 #include "core/rasterizer.h"
40 #include "core/rdtsc_core.h"
41 #include "core/threads.h"
42 #include "core/tilemgr.h"
43 #include "core/clip.h"
44
45 #include "common/simdintrin.h"
46 #include "common/os.h"
47
48 void SetupDefaultState(SWR_CONTEXT *pContext);
49
50 //////////////////////////////////////////////////////////////////////////
51 /// @brief Create SWR Context.
52 /// @param pCreateInfo - pointer to creation info.
53 HANDLE SwrCreateContext(
54 SWR_CREATECONTEXT_INFO* pCreateInfo)
55 {
56 RDTSC_RESET();
57 RDTSC_INIT(0);
58
59 void* pContextMem = _aligned_malloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4);
60 memset(pContextMem, 0, sizeof(SWR_CONTEXT));
61 SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT();
62
63 pContext->driverType = pCreateInfo->driver;
64 pContext->privateStateSize = pCreateInfo->privateStateSize;
65
66 pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
67 pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
68
69 pContext->pMacroTileManagerArray = (MacroTileMgr*)_aligned_malloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
70 pContext->pDispatchQueueArray = (DispatchQueue*)_aligned_malloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
71
72 for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
73 {
74 pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
75 new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
76 new (&pContext->pDispatchQueueArray[dc]) DispatchQueue();
77
78 pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
79 }
80
81 if (!KNOB_SINGLE_THREADED)
82 {
83 memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
84 memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
85 new (&pContext->WaitLock) std::mutex();
86 new (&pContext->FifosNotEmpty) std::condition_variable();
87
88 CreateThreadPool(pContext, &pContext->threadPool);
89 }
90
91 // Calling createThreadPool() above can set SINGLE_THREADED
92 if (KNOB_SINGLE_THREADED)
93 {
94 SET_KNOB(HYPERTHREADED_FE, false);
95 pContext->NumWorkerThreads = 1;
96 pContext->NumFEThreads = 1;
97 pContext->NumBEThreads = 1;
98 }
99
100 // Allocate scratch space for workers.
101 ///@note We could lazily allocate this but its rather small amount of memory.
102 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
103 {
104 #if defined(_WIN32)
105 uint32_t numaNode = pContext->threadPool.pThreadData ?
106 pContext->threadPool.pThreadData[i].numaId : 0;
107 pContext->pScratch[i] = (uint8_t*)VirtualAllocExNuma(
108 GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE),
109 MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE,
110 numaNode);
111 #else
112 pContext->pScratch[i] = (uint8_t*)_aligned_malloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4);
113 #endif
114 }
115
116 // State setup AFTER context is fully initialized
117 SetupDefaultState(pContext);
118
119 // initialize hot tile manager
120 pContext->pHotTileMgr = new HotTileMgr();
121
122 // initialize function pointer tables
123 InitClearTilesTable();
124
125 // initialize store tiles function
126 pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
127 pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
128 pContext->pfnClearTile = pCreateInfo->pfnClearTile;
129
130 // pass pointer to bucket manager back to caller
131 #ifdef KNOB_ENABLE_RDTSC
132 pCreateInfo->pBucketMgr = &gBucketMgr;
133 #endif
134
135 pCreateInfo->contextSaveSize = sizeof(API_STATE);
136
137 return (HANDLE)pContext;
138 }
139
140 void SwrDestroyContext(HANDLE hContext)
141 {
142 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
143 DestroyThreadPool(pContext, &pContext->threadPool);
144
145 // free the fifos
146 for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i)
147 {
148 delete pContext->dcRing[i].pArena;
149 delete pContext->dsRing[i].pArena;
150 pContext->pMacroTileManagerArray[i].~MacroTileMgr();
151 pContext->pDispatchQueueArray[i].~DispatchQueue();
152 }
153
154 _aligned_free(pContext->pDispatchQueueArray);
155 _aligned_free(pContext->pMacroTileManagerArray);
156
157 // Free scratch space.
158 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
159 {
160 #if defined(_WIN32)
161 VirtualFree(pContext->pScratch[i], 0, MEM_RELEASE);
162 #else
163 _aligned_free(pContext->pScratch[i]);
164 #endif
165 }
166
167 delete(pContext->pHotTileMgr);
168
169 pContext->~SWR_CONTEXT();
170 _aligned_free((SWR_CONTEXT*)hContext);
171 }
172
173 void CopyState(DRAW_STATE& dst, const DRAW_STATE& src)
174 {
175 memcpy(&dst.state, &src.state, sizeof(API_STATE));
176 }
177
178 void WakeAllThreads(SWR_CONTEXT *pContext)
179 {
180 pContext->FifosNotEmpty.notify_all();
181 }
182
183 template<bool IsDraw>
184 void QueueWork(SWR_CONTEXT *pContext)
185 {
186 DRAW_CONTEXT* pDC = pContext->pCurDrawContext;
187 uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
188
189 if (IsDraw)
190 {
191 pDC->pTileMgr = &pContext->pMacroTileManagerArray[dcIndex];
192 pDC->pTileMgr->initialize();
193 }
194
195 // Each worker thread looks at a DC for both FE and BE work at different times and so we
196 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
197 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
198 // then moved on if all work is done.)
199 pContext->pCurDrawContext->threadsDone = pContext->NumFEThreads + pContext->NumBEThreads;
200
201 _ReadWriteBarrier();
202 {
203 std::unique_lock<std::mutex> lock(pContext->WaitLock);
204 pContext->dcRing.Enqueue();
205 }
206
207 if (KNOB_SINGLE_THREADED)
208 {
209 // flush denormals to 0
210 uint32_t mxcsr = _mm_getcsr();
211 _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
212
213 if (IsDraw)
214 {
215 static TileSet lockedTiles;
216 uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
217 WorkOnFifoFE(pContext, 0, curDraw[0]);
218 WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles, 0, 0);
219 }
220 else
221 {
222 uint64_t curDispatch = pContext->pCurDrawContext->drawId;
223 WorkOnCompute(pContext, 0, curDispatch);
224 }
225
226 // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
227 while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0) {}
228
229 // restore csr
230 _mm_setcsr(mxcsr);
231 }
232 else
233 {
234 RDTSC_START(APIDrawWakeAllThreads);
235 WakeAllThreads(pContext);
236 RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
237 }
238
239 // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
240 pContext->pPrevDrawContext = pContext->pCurDrawContext;
241 pContext->pCurDrawContext = nullptr;
242 }
243
244 INLINE void QueueDraw(SWR_CONTEXT* pContext)
245 {
246 QueueWork<true>(pContext);
247 }
248
249 INLINE void QueueDispatch(SWR_CONTEXT* pContext)
250 {
251 QueueWork<false>(pContext);
252 }
253
254 DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
255 {
256 RDTSC_START(APIGetDrawContext);
257 // If current draw context is null then need to obtain a new draw context to use from ring.
258 if (pContext->pCurDrawContext == nullptr)
259 {
260 // Need to wait for a free entry.
261 while (pContext->dcRing.IsFull())
262 {
263 _mm_pause();
264 }
265
266 uint64_t curDraw = pContext->dcRing.GetHead();
267 uint32_t dcIndex = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT;
268
269 static uint64_t lastDrawChecked;
270 static uint32_t lastFrameChecked;
271 if ((pContext->frameCount - lastFrameChecked) > 2 ||
272 (curDraw - lastDrawChecked) > 0x10000)
273 {
274 // Take this opportunity to clean-up old arena allocations
275 pContext->cachingArenaAllocator.FreeOldBlocks();
276
277 lastFrameChecked = pContext->frameCount;
278 lastDrawChecked = curDraw;
279 }
280
281 DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
282 pContext->pCurDrawContext = pCurDrawContext;
283
284 // Assign next available entry in DS ring to this DC.
285 uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
286 pCurDrawContext->pState = &pContext->dsRing[dsIndex];
287
288 // Copy previous state to current state.
289 if (pContext->pPrevDrawContext)
290 {
291 DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext;
292
293 // If we're splitting our draw then we can just use the same state from the previous
294 // draw. In this case, we won't increment the DS ring index so the next non-split
295 // draw can receive the state.
296 if (isSplitDraw == false)
297 {
298 CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);
299
300 // Should have been cleaned up previously
301 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
302
303 pCurDrawContext->pState->pPrivateState = nullptr;
304
305 pContext->curStateId++; // Progress state ring index forward.
306 }
307 else
308 {
309 // If its a split draw then just copy the state pointer over
310 // since its the same draw.
311 pCurDrawContext->pState = pPrevDrawContext->pState;
312 SWR_ASSERT(pPrevDrawContext->cleanupState == false);
313 }
314 }
315 else
316 {
317 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
318 pContext->curStateId++; // Progress state ring index forward.
319 }
320
321 SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true);
322
323 pCurDrawContext->dependency = 0;
324 pCurDrawContext->pContext = pContext;
325 pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
326
327 pCurDrawContext->doneFE = false;
328 pCurDrawContext->FeLock = 0;
329 pCurDrawContext->threadsDone = 0;
330
331 // Assign unique drawId for this DC
332 pCurDrawContext->drawId = pContext->dcRing.GetHead();
333
334 pCurDrawContext->cleanupState = true;
335 }
336 else
337 {
338 SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
339 }
340
341 RDTSC_STOP(APIGetDrawContext, 0, 0);
342 return pContext->pCurDrawContext;
343 }
344
345 API_STATE* GetDrawState(SWR_CONTEXT *pContext)
346 {
347 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
348 SWR_ASSERT(pDC->pState != nullptr);
349
350 return &pDC->pState->state;
351 }
352
353 void SWR_API SwrSaveState(
354 HANDLE hContext,
355 void* pOutputStateBlock,
356 size_t memSize)
357 {
358 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
359 auto pSrc = GetDrawState(pContext);
360 SWR_ASSERT(pOutputStateBlock && memSize >= sizeof(*pSrc));
361
362 memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc));
363 }
364
365 void SWR_API SwrRestoreState(
366 HANDLE hContext,
367 const void* pStateBlock,
368 size_t memSize)
369 {
370 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
371 auto pDst = GetDrawState(pContext);
372 SWR_ASSERT(pStateBlock && memSize >= sizeof(*pDst));
373
374 memcpy(pDst, pStateBlock, sizeof(*pDst));
375 }
376
377 void SetupDefaultState(SWR_CONTEXT *pContext)
378 {
379 API_STATE* pState = GetDrawState(pContext);
380
381 pState->rastState.cullMode = SWR_CULLMODE_NONE;
382 pState->rastState.frontWinding = SWR_FRONTWINDING_CCW;
383 }
384
385 static INLINE SWR_CONTEXT* GetContext(HANDLE hContext)
386 {
387 return (SWR_CONTEXT*)hContext;
388 }
389
390 void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2, uint64_t userData3)
391 {
392 RDTSC_START(APISync);
393
394 SWR_ASSERT(pfnFunc != nullptr);
395
396 SWR_CONTEXT *pContext = GetContext(hContext);
397 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
398
399 pDC->FeWork.type = SYNC;
400 pDC->FeWork.pfnWork = ProcessSync;
401 pDC->FeWork.desc.sync.pfnCallbackFunc = pfnFunc;
402 pDC->FeWork.desc.sync.userData = userData;
403 pDC->FeWork.desc.sync.userData2 = userData2;
404 pDC->FeWork.desc.sync.userData3 = userData3;
405
406 // cannot execute until all previous draws have completed
407 pDC->dependency = pDC->drawId - 1;
408
409 //enqueue
410 QueueDraw(pContext);
411
412 RDTSC_STOP(APISync, 1, 0);
413 }
414
415 void SwrWaitForIdle(HANDLE hContext)
416 {
417 SWR_CONTEXT *pContext = GetContext(hContext);
418
419 RDTSC_START(APIWaitForIdle);
420
421 while (!pContext->dcRing.IsEmpty())
422 {
423 _mm_pause();
424 }
425
426 RDTSC_STOP(APIWaitForIdle, 1, 0);
427 }
428
429 void SwrSetVertexBuffers(
430 HANDLE hContext,
431 uint32_t numBuffers,
432 const SWR_VERTEX_BUFFER_STATE* pVertexBuffers)
433 {
434 API_STATE* pState = GetDrawState(GetContext(hContext));
435
436 for (uint32_t i = 0; i < numBuffers; ++i)
437 {
438 const SWR_VERTEX_BUFFER_STATE *pVB = &pVertexBuffers[i];
439 pState->vertexBuffers[pVB->index] = *pVB;
440 }
441 }
442
443 void SwrSetIndexBuffer(
444 HANDLE hContext,
445 const SWR_INDEX_BUFFER_STATE* pIndexBuffer)
446 {
447 API_STATE* pState = GetDrawState(GetContext(hContext));
448
449 pState->indexBuffer = *pIndexBuffer;
450 }
451
452 void SwrSetFetchFunc(
453 HANDLE hContext,
454 PFN_FETCH_FUNC pfnFetchFunc)
455 {
456 API_STATE* pState = GetDrawState(GetContext(hContext));
457
458 pState->pfnFetchFunc = pfnFetchFunc;
459 }
460
461 void SwrSetSoFunc(
462 HANDLE hContext,
463 PFN_SO_FUNC pfnSoFunc,
464 uint32_t streamIndex)
465 {
466 API_STATE* pState = GetDrawState(GetContext(hContext));
467
468 SWR_ASSERT(streamIndex < MAX_SO_STREAMS);
469
470 pState->pfnSoFunc[streamIndex] = pfnSoFunc;
471 }
472
473 void SwrSetSoState(
474 HANDLE hContext,
475 SWR_STREAMOUT_STATE* pSoState)
476 {
477 API_STATE* pState = GetDrawState(GetContext(hContext));
478
479 pState->soState = *pSoState;
480 }
481
482 void SwrSetSoBuffers(
483 HANDLE hContext,
484 SWR_STREAMOUT_BUFFER* pSoBuffer,
485 uint32_t slot)
486 {
487 API_STATE* pState = GetDrawState(GetContext(hContext));
488
489 SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot);
490
491 pState->soBuffer[slot] = *pSoBuffer;
492 }
493
494 void SwrSetVertexFunc(
495 HANDLE hContext,
496 PFN_VERTEX_FUNC pfnVertexFunc)
497 {
498 API_STATE* pState = GetDrawState(GetContext(hContext));
499
500 pState->pfnVertexFunc = pfnVertexFunc;
501 }
502
503 void SwrSetFrontendState(
504 HANDLE hContext,
505 SWR_FRONTEND_STATE *pFEState)
506 {
507 API_STATE* pState = GetDrawState(GetContext(hContext));
508 pState->frontendState = *pFEState;
509 }
510
511 void SwrSetGsState(
512 HANDLE hContext,
513 SWR_GS_STATE *pGSState)
514 {
515 API_STATE* pState = GetDrawState(GetContext(hContext));
516 pState->gsState = *pGSState;
517 }
518
519 void SwrSetGsFunc(
520 HANDLE hContext,
521 PFN_GS_FUNC pfnGsFunc)
522 {
523 API_STATE* pState = GetDrawState(GetContext(hContext));
524 pState->pfnGsFunc = pfnGsFunc;
525 }
526
527 void SwrSetCsFunc(
528 HANDLE hContext,
529 PFN_CS_FUNC pfnCsFunc,
530 uint32_t totalThreadsInGroup,
531 uint32_t totalSpillFillSize)
532 {
533 API_STATE* pState = GetDrawState(GetContext(hContext));
534 pState->pfnCsFunc = pfnCsFunc;
535 pState->totalThreadsInGroup = totalThreadsInGroup;
536 pState->totalSpillFillSize = totalSpillFillSize;
537 }
538
539 void SwrSetTsState(
540 HANDLE hContext,
541 SWR_TS_STATE *pState)
542 {
543 API_STATE* pApiState = GetDrawState(GetContext(hContext));
544 pApiState->tsState = *pState;
545 }
546
547 void SwrSetHsFunc(
548 HANDLE hContext,
549 PFN_HS_FUNC pfnFunc)
550 {
551 API_STATE* pApiState = GetDrawState(GetContext(hContext));
552 pApiState->pfnHsFunc = pfnFunc;
553 }
554
555 void SwrSetDsFunc(
556 HANDLE hContext,
557 PFN_DS_FUNC pfnFunc)
558 {
559 API_STATE* pApiState = GetDrawState(GetContext(hContext));
560 pApiState->pfnDsFunc = pfnFunc;
561 }
562
563 void SwrSetDepthStencilState(
564 HANDLE hContext,
565 SWR_DEPTH_STENCIL_STATE *pDSState)
566 {
567 API_STATE* pState = GetDrawState(GetContext(hContext));
568
569 pState->depthStencilState = *pDSState;
570 }
571
572 void SwrSetBackendState(
573 HANDLE hContext,
574 SWR_BACKEND_STATE *pBEState)
575 {
576 API_STATE* pState = GetDrawState(GetContext(hContext));
577
578 pState->backendState = *pBEState;
579 }
580
581 void SwrSetPixelShaderState(
582 HANDLE hContext,
583 SWR_PS_STATE *pPSState)
584 {
585 API_STATE *pState = GetDrawState(GetContext(hContext));
586 pState->psState = *pPSState;
587 }
588
589 void SwrSetBlendState(
590 HANDLE hContext,
591 SWR_BLEND_STATE *pBlendState)
592 {
593 API_STATE *pState = GetDrawState(GetContext(hContext));
594 memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE));
595 }
596
597 void SwrSetBlendFunc(
598 HANDLE hContext,
599 uint32_t renderTarget,
600 PFN_BLEND_JIT_FUNC pfnBlendFunc)
601 {
602 SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS);
603 API_STATE *pState = GetDrawState(GetContext(hContext));
604 pState->pfnBlendFunc[renderTarget] = pfnBlendFunc;
605 }
606
607 void SwrSetLinkage(
608 HANDLE hContext,
609 uint32_t mask,
610 const uint8_t* pMap)
611 {
612 API_STATE* pState = GetDrawState(GetContext(hContext));
613
614 static const uint8_t IDENTITY_MAP[] =
615 {
616 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
617 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
618 };
619 static_assert(sizeof(IDENTITY_MAP) == sizeof(pState->linkageMap),
620 "Update for new value of MAX_ATTRIBUTES");
621
622 pState->linkageMask = mask;
623 pState->linkageCount = _mm_popcnt_u32(mask);
624
625 if (!pMap)
626 {
627 pMap = IDENTITY_MAP;
628 }
629 memcpy(pState->linkageMap, pMap, pState->linkageCount);
630 }
631
632 // update guardband multipliers for the viewport
633 void updateGuardband(API_STATE *pState)
634 {
635 // guardband center is viewport center
636 pState->gbState.left = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
637 pState->gbState.right = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
638 pState->gbState.top = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
639 pState->gbState.bottom = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
640 }
641
642 void SwrSetRastState(
643 HANDLE hContext,
644 const SWR_RASTSTATE *pRastState)
645 {
646 SWR_CONTEXT *pContext = GetContext(hContext);
647 API_STATE* pState = GetDrawState(pContext);
648
649 memcpy(&pState->rastState, pRastState, sizeof(SWR_RASTSTATE));
650 }
651
652 void SwrSetViewports(
653 HANDLE hContext,
654 uint32_t numViewports,
655 const SWR_VIEWPORT* pViewports,
656 const SWR_VIEWPORT_MATRIX* pMatrices)
657 {
658 SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS,
659 "Invalid number of viewports.");
660
661 SWR_CONTEXT *pContext = GetContext(hContext);
662 API_STATE* pState = GetDrawState(pContext);
663
664 memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports);
665
666 if (pMatrices != nullptr)
667 {
668 memcpy(&pState->vpMatrix[0], pMatrices, sizeof(SWR_VIEWPORT_MATRIX) * numViewports);
669 }
670 else
671 {
672 // Compute default viewport transform.
673 for (uint32_t i = 0; i < numViewports; ++i)
674 {
675 if (pContext->driverType == DX)
676 {
677 pState->vpMatrix[i].m00 = pState->vp[i].width / 2.0f;
678 pState->vpMatrix[i].m11 = -pState->vp[i].height / 2.0f;
679 pState->vpMatrix[i].m22 = pState->vp[i].maxZ - pState->vp[i].minZ;
680 pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
681 pState->vpMatrix[i].m31 = pState->vp[i].y - pState->vpMatrix[i].m11;
682 pState->vpMatrix[i].m32 = pState->vp[i].minZ;
683 }
684 else
685 {
686 // Standard, with the exception that Y is inverted.
687 pState->vpMatrix[i].m00 = (pState->vp[i].width - pState->vp[i].x) / 2.0f;
688 pState->vpMatrix[i].m11 = (pState->vp[i].y - pState->vp[i].height) / 2.0f;
689 pState->vpMatrix[i].m22 = (pState->vp[i].maxZ - pState->vp[i].minZ) / 2.0f;
690 pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
691 pState->vpMatrix[i].m31 = pState->vp[i].height + pState->vpMatrix[i].m11;
692 pState->vpMatrix[i].m32 = pState->vp[i].minZ + pState->vpMatrix[i].m22;
693
694 // Now that the matrix is calculated, clip the view coords to screen size.
695 // OpenGL allows for -ve x,y in the viewport.
696 pState->vp[i].x = std::max(pState->vp[i].x, 0.0f);
697 pState->vp[i].y = std::max(pState->vp[i].y, 0.0f);
698 }
699 }
700 }
701
702 updateGuardband(pState);
703 }
704
705 void SwrSetScissorRects(
706 HANDLE hContext,
707 uint32_t numScissors,
708 const BBOX* pScissors)
709 {
710 SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS,
711 "Invalid number of scissor rects.");
712
713 API_STATE* pState = GetDrawState(GetContext(hContext));
714 memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(BBOX));
715 };
716
717 void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
718 {
719 API_STATE *pState = &pDC->pState->state;
720 uint32_t left, right, top, bottom;
721
722 // Set up scissor dimensions based on scissor or viewport
723 if (pState->rastState.scissorEnable)
724 {
725 // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges
726 left = pState->scissorRects[0].left;
727 right = pState->scissorRects[0].right;
728 top = pState->scissorRects[0].top;
729 bottom = pState->scissorRects[0].bottom;
730 }
731 else
732 {
733 left = (int32_t)pState->vp[0].x;
734 right = (int32_t)pState->vp[0].x + (int32_t)pState->vp[0].width;
735 top = (int32_t)pState->vp[0].y;
736 bottom = (int32_t)pState->vp[0].y + (int32_t)pState->vp[0].height;
737 }
738
739 right = std::min<uint32_t>(right, KNOB_MAX_SCISSOR_X);
740 bottom = std::min<uint32_t>(bottom, KNOB_MAX_SCISSOR_Y);
741
742 if (left > KNOB_MAX_SCISSOR_X || top > KNOB_MAX_SCISSOR_Y)
743 {
744 pState->scissorInFixedPoint.left = 0;
745 pState->scissorInFixedPoint.right = 0;
746 pState->scissorInFixedPoint.top = 0;
747 pState->scissorInFixedPoint.bottom = 0;
748 }
749 else
750 {
751 pState->scissorInFixedPoint.left = left * FIXED_POINT_SCALE;
752 pState->scissorInFixedPoint.right = right * FIXED_POINT_SCALE - 1;
753 pState->scissorInFixedPoint.top = top * FIXED_POINT_SCALE;
754 pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1;
755 }
756 }
757 // templated backend function tables
758 extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
759 extern PFN_BACKEND_FUNC gBackendSingleSample[2][2];
760 extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
761 extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2];
762 extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS + 1][SWR_MULTISAMPLE_TYPE_MAX];
763 extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
764 extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
765 extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2];
766 void SetupPipeline(DRAW_CONTEXT *pDC)
767 {
768 DRAW_STATE* pState = pDC->pState;
769 const SWR_RASTSTATE &rastState = pState->state.rastState;
770 const SWR_PS_STATE &psState = pState->state.psState;
771 BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
772 const uint32_t forcedSampleCount = (rastState.bForcedSampleCount) ? 1 : 0;
773
774 // setup backend
775 if (psState.pfnPixelShader == nullptr)
776 {
777 backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
778 // always need to generate I & J per sample for Z interpolation
779 backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[1];
780 }
781 else
782 {
783 const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.bForcedSampleCount) ? 1 : 0;
784 const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
785
786 // currently only support 'normal' input coverage
787 SWR_ASSERT(psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
788 psState.inputCoverage == SWR_INPUT_COVERAGE_NONE);
789
790 SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
791
792 // select backend function
793 switch(psState.shadingRate)
794 {
795 case SWR_SHADING_RATE_PIXEL:
796 if(bMultisampleEnable)
797 {
798 // always need to generate I & J per sample for Z interpolation
799 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
800 backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount];
801 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
802 }
803 else
804 {
805 // always need to generate I & J per pixel for Z interpolation
806 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
807 backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid];
808 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][SWR_MULTISAMPLE_1X];
809 }
810 break;
811 case SWR_SHADING_RATE_SAMPLE:
812 SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
813 // always need to generate I & J per sample for Z interpolation
814 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
815 backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid];
816 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
817 break;
818 default:
819 SWR_ASSERT(0 && "Invalid shading rate");
820 break;
821 }
822
823 // setup pointer to function that generates necessary barycentrics required by the PS
824 bool bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_PIXEL_MASK) > 0 ? 1 : 0;
825 backendFuncs.pfnCalcPixelBarycentrics = gPixelBarycentricTable[bBarycentrics];
826
827 bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_SAMPLE_MASK) > 0 ? 1 : 0;
828 backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[bBarycentrics];
829
830 bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0 ? 1 : 0;
831 backendFuncs.pfnCalcCentroidBarycentrics = gCentroidBarycentricTable[rastState.sampleCount][bBarycentrics][rastState.samplePattern][forcedSampleCount];
832 }
833
834 PFN_PROCESS_PRIMS pfnBinner;
835 switch (pState->state.topology)
836 {
837 case TOP_POINT_LIST:
838 pState->pfnProcessPrims = ClipPoints;
839 pfnBinner = BinPoints;
840 break;
841 case TOP_LINE_LIST:
842 case TOP_LINE_STRIP:
843 case TOP_LINE_LOOP:
844 case TOP_LINE_LIST_ADJ:
845 case TOP_LISTSTRIP_ADJ:
846 pState->pfnProcessPrims = ClipLines;
847 pfnBinner = BinLines;
848 break;
849 default:
850 pState->pfnProcessPrims = ClipTriangles;
851 pfnBinner = BinTriangles;
852 break;
853 };
854
855 // disable clipper if viewport transform is disabled
856 if (pState->state.frontendState.vpTransformDisable)
857 {
858 pState->pfnProcessPrims = pfnBinner;
859 }
860
861 if ((pState->state.psState.pfnPixelShader == nullptr) &&
862 (pState->state.depthStencilState.depthTestEnable == FALSE) &&
863 (pState->state.depthStencilState.depthWriteEnable == FALSE) &&
864 (pState->state.depthStencilState.stencilTestEnable == FALSE) &&
865 (pState->state.depthStencilState.stencilWriteEnable == FALSE) &&
866 (pState->state.linkageCount == 0))
867 {
868 pState->pfnProcessPrims = nullptr;
869 pState->state.linkageMask = 0;
870 }
871
872 if (pState->state.soState.rasterizerDisable == true)
873 {
874 pState->pfnProcessPrims = nullptr;
875 pState->state.linkageMask = 0;
876 }
877
878 // set up the frontend attrib mask
879 pState->state.feAttribMask = pState->state.linkageMask;
880 if (pState->state.soState.soEnable)
881 {
882 for (uint32_t i = 0; i < 4; ++i)
883 {
884 pState->state.feAttribMask |= pState->state.soState.streamMasks[i];
885 }
886 }
887
888 // complicated logic to test for cases where we don't need backing hottile memory for a draw
889 // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
890 pState->state.depthHottileEnable = ((!(pState->state.depthStencilState.depthTestEnable &&
891 !pState->state.depthStencilState.depthWriteEnable &&
892 pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) &&
893 (pState->state.depthStencilState.depthTestEnable ||
894 pState->state.depthStencilState.depthWriteEnable)) ? true : false;
895
896 pState->state.stencilHottileEnable = (((!(pState->state.depthStencilState.stencilTestEnable &&
897 !pState->state.depthStencilState.stencilWriteEnable &&
898 pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) ||
899 // for stencil we have to check the double sided state as well
900 (!(pState->state.depthStencilState.doubleSidedStencilTestEnable &&
901 !pState->state.depthStencilState.stencilWriteEnable &&
902 pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) &&
903 (pState->state.depthStencilState.stencilTestEnable ||
904 pState->state.depthStencilState.stencilWriteEnable)) ? true : false;
905
906 uint32_t numRTs = pState->state.psState.numRenderTargets;
907 pState->state.colorHottileEnable = 0;
908 if (psState.pfnPixelShader != nullptr)
909 {
910 for (uint32_t rt = 0; rt < numRTs; ++rt)
911 {
912 pState->state.colorHottileEnable |=
913 (!pState->state.blendState.renderTarget[rt].writeDisableAlpha ||
914 !pState->state.blendState.renderTarget[rt].writeDisableRed ||
915 !pState->state.blendState.renderTarget[rt].writeDisableGreen ||
916 !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0;
917 }
918 }
919
920 // Setup depth quantization function
921 if (pState->state.depthHottileEnable)
922 {
923 switch (pState->state.rastState.depthFormat)
924 {
925 case R32_FLOAT_X8X24_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT_X8X24_TYPELESS > ; break;
926 case R32_FLOAT: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; break;
927 case R24_UNORM_X8_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R24_UNORM_X8_TYPELESS > ; break;
928 case R16_UNORM: pState->state.pfnQuantizeDepth = QuantizeDepth < R16_UNORM > ; break;
929 default: SWR_ASSERT(false, "Unsupported depth format for depth quantiztion.");
930 pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
931 }
932 }
933 else
934 {
935 // set up pass-through quantize if depth isn't enabled
936 pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
937 }
938 }
939
940 //////////////////////////////////////////////////////////////////////////
941 /// @brief InitDraw
942 /// @param pDC - Draw context to initialize for this draw.
943 void InitDraw(
944 DRAW_CONTEXT *pDC,
945 bool isSplitDraw)
946 {
947 // We don't need to re-setup the scissors/pipeline state again for split draw.
948 if (isSplitDraw == false)
949 {
950 SetupMacroTileScissors(pDC);
951 SetupPipeline(pDC);
952 }
953 }
954
955 //////////////////////////////////////////////////////////////////////////
956 /// @brief We can split the draw for certain topologies for better performance.
957 /// @param totalVerts - Total vertices for draw
958 /// @param topology - Topology used for draw
959 uint32_t MaxVertsPerDraw(
960 DRAW_CONTEXT* pDC,
961 uint32_t totalVerts,
962 PRIMITIVE_TOPOLOGY topology)
963 {
964 API_STATE& state = pDC->pState->state;
965
966 uint32_t vertsPerDraw = totalVerts;
967
968 if (state.soState.soEnable)
969 {
970 return totalVerts;
971 }
972
973 switch (topology)
974 {
975 case TOP_POINT_LIST:
976 case TOP_TRIANGLE_LIST:
977 vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW;
978 break;
979
980 case TOP_PATCHLIST_1:
981 case TOP_PATCHLIST_2:
982 case TOP_PATCHLIST_3:
983 case TOP_PATCHLIST_4:
984 case TOP_PATCHLIST_5:
985 case TOP_PATCHLIST_6:
986 case TOP_PATCHLIST_7:
987 case TOP_PATCHLIST_8:
988 case TOP_PATCHLIST_9:
989 case TOP_PATCHLIST_10:
990 case TOP_PATCHLIST_11:
991 case TOP_PATCHLIST_12:
992 case TOP_PATCHLIST_13:
993 case TOP_PATCHLIST_14:
994 case TOP_PATCHLIST_15:
995 case TOP_PATCHLIST_16:
996 case TOP_PATCHLIST_17:
997 case TOP_PATCHLIST_18:
998 case TOP_PATCHLIST_19:
999 case TOP_PATCHLIST_20:
1000 case TOP_PATCHLIST_21:
1001 case TOP_PATCHLIST_22:
1002 case TOP_PATCHLIST_23:
1003 case TOP_PATCHLIST_24:
1004 case TOP_PATCHLIST_25:
1005 case TOP_PATCHLIST_26:
1006 case TOP_PATCHLIST_27:
1007 case TOP_PATCHLIST_28:
1008 case TOP_PATCHLIST_29:
1009 case TOP_PATCHLIST_30:
1010 case TOP_PATCHLIST_31:
1011 case TOP_PATCHLIST_32:
1012 if (pDC->pState->state.tsState.tsEnable)
1013 {
1014 uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE;
1015 vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW;
1016 }
1017 break;
1018
1019 // The Primitive Assembly code can only handle 1 RECT at a time.
1020 case TOP_RECT_LIST:
1021 vertsPerDraw = 3;
1022 break;
1023
1024 default:
1025 // We are not splitting up draws for other topologies.
1026 break;
1027 }
1028
1029 return vertsPerDraw;
1030 }
1031
1032 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
1033 // arguments to static template arguments.
1034 template <bool... ArgsB>
1035 struct FEDrawChooser
1036 {
1037 // Last Arg Terminator
1038 static PFN_FE_WORK_FUNC GetFunc(bool bArg)
1039 {
1040 if (bArg)
1041 {
1042 return ProcessDraw<ArgsB..., true>;
1043 }
1044
1045 return ProcessDraw<ArgsB..., false>;
1046 }
1047
1048 // Recursively parse args
1049 template <typename... TArgsT>
1050 static PFN_FE_WORK_FUNC GetFunc(bool bArg, TArgsT... remainingArgs)
1051 {
1052 if (bArg)
1053 {
1054 return FEDrawChooser<ArgsB..., true>::GetFunc(remainingArgs...);
1055 }
1056
1057 return FEDrawChooser<ArgsB..., false>::GetFunc(remainingArgs...);
1058 }
1059 };
1060
1061 // Selector for correct templated Draw front-end function
1062 INLINE
1063 static PFN_FE_WORK_FUNC GetFEDrawFunc(bool IsIndexed, bool HasTessellation, bool HasGeometryShader, bool HasStreamOut, bool RasterizerEnabled)
1064 {
1065 return FEDrawChooser<>::GetFunc(IsIndexed, HasTessellation, HasGeometryShader, HasStreamOut, RasterizerEnabled);
1066 }
1067
1068
1069 //////////////////////////////////////////////////////////////////////////
1070 /// @brief DrawInstanced
1071 /// @param hContext - Handle passed back from SwrCreateContext
1072 /// @param topology - Specifies topology for draw.
1073 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
1074 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1075 /// @param numInstances - How many instances to render.
1076 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1077 void DrawInstanced(
1078 HANDLE hContext,
1079 PRIMITIVE_TOPOLOGY topology,
1080 uint32_t numVertices,
1081 uint32_t startVertex,
1082 uint32_t numInstances = 1,
1083 uint32_t startInstance = 0)
1084 {
1085 if (KNOB_TOSS_DRAW)
1086 {
1087 return;
1088 }
1089
1090 RDTSC_START(APIDraw);
1091
1092 SWR_CONTEXT *pContext = GetContext(hContext);
1093 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1094
1095 uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
1096 uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
1097 uint32_t remainingVerts = numVertices;
1098
1099 API_STATE *pState = &pDC->pState->state;
1100 pState->topology = topology;
1101 pState->forceFront = false;
1102
1103 // disable culling for points/lines
1104 uint32_t oldCullMode = pState->rastState.cullMode;
1105 if (topology == TOP_POINT_LIST)
1106 {
1107 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1108 pState->forceFront = true;
1109 }
1110
1111 int draw = 0;
1112 while (remainingVerts)
1113 {
1114 uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ?
1115 remainingVerts : maxVertsPerDraw;
1116
1117 bool isSplitDraw = (draw > 0) ? true : false;
1118 DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw);
1119 InitDraw(pDC, isSplitDraw);
1120
1121 pDC->FeWork.type = DRAW;
1122 pDC->FeWork.pfnWork = GetFEDrawFunc(
1123 false, // IsIndexed
1124 pState->tsState.tsEnable,
1125 pState->gsState.gsEnable,
1126 pState->soState.soEnable,
1127 pDC->pState->pfnProcessPrims != nullptr);
1128 pDC->FeWork.desc.draw.numVerts = numVertsForDraw;
1129 pDC->FeWork.desc.draw.startVertex = startVertex;
1130 pDC->FeWork.desc.draw.numInstances = numInstances;
1131 pDC->FeWork.desc.draw.startInstance = startInstance;
1132 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1133 pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw;
1134
1135 pDC->cleanupState = (remainingVerts == numVertsForDraw);
1136
1137 //enqueue DC
1138 QueueDraw(pContext);
1139
1140 remainingVerts -= numVertsForDraw;
1141 draw++;
1142 }
1143
1144 // restore culling state
1145 pDC = GetDrawContext(pContext);
1146 pDC->pState->state.rastState.cullMode = oldCullMode;
1147
1148 RDTSC_STOP(APIDraw, numVertices * numInstances, 0);
1149 }
1150
1151 //////////////////////////////////////////////////////////////////////////
1152 /// @brief SwrDraw
1153 /// @param hContext - Handle passed back from SwrCreateContext
1154 /// @param topology - Specifies topology for draw.
1155 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
1156 /// @param primCount - Number of vertices.
1157 void SwrDraw(
1158 HANDLE hContext,
1159 PRIMITIVE_TOPOLOGY topology,
1160 uint32_t startVertex,
1161 uint32_t numVertices)
1162 {
1163 DrawInstanced(hContext, topology, numVertices, startVertex);
1164 }
1165
1166 //////////////////////////////////////////////////////////////////////////
1167 /// @brief SwrDrawInstanced
1168 /// @param hContext - Handle passed back from SwrCreateContext
1169 /// @param topology - Specifies topology for draw.
1170 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
1171 /// @param numInstances - How many instances to render.
1172 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1173 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1174 void SwrDrawInstanced(
1175 HANDLE hContext,
1176 PRIMITIVE_TOPOLOGY topology,
1177 uint32_t numVertsPerInstance,
1178 uint32_t numInstances,
1179 uint32_t startVertex,
1180 uint32_t startInstance
1181 )
1182 {
1183 DrawInstanced(hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance);
1184 }
1185
1186 //////////////////////////////////////////////////////////////////////////
1187 /// @brief DrawIndexedInstanced
1188 /// @param hContext - Handle passed back from SwrCreateContext
1189 /// @param topology - Specifies topology for draw.
1190 /// @param numIndices - Number of indices to read sequentially from index buffer.
1191 /// @param indexOffset - Starting index into index buffer.
1192 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1193 /// @param numInstances - Number of instances to render.
1194 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1195 void DrawIndexedInstance(
1196 HANDLE hContext,
1197 PRIMITIVE_TOPOLOGY topology,
1198 uint32_t numIndices,
1199 uint32_t indexOffset,
1200 int32_t baseVertex,
1201 uint32_t numInstances = 1,
1202 uint32_t startInstance = 0)
1203 {
1204 if (KNOB_TOSS_DRAW)
1205 {
1206 return;
1207 }
1208
1209 RDTSC_START(APIDrawIndexed);
1210
1211 SWR_CONTEXT *pContext = GetContext(hContext);
1212 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1213 API_STATE* pState = &pDC->pState->state;
1214
1215 uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
1216 uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
1217 uint32_t remainingIndices = numIndices;
1218
1219 uint32_t indexSize = 0;
1220 switch (pState->indexBuffer.format)
1221 {
1222 case R32_UINT: indexSize = sizeof(uint32_t); break;
1223 case R16_UINT: indexSize = sizeof(uint16_t); break;
1224 case R8_UINT: indexSize = sizeof(uint8_t); break;
1225 default:
1226 SWR_ASSERT(0);
1227 }
1228
1229 int draw = 0;
1230 uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices;
1231 pIB += (uint64_t)indexOffset * (uint64_t)indexSize;
1232
1233 pState->topology = topology;
1234 pState->forceFront = false;
1235
1236 // disable culling for points/lines
1237 uint32_t oldCullMode = pState->rastState.cullMode;
1238 if (topology == TOP_POINT_LIST)
1239 {
1240 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1241 pState->forceFront = true;
1242 }
1243
1244 while (remainingIndices)
1245 {
1246 uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ?
1247 remainingIndices : maxIndicesPerDraw;
1248
1249 // When breaking up draw, we need to obtain new draw context for each iteration.
1250 bool isSplitDraw = (draw > 0) ? true : false;
1251 pDC = GetDrawContext(pContext, isSplitDraw);
1252 InitDraw(pDC, isSplitDraw);
1253
1254 pDC->FeWork.type = DRAW;
1255 pDC->FeWork.pfnWork = GetFEDrawFunc(
1256 true, // IsIndexed
1257 pState->tsState.tsEnable,
1258 pState->gsState.gsEnable,
1259 pState->soState.soEnable,
1260 pDC->pState->pfnProcessPrims != nullptr);
1261 pDC->FeWork.desc.draw.pDC = pDC;
1262 pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
1263 pDC->FeWork.desc.draw.pIB = (int*)pIB;
1264 pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format;
1265
1266 pDC->FeWork.desc.draw.numInstances = numInstances;
1267 pDC->FeWork.desc.draw.startInstance = startInstance;
1268 pDC->FeWork.desc.draw.baseVertex = baseVertex;
1269 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1270
1271 pDC->cleanupState = (remainingIndices == numIndicesForDraw);
1272
1273 //enqueue DC
1274 QueueDraw(pContext);
1275
1276 pIB += maxIndicesPerDraw * indexSize;
1277 remainingIndices -= numIndicesForDraw;
1278 draw++;
1279 }
1280
1281 // restore culling state
1282 pDC = GetDrawContext(pContext);
1283 pDC->pState->state.rastState.cullMode = oldCullMode;
1284
1285 RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0);
1286 }
1287
1288
1289 //////////////////////////////////////////////////////////////////////////
1290 /// @brief DrawIndexed
1291 /// @param hContext - Handle passed back from SwrCreateContext
1292 /// @param topology - Specifies topology for draw.
1293 /// @param numIndices - Number of indices to read sequentially from index buffer.
1294 /// @param indexOffset - Starting index into index buffer.
1295 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1296 void SwrDrawIndexed(
1297 HANDLE hContext,
1298 PRIMITIVE_TOPOLOGY topology,
1299 uint32_t numIndices,
1300 uint32_t indexOffset,
1301 int32_t baseVertex
1302 )
1303 {
1304 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex);
1305 }
1306
1307 //////////////////////////////////////////////////////////////////////////
1308 /// @brief SwrDrawIndexedInstanced
1309 /// @param hContext - Handle passed back from SwrCreateContext
1310 /// @param topology - Specifies topology for draw.
1311 /// @param numIndices - Number of indices to read sequentially from index buffer.
1312 /// @param numInstances - Number of instances to render.
1313 /// @param indexOffset - Starting index into index buffer.
1314 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1315 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1316 void SwrDrawIndexedInstanced(
1317 HANDLE hContext,
1318 PRIMITIVE_TOPOLOGY topology,
1319 uint32_t numIndices,
1320 uint32_t numInstances,
1321 uint32_t indexOffset,
1322 int32_t baseVertex,
1323 uint32_t startInstance)
1324 {
1325 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
1326 }
1327
1328 //////////////////////////////////////////////////////////////////////////
1329 /// @brief SwrInvalidateTiles
1330 /// @param hContext - Handle passed back from SwrCreateContext
1331 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
1332 void SwrInvalidateTiles(
1333 HANDLE hContext,
1334 uint32_t attachmentMask)
1335 {
1336 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1337 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1338
1339 pDC->FeWork.type = DISCARDINVALIDATETILES;
1340 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1341 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1342 memset(&pDC->FeWork.desc.discardInvalidateTiles.rect, 0, sizeof(SWR_RECT));
1343 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID;
1344 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false;
1345 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false;
1346
1347 //enqueue
1348 QueueDraw(pContext);
1349 }
1350
1351 //////////////////////////////////////////////////////////////////////////
1352 /// @brief SwrDiscardRect
1353 /// @param hContext - Handle passed back from SwrCreateContext
1354 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
1355 /// @param rect - if rect is all zeros, the entire attachment surface will be discarded
1356 void SwrDiscardRect(
1357 HANDLE hContext,
1358 uint32_t attachmentMask,
1359 SWR_RECT rect)
1360 {
1361 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1362 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1363
1364 // Queue a load to the hottile
1365 pDC->FeWork.type = DISCARDINVALIDATETILES;
1366 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1367 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1368 pDC->FeWork.desc.discardInvalidateTiles.rect = rect;
1369 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED;
1370 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true;
1371 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true;
1372
1373 //enqueue
1374 QueueDraw(pContext);
1375 }
1376
1377 //////////////////////////////////////////////////////////////////////////
1378 /// @brief SwrDispatch
1379 /// @param hContext - Handle passed back from SwrCreateContext
1380 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
1381 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
1382 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
1383 void SwrDispatch(
1384 HANDLE hContext,
1385 uint32_t threadGroupCountX,
1386 uint32_t threadGroupCountY,
1387 uint32_t threadGroupCountZ)
1388 {
1389 if (KNOB_TOSS_DRAW)
1390 {
1391 return;
1392 }
1393
1394 RDTSC_START(APIDispatch);
1395 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1396 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1397
1398 pDC->isCompute = true; // This is a compute context.
1399
1400 COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
1401
1402 pTaskData->threadGroupCountX = threadGroupCountX;
1403 pTaskData->threadGroupCountY = threadGroupCountY;
1404 pTaskData->threadGroupCountZ = threadGroupCountZ;
1405
1406 uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
1407 uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
1408 pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex];
1409 pDC->pDispatch->initialize(totalThreadGroups, pTaskData);
1410
1411 QueueDispatch(pContext);
1412 RDTSC_STOP(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ, 0);
1413 }
1414
1415 // Deswizzles, converts and stores current contents of the hot tiles to surface
1416 // described by pState
1417 void SwrStoreTiles(
1418 HANDLE hContext,
1419 SWR_RENDERTARGET_ATTACHMENT attachment,
1420 SWR_TILE_STATE postStoreTileState)
1421 {
1422 RDTSC_START(APIStoreTiles);
1423
1424 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1425 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1426
1427 SetupMacroTileScissors(pDC);
1428
1429 pDC->FeWork.type = STORETILES;
1430 pDC->FeWork.pfnWork = ProcessStoreTiles;
1431 pDC->FeWork.desc.storeTiles.attachment = attachment;
1432 pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState;
1433
1434 //enqueue
1435 QueueDraw(pContext);
1436
1437 RDTSC_STOP(APIStoreTiles, 0, 0);
1438 }
1439
1440 void SwrClearRenderTarget(
1441 HANDLE hContext,
1442 uint32_t clearMask,
1443 const float clearColor[4],
1444 float z,
1445 uint8_t stencil)
1446 {
1447 RDTSC_START(APIClearRenderTarget);
1448
1449 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1450
1451 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1452
1453 SetupMacroTileScissors(pDC);
1454
1455 CLEAR_FLAGS flags;
1456 flags.mask = clearMask;
1457
1458 pDC->FeWork.type = CLEAR;
1459 pDC->FeWork.pfnWork = ProcessClear;
1460 pDC->FeWork.desc.clear.flags = flags;
1461 pDC->FeWork.desc.clear.clearDepth = z;
1462 pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0];
1463 pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1];
1464 pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2];
1465 pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3];
1466 pDC->FeWork.desc.clear.clearStencil = stencil;
1467
1468 // enqueue draw
1469 QueueDraw(pContext);
1470
1471 RDTSC_STOP(APIClearRenderTarget, 0, pDC->drawId);
1472 }
1473
1474 //////////////////////////////////////////////////////////////////////////
1475 /// @brief Returns a pointer to the private context state for the current
1476 /// draw operation. This is used for external componets such as the
1477 /// sampler.
1478 /// SWR is responsible for the allocation of the private context state.
1479 /// @param hContext - Handle passed back from SwrCreateContext
1480 VOID* SwrGetPrivateContextState(
1481 HANDLE hContext)
1482 {
1483 SWR_CONTEXT* pContext = GetContext(hContext);
1484 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1485 DRAW_STATE* pState = pDC->pState;
1486
1487 if (pState->pPrivateState == nullptr)
1488 {
1489 pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float));
1490 }
1491
1492 return pState->pPrivateState;
1493 }
1494
1495 //////////////////////////////////////////////////////////////////////////
1496 /// @brief Clients can use this to allocate memory for draw/dispatch
1497 /// operations. The memory will automatically be freed once operation
1498 /// has completed. Client can use this to allocate binding tables,
1499 /// etc. needed for shader execution.
1500 /// @param hContext - Handle passed back from SwrCreateContext
1501 /// @param size - Size of allocation
1502 /// @param align - Alignment needed for allocation.
1503 VOID* SwrAllocDrawContextMemory(
1504 HANDLE hContext,
1505 uint32_t size,
1506 uint32_t align)
1507 {
1508 SWR_CONTEXT* pContext = GetContext(hContext);
1509 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1510
1511 return pDC->pState->pArena->AllocAligned(size, align);
1512 }
1513
1514 //////////////////////////////////////////////////////////////////////////
1515 /// @brief Returns pointer to SWR stats.
1516 /// @note The counters are atomically incremented by multiple threads.
1517 /// When calling this, you need to ensure all previous operations
1518 /// have completed.
1519 /// @todo If necessary, add a callback to avoid stalling the pipe to
1520 /// sample the counters.
1521 /// @param hContext - Handle passed back from SwrCreateContext
1522 /// @param pStats - SWR will fill this out for caller.
1523 void SwrGetStats(
1524 HANDLE hContext,
1525 SWR_STATS* pStats)
1526 {
1527 SWR_CONTEXT *pContext = GetContext(hContext);
1528 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1529
1530 pDC->FeWork.type = QUERYSTATS;
1531 pDC->FeWork.pfnWork = ProcessQueryStats;
1532 pDC->FeWork.desc.queryStats.pStats = pStats;
1533
1534 // cannot execute until all previous draws have completed
1535 pDC->dependency = pDC->drawId - 1;
1536
1537 //enqueue
1538 QueueDraw(pContext);
1539 }
1540
1541 //////////////////////////////////////////////////////////////////////////
1542 /// @brief Enables stats counting
1543 /// @param hContext - Handle passed back from SwrCreateContext
1544 /// @param enable - If true then counts are incremented.
1545 void SwrEnableStats(
1546 HANDLE hContext,
1547 bool enable)
1548 {
1549 SWR_CONTEXT *pContext = GetContext(hContext);
1550 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1551
1552 pDC->pState->state.enableStats = enable;
1553 }
1554
1555 //////////////////////////////////////////////////////////////////////////
1556 /// @brief Mark end of frame - used for performance profiling
1557 /// @param hContext - Handle passed back from SwrCreateContext
1558 void SWR_API SwrEndFrame(
1559 HANDLE hContext)
1560 {
1561 RDTSC_ENDFRAME();
1562 SWR_CONTEXT *pContext = GetContext(hContext);
1563 pContext->frameCount++;
1564 }