swr: [rasterizer core] Put DRAW_CONTEXT on a diet
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / api.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file api.cpp
24 *
25 * @brief API implementation
26 *
27 ******************************************************************************/
28
29 #include <cfloat>
30 #include <cmath>
31 #include <cstdio>
32 #include <new>
33
34 #include "core/api.h"
35 #include "core/backend.h"
36 #include "core/context.h"
37 #include "core/depthstencil.h"
38 #include "core/frontend.h"
39 #include "core/rasterizer.h"
40 #include "core/rdtsc_core.h"
41 #include "core/threads.h"
42 #include "core/tilemgr.h"
43 #include "core/clip.h"
44
45 #include "common/simdintrin.h"
46 #include "common/os.h"
47
48 void SetupDefaultState(SWR_CONTEXT *pContext);
49
50 //////////////////////////////////////////////////////////////////////////
51 /// @brief Create SWR Context.
52 /// @param pCreateInfo - pointer to creation info.
53 HANDLE SwrCreateContext(
54 SWR_CREATECONTEXT_INFO* pCreateInfo)
55 {
56 RDTSC_RESET();
57 RDTSC_INIT(0);
58
59 void* pContextMem = _aligned_malloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4);
60 memset(pContextMem, 0, sizeof(SWR_CONTEXT));
61 SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT();
62
63 pContext->driverType = pCreateInfo->driver;
64 pContext->privateStateSize = pCreateInfo->privateStateSize;
65
66 pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
67 pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
68
69 pContext->pMacroTileManagerArray = (MacroTileMgr*)_aligned_malloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
70 pContext->pDispatchQueueArray = (DispatchQueue*)_aligned_malloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
71
72 for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
73 {
74 pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
75 new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
76 new (&pContext->pDispatchQueueArray[dc]) DispatchQueue();
77
78 pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
79 }
80
81 if (!KNOB_SINGLE_THREADED)
82 {
83 memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
84 memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
85 new (&pContext->WaitLock) std::mutex();
86 new (&pContext->FifosNotEmpty) std::condition_variable();
87
88 CreateThreadPool(pContext, &pContext->threadPool);
89 }
90
91 // Calling createThreadPool() above can set SINGLE_THREADED
92 if (KNOB_SINGLE_THREADED)
93 {
94 SET_KNOB(HYPERTHREADED_FE, false);
95 pContext->NumWorkerThreads = 1;
96 pContext->NumFEThreads = 1;
97 pContext->NumBEThreads = 1;
98 }
99
100 // Allocate scratch space for workers.
101 ///@note We could lazily allocate this but its rather small amount of memory.
102 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
103 {
104 #if defined(_WIN32)
105 uint32_t numaNode = pContext->threadPool.pThreadData ?
106 pContext->threadPool.pThreadData[i].numaId : 0;
107 pContext->pScratch[i] = (uint8_t*)VirtualAllocExNuma(
108 GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE),
109 MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE,
110 numaNode);
111 #else
112 pContext->pScratch[i] = (uint8_t*)_aligned_malloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4);
113 #endif
114 }
115
116 // State setup AFTER context is fully initialized
117 SetupDefaultState(pContext);
118
119 // initialize hot tile manager
120 pContext->pHotTileMgr = new HotTileMgr();
121
122 // initialize function pointer tables
123 InitClearTilesTable();
124
125 // initialize store tiles function
126 pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
127 pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
128 pContext->pfnClearTile = pCreateInfo->pfnClearTile;
129
130 // pass pointer to bucket manager back to caller
131 #ifdef KNOB_ENABLE_RDTSC
132 pCreateInfo->pBucketMgr = &gBucketMgr;
133 #endif
134
135 pCreateInfo->contextSaveSize = sizeof(API_STATE);
136
137 return (HANDLE)pContext;
138 }
139
140 void SwrDestroyContext(HANDLE hContext)
141 {
142 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
143 DestroyThreadPool(pContext, &pContext->threadPool);
144
145 // free the fifos
146 for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i)
147 {
148 delete pContext->dcRing[i].pArena;
149 delete pContext->dsRing[i].pArena;
150 pContext->pMacroTileManagerArray[i].~MacroTileMgr();
151 pContext->pDispatchQueueArray[i].~DispatchQueue();
152 }
153
154 _aligned_free(pContext->pDispatchQueueArray);
155 _aligned_free(pContext->pMacroTileManagerArray);
156
157 // Free scratch space.
158 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
159 {
160 #if defined(_WIN32)
161 VirtualFree(pContext->pScratch[i], 0, MEM_RELEASE);
162 #else
163 _aligned_free(pContext->pScratch[i]);
164 #endif
165 }
166
167 delete(pContext->pHotTileMgr);
168
169 pContext->~SWR_CONTEXT();
170 _aligned_free((SWR_CONTEXT*)hContext);
171 }
172
173 void CopyState(DRAW_STATE& dst, const DRAW_STATE& src)
174 {
175 memcpy(&dst.state, &src.state, sizeof(API_STATE));
176 }
177
178 void WakeAllThreads(SWR_CONTEXT *pContext)
179 {
180 pContext->FifosNotEmpty.notify_all();
181 }
182
183 template<bool IsDraw>
184 void QueueWork(SWR_CONTEXT *pContext)
185 {
186 DRAW_CONTEXT* pDC = pContext->pCurDrawContext;
187 uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
188
189 if (IsDraw)
190 {
191 pDC->pTileMgr = &pContext->pMacroTileManagerArray[dcIndex];
192 pDC->pTileMgr->initialize();
193 }
194
195 // Each worker thread looks at a DC for both FE and BE work at different times and so we
196 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
197 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
198 // then moved on if all work is done.)
199 pContext->pCurDrawContext->threadsDone = pContext->NumFEThreads + pContext->NumBEThreads;
200
201 _ReadWriteBarrier();
202 {
203 std::unique_lock<std::mutex> lock(pContext->WaitLock);
204 pContext->dcRing.Enqueue();
205 }
206
207 if (KNOB_SINGLE_THREADED)
208 {
209 // flush denormals to 0
210 uint32_t mxcsr = _mm_getcsr();
211 _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
212
213 if (IsDraw)
214 {
215 static TileSet lockedTiles;
216 uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
217 WorkOnFifoFE(pContext, 0, curDraw[0]);
218 WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles, 0, 0);
219 }
220 else
221 {
222 uint64_t curDispatch = pContext->pCurDrawContext->drawId;
223 WorkOnCompute(pContext, 0, curDispatch);
224 }
225
226 // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
227 while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0) {}
228
229 // restore csr
230 _mm_setcsr(mxcsr);
231 }
232 else
233 {
234 RDTSC_START(APIDrawWakeAllThreads);
235 WakeAllThreads(pContext);
236 RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
237 }
238
239 // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
240 pContext->pPrevDrawContext = pContext->pCurDrawContext;
241 pContext->pCurDrawContext = nullptr;
242 }
243
244 INLINE void QueueDraw(SWR_CONTEXT* pContext)
245 {
246 QueueWork<true>(pContext);
247 }
248
249 INLINE void QueueDispatch(SWR_CONTEXT* pContext)
250 {
251 QueueWork<false>(pContext);
252 }
253
254 DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
255 {
256 RDTSC_START(APIGetDrawContext);
257 // If current draw context is null then need to obtain a new draw context to use from ring.
258 if (pContext->pCurDrawContext == nullptr)
259 {
260 // Need to wait for a free entry.
261 while (pContext->dcRing.IsFull())
262 {
263 _mm_pause();
264 }
265
266 uint32_t dcIndex = pContext->dcRing.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT;
267
268 DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
269 pContext->pCurDrawContext = pCurDrawContext;
270
271 // Assign next available entry in DS ring to this DC.
272 uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
273 pCurDrawContext->pState = &pContext->dsRing[dsIndex];
274
275 // Copy previous state to current state.
276 if (pContext->pPrevDrawContext)
277 {
278 DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext;
279
280 // If we're splitting our draw then we can just use the same state from the previous
281 // draw. In this case, we won't increment the DS ring index so the next non-split
282 // draw can receive the state.
283 if (isSplitDraw == false)
284 {
285 CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);
286
287 // Should have been cleaned up previously
288 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
289
290 pCurDrawContext->pState->pPrivateState = nullptr;
291
292 pContext->curStateId++; // Progress state ring index forward.
293 }
294 else
295 {
296 // If its a split draw then just copy the state pointer over
297 // since its the same draw.
298 pCurDrawContext->pState = pPrevDrawContext->pState;
299 SWR_ASSERT(pPrevDrawContext->cleanupState == false);
300 }
301 }
302 else
303 {
304 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
305 pContext->curStateId++; // Progress state ring index forward.
306 }
307
308 SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true);
309
310 pCurDrawContext->dependency = 0;
311 pCurDrawContext->pContext = pContext;
312 pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
313
314 pCurDrawContext->doneFE = false;
315 pCurDrawContext->FeLock = 0;
316 pCurDrawContext->threadsDone = 0;
317
318 // Assign unique drawId for this DC
319 pCurDrawContext->drawId = pContext->dcRing.GetHead();
320
321 pCurDrawContext->cleanupState = true;
322 }
323 else
324 {
325 SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
326 }
327
328 RDTSC_STOP(APIGetDrawContext, 0, 0);
329 return pContext->pCurDrawContext;
330 }
331
332 API_STATE* GetDrawState(SWR_CONTEXT *pContext)
333 {
334 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
335 SWR_ASSERT(pDC->pState != nullptr);
336
337 return &pDC->pState->state;
338 }
339
340 void SWR_API SwrSaveState(
341 HANDLE hContext,
342 void* pOutputStateBlock,
343 size_t memSize)
344 {
345 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
346 auto pSrc = GetDrawState(pContext);
347 SWR_ASSERT(pOutputStateBlock && memSize >= sizeof(*pSrc));
348
349 memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc));
350 }
351
352 void SWR_API SwrRestoreState(
353 HANDLE hContext,
354 const void* pStateBlock,
355 size_t memSize)
356 {
357 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
358 auto pDst = GetDrawState(pContext);
359 SWR_ASSERT(pStateBlock && memSize >= sizeof(*pDst));
360
361 memcpy(pDst, pStateBlock, sizeof(*pDst));
362 }
363
364 void SetupDefaultState(SWR_CONTEXT *pContext)
365 {
366 API_STATE* pState = GetDrawState(pContext);
367
368 pState->rastState.cullMode = SWR_CULLMODE_NONE;
369 pState->rastState.frontWinding = SWR_FRONTWINDING_CCW;
370 }
371
372 static INLINE SWR_CONTEXT* GetContext(HANDLE hContext)
373 {
374 return (SWR_CONTEXT*)hContext;
375 }
376
377 void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2, uint64_t userData3)
378 {
379 RDTSC_START(APISync);
380
381 SWR_ASSERT(pfnFunc != nullptr);
382
383 SWR_CONTEXT *pContext = GetContext(hContext);
384 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
385
386 pDC->FeWork.type = SYNC;
387 pDC->FeWork.pfnWork = ProcessSync;
388 pDC->FeWork.desc.sync.pfnCallbackFunc = pfnFunc;
389 pDC->FeWork.desc.sync.userData = userData;
390 pDC->FeWork.desc.sync.userData2 = userData2;
391 pDC->FeWork.desc.sync.userData3 = userData3;
392
393 // cannot execute until all previous draws have completed
394 pDC->dependency = pDC->drawId - 1;
395
396 //enqueue
397 QueueDraw(pContext);
398
399 RDTSC_STOP(APISync, 1, 0);
400 }
401
402 void SwrWaitForIdle(HANDLE hContext)
403 {
404 SWR_CONTEXT *pContext = GetContext(hContext);
405
406 RDTSC_START(APIWaitForIdle);
407
408 while (!pContext->dcRing.IsEmpty())
409 {
410 _mm_pause();
411 }
412
413 RDTSC_STOP(APIWaitForIdle, 1, 0);
414 }
415
416 void SwrSetVertexBuffers(
417 HANDLE hContext,
418 uint32_t numBuffers,
419 const SWR_VERTEX_BUFFER_STATE* pVertexBuffers)
420 {
421 API_STATE* pState = GetDrawState(GetContext(hContext));
422
423 for (uint32_t i = 0; i < numBuffers; ++i)
424 {
425 const SWR_VERTEX_BUFFER_STATE *pVB = &pVertexBuffers[i];
426 pState->vertexBuffers[pVB->index] = *pVB;
427 }
428 }
429
430 void SwrSetIndexBuffer(
431 HANDLE hContext,
432 const SWR_INDEX_BUFFER_STATE* pIndexBuffer)
433 {
434 API_STATE* pState = GetDrawState(GetContext(hContext));
435
436 pState->indexBuffer = *pIndexBuffer;
437 }
438
439 void SwrSetFetchFunc(
440 HANDLE hContext,
441 PFN_FETCH_FUNC pfnFetchFunc)
442 {
443 API_STATE* pState = GetDrawState(GetContext(hContext));
444
445 pState->pfnFetchFunc = pfnFetchFunc;
446 }
447
448 void SwrSetSoFunc(
449 HANDLE hContext,
450 PFN_SO_FUNC pfnSoFunc,
451 uint32_t streamIndex)
452 {
453 API_STATE* pState = GetDrawState(GetContext(hContext));
454
455 SWR_ASSERT(streamIndex < MAX_SO_STREAMS);
456
457 pState->pfnSoFunc[streamIndex] = pfnSoFunc;
458 }
459
460 void SwrSetSoState(
461 HANDLE hContext,
462 SWR_STREAMOUT_STATE* pSoState)
463 {
464 API_STATE* pState = GetDrawState(GetContext(hContext));
465
466 pState->soState = *pSoState;
467 }
468
469 void SwrSetSoBuffers(
470 HANDLE hContext,
471 SWR_STREAMOUT_BUFFER* pSoBuffer,
472 uint32_t slot)
473 {
474 API_STATE* pState = GetDrawState(GetContext(hContext));
475
476 SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot);
477
478 pState->soBuffer[slot] = *pSoBuffer;
479 }
480
481 void SwrSetVertexFunc(
482 HANDLE hContext,
483 PFN_VERTEX_FUNC pfnVertexFunc)
484 {
485 API_STATE* pState = GetDrawState(GetContext(hContext));
486
487 pState->pfnVertexFunc = pfnVertexFunc;
488 }
489
490 void SwrSetFrontendState(
491 HANDLE hContext,
492 SWR_FRONTEND_STATE *pFEState)
493 {
494 API_STATE* pState = GetDrawState(GetContext(hContext));
495 pState->frontendState = *pFEState;
496 }
497
498 void SwrSetGsState(
499 HANDLE hContext,
500 SWR_GS_STATE *pGSState)
501 {
502 API_STATE* pState = GetDrawState(GetContext(hContext));
503 pState->gsState = *pGSState;
504 }
505
506 void SwrSetGsFunc(
507 HANDLE hContext,
508 PFN_GS_FUNC pfnGsFunc)
509 {
510 API_STATE* pState = GetDrawState(GetContext(hContext));
511 pState->pfnGsFunc = pfnGsFunc;
512 }
513
514 void SwrSetCsFunc(
515 HANDLE hContext,
516 PFN_CS_FUNC pfnCsFunc,
517 uint32_t totalThreadsInGroup)
518 {
519 API_STATE* pState = GetDrawState(GetContext(hContext));
520 pState->pfnCsFunc = pfnCsFunc;
521 pState->totalThreadsInGroup = totalThreadsInGroup;
522 }
523
524 void SwrSetTsState(
525 HANDLE hContext,
526 SWR_TS_STATE *pState)
527 {
528 API_STATE* pApiState = GetDrawState(GetContext(hContext));
529 pApiState->tsState = *pState;
530 }
531
532 void SwrSetHsFunc(
533 HANDLE hContext,
534 PFN_HS_FUNC pfnFunc)
535 {
536 API_STATE* pApiState = GetDrawState(GetContext(hContext));
537 pApiState->pfnHsFunc = pfnFunc;
538 }
539
540 void SwrSetDsFunc(
541 HANDLE hContext,
542 PFN_DS_FUNC pfnFunc)
543 {
544 API_STATE* pApiState = GetDrawState(GetContext(hContext));
545 pApiState->pfnDsFunc = pfnFunc;
546 }
547
548 void SwrSetDepthStencilState(
549 HANDLE hContext,
550 SWR_DEPTH_STENCIL_STATE *pDSState)
551 {
552 API_STATE* pState = GetDrawState(GetContext(hContext));
553
554 pState->depthStencilState = *pDSState;
555 }
556
557 void SwrSetBackendState(
558 HANDLE hContext,
559 SWR_BACKEND_STATE *pBEState)
560 {
561 API_STATE* pState = GetDrawState(GetContext(hContext));
562
563 pState->backendState = *pBEState;
564 }
565
566 void SwrSetPixelShaderState(
567 HANDLE hContext,
568 SWR_PS_STATE *pPSState)
569 {
570 API_STATE *pState = GetDrawState(GetContext(hContext));
571 pState->psState = *pPSState;
572 }
573
574 void SwrSetBlendState(
575 HANDLE hContext,
576 SWR_BLEND_STATE *pBlendState)
577 {
578 API_STATE *pState = GetDrawState(GetContext(hContext));
579 memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE));
580 }
581
582 void SwrSetBlendFunc(
583 HANDLE hContext,
584 uint32_t renderTarget,
585 PFN_BLEND_JIT_FUNC pfnBlendFunc)
586 {
587 SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS);
588 API_STATE *pState = GetDrawState(GetContext(hContext));
589 pState->pfnBlendFunc[renderTarget] = pfnBlendFunc;
590 }
591
592 void SwrSetLinkage(
593 HANDLE hContext,
594 uint32_t mask,
595 const uint8_t* pMap)
596 {
597 API_STATE* pState = GetDrawState(GetContext(hContext));
598
599 static const uint8_t IDENTITY_MAP[] =
600 {
601 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
602 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
603 };
604 static_assert(sizeof(IDENTITY_MAP) == sizeof(pState->linkageMap),
605 "Update for new value of MAX_ATTRIBUTES");
606
607 pState->linkageMask = mask;
608 pState->linkageCount = _mm_popcnt_u32(mask);
609
610 if (!pMap)
611 {
612 pMap = IDENTITY_MAP;
613 }
614 memcpy(pState->linkageMap, pMap, pState->linkageCount);
615 }
616
617 // update guardband multipliers for the viewport
618 void updateGuardband(API_STATE *pState)
619 {
620 // guardband center is viewport center
621 pState->gbState.left = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
622 pState->gbState.right = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
623 pState->gbState.top = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
624 pState->gbState.bottom = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
625 }
626
627 void SwrSetRastState(
628 HANDLE hContext,
629 const SWR_RASTSTATE *pRastState)
630 {
631 SWR_CONTEXT *pContext = GetContext(hContext);
632 API_STATE* pState = GetDrawState(pContext);
633
634 memcpy(&pState->rastState, pRastState, sizeof(SWR_RASTSTATE));
635 }
636
637 void SwrSetViewports(
638 HANDLE hContext,
639 uint32_t numViewports,
640 const SWR_VIEWPORT* pViewports,
641 const SWR_VIEWPORT_MATRIX* pMatrices)
642 {
643 SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS,
644 "Invalid number of viewports.");
645
646 SWR_CONTEXT *pContext = GetContext(hContext);
647 API_STATE* pState = GetDrawState(pContext);
648
649 memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports);
650
651 if (pMatrices != nullptr)
652 {
653 memcpy(&pState->vpMatrix[0], pMatrices, sizeof(SWR_VIEWPORT_MATRIX) * numViewports);
654 }
655 else
656 {
657 // Compute default viewport transform.
658 for (uint32_t i = 0; i < numViewports; ++i)
659 {
660 if (pContext->driverType == DX)
661 {
662 pState->vpMatrix[i].m00 = pState->vp[i].width / 2.0f;
663 pState->vpMatrix[i].m11 = -pState->vp[i].height / 2.0f;
664 pState->vpMatrix[i].m22 = pState->vp[i].maxZ - pState->vp[i].minZ;
665 pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
666 pState->vpMatrix[i].m31 = pState->vp[i].y - pState->vpMatrix[i].m11;
667 pState->vpMatrix[i].m32 = pState->vp[i].minZ;
668 }
669 else
670 {
671 // Standard, with the exception that Y is inverted.
672 pState->vpMatrix[i].m00 = (pState->vp[i].width - pState->vp[i].x) / 2.0f;
673 pState->vpMatrix[i].m11 = (pState->vp[i].y - pState->vp[i].height) / 2.0f;
674 pState->vpMatrix[i].m22 = (pState->vp[i].maxZ - pState->vp[i].minZ) / 2.0f;
675 pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
676 pState->vpMatrix[i].m31 = pState->vp[i].height + pState->vpMatrix[i].m11;
677 pState->vpMatrix[i].m32 = pState->vp[i].minZ + pState->vpMatrix[i].m22;
678
679 // Now that the matrix is calculated, clip the view coords to screen size.
680 // OpenGL allows for -ve x,y in the viewport.
681 pState->vp[i].x = std::max(pState->vp[i].x, 0.0f);
682 pState->vp[i].y = std::max(pState->vp[i].y, 0.0f);
683 }
684 }
685 }
686
687 updateGuardband(pState);
688 }
689
690 void SwrSetScissorRects(
691 HANDLE hContext,
692 uint32_t numScissors,
693 const BBOX* pScissors)
694 {
695 SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS,
696 "Invalid number of scissor rects.");
697
698 API_STATE* pState = GetDrawState(GetContext(hContext));
699 memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(BBOX));
700 };
701
702 void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
703 {
704 API_STATE *pState = &pDC->pState->state;
705 uint32_t left, right, top, bottom;
706
707 // Set up scissor dimensions based on scissor or viewport
708 if (pState->rastState.scissorEnable)
709 {
710 // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges
711 left = pState->scissorRects[0].left;
712 right = pState->scissorRects[0].right;
713 top = pState->scissorRects[0].top;
714 bottom = pState->scissorRects[0].bottom;
715 }
716 else
717 {
718 left = (int32_t)pState->vp[0].x;
719 right = (int32_t)pState->vp[0].x + (int32_t)pState->vp[0].width;
720 top = (int32_t)pState->vp[0].y;
721 bottom = (int32_t)pState->vp[0].y + (int32_t)pState->vp[0].height;
722 }
723
724 right = std::min<uint32_t>(right, KNOB_MAX_SCISSOR_X);
725 bottom = std::min<uint32_t>(bottom, KNOB_MAX_SCISSOR_Y);
726
727 if (left > KNOB_MAX_SCISSOR_X || top > KNOB_MAX_SCISSOR_Y)
728 {
729 pState->scissorInFixedPoint.left = 0;
730 pState->scissorInFixedPoint.right = 0;
731 pState->scissorInFixedPoint.top = 0;
732 pState->scissorInFixedPoint.bottom = 0;
733 }
734 else
735 {
736 pState->scissorInFixedPoint.left = left * FIXED_POINT_SCALE;
737 pState->scissorInFixedPoint.right = right * FIXED_POINT_SCALE - 1;
738 pState->scissorInFixedPoint.top = top * FIXED_POINT_SCALE;
739 pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1;
740 }
741 }
742 // templated backend function tables
743 extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
744 extern PFN_BACKEND_FUNC gBackendSingleSample[2][2];
745 extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
746 extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2];
747 extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS + 1][SWR_MULTISAMPLE_TYPE_MAX];
748 extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
749 extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
750 extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2];
751 void SetupPipeline(DRAW_CONTEXT *pDC)
752 {
753 DRAW_STATE* pState = pDC->pState;
754 const SWR_RASTSTATE &rastState = pState->state.rastState;
755 const SWR_PS_STATE &psState = pState->state.psState;
756 BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
757 const uint32_t forcedSampleCount = (rastState.bForcedSampleCount) ? 1 : 0;
758
759 // setup backend
760 if (psState.pfnPixelShader == nullptr)
761 {
762 backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
763 // always need to generate I & J per sample for Z interpolation
764 backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[1];
765 }
766 else
767 {
768 const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.bForcedSampleCount) ? 1 : 0;
769 const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
770
771 // currently only support 'normal' input coverage
772 SWR_ASSERT(psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
773 psState.inputCoverage == SWR_INPUT_COVERAGE_NONE);
774
775 SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
776
777 // select backend function
778 switch(psState.shadingRate)
779 {
780 case SWR_SHADING_RATE_PIXEL:
781 if(bMultisampleEnable)
782 {
783 // always need to generate I & J per sample for Z interpolation
784 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
785 backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount];
786 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
787 }
788 else
789 {
790 // always need to generate I & J per pixel for Z interpolation
791 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
792 backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid];
793 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][SWR_MULTISAMPLE_1X];
794 }
795 break;
796 case SWR_SHADING_RATE_SAMPLE:
797 SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
798 // always need to generate I & J per sample for Z interpolation
799 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
800 backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid];
801 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
802 break;
803 default:
804 SWR_ASSERT(0 && "Invalid shading rate");
805 break;
806 }
807
808 // setup pointer to function that generates necessary barycentrics required by the PS
809 bool bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_PIXEL_MASK) > 0 ? 1 : 0;
810 backendFuncs.pfnCalcPixelBarycentrics = gPixelBarycentricTable[bBarycentrics];
811
812 bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_SAMPLE_MASK) > 0 ? 1 : 0;
813 backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[bBarycentrics];
814
815 bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0 ? 1 : 0;
816 backendFuncs.pfnCalcCentroidBarycentrics = gCentroidBarycentricTable[rastState.sampleCount][bBarycentrics][rastState.samplePattern][forcedSampleCount];
817 }
818
819 PFN_PROCESS_PRIMS pfnBinner;
820 switch (pState->state.topology)
821 {
822 case TOP_POINT_LIST:
823 pState->pfnProcessPrims = ClipPoints;
824 pfnBinner = BinPoints;
825 break;
826 case TOP_LINE_LIST:
827 case TOP_LINE_STRIP:
828 case TOP_LINE_LOOP:
829 case TOP_LINE_LIST_ADJ:
830 case TOP_LISTSTRIP_ADJ:
831 pState->pfnProcessPrims = ClipLines;
832 pfnBinner = BinLines;
833 break;
834 default:
835 pState->pfnProcessPrims = ClipTriangles;
836 pfnBinner = BinTriangles;
837 break;
838 };
839
840 // disable clipper if viewport transform is disabled
841 if (pState->state.frontendState.vpTransformDisable)
842 {
843 pState->pfnProcessPrims = pfnBinner;
844 }
845
846 if ((pState->state.psState.pfnPixelShader == nullptr) &&
847 (pState->state.depthStencilState.depthTestEnable == FALSE) &&
848 (pState->state.depthStencilState.depthWriteEnable == FALSE) &&
849 (pState->state.depthStencilState.stencilTestEnable == FALSE) &&
850 (pState->state.depthStencilState.stencilWriteEnable == FALSE) &&
851 (pState->state.linkageCount == 0))
852 {
853 pState->pfnProcessPrims = nullptr;
854 pState->state.linkageMask = 0;
855 }
856
857 if (pState->state.soState.rasterizerDisable == true)
858 {
859 pState->pfnProcessPrims = nullptr;
860 pState->state.linkageMask = 0;
861 }
862
863 // set up the frontend attrib mask
864 pState->state.feAttribMask = pState->state.linkageMask;
865 if (pState->state.soState.soEnable)
866 {
867 for (uint32_t i = 0; i < 4; ++i)
868 {
869 pState->state.feAttribMask |= pState->state.soState.streamMasks[i];
870 }
871 }
872
873 // complicated logic to test for cases where we don't need backing hottile memory for a draw
874 // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
875 pState->state.depthHottileEnable = ((!(pState->state.depthStencilState.depthTestEnable &&
876 !pState->state.depthStencilState.depthWriteEnable &&
877 pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) &&
878 (pState->state.depthStencilState.depthTestEnable ||
879 pState->state.depthStencilState.depthWriteEnable)) ? true : false;
880
881 pState->state.stencilHottileEnable = (((!(pState->state.depthStencilState.stencilTestEnable &&
882 !pState->state.depthStencilState.stencilWriteEnable &&
883 pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) ||
884 // for stencil we have to check the double sided state as well
885 (!(pState->state.depthStencilState.doubleSidedStencilTestEnable &&
886 !pState->state.depthStencilState.stencilWriteEnable &&
887 pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) &&
888 (pState->state.depthStencilState.stencilTestEnable ||
889 pState->state.depthStencilState.stencilWriteEnable)) ? true : false;
890
891 uint32_t numRTs = pState->state.psState.numRenderTargets;
892 pState->state.colorHottileEnable = 0;
893 if (psState.pfnPixelShader != nullptr)
894 {
895 for (uint32_t rt = 0; rt < numRTs; ++rt)
896 {
897 pState->state.colorHottileEnable |=
898 (!pState->state.blendState.renderTarget[rt].writeDisableAlpha ||
899 !pState->state.blendState.renderTarget[rt].writeDisableRed ||
900 !pState->state.blendState.renderTarget[rt].writeDisableGreen ||
901 !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0;
902 }
903 }
904
905 // Setup depth quantization function
906 if (pState->state.depthHottileEnable)
907 {
908 switch (pState->state.rastState.depthFormat)
909 {
910 case R32_FLOAT_X8X24_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT_X8X24_TYPELESS > ; break;
911 case R32_FLOAT: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; break;
912 case R24_UNORM_X8_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R24_UNORM_X8_TYPELESS > ; break;
913 case R16_UNORM: pState->state.pfnQuantizeDepth = QuantizeDepth < R16_UNORM > ; break;
914 default: SWR_ASSERT(false, "Unsupported depth format for depth quantiztion.");
915 pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
916 }
917 }
918 else
919 {
920 // set up pass-through quantize if depth isn't enabled
921 pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
922 }
923 }
924
925 //////////////////////////////////////////////////////////////////////////
926 /// @brief InitDraw
927 /// @param pDC - Draw context to initialize for this draw.
928 void InitDraw(
929 DRAW_CONTEXT *pDC,
930 bool isSplitDraw)
931 {
932 // We don't need to re-setup the scissors/pipeline state again for split draw.
933 if (isSplitDraw == false)
934 {
935 SetupMacroTileScissors(pDC);
936 SetupPipeline(pDC);
937 }
938 }
939
940 //////////////////////////////////////////////////////////////////////////
941 /// @brief We can split the draw for certain topologies for better performance.
942 /// @param totalVerts - Total vertices for draw
943 /// @param topology - Topology used for draw
944 uint32_t MaxVertsPerDraw(
945 DRAW_CONTEXT* pDC,
946 uint32_t totalVerts,
947 PRIMITIVE_TOPOLOGY topology)
948 {
949 API_STATE& state = pDC->pState->state;
950
951 uint32_t vertsPerDraw = totalVerts;
952
953 if (state.soState.soEnable)
954 {
955 return totalVerts;
956 }
957
958 switch (topology)
959 {
960 case TOP_POINT_LIST:
961 case TOP_TRIANGLE_LIST:
962 vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW;
963 break;
964
965 case TOP_PATCHLIST_1:
966 case TOP_PATCHLIST_2:
967 case TOP_PATCHLIST_3:
968 case TOP_PATCHLIST_4:
969 case TOP_PATCHLIST_5:
970 case TOP_PATCHLIST_6:
971 case TOP_PATCHLIST_7:
972 case TOP_PATCHLIST_8:
973 case TOP_PATCHLIST_9:
974 case TOP_PATCHLIST_10:
975 case TOP_PATCHLIST_11:
976 case TOP_PATCHLIST_12:
977 case TOP_PATCHLIST_13:
978 case TOP_PATCHLIST_14:
979 case TOP_PATCHLIST_15:
980 case TOP_PATCHLIST_16:
981 case TOP_PATCHLIST_17:
982 case TOP_PATCHLIST_18:
983 case TOP_PATCHLIST_19:
984 case TOP_PATCHLIST_20:
985 case TOP_PATCHLIST_21:
986 case TOP_PATCHLIST_22:
987 case TOP_PATCHLIST_23:
988 case TOP_PATCHLIST_24:
989 case TOP_PATCHLIST_25:
990 case TOP_PATCHLIST_26:
991 case TOP_PATCHLIST_27:
992 case TOP_PATCHLIST_28:
993 case TOP_PATCHLIST_29:
994 case TOP_PATCHLIST_30:
995 case TOP_PATCHLIST_31:
996 case TOP_PATCHLIST_32:
997 if (pDC->pState->state.tsState.tsEnable)
998 {
999 uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE;
1000 vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW;
1001 }
1002 break;
1003
1004 // The Primitive Assembly code can only handle 1 RECT at a time.
1005 case TOP_RECT_LIST:
1006 vertsPerDraw = 3;
1007 break;
1008
1009 default:
1010 // We are not splitting up draws for other topologies.
1011 break;
1012 }
1013
1014 return vertsPerDraw;
1015 }
1016
1017 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
1018 // arguments to static template arguments.
1019 template <bool... ArgsB>
1020 struct FEDrawChooser
1021 {
1022 // Last Arg Terminator
1023 static PFN_FE_WORK_FUNC GetFunc(bool bArg)
1024 {
1025 if (bArg)
1026 {
1027 return ProcessDraw<ArgsB..., true>;
1028 }
1029
1030 return ProcessDraw<ArgsB..., false>;
1031 }
1032
1033 // Recursively parse args
1034 template <typename... TArgsT>
1035 static PFN_FE_WORK_FUNC GetFunc(bool bArg, TArgsT... remainingArgs)
1036 {
1037 if (bArg)
1038 {
1039 return FEDrawChooser<ArgsB..., true>::GetFunc(remainingArgs...);
1040 }
1041
1042 return FEDrawChooser<ArgsB..., false>::GetFunc(remainingArgs...);
1043 }
1044 };
1045
1046 // Selector for correct templated Draw front-end function
1047 INLINE
1048 static PFN_FE_WORK_FUNC GetFEDrawFunc(bool IsIndexed, bool HasTessellation, bool HasGeometryShader, bool HasStreamOut, bool RasterizerEnabled)
1049 {
1050 return FEDrawChooser<>::GetFunc(IsIndexed, HasTessellation, HasGeometryShader, HasStreamOut, RasterizerEnabled);
1051 }
1052
1053
1054 //////////////////////////////////////////////////////////////////////////
1055 /// @brief DrawInstanced
1056 /// @param hContext - Handle passed back from SwrCreateContext
1057 /// @param topology - Specifies topology for draw.
1058 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
1059 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1060 /// @param numInstances - How many instances to render.
1061 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1062 void DrawInstanced(
1063 HANDLE hContext,
1064 PRIMITIVE_TOPOLOGY topology,
1065 uint32_t numVertices,
1066 uint32_t startVertex,
1067 uint32_t numInstances = 1,
1068 uint32_t startInstance = 0)
1069 {
1070 if (KNOB_TOSS_DRAW)
1071 {
1072 return;
1073 }
1074
1075 RDTSC_START(APIDraw);
1076
1077 SWR_CONTEXT *pContext = GetContext(hContext);
1078 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1079
1080 int32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
1081 uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
1082 int32_t remainingVerts = numVertices;
1083
1084 API_STATE *pState = &pDC->pState->state;
1085 pState->topology = topology;
1086 pState->forceFront = false;
1087
1088 // disable culling for points/lines
1089 uint32_t oldCullMode = pState->rastState.cullMode;
1090 if (topology == TOP_POINT_LIST)
1091 {
1092 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1093 pState->forceFront = true;
1094 }
1095
1096 int draw = 0;
1097 while (remainingVerts)
1098 {
1099 uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ?
1100 remainingVerts : maxVertsPerDraw;
1101
1102 bool isSplitDraw = (draw > 0) ? true : false;
1103 DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw);
1104 InitDraw(pDC, isSplitDraw);
1105
1106 pDC->FeWork.type = DRAW;
1107 pDC->FeWork.pfnWork = GetFEDrawFunc(
1108 false, // IsIndexed
1109 pState->tsState.tsEnable,
1110 pState->gsState.gsEnable,
1111 pState->soState.soEnable,
1112 pDC->pState->pfnProcessPrims != nullptr);
1113 pDC->FeWork.desc.draw.numVerts = numVertsForDraw;
1114 pDC->FeWork.desc.draw.startVertex = startVertex;
1115 pDC->FeWork.desc.draw.numInstances = numInstances;
1116 pDC->FeWork.desc.draw.startInstance = startInstance;
1117 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1118 pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw;
1119
1120 pDC->cleanupState = (remainingVerts == numVertsForDraw);
1121
1122 //enqueue DC
1123 QueueDraw(pContext);
1124
1125 remainingVerts -= numVertsForDraw;
1126 draw++;
1127 }
1128
1129 // restore culling state
1130 pDC = GetDrawContext(pContext);
1131 pDC->pState->state.rastState.cullMode = oldCullMode;
1132
1133 RDTSC_STOP(APIDraw, numVertices * numInstances, 0);
1134 }
1135
1136 //////////////////////////////////////////////////////////////////////////
1137 /// @brief SwrDraw
1138 /// @param hContext - Handle passed back from SwrCreateContext
1139 /// @param topology - Specifies topology for draw.
1140 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
1141 /// @param primCount - Number of vertices.
1142 void SwrDraw(
1143 HANDLE hContext,
1144 PRIMITIVE_TOPOLOGY topology,
1145 uint32_t startVertex,
1146 uint32_t numVertices)
1147 {
1148 DrawInstanced(hContext, topology, numVertices, startVertex);
1149 }
1150
1151 //////////////////////////////////////////////////////////////////////////
1152 /// @brief SwrDrawInstanced
1153 /// @param hContext - Handle passed back from SwrCreateContext
1154 /// @param topology - Specifies topology for draw.
1155 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
1156 /// @param numInstances - How many instances to render.
1157 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1158 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1159 void SwrDrawInstanced(
1160 HANDLE hContext,
1161 PRIMITIVE_TOPOLOGY topology,
1162 uint32_t numVertsPerInstance,
1163 uint32_t numInstances,
1164 uint32_t startVertex,
1165 uint32_t startInstance
1166 )
1167 {
1168 DrawInstanced(hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance);
1169 }
1170
1171 //////////////////////////////////////////////////////////////////////////
1172 /// @brief DrawIndexedInstanced
1173 /// @param hContext - Handle passed back from SwrCreateContext
1174 /// @param topology - Specifies topology for draw.
1175 /// @param numIndices - Number of indices to read sequentially from index buffer.
1176 /// @param indexOffset - Starting index into index buffer.
1177 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1178 /// @param numInstances - Number of instances to render.
1179 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1180 void DrawIndexedInstance(
1181 HANDLE hContext,
1182 PRIMITIVE_TOPOLOGY topology,
1183 uint32_t numIndices,
1184 uint32_t indexOffset,
1185 int32_t baseVertex,
1186 uint32_t numInstances = 1,
1187 uint32_t startInstance = 0)
1188 {
1189 if (KNOB_TOSS_DRAW)
1190 {
1191 return;
1192 }
1193
1194 RDTSC_START(APIDrawIndexed);
1195
1196 SWR_CONTEXT *pContext = GetContext(hContext);
1197 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1198 API_STATE* pState = &pDC->pState->state;
1199
1200 int32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
1201 uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
1202 int32_t remainingIndices = numIndices;
1203
1204 uint32_t indexSize = 0;
1205 switch (pState->indexBuffer.format)
1206 {
1207 case R32_UINT: indexSize = sizeof(uint32_t); break;
1208 case R16_UINT: indexSize = sizeof(uint16_t); break;
1209 case R8_UINT: indexSize = sizeof(uint8_t); break;
1210 default:
1211 SWR_ASSERT(0);
1212 }
1213
1214 int draw = 0;
1215 uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices;
1216 pIB += (uint64_t)indexOffset * (uint64_t)indexSize;
1217
1218 pState->topology = topology;
1219 pState->forceFront = false;
1220
1221 // disable culling for points/lines
1222 uint32_t oldCullMode = pState->rastState.cullMode;
1223 if (topology == TOP_POINT_LIST)
1224 {
1225 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1226 pState->forceFront = true;
1227 }
1228
1229 while (remainingIndices)
1230 {
1231 uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ?
1232 remainingIndices : maxIndicesPerDraw;
1233
1234 // When breaking up draw, we need to obtain new draw context for each iteration.
1235 bool isSplitDraw = (draw > 0) ? true : false;
1236 pDC = GetDrawContext(pContext, isSplitDraw);
1237 InitDraw(pDC, isSplitDraw);
1238
1239 pDC->FeWork.type = DRAW;
1240 pDC->FeWork.pfnWork = GetFEDrawFunc(
1241 true, // IsIndexed
1242 pState->tsState.tsEnable,
1243 pState->gsState.gsEnable,
1244 pState->soState.soEnable,
1245 pDC->pState->pfnProcessPrims != nullptr);
1246 pDC->FeWork.desc.draw.pDC = pDC;
1247 pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
1248 pDC->FeWork.desc.draw.pIB = (int*)pIB;
1249 pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format;
1250
1251 pDC->FeWork.desc.draw.numInstances = numInstances;
1252 pDC->FeWork.desc.draw.startInstance = startInstance;
1253 pDC->FeWork.desc.draw.baseVertex = baseVertex;
1254 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1255
1256 pDC->cleanupState = (remainingIndices == numIndicesForDraw);
1257
1258 //enqueue DC
1259 QueueDraw(pContext);
1260
1261 pIB += maxIndicesPerDraw * indexSize;
1262 remainingIndices -= numIndicesForDraw;
1263 draw++;
1264 }
1265
1266 // restore culling state
1267 pDC = GetDrawContext(pContext);
1268 pDC->pState->state.rastState.cullMode = oldCullMode;
1269
1270 RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0);
1271 }
1272
1273
1274 //////////////////////////////////////////////////////////////////////////
1275 /// @brief DrawIndexed
1276 /// @param hContext - Handle passed back from SwrCreateContext
1277 /// @param topology - Specifies topology for draw.
1278 /// @param numIndices - Number of indices to read sequentially from index buffer.
1279 /// @param indexOffset - Starting index into index buffer.
1280 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1281 void SwrDrawIndexed(
1282 HANDLE hContext,
1283 PRIMITIVE_TOPOLOGY topology,
1284 uint32_t numIndices,
1285 uint32_t indexOffset,
1286 int32_t baseVertex
1287 )
1288 {
1289 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex);
1290 }
1291
1292 //////////////////////////////////////////////////////////////////////////
1293 /// @brief SwrDrawIndexedInstanced
1294 /// @param hContext - Handle passed back from SwrCreateContext
1295 /// @param topology - Specifies topology for draw.
1296 /// @param numIndices - Number of indices to read sequentially from index buffer.
1297 /// @param numInstances - Number of instances to render.
1298 /// @param indexOffset - Starting index into index buffer.
1299 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1300 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1301 void SwrDrawIndexedInstanced(
1302 HANDLE hContext,
1303 PRIMITIVE_TOPOLOGY topology,
1304 uint32_t numIndices,
1305 uint32_t numInstances,
1306 uint32_t indexOffset,
1307 int32_t baseVertex,
1308 uint32_t startInstance)
1309 {
1310 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
1311 }
1312
1313 //////////////////////////////////////////////////////////////////////////
1314 /// @brief SwrInvalidateTiles
1315 /// @param hContext - Handle passed back from SwrCreateContext
1316 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
1317 void SwrInvalidateTiles(
1318 HANDLE hContext,
1319 uint32_t attachmentMask)
1320 {
1321 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1322 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1323
1324 pDC->FeWork.type = DISCARDINVALIDATETILES;
1325 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1326 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1327 memset(&pDC->FeWork.desc.discardInvalidateTiles.rect, 0, sizeof(SWR_RECT));
1328 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID;
1329 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false;
1330 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false;
1331
1332 //enqueue
1333 QueueDraw(pContext);
1334 }
1335
1336 //////////////////////////////////////////////////////////////////////////
1337 /// @brief SwrDiscardRect
1338 /// @param hContext - Handle passed back from SwrCreateContext
1339 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
1340 /// @param rect - if rect is all zeros, the entire attachment surface will be discarded
1341 void SwrDiscardRect(
1342 HANDLE hContext,
1343 uint32_t attachmentMask,
1344 SWR_RECT rect)
1345 {
1346 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1347 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1348
1349 // Queue a load to the hottile
1350 pDC->FeWork.type = DISCARDINVALIDATETILES;
1351 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1352 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1353 pDC->FeWork.desc.discardInvalidateTiles.rect = rect;
1354 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED;
1355 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true;
1356 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true;
1357
1358 //enqueue
1359 QueueDraw(pContext);
1360 }
1361
1362 //////////////////////////////////////////////////////////////////////////
1363 /// @brief SwrDispatch
1364 /// @param hContext - Handle passed back from SwrCreateContext
1365 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
1366 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
1367 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
1368 void SwrDispatch(
1369 HANDLE hContext,
1370 uint32_t threadGroupCountX,
1371 uint32_t threadGroupCountY,
1372 uint32_t threadGroupCountZ)
1373 {
1374 if (KNOB_TOSS_DRAW)
1375 {
1376 return;
1377 }
1378
1379 RDTSC_START(APIDispatch);
1380 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1381 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1382
1383 pDC->isCompute = true; // This is a compute context.
1384
1385 COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
1386
1387 pTaskData->threadGroupCountX = threadGroupCountX;
1388 pTaskData->threadGroupCountY = threadGroupCountY;
1389 pTaskData->threadGroupCountZ = threadGroupCountZ;
1390
1391 uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
1392 uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
1393 pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex];
1394 pDC->pDispatch->initialize(totalThreadGroups, pTaskData);
1395
1396 QueueDispatch(pContext);
1397 RDTSC_STOP(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ, 0);
1398 }
1399
1400 // Deswizzles, converts and stores current contents of the hot tiles to surface
1401 // described by pState
1402 void SwrStoreTiles(
1403 HANDLE hContext,
1404 SWR_RENDERTARGET_ATTACHMENT attachment,
1405 SWR_TILE_STATE postStoreTileState)
1406 {
1407 RDTSC_START(APIStoreTiles);
1408
1409 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1410 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1411
1412 SetupMacroTileScissors(pDC);
1413
1414 pDC->FeWork.type = STORETILES;
1415 pDC->FeWork.pfnWork = ProcessStoreTiles;
1416 pDC->FeWork.desc.storeTiles.attachment = attachment;
1417 pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState;
1418
1419 //enqueue
1420 QueueDraw(pContext);
1421
1422 RDTSC_STOP(APIStoreTiles, 0, 0);
1423 }
1424
1425 void SwrClearRenderTarget(
1426 HANDLE hContext,
1427 uint32_t clearMask,
1428 const float clearColor[4],
1429 float z,
1430 uint8_t stencil)
1431 {
1432 RDTSC_START(APIClearRenderTarget);
1433
1434 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1435
1436 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1437
1438 SetupMacroTileScissors(pDC);
1439
1440 CLEAR_FLAGS flags;
1441 flags.mask = clearMask;
1442
1443 pDC->FeWork.type = CLEAR;
1444 pDC->FeWork.pfnWork = ProcessClear;
1445 pDC->FeWork.desc.clear.flags = flags;
1446 pDC->FeWork.desc.clear.clearDepth = z;
1447 pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0];
1448 pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1];
1449 pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2];
1450 pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3];
1451 pDC->FeWork.desc.clear.clearStencil = stencil;
1452
1453 // enqueue draw
1454 QueueDraw(pContext);
1455
1456 RDTSC_STOP(APIClearRenderTarget, 0, pDC->drawId);
1457 }
1458
1459 //////////////////////////////////////////////////////////////////////////
1460 /// @brief Returns a pointer to the private context state for the current
1461 /// draw operation. This is used for external componets such as the
1462 /// sampler.
1463 /// SWR is responsible for the allocation of the private context state.
1464 /// @param hContext - Handle passed back from SwrCreateContext
1465 VOID* SwrGetPrivateContextState(
1466 HANDLE hContext)
1467 {
1468 SWR_CONTEXT* pContext = GetContext(hContext);
1469 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1470 DRAW_STATE* pState = pDC->pState;
1471
1472 if (pState->pPrivateState == nullptr)
1473 {
1474 pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float));
1475 }
1476
1477 return pState->pPrivateState;
1478 }
1479
1480 //////////////////////////////////////////////////////////////////////////
1481 /// @brief Clients can use this to allocate memory for draw/dispatch
1482 /// operations. The memory will automatically be freed once operation
1483 /// has completed. Client can use this to allocate binding tables,
1484 /// etc. needed for shader execution.
1485 /// @param hContext - Handle passed back from SwrCreateContext
1486 /// @param size - Size of allocation
1487 /// @param align - Alignment needed for allocation.
1488 VOID* SwrAllocDrawContextMemory(
1489 HANDLE hContext,
1490 uint32_t size,
1491 uint32_t align)
1492 {
1493 SWR_CONTEXT* pContext = GetContext(hContext);
1494 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1495
1496 return pDC->pState->pArena->AllocAligned(size, align);
1497 }
1498
1499 //////////////////////////////////////////////////////////////////////////
1500 /// @brief Returns pointer to SWR stats.
1501 /// @note The counters are atomically incremented by multiple threads.
1502 /// When calling this, you need to ensure all previous operations
1503 /// have completed.
1504 /// @todo If necessary, add a callback to avoid stalling the pipe to
1505 /// sample the counters.
1506 /// @param hContext - Handle passed back from SwrCreateContext
1507 /// @param pStats - SWR will fill this out for caller.
1508 void SwrGetStats(
1509 HANDLE hContext,
1510 SWR_STATS* pStats)
1511 {
1512 SWR_CONTEXT *pContext = GetContext(hContext);
1513 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1514
1515 pDC->FeWork.type = QUERYSTATS;
1516 pDC->FeWork.pfnWork = ProcessQueryStats;
1517 pDC->FeWork.desc.queryStats.pStats = pStats;
1518
1519 // cannot execute until all previous draws have completed
1520 pDC->dependency = pDC->drawId - 1;
1521
1522 //enqueue
1523 QueueDraw(pContext);
1524 }
1525
1526 //////////////////////////////////////////////////////////////////////////
1527 /// @brief Enables stats counting
1528 /// @param hContext - Handle passed back from SwrCreateContext
1529 /// @param enable - If true then counts are incremented.
1530 void SwrEnableStats(
1531 HANDLE hContext,
1532 bool enable)
1533 {
1534 SWR_CONTEXT *pContext = GetContext(hContext);
1535 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1536
1537 pDC->pState->state.enableStats = enable;
1538 }
1539
1540 //////////////////////////////////////////////////////////////////////////
1541 /// @brief Mark end of frame - used for performance profiling
1542 /// @param hContext - Handle passed back from SwrCreateContext
1543 void SWR_API SwrEndFrame(
1544 HANDLE hContext)
1545 {
1546 RDTSC_ENDFRAME();
1547 }