swr: [rasterizer core] TemplateArgUnroller
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / api.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file api.cpp
24 *
25 * @brief API implementation
26 *
27 ******************************************************************************/
28
29 #include <cfloat>
30 #include <cmath>
31 #include <cstdio>
32 #include <new>
33
34 #include "core/api.h"
35 #include "core/backend.h"
36 #include "core/context.h"
37 #include "core/depthstencil.h"
38 #include "core/frontend.h"
39 #include "core/rasterizer.h"
40 #include "core/rdtsc_core.h"
41 #include "core/threads.h"
42 #include "core/tilemgr.h"
43 #include "core/clip.h"
44 #include "core/utils.h"
45
46 #include "common/simdintrin.h"
47 #include "common/os.h"
48
49 void SetupDefaultState(SWR_CONTEXT *pContext);
50
51 //////////////////////////////////////////////////////////////////////////
52 /// @brief Create SWR Context.
53 /// @param pCreateInfo - pointer to creation info.
54 HANDLE SwrCreateContext(
55 SWR_CREATECONTEXT_INFO* pCreateInfo)
56 {
57 RDTSC_RESET();
58 RDTSC_INIT(0);
59
60 void* pContextMem = _aligned_malloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4);
61 memset(pContextMem, 0, sizeof(SWR_CONTEXT));
62 SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT();
63
64 pContext->driverType = pCreateInfo->driver;
65 pContext->privateStateSize = pCreateInfo->privateStateSize;
66
67 pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
68 pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
69
70 pContext->pMacroTileManagerArray = (MacroTileMgr*)_aligned_malloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
71 pContext->pDispatchQueueArray = (DispatchQueue*)_aligned_malloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
72
73 for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
74 {
75 pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
76 new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
77 new (&pContext->pDispatchQueueArray[dc]) DispatchQueue();
78
79 pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
80 }
81
82 if (!KNOB_SINGLE_THREADED)
83 {
84 memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
85 memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
86 new (&pContext->WaitLock) std::mutex();
87 new (&pContext->FifosNotEmpty) std::condition_variable();
88
89 CreateThreadPool(pContext, &pContext->threadPool);
90 }
91
92 // Calling createThreadPool() above can set SINGLE_THREADED
93 if (KNOB_SINGLE_THREADED)
94 {
95 SET_KNOB(HYPERTHREADED_FE, false);
96 pContext->NumWorkerThreads = 1;
97 pContext->NumFEThreads = 1;
98 pContext->NumBEThreads = 1;
99 }
100
101 // Allocate scratch space for workers.
102 ///@note We could lazily allocate this but its rather small amount of memory.
103 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
104 {
105 #if defined(_WIN32)
106 uint32_t numaNode = pContext->threadPool.pThreadData ?
107 pContext->threadPool.pThreadData[i].numaId : 0;
108 pContext->pScratch[i] = (uint8_t*)VirtualAllocExNuma(
109 GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE),
110 MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE,
111 numaNode);
112 #else
113 pContext->pScratch[i] = (uint8_t*)_aligned_malloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4);
114 #endif
115 }
116
117 // State setup AFTER context is fully initialized
118 SetupDefaultState(pContext);
119
120 // initialize hot tile manager
121 pContext->pHotTileMgr = new HotTileMgr();
122
123 // initialize function pointer tables
124 InitClearTilesTable();
125
126 // initialize store tiles function
127 pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
128 pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
129 pContext->pfnClearTile = pCreateInfo->pfnClearTile;
130
131 // pass pointer to bucket manager back to caller
132 #ifdef KNOB_ENABLE_RDTSC
133 pCreateInfo->pBucketMgr = &gBucketMgr;
134 #endif
135
136 pCreateInfo->contextSaveSize = sizeof(API_STATE);
137
138 return (HANDLE)pContext;
139 }
140
141 void SwrDestroyContext(HANDLE hContext)
142 {
143 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
144 DestroyThreadPool(pContext, &pContext->threadPool);
145
146 // free the fifos
147 for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i)
148 {
149 delete pContext->dcRing[i].pArena;
150 delete pContext->dsRing[i].pArena;
151 pContext->pMacroTileManagerArray[i].~MacroTileMgr();
152 pContext->pDispatchQueueArray[i].~DispatchQueue();
153 }
154
155 _aligned_free(pContext->pDispatchQueueArray);
156 _aligned_free(pContext->pMacroTileManagerArray);
157
158 // Free scratch space.
159 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
160 {
161 #if defined(_WIN32)
162 VirtualFree(pContext->pScratch[i], 0, MEM_RELEASE);
163 #else
164 _aligned_free(pContext->pScratch[i]);
165 #endif
166 }
167
168 delete(pContext->pHotTileMgr);
169
170 pContext->~SWR_CONTEXT();
171 _aligned_free((SWR_CONTEXT*)hContext);
172 }
173
174 void CopyState(DRAW_STATE& dst, const DRAW_STATE& src)
175 {
176 memcpy(&dst.state, &src.state, sizeof(API_STATE));
177 }
178
179 void WakeAllThreads(SWR_CONTEXT *pContext)
180 {
181 pContext->FifosNotEmpty.notify_all();
182 }
183
184 template<bool IsDraw>
185 void QueueWork(SWR_CONTEXT *pContext)
186 {
187 DRAW_CONTEXT* pDC = pContext->pCurDrawContext;
188 uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
189
190 if (IsDraw)
191 {
192 pDC->pTileMgr = &pContext->pMacroTileManagerArray[dcIndex];
193 pDC->pTileMgr->initialize();
194 }
195
196 // Each worker thread looks at a DC for both FE and BE work at different times and so we
197 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
198 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
199 // then moved on if all work is done.)
200 pContext->pCurDrawContext->threadsDone = pContext->NumFEThreads + pContext->NumBEThreads;
201
202 _ReadWriteBarrier();
203 {
204 std::unique_lock<std::mutex> lock(pContext->WaitLock);
205 pContext->dcRing.Enqueue();
206 }
207
208 if (KNOB_SINGLE_THREADED)
209 {
210 // flush denormals to 0
211 uint32_t mxcsr = _mm_getcsr();
212 _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
213
214 if (IsDraw)
215 {
216 static TileSet lockedTiles;
217 uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
218 WorkOnFifoFE(pContext, 0, curDraw[0]);
219 WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles, 0, 0);
220 }
221 else
222 {
223 uint64_t curDispatch = pContext->pCurDrawContext->drawId;
224 WorkOnCompute(pContext, 0, curDispatch);
225 }
226
227 // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
228 while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0) {}
229
230 // restore csr
231 _mm_setcsr(mxcsr);
232 }
233 else
234 {
235 RDTSC_START(APIDrawWakeAllThreads);
236 WakeAllThreads(pContext);
237 RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
238 }
239
240 // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
241 pContext->pPrevDrawContext = pContext->pCurDrawContext;
242 pContext->pCurDrawContext = nullptr;
243 }
244
245 INLINE void QueueDraw(SWR_CONTEXT* pContext)
246 {
247 QueueWork<true>(pContext);
248 }
249
250 INLINE void QueueDispatch(SWR_CONTEXT* pContext)
251 {
252 QueueWork<false>(pContext);
253 }
254
255 DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
256 {
257 RDTSC_START(APIGetDrawContext);
258 // If current draw context is null then need to obtain a new draw context to use from ring.
259 if (pContext->pCurDrawContext == nullptr)
260 {
261 // Need to wait for a free entry.
262 while (pContext->dcRing.IsFull())
263 {
264 _mm_pause();
265 }
266
267 uint64_t curDraw = pContext->dcRing.GetHead();
268 uint32_t dcIndex = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT;
269
270 static uint64_t lastDrawChecked;
271 static uint32_t lastFrameChecked;
272 if ((pContext->frameCount - lastFrameChecked) > 2 ||
273 (curDraw - lastDrawChecked) > 0x10000)
274 {
275 // Take this opportunity to clean-up old arena allocations
276 pContext->cachingArenaAllocator.FreeOldBlocks();
277
278 lastFrameChecked = pContext->frameCount;
279 lastDrawChecked = curDraw;
280 }
281
282 DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
283 pContext->pCurDrawContext = pCurDrawContext;
284
285 // Assign next available entry in DS ring to this DC.
286 uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
287 pCurDrawContext->pState = &pContext->dsRing[dsIndex];
288
289 // Copy previous state to current state.
290 if (pContext->pPrevDrawContext)
291 {
292 DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext;
293
294 // If we're splitting our draw then we can just use the same state from the previous
295 // draw. In this case, we won't increment the DS ring index so the next non-split
296 // draw can receive the state.
297 if (isSplitDraw == false)
298 {
299 CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);
300
301 // Should have been cleaned up previously
302 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
303
304 pCurDrawContext->pState->pPrivateState = nullptr;
305
306 pContext->curStateId++; // Progress state ring index forward.
307 }
308 else
309 {
310 // If its a split draw then just copy the state pointer over
311 // since its the same draw.
312 pCurDrawContext->pState = pPrevDrawContext->pState;
313 SWR_ASSERT(pPrevDrawContext->cleanupState == false);
314 }
315 }
316 else
317 {
318 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
319 pContext->curStateId++; // Progress state ring index forward.
320 }
321
322 SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true);
323
324 pCurDrawContext->dependency = 0;
325 pCurDrawContext->pContext = pContext;
326 pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
327
328 pCurDrawContext->doneFE = false;
329 pCurDrawContext->FeLock = 0;
330 pCurDrawContext->threadsDone = 0;
331
332 // Assign unique drawId for this DC
333 pCurDrawContext->drawId = pContext->dcRing.GetHead();
334
335 pCurDrawContext->cleanupState = true;
336 }
337 else
338 {
339 SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
340 }
341
342 RDTSC_STOP(APIGetDrawContext, 0, 0);
343 return pContext->pCurDrawContext;
344 }
345
346 API_STATE* GetDrawState(SWR_CONTEXT *pContext)
347 {
348 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
349 SWR_ASSERT(pDC->pState != nullptr);
350
351 return &pDC->pState->state;
352 }
353
354 void SWR_API SwrSaveState(
355 HANDLE hContext,
356 void* pOutputStateBlock,
357 size_t memSize)
358 {
359 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
360 auto pSrc = GetDrawState(pContext);
361 SWR_ASSERT(pOutputStateBlock && memSize >= sizeof(*pSrc));
362
363 memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc));
364 }
365
366 void SWR_API SwrRestoreState(
367 HANDLE hContext,
368 const void* pStateBlock,
369 size_t memSize)
370 {
371 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
372 auto pDst = GetDrawState(pContext);
373 SWR_ASSERT(pStateBlock && memSize >= sizeof(*pDst));
374
375 memcpy(pDst, pStateBlock, sizeof(*pDst));
376 }
377
378 void SetupDefaultState(SWR_CONTEXT *pContext)
379 {
380 API_STATE* pState = GetDrawState(pContext);
381
382 pState->rastState.cullMode = SWR_CULLMODE_NONE;
383 pState->rastState.frontWinding = SWR_FRONTWINDING_CCW;
384 }
385
386 static INLINE SWR_CONTEXT* GetContext(HANDLE hContext)
387 {
388 return (SWR_CONTEXT*)hContext;
389 }
390
391 void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2, uint64_t userData3)
392 {
393 RDTSC_START(APISync);
394
395 SWR_ASSERT(pfnFunc != nullptr);
396
397 SWR_CONTEXT *pContext = GetContext(hContext);
398 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
399
400 pDC->FeWork.type = SYNC;
401 pDC->FeWork.pfnWork = ProcessSync;
402 pDC->FeWork.desc.sync.pfnCallbackFunc = pfnFunc;
403 pDC->FeWork.desc.sync.userData = userData;
404 pDC->FeWork.desc.sync.userData2 = userData2;
405 pDC->FeWork.desc.sync.userData3 = userData3;
406
407 // cannot execute until all previous draws have completed
408 pDC->dependency = pDC->drawId - 1;
409
410 //enqueue
411 QueueDraw(pContext);
412
413 RDTSC_STOP(APISync, 1, 0);
414 }
415
416 void SwrWaitForIdle(HANDLE hContext)
417 {
418 SWR_CONTEXT *pContext = GetContext(hContext);
419
420 RDTSC_START(APIWaitForIdle);
421
422 while (!pContext->dcRing.IsEmpty())
423 {
424 _mm_pause();
425 }
426
427 RDTSC_STOP(APIWaitForIdle, 1, 0);
428 }
429
430 void SwrSetVertexBuffers(
431 HANDLE hContext,
432 uint32_t numBuffers,
433 const SWR_VERTEX_BUFFER_STATE* pVertexBuffers)
434 {
435 API_STATE* pState = GetDrawState(GetContext(hContext));
436
437 for (uint32_t i = 0; i < numBuffers; ++i)
438 {
439 const SWR_VERTEX_BUFFER_STATE *pVB = &pVertexBuffers[i];
440 pState->vertexBuffers[pVB->index] = *pVB;
441 }
442 }
443
444 void SwrSetIndexBuffer(
445 HANDLE hContext,
446 const SWR_INDEX_BUFFER_STATE* pIndexBuffer)
447 {
448 API_STATE* pState = GetDrawState(GetContext(hContext));
449
450 pState->indexBuffer = *pIndexBuffer;
451 }
452
453 void SwrSetFetchFunc(
454 HANDLE hContext,
455 PFN_FETCH_FUNC pfnFetchFunc)
456 {
457 API_STATE* pState = GetDrawState(GetContext(hContext));
458
459 pState->pfnFetchFunc = pfnFetchFunc;
460 }
461
462 void SwrSetSoFunc(
463 HANDLE hContext,
464 PFN_SO_FUNC pfnSoFunc,
465 uint32_t streamIndex)
466 {
467 API_STATE* pState = GetDrawState(GetContext(hContext));
468
469 SWR_ASSERT(streamIndex < MAX_SO_STREAMS);
470
471 pState->pfnSoFunc[streamIndex] = pfnSoFunc;
472 }
473
474 void SwrSetSoState(
475 HANDLE hContext,
476 SWR_STREAMOUT_STATE* pSoState)
477 {
478 API_STATE* pState = GetDrawState(GetContext(hContext));
479
480 pState->soState = *pSoState;
481 }
482
483 void SwrSetSoBuffers(
484 HANDLE hContext,
485 SWR_STREAMOUT_BUFFER* pSoBuffer,
486 uint32_t slot)
487 {
488 API_STATE* pState = GetDrawState(GetContext(hContext));
489
490 SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot);
491
492 pState->soBuffer[slot] = *pSoBuffer;
493 }
494
495 void SwrSetVertexFunc(
496 HANDLE hContext,
497 PFN_VERTEX_FUNC pfnVertexFunc)
498 {
499 API_STATE* pState = GetDrawState(GetContext(hContext));
500
501 pState->pfnVertexFunc = pfnVertexFunc;
502 }
503
504 void SwrSetFrontendState(
505 HANDLE hContext,
506 SWR_FRONTEND_STATE *pFEState)
507 {
508 API_STATE* pState = GetDrawState(GetContext(hContext));
509 pState->frontendState = *pFEState;
510 }
511
512 void SwrSetGsState(
513 HANDLE hContext,
514 SWR_GS_STATE *pGSState)
515 {
516 API_STATE* pState = GetDrawState(GetContext(hContext));
517 pState->gsState = *pGSState;
518 }
519
520 void SwrSetGsFunc(
521 HANDLE hContext,
522 PFN_GS_FUNC pfnGsFunc)
523 {
524 API_STATE* pState = GetDrawState(GetContext(hContext));
525 pState->pfnGsFunc = pfnGsFunc;
526 }
527
528 void SwrSetCsFunc(
529 HANDLE hContext,
530 PFN_CS_FUNC pfnCsFunc,
531 uint32_t totalThreadsInGroup,
532 uint32_t totalSpillFillSize)
533 {
534 API_STATE* pState = GetDrawState(GetContext(hContext));
535 pState->pfnCsFunc = pfnCsFunc;
536 pState->totalThreadsInGroup = totalThreadsInGroup;
537 pState->totalSpillFillSize = totalSpillFillSize;
538 }
539
540 void SwrSetTsState(
541 HANDLE hContext,
542 SWR_TS_STATE *pState)
543 {
544 API_STATE* pApiState = GetDrawState(GetContext(hContext));
545 pApiState->tsState = *pState;
546 }
547
548 void SwrSetHsFunc(
549 HANDLE hContext,
550 PFN_HS_FUNC pfnFunc)
551 {
552 API_STATE* pApiState = GetDrawState(GetContext(hContext));
553 pApiState->pfnHsFunc = pfnFunc;
554 }
555
556 void SwrSetDsFunc(
557 HANDLE hContext,
558 PFN_DS_FUNC pfnFunc)
559 {
560 API_STATE* pApiState = GetDrawState(GetContext(hContext));
561 pApiState->pfnDsFunc = pfnFunc;
562 }
563
564 void SwrSetDepthStencilState(
565 HANDLE hContext,
566 SWR_DEPTH_STENCIL_STATE *pDSState)
567 {
568 API_STATE* pState = GetDrawState(GetContext(hContext));
569
570 pState->depthStencilState = *pDSState;
571 }
572
573 void SwrSetBackendState(
574 HANDLE hContext,
575 SWR_BACKEND_STATE *pBEState)
576 {
577 API_STATE* pState = GetDrawState(GetContext(hContext));
578
579 pState->backendState = *pBEState;
580 }
581
582 void SwrSetPixelShaderState(
583 HANDLE hContext,
584 SWR_PS_STATE *pPSState)
585 {
586 API_STATE *pState = GetDrawState(GetContext(hContext));
587 pState->psState = *pPSState;
588 }
589
590 void SwrSetBlendState(
591 HANDLE hContext,
592 SWR_BLEND_STATE *pBlendState)
593 {
594 API_STATE *pState = GetDrawState(GetContext(hContext));
595 memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE));
596 }
597
598 void SwrSetBlendFunc(
599 HANDLE hContext,
600 uint32_t renderTarget,
601 PFN_BLEND_JIT_FUNC pfnBlendFunc)
602 {
603 SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS);
604 API_STATE *pState = GetDrawState(GetContext(hContext));
605 pState->pfnBlendFunc[renderTarget] = pfnBlendFunc;
606 }
607
608 void SwrSetLinkage(
609 HANDLE hContext,
610 uint32_t mask,
611 const uint8_t* pMap)
612 {
613 API_STATE* pState = GetDrawState(GetContext(hContext));
614
615 static const uint8_t IDENTITY_MAP[] =
616 {
617 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
618 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
619 };
620 static_assert(sizeof(IDENTITY_MAP) == sizeof(pState->linkageMap),
621 "Update for new value of MAX_ATTRIBUTES");
622
623 pState->linkageMask = mask;
624 pState->linkageCount = _mm_popcnt_u32(mask);
625
626 if (!pMap)
627 {
628 pMap = IDENTITY_MAP;
629 }
630 memcpy(pState->linkageMap, pMap, pState->linkageCount);
631 }
632
633 // update guardband multipliers for the viewport
634 void updateGuardband(API_STATE *pState)
635 {
636 // guardband center is viewport center
637 pState->gbState.left = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
638 pState->gbState.right = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
639 pState->gbState.top = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
640 pState->gbState.bottom = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
641 }
642
643 void SwrSetRastState(
644 HANDLE hContext,
645 const SWR_RASTSTATE *pRastState)
646 {
647 SWR_CONTEXT *pContext = GetContext(hContext);
648 API_STATE* pState = GetDrawState(pContext);
649
650 memcpy(&pState->rastState, pRastState, sizeof(SWR_RASTSTATE));
651 }
652
653 void SwrSetViewports(
654 HANDLE hContext,
655 uint32_t numViewports,
656 const SWR_VIEWPORT* pViewports,
657 const SWR_VIEWPORT_MATRIX* pMatrices)
658 {
659 SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS,
660 "Invalid number of viewports.");
661
662 SWR_CONTEXT *pContext = GetContext(hContext);
663 API_STATE* pState = GetDrawState(pContext);
664
665 memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports);
666
667 if (pMatrices != nullptr)
668 {
669 memcpy(&pState->vpMatrix[0], pMatrices, sizeof(SWR_VIEWPORT_MATRIX) * numViewports);
670 }
671 else
672 {
673 // Compute default viewport transform.
674 for (uint32_t i = 0; i < numViewports; ++i)
675 {
676 if (pContext->driverType == DX)
677 {
678 pState->vpMatrix[i].m00 = pState->vp[i].width / 2.0f;
679 pState->vpMatrix[i].m11 = -pState->vp[i].height / 2.0f;
680 pState->vpMatrix[i].m22 = pState->vp[i].maxZ - pState->vp[i].minZ;
681 pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
682 pState->vpMatrix[i].m31 = pState->vp[i].y - pState->vpMatrix[i].m11;
683 pState->vpMatrix[i].m32 = pState->vp[i].minZ;
684 }
685 else
686 {
687 // Standard, with the exception that Y is inverted.
688 pState->vpMatrix[i].m00 = (pState->vp[i].width - pState->vp[i].x) / 2.0f;
689 pState->vpMatrix[i].m11 = (pState->vp[i].y - pState->vp[i].height) / 2.0f;
690 pState->vpMatrix[i].m22 = (pState->vp[i].maxZ - pState->vp[i].minZ) / 2.0f;
691 pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
692 pState->vpMatrix[i].m31 = pState->vp[i].height + pState->vpMatrix[i].m11;
693 pState->vpMatrix[i].m32 = pState->vp[i].minZ + pState->vpMatrix[i].m22;
694
695 // Now that the matrix is calculated, clip the view coords to screen size.
696 // OpenGL allows for -ve x,y in the viewport.
697 pState->vp[i].x = std::max(pState->vp[i].x, 0.0f);
698 pState->vp[i].y = std::max(pState->vp[i].y, 0.0f);
699 }
700 }
701 }
702
703 updateGuardband(pState);
704 }
705
706 void SwrSetScissorRects(
707 HANDLE hContext,
708 uint32_t numScissors,
709 const BBOX* pScissors)
710 {
711 SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS,
712 "Invalid number of scissor rects.");
713
714 API_STATE* pState = GetDrawState(GetContext(hContext));
715 memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(BBOX));
716 };
717
718 void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
719 {
720 API_STATE *pState = &pDC->pState->state;
721 uint32_t left, right, top, bottom;
722
723 // Set up scissor dimensions based on scissor or viewport
724 if (pState->rastState.scissorEnable)
725 {
726 // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges
727 left = pState->scissorRects[0].left;
728 right = pState->scissorRects[0].right;
729 top = pState->scissorRects[0].top;
730 bottom = pState->scissorRects[0].bottom;
731 }
732 else
733 {
734 left = (int32_t)pState->vp[0].x;
735 right = (int32_t)pState->vp[0].x + (int32_t)pState->vp[0].width;
736 top = (int32_t)pState->vp[0].y;
737 bottom = (int32_t)pState->vp[0].y + (int32_t)pState->vp[0].height;
738 }
739
740 right = std::min<uint32_t>(right, KNOB_MAX_SCISSOR_X);
741 bottom = std::min<uint32_t>(bottom, KNOB_MAX_SCISSOR_Y);
742
743 if (left > KNOB_MAX_SCISSOR_X || top > KNOB_MAX_SCISSOR_Y)
744 {
745 pState->scissorInFixedPoint.left = 0;
746 pState->scissorInFixedPoint.right = 0;
747 pState->scissorInFixedPoint.top = 0;
748 pState->scissorInFixedPoint.bottom = 0;
749 }
750 else
751 {
752 pState->scissorInFixedPoint.left = left * FIXED_POINT_SCALE;
753 pState->scissorInFixedPoint.right = right * FIXED_POINT_SCALE - 1;
754 pState->scissorInFixedPoint.top = top * FIXED_POINT_SCALE;
755 pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1;
756 }
757 }
758 // templated backend function tables
759 extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
760 extern PFN_BACKEND_FUNC gBackendSingleSample[2][2];
761 extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
762 extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2];
763 extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS + 1][SWR_MULTISAMPLE_TYPE_MAX];
764 extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
765 extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
766 extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2];
767 void SetupPipeline(DRAW_CONTEXT *pDC)
768 {
769 DRAW_STATE* pState = pDC->pState;
770 const SWR_RASTSTATE &rastState = pState->state.rastState;
771 const SWR_PS_STATE &psState = pState->state.psState;
772 BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
773 const uint32_t forcedSampleCount = (rastState.bForcedSampleCount) ? 1 : 0;
774
775 // setup backend
776 if (psState.pfnPixelShader == nullptr)
777 {
778 backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
779 // always need to generate I & J per sample for Z interpolation
780 backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[1];
781 }
782 else
783 {
784 const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.bForcedSampleCount) ? 1 : 0;
785 const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
786
787 // currently only support 'normal' input coverage
788 SWR_ASSERT(psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
789 psState.inputCoverage == SWR_INPUT_COVERAGE_NONE);
790
791 SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
792
793 // select backend function
794 switch(psState.shadingRate)
795 {
796 case SWR_SHADING_RATE_PIXEL:
797 if(bMultisampleEnable)
798 {
799 // always need to generate I & J per sample for Z interpolation
800 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
801 backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount];
802 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
803 }
804 else
805 {
806 // always need to generate I & J per pixel for Z interpolation
807 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
808 backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid];
809 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][SWR_MULTISAMPLE_1X];
810 }
811 break;
812 case SWR_SHADING_RATE_SAMPLE:
813 SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
814 // always need to generate I & J per sample for Z interpolation
815 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
816 backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid];
817 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
818 break;
819 default:
820 SWR_ASSERT(0 && "Invalid shading rate");
821 break;
822 }
823
824 // setup pointer to function that generates necessary barycentrics required by the PS
825 bool bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_PIXEL_MASK) > 0 ? 1 : 0;
826 backendFuncs.pfnCalcPixelBarycentrics = gPixelBarycentricTable[bBarycentrics];
827
828 bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_SAMPLE_MASK) > 0 ? 1 : 0;
829 backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[bBarycentrics];
830
831 bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0 ? 1 : 0;
832 backendFuncs.pfnCalcCentroidBarycentrics = gCentroidBarycentricTable[rastState.sampleCount][bBarycentrics][rastState.samplePattern][forcedSampleCount];
833 }
834
835 PFN_PROCESS_PRIMS pfnBinner;
836 switch (pState->state.topology)
837 {
838 case TOP_POINT_LIST:
839 pState->pfnProcessPrims = ClipPoints;
840 pfnBinner = BinPoints;
841 break;
842 case TOP_LINE_LIST:
843 case TOP_LINE_STRIP:
844 case TOP_LINE_LOOP:
845 case TOP_LINE_LIST_ADJ:
846 case TOP_LISTSTRIP_ADJ:
847 pState->pfnProcessPrims = ClipLines;
848 pfnBinner = BinLines;
849 break;
850 default:
851 pState->pfnProcessPrims = ClipTriangles;
852 pfnBinner = BinTriangles;
853 break;
854 };
855
856 // disable clipper if viewport transform is disabled
857 if (pState->state.frontendState.vpTransformDisable)
858 {
859 pState->pfnProcessPrims = pfnBinner;
860 }
861
862 if ((pState->state.psState.pfnPixelShader == nullptr) &&
863 (pState->state.depthStencilState.depthTestEnable == FALSE) &&
864 (pState->state.depthStencilState.depthWriteEnable == FALSE) &&
865 (pState->state.depthStencilState.stencilTestEnable == FALSE) &&
866 (pState->state.depthStencilState.stencilWriteEnable == FALSE) &&
867 (pState->state.linkageCount == 0))
868 {
869 pState->pfnProcessPrims = nullptr;
870 pState->state.linkageMask = 0;
871 }
872
873 if (pState->state.soState.rasterizerDisable == true)
874 {
875 pState->pfnProcessPrims = nullptr;
876 pState->state.linkageMask = 0;
877 }
878
879 // set up the frontend attrib mask
880 pState->state.feAttribMask = pState->state.linkageMask;
881 if (pState->state.soState.soEnable)
882 {
883 for (uint32_t i = 0; i < 4; ++i)
884 {
885 pState->state.feAttribMask |= pState->state.soState.streamMasks[i];
886 }
887 }
888
889 // complicated logic to test for cases where we don't need backing hottile memory for a draw
890 // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
891 pState->state.depthHottileEnable = ((!(pState->state.depthStencilState.depthTestEnable &&
892 !pState->state.depthStencilState.depthWriteEnable &&
893 pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) &&
894 (pState->state.depthStencilState.depthTestEnable ||
895 pState->state.depthStencilState.depthWriteEnable)) ? true : false;
896
897 pState->state.stencilHottileEnable = (((!(pState->state.depthStencilState.stencilTestEnable &&
898 !pState->state.depthStencilState.stencilWriteEnable &&
899 pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) ||
900 // for stencil we have to check the double sided state as well
901 (!(pState->state.depthStencilState.doubleSidedStencilTestEnable &&
902 !pState->state.depthStencilState.stencilWriteEnable &&
903 pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) &&
904 (pState->state.depthStencilState.stencilTestEnable ||
905 pState->state.depthStencilState.stencilWriteEnable)) ? true : false;
906
907 uint32_t numRTs = pState->state.psState.numRenderTargets;
908 pState->state.colorHottileEnable = 0;
909 if (psState.pfnPixelShader != nullptr)
910 {
911 for (uint32_t rt = 0; rt < numRTs; ++rt)
912 {
913 pState->state.colorHottileEnable |=
914 (!pState->state.blendState.renderTarget[rt].writeDisableAlpha ||
915 !pState->state.blendState.renderTarget[rt].writeDisableRed ||
916 !pState->state.blendState.renderTarget[rt].writeDisableGreen ||
917 !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0;
918 }
919 }
920
921 // Setup depth quantization function
922 if (pState->state.depthHottileEnable)
923 {
924 switch (pState->state.rastState.depthFormat)
925 {
926 case R32_FLOAT_X8X24_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT_X8X24_TYPELESS > ; break;
927 case R32_FLOAT: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; break;
928 case R24_UNORM_X8_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R24_UNORM_X8_TYPELESS > ; break;
929 case R16_UNORM: pState->state.pfnQuantizeDepth = QuantizeDepth < R16_UNORM > ; break;
930 default: SWR_ASSERT(false, "Unsupported depth format for depth quantiztion.");
931 pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
932 }
933 }
934 else
935 {
936 // set up pass-through quantize if depth isn't enabled
937 pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
938 }
939 }
940
941 //////////////////////////////////////////////////////////////////////////
942 /// @brief InitDraw
943 /// @param pDC - Draw context to initialize for this draw.
944 void InitDraw(
945 DRAW_CONTEXT *pDC,
946 bool isSplitDraw)
947 {
948 // We don't need to re-setup the scissors/pipeline state again for split draw.
949 if (isSplitDraw == false)
950 {
951 SetupMacroTileScissors(pDC);
952 SetupPipeline(pDC);
953 }
954 }
955
956 //////////////////////////////////////////////////////////////////////////
957 /// @brief We can split the draw for certain topologies for better performance.
958 /// @param totalVerts - Total vertices for draw
959 /// @param topology - Topology used for draw
960 uint32_t MaxVertsPerDraw(
961 DRAW_CONTEXT* pDC,
962 uint32_t totalVerts,
963 PRIMITIVE_TOPOLOGY topology)
964 {
965 API_STATE& state = pDC->pState->state;
966
967 uint32_t vertsPerDraw = totalVerts;
968
969 if (state.soState.soEnable)
970 {
971 return totalVerts;
972 }
973
974 switch (topology)
975 {
976 case TOP_POINT_LIST:
977 case TOP_TRIANGLE_LIST:
978 vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW;
979 break;
980
981 case TOP_PATCHLIST_1:
982 case TOP_PATCHLIST_2:
983 case TOP_PATCHLIST_3:
984 case TOP_PATCHLIST_4:
985 case TOP_PATCHLIST_5:
986 case TOP_PATCHLIST_6:
987 case TOP_PATCHLIST_7:
988 case TOP_PATCHLIST_8:
989 case TOP_PATCHLIST_9:
990 case TOP_PATCHLIST_10:
991 case TOP_PATCHLIST_11:
992 case TOP_PATCHLIST_12:
993 case TOP_PATCHLIST_13:
994 case TOP_PATCHLIST_14:
995 case TOP_PATCHLIST_15:
996 case TOP_PATCHLIST_16:
997 case TOP_PATCHLIST_17:
998 case TOP_PATCHLIST_18:
999 case TOP_PATCHLIST_19:
1000 case TOP_PATCHLIST_20:
1001 case TOP_PATCHLIST_21:
1002 case TOP_PATCHLIST_22:
1003 case TOP_PATCHLIST_23:
1004 case TOP_PATCHLIST_24:
1005 case TOP_PATCHLIST_25:
1006 case TOP_PATCHLIST_26:
1007 case TOP_PATCHLIST_27:
1008 case TOP_PATCHLIST_28:
1009 case TOP_PATCHLIST_29:
1010 case TOP_PATCHLIST_30:
1011 case TOP_PATCHLIST_31:
1012 case TOP_PATCHLIST_32:
1013 if (pDC->pState->state.tsState.tsEnable)
1014 {
1015 uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE;
1016 vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW;
1017 }
1018 break;
1019
1020 // The Primitive Assembly code can only handle 1 RECT at a time.
1021 case TOP_RECT_LIST:
1022 vertsPerDraw = 3;
1023 break;
1024
1025 default:
1026 // We are not splitting up draws for other topologies.
1027 break;
1028 }
1029
1030 return vertsPerDraw;
1031 }
1032
1033
1034 //////////////////////////////////////////////////////////////////////////
1035 /// @brief DrawInstanced
1036 /// @param hContext - Handle passed back from SwrCreateContext
1037 /// @param topology - Specifies topology for draw.
1038 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
1039 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1040 /// @param numInstances - How many instances to render.
1041 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1042 void DrawInstanced(
1043 HANDLE hContext,
1044 PRIMITIVE_TOPOLOGY topology,
1045 uint32_t numVertices,
1046 uint32_t startVertex,
1047 uint32_t numInstances = 1,
1048 uint32_t startInstance = 0)
1049 {
1050 if (KNOB_TOSS_DRAW)
1051 {
1052 return;
1053 }
1054
1055 RDTSC_START(APIDraw);
1056
1057 SWR_CONTEXT *pContext = GetContext(hContext);
1058 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1059
1060 uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
1061 uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
1062 uint32_t remainingVerts = numVertices;
1063
1064 API_STATE *pState = &pDC->pState->state;
1065 pState->topology = topology;
1066 pState->forceFront = false;
1067
1068 // disable culling for points/lines
1069 uint32_t oldCullMode = pState->rastState.cullMode;
1070 if (topology == TOP_POINT_LIST)
1071 {
1072 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1073 pState->forceFront = true;
1074 }
1075
1076 int draw = 0;
1077 while (remainingVerts)
1078 {
1079 uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ?
1080 remainingVerts : maxVertsPerDraw;
1081
1082 bool isSplitDraw = (draw > 0) ? true : false;
1083 DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw);
1084 InitDraw(pDC, isSplitDraw);
1085
1086 pDC->FeWork.type = DRAW;
1087 pDC->FeWork.pfnWork = GetProcessDrawFunc(
1088 false, // IsIndexed
1089 pState->tsState.tsEnable,
1090 pState->gsState.gsEnable,
1091 pState->soState.soEnable,
1092 pDC->pState->pfnProcessPrims != nullptr);
1093 pDC->FeWork.desc.draw.numVerts = numVertsForDraw;
1094 pDC->FeWork.desc.draw.startVertex = startVertex;
1095 pDC->FeWork.desc.draw.numInstances = numInstances;
1096 pDC->FeWork.desc.draw.startInstance = startInstance;
1097 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1098 pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw;
1099
1100 pDC->cleanupState = (remainingVerts == numVertsForDraw);
1101
1102 //enqueue DC
1103 QueueDraw(pContext);
1104
1105 remainingVerts -= numVertsForDraw;
1106 draw++;
1107 }
1108
1109 // restore culling state
1110 pDC = GetDrawContext(pContext);
1111 pDC->pState->state.rastState.cullMode = oldCullMode;
1112
1113 RDTSC_STOP(APIDraw, numVertices * numInstances, 0);
1114 }
1115
1116 //////////////////////////////////////////////////////////////////////////
1117 /// @brief SwrDraw
1118 /// @param hContext - Handle passed back from SwrCreateContext
1119 /// @param topology - Specifies topology for draw.
1120 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
1121 /// @param primCount - Number of vertices.
1122 void SwrDraw(
1123 HANDLE hContext,
1124 PRIMITIVE_TOPOLOGY topology,
1125 uint32_t startVertex,
1126 uint32_t numVertices)
1127 {
1128 DrawInstanced(hContext, topology, numVertices, startVertex);
1129 }
1130
1131 //////////////////////////////////////////////////////////////////////////
1132 /// @brief SwrDrawInstanced
1133 /// @param hContext - Handle passed back from SwrCreateContext
1134 /// @param topology - Specifies topology for draw.
1135 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
1136 /// @param numInstances - How many instances to render.
1137 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1138 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1139 void SwrDrawInstanced(
1140 HANDLE hContext,
1141 PRIMITIVE_TOPOLOGY topology,
1142 uint32_t numVertsPerInstance,
1143 uint32_t numInstances,
1144 uint32_t startVertex,
1145 uint32_t startInstance
1146 )
1147 {
1148 DrawInstanced(hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance);
1149 }
1150
1151 //////////////////////////////////////////////////////////////////////////
1152 /// @brief DrawIndexedInstanced
1153 /// @param hContext - Handle passed back from SwrCreateContext
1154 /// @param topology - Specifies topology for draw.
1155 /// @param numIndices - Number of indices to read sequentially from index buffer.
1156 /// @param indexOffset - Starting index into index buffer.
1157 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1158 /// @param numInstances - Number of instances to render.
1159 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1160 void DrawIndexedInstance(
1161 HANDLE hContext,
1162 PRIMITIVE_TOPOLOGY topology,
1163 uint32_t numIndices,
1164 uint32_t indexOffset,
1165 int32_t baseVertex,
1166 uint32_t numInstances = 1,
1167 uint32_t startInstance = 0)
1168 {
1169 if (KNOB_TOSS_DRAW)
1170 {
1171 return;
1172 }
1173
1174 RDTSC_START(APIDrawIndexed);
1175
1176 SWR_CONTEXT *pContext = GetContext(hContext);
1177 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1178 API_STATE* pState = &pDC->pState->state;
1179
1180 uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
1181 uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
1182 uint32_t remainingIndices = numIndices;
1183
1184 uint32_t indexSize = 0;
1185 switch (pState->indexBuffer.format)
1186 {
1187 case R32_UINT: indexSize = sizeof(uint32_t); break;
1188 case R16_UINT: indexSize = sizeof(uint16_t); break;
1189 case R8_UINT: indexSize = sizeof(uint8_t); break;
1190 default:
1191 SWR_ASSERT(0);
1192 }
1193
1194 int draw = 0;
1195 uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices;
1196 pIB += (uint64_t)indexOffset * (uint64_t)indexSize;
1197
1198 pState->topology = topology;
1199 pState->forceFront = false;
1200
1201 // disable culling for points/lines
1202 uint32_t oldCullMode = pState->rastState.cullMode;
1203 if (topology == TOP_POINT_LIST)
1204 {
1205 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1206 pState->forceFront = true;
1207 }
1208
1209 while (remainingIndices)
1210 {
1211 uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ?
1212 remainingIndices : maxIndicesPerDraw;
1213
1214 // When breaking up draw, we need to obtain new draw context for each iteration.
1215 bool isSplitDraw = (draw > 0) ? true : false;
1216 pDC = GetDrawContext(pContext, isSplitDraw);
1217 InitDraw(pDC, isSplitDraw);
1218
1219 pDC->FeWork.type = DRAW;
1220 pDC->FeWork.pfnWork = GetProcessDrawFunc(
1221 true, // IsIndexed
1222 pState->tsState.tsEnable,
1223 pState->gsState.gsEnable,
1224 pState->soState.soEnable,
1225 pDC->pState->pfnProcessPrims != nullptr);
1226 pDC->FeWork.desc.draw.pDC = pDC;
1227 pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
1228 pDC->FeWork.desc.draw.pIB = (int*)pIB;
1229 pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format;
1230
1231 pDC->FeWork.desc.draw.numInstances = numInstances;
1232 pDC->FeWork.desc.draw.startInstance = startInstance;
1233 pDC->FeWork.desc.draw.baseVertex = baseVertex;
1234 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1235
1236 pDC->cleanupState = (remainingIndices == numIndicesForDraw);
1237
1238 //enqueue DC
1239 QueueDraw(pContext);
1240
1241 pIB += maxIndicesPerDraw * indexSize;
1242 remainingIndices -= numIndicesForDraw;
1243 draw++;
1244 }
1245
1246 // restore culling state
1247 pDC = GetDrawContext(pContext);
1248 pDC->pState->state.rastState.cullMode = oldCullMode;
1249
1250 RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0);
1251 }
1252
1253
1254 //////////////////////////////////////////////////////////////////////////
1255 /// @brief DrawIndexed
1256 /// @param hContext - Handle passed back from SwrCreateContext
1257 /// @param topology - Specifies topology for draw.
1258 /// @param numIndices - Number of indices to read sequentially from index buffer.
1259 /// @param indexOffset - Starting index into index buffer.
1260 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1261 void SwrDrawIndexed(
1262 HANDLE hContext,
1263 PRIMITIVE_TOPOLOGY topology,
1264 uint32_t numIndices,
1265 uint32_t indexOffset,
1266 int32_t baseVertex
1267 )
1268 {
1269 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex);
1270 }
1271
1272 //////////////////////////////////////////////////////////////////////////
1273 /// @brief SwrDrawIndexedInstanced
1274 /// @param hContext - Handle passed back from SwrCreateContext
1275 /// @param topology - Specifies topology for draw.
1276 /// @param numIndices - Number of indices to read sequentially from index buffer.
1277 /// @param numInstances - Number of instances to render.
1278 /// @param indexOffset - Starting index into index buffer.
1279 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1280 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1281 void SwrDrawIndexedInstanced(
1282 HANDLE hContext,
1283 PRIMITIVE_TOPOLOGY topology,
1284 uint32_t numIndices,
1285 uint32_t numInstances,
1286 uint32_t indexOffset,
1287 int32_t baseVertex,
1288 uint32_t startInstance)
1289 {
1290 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
1291 }
1292
1293 //////////////////////////////////////////////////////////////////////////
1294 /// @brief SwrInvalidateTiles
1295 /// @param hContext - Handle passed back from SwrCreateContext
1296 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
1297 void SwrInvalidateTiles(
1298 HANDLE hContext,
1299 uint32_t attachmentMask)
1300 {
1301 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1302 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1303
1304 pDC->FeWork.type = DISCARDINVALIDATETILES;
1305 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1306 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1307 memset(&pDC->FeWork.desc.discardInvalidateTiles.rect, 0, sizeof(SWR_RECT));
1308 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID;
1309 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false;
1310 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false;
1311
1312 //enqueue
1313 QueueDraw(pContext);
1314 }
1315
1316 //////////////////////////////////////////////////////////////////////////
1317 /// @brief SwrDiscardRect
1318 /// @param hContext - Handle passed back from SwrCreateContext
1319 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
1320 /// @param rect - if rect is all zeros, the entire attachment surface will be discarded
1321 void SwrDiscardRect(
1322 HANDLE hContext,
1323 uint32_t attachmentMask,
1324 SWR_RECT rect)
1325 {
1326 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1327 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1328
1329 // Queue a load to the hottile
1330 pDC->FeWork.type = DISCARDINVALIDATETILES;
1331 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1332 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1333 pDC->FeWork.desc.discardInvalidateTiles.rect = rect;
1334 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED;
1335 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true;
1336 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true;
1337
1338 //enqueue
1339 QueueDraw(pContext);
1340 }
1341
1342 //////////////////////////////////////////////////////////////////////////
1343 /// @brief SwrDispatch
1344 /// @param hContext - Handle passed back from SwrCreateContext
1345 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
1346 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
1347 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
1348 void SwrDispatch(
1349 HANDLE hContext,
1350 uint32_t threadGroupCountX,
1351 uint32_t threadGroupCountY,
1352 uint32_t threadGroupCountZ)
1353 {
1354 if (KNOB_TOSS_DRAW)
1355 {
1356 return;
1357 }
1358
1359 RDTSC_START(APIDispatch);
1360 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1361 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1362
1363 pDC->isCompute = true; // This is a compute context.
1364
1365 COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
1366
1367 pTaskData->threadGroupCountX = threadGroupCountX;
1368 pTaskData->threadGroupCountY = threadGroupCountY;
1369 pTaskData->threadGroupCountZ = threadGroupCountZ;
1370
1371 uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
1372 uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
1373 pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex];
1374 pDC->pDispatch->initialize(totalThreadGroups, pTaskData);
1375
1376 QueueDispatch(pContext);
1377 RDTSC_STOP(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ, 0);
1378 }
1379
1380 // Deswizzles, converts and stores current contents of the hot tiles to surface
1381 // described by pState
1382 void SwrStoreTiles(
1383 HANDLE hContext,
1384 SWR_RENDERTARGET_ATTACHMENT attachment,
1385 SWR_TILE_STATE postStoreTileState)
1386 {
1387 RDTSC_START(APIStoreTiles);
1388
1389 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1390 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1391
1392 SetupMacroTileScissors(pDC);
1393
1394 pDC->FeWork.type = STORETILES;
1395 pDC->FeWork.pfnWork = ProcessStoreTiles;
1396 pDC->FeWork.desc.storeTiles.attachment = attachment;
1397 pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState;
1398
1399 //enqueue
1400 QueueDraw(pContext);
1401
1402 RDTSC_STOP(APIStoreTiles, 0, 0);
1403 }
1404
1405 void SwrClearRenderTarget(
1406 HANDLE hContext,
1407 uint32_t clearMask,
1408 const float clearColor[4],
1409 float z,
1410 uint8_t stencil)
1411 {
1412 RDTSC_START(APIClearRenderTarget);
1413
1414 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1415
1416 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1417
1418 SetupMacroTileScissors(pDC);
1419
1420 CLEAR_FLAGS flags;
1421 flags.mask = clearMask;
1422
1423 pDC->FeWork.type = CLEAR;
1424 pDC->FeWork.pfnWork = ProcessClear;
1425 pDC->FeWork.desc.clear.flags = flags;
1426 pDC->FeWork.desc.clear.clearDepth = z;
1427 pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0];
1428 pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1];
1429 pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2];
1430 pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3];
1431 pDC->FeWork.desc.clear.clearStencil = stencil;
1432
1433 // enqueue draw
1434 QueueDraw(pContext);
1435
1436 RDTSC_STOP(APIClearRenderTarget, 0, pDC->drawId);
1437 }
1438
1439 //////////////////////////////////////////////////////////////////////////
1440 /// @brief Returns a pointer to the private context state for the current
1441 /// draw operation. This is used for external componets such as the
1442 /// sampler.
1443 /// SWR is responsible for the allocation of the private context state.
1444 /// @param hContext - Handle passed back from SwrCreateContext
1445 VOID* SwrGetPrivateContextState(
1446 HANDLE hContext)
1447 {
1448 SWR_CONTEXT* pContext = GetContext(hContext);
1449 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1450 DRAW_STATE* pState = pDC->pState;
1451
1452 if (pState->pPrivateState == nullptr)
1453 {
1454 pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float));
1455 }
1456
1457 return pState->pPrivateState;
1458 }
1459
1460 //////////////////////////////////////////////////////////////////////////
1461 /// @brief Clients can use this to allocate memory for draw/dispatch
1462 /// operations. The memory will automatically be freed once operation
1463 /// has completed. Client can use this to allocate binding tables,
1464 /// etc. needed for shader execution.
1465 /// @param hContext - Handle passed back from SwrCreateContext
1466 /// @param size - Size of allocation
1467 /// @param align - Alignment needed for allocation.
1468 VOID* SwrAllocDrawContextMemory(
1469 HANDLE hContext,
1470 uint32_t size,
1471 uint32_t align)
1472 {
1473 SWR_CONTEXT* pContext = GetContext(hContext);
1474 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1475
1476 return pDC->pState->pArena->AllocAligned(size, align);
1477 }
1478
1479 //////////////////////////////////////////////////////////////////////////
1480 /// @brief Returns pointer to SWR stats.
1481 /// @note The counters are atomically incremented by multiple threads.
1482 /// When calling this, you need to ensure all previous operations
1483 /// have completed.
1484 /// @todo If necessary, add a callback to avoid stalling the pipe to
1485 /// sample the counters.
1486 /// @param hContext - Handle passed back from SwrCreateContext
1487 /// @param pStats - SWR will fill this out for caller.
1488 void SwrGetStats(
1489 HANDLE hContext,
1490 SWR_STATS* pStats)
1491 {
1492 SWR_CONTEXT *pContext = GetContext(hContext);
1493 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1494
1495 pDC->FeWork.type = QUERYSTATS;
1496 pDC->FeWork.pfnWork = ProcessQueryStats;
1497 pDC->FeWork.desc.queryStats.pStats = pStats;
1498
1499 // cannot execute until all previous draws have completed
1500 pDC->dependency = pDC->drawId - 1;
1501
1502 //enqueue
1503 QueueDraw(pContext);
1504 }
1505
1506 //////////////////////////////////////////////////////////////////////////
1507 /// @brief Enables stats counting
1508 /// @param hContext - Handle passed back from SwrCreateContext
1509 /// @param enable - If true then counts are incremented.
1510 void SwrEnableStats(
1511 HANDLE hContext,
1512 bool enable)
1513 {
1514 SWR_CONTEXT *pContext = GetContext(hContext);
1515 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1516
1517 pDC->pState->state.enableStats = enable;
1518 }
1519
1520 //////////////////////////////////////////////////////////////////////////
1521 /// @brief Mark end of frame - used for performance profiling
1522 /// @param hContext - Handle passed back from SwrCreateContext
1523 void SWR_API SwrEndFrame(
1524 HANDLE hContext)
1525 {
1526 RDTSC_ENDFRAME();
1527 SWR_CONTEXT *pContext = GetContext(hContext);
1528 pContext->frameCount++;
1529 }