swr: [rasterizer core] conservative rast backend changes
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / api.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file api.cpp
24 *
25 * @brief API implementation
26 *
27 ******************************************************************************/
28
29 #include <cfloat>
30 #include <cmath>
31 #include <cstdio>
32 #include <new>
33
34 #include "core/api.h"
35 #include "core/backend.h"
36 #include "core/context.h"
37 #include "core/depthstencil.h"
38 #include "core/frontend.h"
39 #include "core/rasterizer.h"
40 #include "core/rdtsc_core.h"
41 #include "core/threads.h"
42 #include "core/tilemgr.h"
43 #include "core/clip.h"
44 #include "core/utils.h"
45
46 #include "common/simdintrin.h"
47 #include "common/os.h"
48
49 void SetupDefaultState(SWR_CONTEXT *pContext);
50
51 static INLINE SWR_CONTEXT* GetContext(HANDLE hContext)
52 {
53 return (SWR_CONTEXT*)hContext;
54 }
55
56 //////////////////////////////////////////////////////////////////////////
57 /// @brief Create SWR Context.
58 /// @param pCreateInfo - pointer to creation info.
59 HANDLE SwrCreateContext(
60 SWR_CREATECONTEXT_INFO* pCreateInfo)
61 {
62 RDTSC_RESET();
63 RDTSC_INIT(0);
64
65 void* pContextMem = AlignedMalloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4);
66 memset(pContextMem, 0, sizeof(SWR_CONTEXT));
67 SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT();
68
69 pContext->driverType = pCreateInfo->driver;
70 pContext->privateStateSize = pCreateInfo->privateStateSize;
71
72 pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
73 pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
74
75 pContext->pMacroTileManagerArray = (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
76 pContext->pDispatchQueueArray = (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
77
78 for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
79 {
80 pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
81 new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
82 new (&pContext->pDispatchQueueArray[dc]) DispatchQueue();
83
84 pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
85 }
86
87 if (!KNOB_SINGLE_THREADED)
88 {
89 memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
90 memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
91 new (&pContext->WaitLock) std::mutex();
92 new (&pContext->FifosNotEmpty) std::condition_variable();
93
94 CreateThreadPool(pContext, &pContext->threadPool);
95 }
96
97 // Calling createThreadPool() above can set SINGLE_THREADED
98 if (KNOB_SINGLE_THREADED)
99 {
100 SET_KNOB(HYPERTHREADED_FE, false);
101 pContext->NumWorkerThreads = 1;
102 pContext->NumFEThreads = 1;
103 pContext->NumBEThreads = 1;
104 }
105
106 // Allocate scratch space for workers.
107 ///@note We could lazily allocate this but its rather small amount of memory.
108 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
109 {
110 #if defined(_WIN32)
111 uint32_t numaNode = pContext->threadPool.pThreadData ?
112 pContext->threadPool.pThreadData[i].numaId : 0;
113 pContext->pScratch[i] = (uint8_t*)VirtualAllocExNuma(
114 GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE),
115 MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE,
116 numaNode);
117 #else
118 pContext->pScratch[i] = (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4);
119 #endif
120 }
121
122 // State setup AFTER context is fully initialized
123 SetupDefaultState(pContext);
124
125 // initialize hot tile manager
126 pContext->pHotTileMgr = new HotTileMgr();
127
128 // initialize function pointer tables
129 InitClearTilesTable();
130
131 // initialize store tiles function
132 pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
133 pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
134 pContext->pfnClearTile = pCreateInfo->pfnClearTile;
135
136 // pass pointer to bucket manager back to caller
137 #ifdef KNOB_ENABLE_RDTSC
138 pCreateInfo->pBucketMgr = &gBucketMgr;
139 #endif
140
141 pCreateInfo->contextSaveSize = sizeof(API_STATE);
142
143 return (HANDLE)pContext;
144 }
145
146 void SwrDestroyContext(HANDLE hContext)
147 {
148 SWR_CONTEXT *pContext = GetContext(hContext);
149 DestroyThreadPool(pContext, &pContext->threadPool);
150
151 // free the fifos
152 for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i)
153 {
154 delete pContext->dcRing[i].pArena;
155 delete pContext->dsRing[i].pArena;
156 pContext->pMacroTileManagerArray[i].~MacroTileMgr();
157 pContext->pDispatchQueueArray[i].~DispatchQueue();
158 }
159
160 AlignedFree(pContext->pDispatchQueueArray);
161 AlignedFree(pContext->pMacroTileManagerArray);
162
163 // Free scratch space.
164 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
165 {
166 #if defined(_WIN32)
167 VirtualFree(pContext->pScratch[i], 0, MEM_RELEASE);
168 #else
169 AlignedFree(pContext->pScratch[i]);
170 #endif
171 }
172
173 delete(pContext->pHotTileMgr);
174
175 pContext->~SWR_CONTEXT();
176 AlignedFree(GetContext(hContext));
177 }
178
179 void CopyState(DRAW_STATE& dst, const DRAW_STATE& src)
180 {
181 memcpy(&dst.state, &src.state, sizeof(API_STATE));
182 }
183
184 void WakeAllThreads(SWR_CONTEXT *pContext)
185 {
186 pContext->FifosNotEmpty.notify_all();
187 }
188
189 static TileSet gSingleThreadLockedTiles;
190
191 template<bool IsDraw>
192 void QueueWork(SWR_CONTEXT *pContext)
193 {
194 DRAW_CONTEXT* pDC = pContext->pCurDrawContext;
195 uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
196
197 if (IsDraw)
198 {
199 pDC->pTileMgr = &pContext->pMacroTileManagerArray[dcIndex];
200 pDC->pTileMgr->initialize();
201 }
202
203 // Each worker thread looks at a DC for both FE and BE work at different times and so we
204 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
205 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
206 // then moved on if all work is done.)
207 pContext->pCurDrawContext->threadsDone = pContext->NumFEThreads + pContext->NumBEThreads;
208
209 _ReadWriteBarrier();
210 {
211 std::unique_lock<std::mutex> lock(pContext->WaitLock);
212 pContext->dcRing.Enqueue();
213 }
214
215 if (KNOB_SINGLE_THREADED)
216 {
217 // flush denormals to 0
218 uint32_t mxcsr = _mm_getcsr();
219 _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
220
221 if (IsDraw)
222 {
223 uint32_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
224 WorkOnFifoFE(pContext, 0, curDraw[0]);
225 WorkOnFifoBE(pContext, 0, curDraw[1], gSingleThreadLockedTiles, 0, 0);
226 }
227 else
228 {
229 uint32_t curDispatch = pContext->pCurDrawContext->drawId;
230 WorkOnCompute(pContext, 0, curDispatch);
231 }
232
233 // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
234 while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0) {}
235
236 // restore csr
237 _mm_setcsr(mxcsr);
238 }
239 else
240 {
241 RDTSC_START(APIDrawWakeAllThreads);
242 WakeAllThreads(pContext);
243 RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
244 }
245
246 // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
247 pContext->pPrevDrawContext = pContext->pCurDrawContext;
248 pContext->pCurDrawContext = nullptr;
249 }
250
251 INLINE void QueueDraw(SWR_CONTEXT* pContext)
252 {
253 QueueWork<true>(pContext);
254 }
255
256 INLINE void QueueDispatch(SWR_CONTEXT* pContext)
257 {
258 QueueWork<false>(pContext);
259 }
260
261 DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
262 {
263 RDTSC_START(APIGetDrawContext);
264 // If current draw context is null then need to obtain a new draw context to use from ring.
265 if (pContext->pCurDrawContext == nullptr)
266 {
267 // Need to wait for a free entry.
268 while (pContext->dcRing.IsFull())
269 {
270 _mm_pause();
271 }
272
273 uint64_t curDraw = pContext->dcRing.GetHead();
274 uint32_t dcIndex = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT;
275
276 static uint64_t lastDrawChecked;
277 static uint32_t lastFrameChecked;
278 if ((pContext->frameCount - lastFrameChecked) > 2 ||
279 (curDraw - lastDrawChecked) > 0x10000)
280 {
281 // Take this opportunity to clean-up old arena allocations
282 pContext->cachingArenaAllocator.FreeOldBlocks();
283
284 lastFrameChecked = pContext->frameCount;
285 lastDrawChecked = curDraw;
286 }
287
288 DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
289 pContext->pCurDrawContext = pCurDrawContext;
290
291 // Assign next available entry in DS ring to this DC.
292 uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
293 pCurDrawContext->pState = &pContext->dsRing[dsIndex];
294
295 // Copy previous state to current state.
296 if (pContext->pPrevDrawContext)
297 {
298 DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext;
299
300 // If we're splitting our draw then we can just use the same state from the previous
301 // draw. In this case, we won't increment the DS ring index so the next non-split
302 // draw can receive the state.
303 if (isSplitDraw == false)
304 {
305 CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);
306
307 // Should have been cleaned up previously
308 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
309
310 pCurDrawContext->pState->pPrivateState = nullptr;
311
312 pContext->curStateId++; // Progress state ring index forward.
313 }
314 else
315 {
316 // If its a split draw then just copy the state pointer over
317 // since its the same draw.
318 pCurDrawContext->pState = pPrevDrawContext->pState;
319 SWR_ASSERT(pPrevDrawContext->cleanupState == false);
320 }
321 }
322 else
323 {
324 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
325 pContext->curStateId++; // Progress state ring index forward.
326 }
327
328 SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true);
329
330 pCurDrawContext->dependent = false;
331 pCurDrawContext->pContext = pContext;
332 pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
333
334 pCurDrawContext->doneFE = false;
335 pCurDrawContext->FeLock = 0;
336 pCurDrawContext->threadsDone = 0;
337
338 // Assign unique drawId for this DC
339 pCurDrawContext->drawId = pContext->dcRing.GetHead();
340
341 pCurDrawContext->cleanupState = true;
342 }
343 else
344 {
345 SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
346 }
347
348 RDTSC_STOP(APIGetDrawContext, 0, 0);
349 return pContext->pCurDrawContext;
350 }
351
352 API_STATE* GetDrawState(SWR_CONTEXT *pContext)
353 {
354 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
355 SWR_ASSERT(pDC->pState != nullptr);
356
357 return &pDC->pState->state;
358 }
359
360 void SWR_API SwrSaveState(
361 HANDLE hContext,
362 void* pOutputStateBlock,
363 size_t memSize)
364 {
365 SWR_CONTEXT *pContext = GetContext(hContext);
366 auto pSrc = GetDrawState(pContext);
367 SWR_ASSERT(pOutputStateBlock && memSize >= sizeof(*pSrc));
368
369 memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc));
370 }
371
372 void SWR_API SwrRestoreState(
373 HANDLE hContext,
374 const void* pStateBlock,
375 size_t memSize)
376 {
377 SWR_CONTEXT *pContext = GetContext(hContext);
378 auto pDst = GetDrawState(pContext);
379 SWR_ASSERT(pStateBlock && memSize >= sizeof(*pDst));
380
381 memcpy(pDst, pStateBlock, sizeof(*pDst));
382 }
383
384 void SetupDefaultState(SWR_CONTEXT *pContext)
385 {
386 API_STATE* pState = GetDrawState(pContext);
387
388 pState->rastState.cullMode = SWR_CULLMODE_NONE;
389 pState->rastState.frontWinding = SWR_FRONTWINDING_CCW;
390 }
391
392 void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2, uint64_t userData3)
393 {
394 RDTSC_START(APISync);
395
396 SWR_ASSERT(pfnFunc != nullptr);
397
398 SWR_CONTEXT *pContext = GetContext(hContext);
399 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
400
401 pDC->FeWork.type = SYNC;
402 pDC->FeWork.pfnWork = ProcessSync;
403 pDC->FeWork.desc.sync.pfnCallbackFunc = pfnFunc;
404 pDC->FeWork.desc.sync.userData = userData;
405 pDC->FeWork.desc.sync.userData2 = userData2;
406 pDC->FeWork.desc.sync.userData3 = userData3;
407
408 // cannot execute until all previous draws have completed
409 pDC->dependent = true;
410
411 //enqueue
412 QueueDraw(pContext);
413
414 RDTSC_STOP(APISync, 1, 0);
415 }
416
417 void SwrWaitForIdle(HANDLE hContext)
418 {
419 SWR_CONTEXT *pContext = GetContext(hContext);
420
421 RDTSC_START(APIWaitForIdle);
422
423 while (!pContext->dcRing.IsEmpty())
424 {
425 _mm_pause();
426 }
427
428 RDTSC_STOP(APIWaitForIdle, 1, 0);
429 }
430
431 void SwrSetVertexBuffers(
432 HANDLE hContext,
433 uint32_t numBuffers,
434 const SWR_VERTEX_BUFFER_STATE* pVertexBuffers)
435 {
436 API_STATE* pState = GetDrawState(GetContext(hContext));
437
438 for (uint32_t i = 0; i < numBuffers; ++i)
439 {
440 const SWR_VERTEX_BUFFER_STATE *pVB = &pVertexBuffers[i];
441 pState->vertexBuffers[pVB->index] = *pVB;
442 }
443 }
444
445 void SwrSetIndexBuffer(
446 HANDLE hContext,
447 const SWR_INDEX_BUFFER_STATE* pIndexBuffer)
448 {
449 API_STATE* pState = GetDrawState(GetContext(hContext));
450
451 pState->indexBuffer = *pIndexBuffer;
452 }
453
454 void SwrSetFetchFunc(
455 HANDLE hContext,
456 PFN_FETCH_FUNC pfnFetchFunc)
457 {
458 API_STATE* pState = GetDrawState(GetContext(hContext));
459
460 pState->pfnFetchFunc = pfnFetchFunc;
461 }
462
463 void SwrSetSoFunc(
464 HANDLE hContext,
465 PFN_SO_FUNC pfnSoFunc,
466 uint32_t streamIndex)
467 {
468 API_STATE* pState = GetDrawState(GetContext(hContext));
469
470 SWR_ASSERT(streamIndex < MAX_SO_STREAMS);
471
472 pState->pfnSoFunc[streamIndex] = pfnSoFunc;
473 }
474
475 void SwrSetSoState(
476 HANDLE hContext,
477 SWR_STREAMOUT_STATE* pSoState)
478 {
479 API_STATE* pState = GetDrawState(GetContext(hContext));
480
481 pState->soState = *pSoState;
482 }
483
484 void SwrSetSoBuffers(
485 HANDLE hContext,
486 SWR_STREAMOUT_BUFFER* pSoBuffer,
487 uint32_t slot)
488 {
489 API_STATE* pState = GetDrawState(GetContext(hContext));
490
491 SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot);
492
493 pState->soBuffer[slot] = *pSoBuffer;
494 }
495
496 void SwrSetVertexFunc(
497 HANDLE hContext,
498 PFN_VERTEX_FUNC pfnVertexFunc)
499 {
500 API_STATE* pState = GetDrawState(GetContext(hContext));
501
502 pState->pfnVertexFunc = pfnVertexFunc;
503 }
504
505 void SwrSetFrontendState(
506 HANDLE hContext,
507 SWR_FRONTEND_STATE *pFEState)
508 {
509 API_STATE* pState = GetDrawState(GetContext(hContext));
510 pState->frontendState = *pFEState;
511 }
512
513 void SwrSetGsState(
514 HANDLE hContext,
515 SWR_GS_STATE *pGSState)
516 {
517 API_STATE* pState = GetDrawState(GetContext(hContext));
518 pState->gsState = *pGSState;
519 }
520
521 void SwrSetGsFunc(
522 HANDLE hContext,
523 PFN_GS_FUNC pfnGsFunc)
524 {
525 API_STATE* pState = GetDrawState(GetContext(hContext));
526 pState->pfnGsFunc = pfnGsFunc;
527 }
528
529 void SwrSetCsFunc(
530 HANDLE hContext,
531 PFN_CS_FUNC pfnCsFunc,
532 uint32_t totalThreadsInGroup,
533 uint32_t totalSpillFillSize)
534 {
535 API_STATE* pState = GetDrawState(GetContext(hContext));
536 pState->pfnCsFunc = pfnCsFunc;
537 pState->totalThreadsInGroup = totalThreadsInGroup;
538 pState->totalSpillFillSize = totalSpillFillSize;
539 }
540
541 void SwrSetTsState(
542 HANDLE hContext,
543 SWR_TS_STATE *pState)
544 {
545 API_STATE* pApiState = GetDrawState(GetContext(hContext));
546 pApiState->tsState = *pState;
547 }
548
549 void SwrSetHsFunc(
550 HANDLE hContext,
551 PFN_HS_FUNC pfnFunc)
552 {
553 API_STATE* pApiState = GetDrawState(GetContext(hContext));
554 pApiState->pfnHsFunc = pfnFunc;
555 }
556
557 void SwrSetDsFunc(
558 HANDLE hContext,
559 PFN_DS_FUNC pfnFunc)
560 {
561 API_STATE* pApiState = GetDrawState(GetContext(hContext));
562 pApiState->pfnDsFunc = pfnFunc;
563 }
564
565 void SwrSetDepthStencilState(
566 HANDLE hContext,
567 SWR_DEPTH_STENCIL_STATE *pDSState)
568 {
569 API_STATE* pState = GetDrawState(GetContext(hContext));
570
571 pState->depthStencilState = *pDSState;
572 }
573
574 void SwrSetBackendState(
575 HANDLE hContext,
576 SWR_BACKEND_STATE *pBEState)
577 {
578 API_STATE* pState = GetDrawState(GetContext(hContext));
579
580 pState->backendState = *pBEState;
581 }
582
583 void SwrSetPixelShaderState(
584 HANDLE hContext,
585 SWR_PS_STATE *pPSState)
586 {
587 API_STATE *pState = GetDrawState(GetContext(hContext));
588 pState->psState = *pPSState;
589 }
590
591 void SwrSetBlendState(
592 HANDLE hContext,
593 SWR_BLEND_STATE *pBlendState)
594 {
595 API_STATE *pState = GetDrawState(GetContext(hContext));
596 memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE));
597 }
598
599 void SwrSetBlendFunc(
600 HANDLE hContext,
601 uint32_t renderTarget,
602 PFN_BLEND_JIT_FUNC pfnBlendFunc)
603 {
604 SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS);
605 API_STATE *pState = GetDrawState(GetContext(hContext));
606 pState->pfnBlendFunc[renderTarget] = pfnBlendFunc;
607 }
608
609 void SwrSetLinkage(
610 HANDLE hContext,
611 uint32_t mask,
612 const uint8_t* pMap)
613 {
614 API_STATE* pState = GetDrawState(GetContext(hContext));
615
616 static const uint8_t IDENTITY_MAP[] =
617 {
618 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
619 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
620 };
621 static_assert(sizeof(IDENTITY_MAP) == sizeof(pState->linkageMap),
622 "Update for new value of MAX_ATTRIBUTES");
623
624 pState->linkageMask = mask;
625 pState->linkageCount = _mm_popcnt_u32(mask);
626
627 if (!pMap)
628 {
629 pMap = IDENTITY_MAP;
630 }
631 memcpy(pState->linkageMap, pMap, pState->linkageCount);
632 }
633
634 // update guardband multipliers for the viewport
635 void updateGuardband(API_STATE *pState)
636 {
637 // guardband center is viewport center
638 pState->gbState.left = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
639 pState->gbState.right = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
640 pState->gbState.top = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
641 pState->gbState.bottom = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
642 }
643
644 void SwrSetRastState(
645 HANDLE hContext,
646 const SWR_RASTSTATE *pRastState)
647 {
648 SWR_CONTEXT *pContext = GetContext(hContext);
649 API_STATE* pState = GetDrawState(pContext);
650
651 memcpy(&pState->rastState, pRastState, sizeof(SWR_RASTSTATE));
652 }
653
654 void SwrSetViewports(
655 HANDLE hContext,
656 uint32_t numViewports,
657 const SWR_VIEWPORT* pViewports,
658 const SWR_VIEWPORT_MATRIX* pMatrices)
659 {
660 SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS,
661 "Invalid number of viewports.");
662
663 SWR_CONTEXT *pContext = GetContext(hContext);
664 API_STATE* pState = GetDrawState(pContext);
665
666 memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports);
667
668 if (pMatrices != nullptr)
669 {
670 memcpy(&pState->vpMatrix[0], pMatrices, sizeof(SWR_VIEWPORT_MATRIX) * numViewports);
671 }
672 else
673 {
674 // Compute default viewport transform.
675 for (uint32_t i = 0; i < numViewports; ++i)
676 {
677 if (pContext->driverType == DX)
678 {
679 pState->vpMatrix[i].m00 = pState->vp[i].width / 2.0f;
680 pState->vpMatrix[i].m11 = -pState->vp[i].height / 2.0f;
681 pState->vpMatrix[i].m22 = pState->vp[i].maxZ - pState->vp[i].minZ;
682 pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
683 pState->vpMatrix[i].m31 = pState->vp[i].y - pState->vpMatrix[i].m11;
684 pState->vpMatrix[i].m32 = pState->vp[i].minZ;
685 }
686 else
687 {
688 // Standard, with the exception that Y is inverted.
689 pState->vpMatrix[i].m00 = (pState->vp[i].width - pState->vp[i].x) / 2.0f;
690 pState->vpMatrix[i].m11 = (pState->vp[i].y - pState->vp[i].height) / 2.0f;
691 pState->vpMatrix[i].m22 = (pState->vp[i].maxZ - pState->vp[i].minZ) / 2.0f;
692 pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
693 pState->vpMatrix[i].m31 = pState->vp[i].height + pState->vpMatrix[i].m11;
694 pState->vpMatrix[i].m32 = pState->vp[i].minZ + pState->vpMatrix[i].m22;
695
696 // Now that the matrix is calculated, clip the view coords to screen size.
697 // OpenGL allows for -ve x,y in the viewport.
698 pState->vp[i].x = std::max(pState->vp[i].x, 0.0f);
699 pState->vp[i].y = std::max(pState->vp[i].y, 0.0f);
700 }
701 }
702 }
703
704 updateGuardband(pState);
705 }
706
707 void SwrSetScissorRects(
708 HANDLE hContext,
709 uint32_t numScissors,
710 const BBOX* pScissors)
711 {
712 SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS,
713 "Invalid number of scissor rects.");
714
715 API_STATE* pState = GetDrawState(GetContext(hContext));
716 memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(BBOX));
717 };
718
719 void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
720 {
721 API_STATE *pState = &pDC->pState->state;
722 uint32_t left, right, top, bottom;
723
724 // Set up scissor dimensions based on scissor or viewport
725 if (pState->rastState.scissorEnable)
726 {
727 // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges
728 left = pState->scissorRects[0].left;
729 right = pState->scissorRects[0].right;
730 top = pState->scissorRects[0].top;
731 bottom = pState->scissorRects[0].bottom;
732 }
733 else
734 {
735 left = (int32_t)pState->vp[0].x;
736 right = (int32_t)pState->vp[0].x + (int32_t)pState->vp[0].width;
737 top = (int32_t)pState->vp[0].y;
738 bottom = (int32_t)pState->vp[0].y + (int32_t)pState->vp[0].height;
739 }
740
741 right = std::min<uint32_t>(right, KNOB_MAX_SCISSOR_X);
742 bottom = std::min<uint32_t>(bottom, KNOB_MAX_SCISSOR_Y);
743
744 if (left > KNOB_MAX_SCISSOR_X || top > KNOB_MAX_SCISSOR_Y)
745 {
746 pState->scissorInFixedPoint.left = 0;
747 pState->scissorInFixedPoint.right = 0;
748 pState->scissorInFixedPoint.top = 0;
749 pState->scissorInFixedPoint.bottom = 0;
750 }
751 else
752 {
753 pState->scissorInFixedPoint.left = left * FIXED_POINT_SCALE;
754 pState->scissorInFixedPoint.right = right * FIXED_POINT_SCALE - 1;
755 pState->scissorInFixedPoint.top = top * FIXED_POINT_SCALE;
756 pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1;
757 }
758 }
759
760 // templated backend function tables
761 extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
762 extern PFN_BACKEND_FUNC gBackendSingleSample[2][2][2];
763 extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][2][2][2][2];
764 extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2];
765 void SetupPipeline(DRAW_CONTEXT *pDC)
766 {
767 DRAW_STATE* pState = pDC->pState;
768 const SWR_RASTSTATE &rastState = pState->state.rastState;
769 const SWR_PS_STATE &psState = pState->state.psState;
770 BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
771 const uint32_t forcedSampleCount = (rastState.forcedSampleCount) ? 1 : 0;
772
773 // setup backend
774 if (psState.pfnPixelShader == nullptr)
775 {
776 backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
777 }
778 else
779 {
780 const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.forcedSampleCount) ? 1 : 0;
781 const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
782 const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesSourceDepth && !psState.usesUAV)) ? 1 : 0;
783 const uint32_t inputCoverage = (psState.inputCoverage != SWR_INPUT_COVERAGE_NONE) ? 1 : 0;
784
785 SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
786
787 // select backend function
788 switch(psState.shadingRate)
789 {
790 case SWR_SHADING_RATE_PIXEL:
791 if(bMultisampleEnable)
792 {
793 // always need to generate I & J per sample for Z interpolation
794 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
795 backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][inputCoverage][centroid][forcedSampleCount][canEarlyZ];
796 }
797 else
798 {
799 // always need to generate I & J per pixel for Z interpolation
800 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
801 backendFuncs.pfnBackend = gBackendSingleSample[inputCoverage][centroid][canEarlyZ];
802 }
803 break;
804 case SWR_SHADING_RATE_SAMPLE:
805 SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
806 // always need to generate I & J per sample for Z interpolation
807 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
808 backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][inputCoverage][centroid][canEarlyZ];
809 break;
810 default:
811 SWR_ASSERT(0 && "Invalid shading rate");
812 break;
813 }
814 }
815
816 PFN_PROCESS_PRIMS pfnBinner;
817 switch (pState->state.topology)
818 {
819 case TOP_POINT_LIST:
820 pState->pfnProcessPrims = ClipPoints;
821 pfnBinner = BinPoints;
822 break;
823 case TOP_LINE_LIST:
824 case TOP_LINE_STRIP:
825 case TOP_LINE_LOOP:
826 case TOP_LINE_LIST_ADJ:
827 case TOP_LISTSTRIP_ADJ:
828 pState->pfnProcessPrims = ClipLines;
829 pfnBinner = BinLines;
830 break;
831 default:
832 pState->pfnProcessPrims = ClipTriangles;
833 pfnBinner = GetBinTrianglesFunc((rastState.conservativeRast > 0));
834 break;
835 };
836
837 // disable clipper if viewport transform is disabled
838 if (pState->state.frontendState.vpTransformDisable)
839 {
840 pState->pfnProcessPrims = pfnBinner;
841 }
842
843 if ((pState->state.psState.pfnPixelShader == nullptr) &&
844 (pState->state.depthStencilState.depthTestEnable == FALSE) &&
845 (pState->state.depthStencilState.depthWriteEnable == FALSE) &&
846 (pState->state.depthStencilState.stencilTestEnable == FALSE) &&
847 (pState->state.depthStencilState.stencilWriteEnable == FALSE) &&
848 (pState->state.linkageCount == 0))
849 {
850 pState->pfnProcessPrims = nullptr;
851 pState->state.linkageMask = 0;
852 }
853
854 if (pState->state.soState.rasterizerDisable == true)
855 {
856 pState->pfnProcessPrims = nullptr;
857 pState->state.linkageMask = 0;
858 }
859
860 // set up the frontend attrib mask
861 pState->state.feAttribMask = pState->state.linkageMask;
862 if (pState->state.soState.soEnable)
863 {
864 for (uint32_t i = 0; i < 4; ++i)
865 {
866 pState->state.feAttribMask |= pState->state.soState.streamMasks[i];
867 }
868 }
869
870 // complicated logic to test for cases where we don't need backing hottile memory for a draw
871 // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
872 pState->state.depthHottileEnable = ((!(pState->state.depthStencilState.depthTestEnable &&
873 !pState->state.depthStencilState.depthWriteEnable &&
874 pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) &&
875 (pState->state.depthStencilState.depthTestEnable ||
876 pState->state.depthStencilState.depthWriteEnable)) ? true : false;
877
878 pState->state.stencilHottileEnable = (((!(pState->state.depthStencilState.stencilTestEnable &&
879 !pState->state.depthStencilState.stencilWriteEnable &&
880 pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) ||
881 // for stencil we have to check the double sided state as well
882 (!(pState->state.depthStencilState.doubleSidedStencilTestEnable &&
883 !pState->state.depthStencilState.stencilWriteEnable &&
884 pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) &&
885 (pState->state.depthStencilState.stencilTestEnable ||
886 pState->state.depthStencilState.stencilWriteEnable)) ? true : false;
887
888 uint32_t numRTs = pState->state.psState.numRenderTargets;
889 pState->state.colorHottileEnable = 0;
890 if (psState.pfnPixelShader != nullptr)
891 {
892 for (uint32_t rt = 0; rt < numRTs; ++rt)
893 {
894 pState->state.colorHottileEnable |=
895 (!pState->state.blendState.renderTarget[rt].writeDisableAlpha ||
896 !pState->state.blendState.renderTarget[rt].writeDisableRed ||
897 !pState->state.blendState.renderTarget[rt].writeDisableGreen ||
898 !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0;
899 }
900 }
901
902 // Setup depth quantization function
903 if (pState->state.depthHottileEnable)
904 {
905 switch (pState->state.rastState.depthFormat)
906 {
907 case R32_FLOAT_X8X24_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT_X8X24_TYPELESS > ; break;
908 case R32_FLOAT: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; break;
909 case R24_UNORM_X8_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R24_UNORM_X8_TYPELESS > ; break;
910 case R16_UNORM: pState->state.pfnQuantizeDepth = QuantizeDepth < R16_UNORM > ; break;
911 default: SWR_ASSERT(false, "Unsupported depth format for depth quantiztion.");
912 pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
913 }
914 }
915 else
916 {
917 // set up pass-through quantize if depth isn't enabled
918 pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
919 }
920 }
921
922 //////////////////////////////////////////////////////////////////////////
923 /// @brief InitDraw
924 /// @param pDC - Draw context to initialize for this draw.
925 void InitDraw(
926 DRAW_CONTEXT *pDC,
927 bool isSplitDraw)
928 {
929 // We don't need to re-setup the scissors/pipeline state again for split draw.
930 if (isSplitDraw == false)
931 {
932 SetupMacroTileScissors(pDC);
933 SetupPipeline(pDC);
934 }
935 }
936
937 //////////////////////////////////////////////////////////////////////////
938 /// @brief We can split the draw for certain topologies for better performance.
939 /// @param totalVerts - Total vertices for draw
940 /// @param topology - Topology used for draw
941 uint32_t MaxVertsPerDraw(
942 DRAW_CONTEXT* pDC,
943 uint32_t totalVerts,
944 PRIMITIVE_TOPOLOGY topology)
945 {
946 API_STATE& state = pDC->pState->state;
947
948 uint32_t vertsPerDraw = totalVerts;
949
950 if (state.soState.soEnable)
951 {
952 return totalVerts;
953 }
954
955 switch (topology)
956 {
957 case TOP_POINT_LIST:
958 case TOP_TRIANGLE_LIST:
959 vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW;
960 break;
961
962 case TOP_PATCHLIST_1:
963 case TOP_PATCHLIST_2:
964 case TOP_PATCHLIST_3:
965 case TOP_PATCHLIST_4:
966 case TOP_PATCHLIST_5:
967 case TOP_PATCHLIST_6:
968 case TOP_PATCHLIST_7:
969 case TOP_PATCHLIST_8:
970 case TOP_PATCHLIST_9:
971 case TOP_PATCHLIST_10:
972 case TOP_PATCHLIST_11:
973 case TOP_PATCHLIST_12:
974 case TOP_PATCHLIST_13:
975 case TOP_PATCHLIST_14:
976 case TOP_PATCHLIST_15:
977 case TOP_PATCHLIST_16:
978 case TOP_PATCHLIST_17:
979 case TOP_PATCHLIST_18:
980 case TOP_PATCHLIST_19:
981 case TOP_PATCHLIST_20:
982 case TOP_PATCHLIST_21:
983 case TOP_PATCHLIST_22:
984 case TOP_PATCHLIST_23:
985 case TOP_PATCHLIST_24:
986 case TOP_PATCHLIST_25:
987 case TOP_PATCHLIST_26:
988 case TOP_PATCHLIST_27:
989 case TOP_PATCHLIST_28:
990 case TOP_PATCHLIST_29:
991 case TOP_PATCHLIST_30:
992 case TOP_PATCHLIST_31:
993 case TOP_PATCHLIST_32:
994 if (pDC->pState->state.tsState.tsEnable)
995 {
996 uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE;
997 vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW;
998 }
999 break;
1000
1001 // The Primitive Assembly code can only handle 1 RECT at a time.
1002 case TOP_RECT_LIST:
1003 vertsPerDraw = 3;
1004 break;
1005
1006 default:
1007 // We are not splitting up draws for other topologies.
1008 break;
1009 }
1010
1011 return vertsPerDraw;
1012 }
1013
1014
1015 //////////////////////////////////////////////////////////////////////////
1016 /// @brief DrawInstanced
1017 /// @param hContext - Handle passed back from SwrCreateContext
1018 /// @param topology - Specifies topology for draw.
1019 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
1020 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1021 /// @param numInstances - How many instances to render.
1022 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1023 void DrawInstanced(
1024 HANDLE hContext,
1025 PRIMITIVE_TOPOLOGY topology,
1026 uint32_t numVertices,
1027 uint32_t startVertex,
1028 uint32_t numInstances = 1,
1029 uint32_t startInstance = 0)
1030 {
1031 if (KNOB_TOSS_DRAW)
1032 {
1033 return;
1034 }
1035
1036 RDTSC_START(APIDraw);
1037
1038 SWR_CONTEXT *pContext = GetContext(hContext);
1039 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1040
1041 uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
1042 uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
1043 uint32_t remainingVerts = numVertices;
1044
1045 API_STATE *pState = &pDC->pState->state;
1046 pState->topology = topology;
1047 pState->forceFront = false;
1048
1049 // disable culling for points/lines
1050 uint32_t oldCullMode = pState->rastState.cullMode;
1051 if (topology == TOP_POINT_LIST)
1052 {
1053 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1054 pState->forceFront = true;
1055 }
1056
1057 int draw = 0;
1058 while (remainingVerts)
1059 {
1060 uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ?
1061 remainingVerts : maxVertsPerDraw;
1062
1063 bool isSplitDraw = (draw > 0) ? true : false;
1064 DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw);
1065 InitDraw(pDC, isSplitDraw);
1066
1067 pDC->FeWork.type = DRAW;
1068 pDC->FeWork.pfnWork = GetProcessDrawFunc(
1069 false, // IsIndexed
1070 false, // bEnableCutIndex
1071 pState->tsState.tsEnable,
1072 pState->gsState.gsEnable,
1073 pState->soState.soEnable,
1074 pDC->pState->pfnProcessPrims != nullptr);
1075 pDC->FeWork.desc.draw.numVerts = numVertsForDraw;
1076 pDC->FeWork.desc.draw.startVertex = startVertex;
1077 pDC->FeWork.desc.draw.numInstances = numInstances;
1078 pDC->FeWork.desc.draw.startInstance = startInstance;
1079 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1080 pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw;
1081
1082 pDC->cleanupState = (remainingVerts == numVertsForDraw);
1083
1084 //enqueue DC
1085 QueueDraw(pContext);
1086
1087 remainingVerts -= numVertsForDraw;
1088 draw++;
1089 }
1090
1091 // restore culling state
1092 pDC = GetDrawContext(pContext);
1093 pDC->pState->state.rastState.cullMode = oldCullMode;
1094
1095 RDTSC_STOP(APIDraw, numVertices * numInstances, 0);
1096 }
1097
1098 //////////////////////////////////////////////////////////////////////////
1099 /// @brief SwrDraw
1100 /// @param hContext - Handle passed back from SwrCreateContext
1101 /// @param topology - Specifies topology for draw.
1102 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
1103 /// @param primCount - Number of vertices.
1104 void SwrDraw(
1105 HANDLE hContext,
1106 PRIMITIVE_TOPOLOGY topology,
1107 uint32_t startVertex,
1108 uint32_t numVertices)
1109 {
1110 DrawInstanced(hContext, topology, numVertices, startVertex);
1111 }
1112
1113 //////////////////////////////////////////////////////////////////////////
1114 /// @brief SwrDrawInstanced
1115 /// @param hContext - Handle passed back from SwrCreateContext
1116 /// @param topology - Specifies topology for draw.
1117 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
1118 /// @param numInstances - How many instances to render.
1119 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1120 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1121 void SwrDrawInstanced(
1122 HANDLE hContext,
1123 PRIMITIVE_TOPOLOGY topology,
1124 uint32_t numVertsPerInstance,
1125 uint32_t numInstances,
1126 uint32_t startVertex,
1127 uint32_t startInstance
1128 )
1129 {
1130 DrawInstanced(hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance);
1131 }
1132
1133 //////////////////////////////////////////////////////////////////////////
1134 /// @brief DrawIndexedInstanced
1135 /// @param hContext - Handle passed back from SwrCreateContext
1136 /// @param topology - Specifies topology for draw.
1137 /// @param numIndices - Number of indices to read sequentially from index buffer.
1138 /// @param indexOffset - Starting index into index buffer.
1139 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1140 /// @param numInstances - Number of instances to render.
1141 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1142 void DrawIndexedInstance(
1143 HANDLE hContext,
1144 PRIMITIVE_TOPOLOGY topology,
1145 uint32_t numIndices,
1146 uint32_t indexOffset,
1147 int32_t baseVertex,
1148 uint32_t numInstances = 1,
1149 uint32_t startInstance = 0)
1150 {
1151 if (KNOB_TOSS_DRAW)
1152 {
1153 return;
1154 }
1155
1156 RDTSC_START(APIDrawIndexed);
1157
1158 SWR_CONTEXT *pContext = GetContext(hContext);
1159 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1160 API_STATE* pState = &pDC->pState->state;
1161
1162 uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
1163 uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
1164 uint32_t remainingIndices = numIndices;
1165
1166 uint32_t indexSize = 0;
1167 switch (pState->indexBuffer.format)
1168 {
1169 case R32_UINT: indexSize = sizeof(uint32_t); break;
1170 case R16_UINT: indexSize = sizeof(uint16_t); break;
1171 case R8_UINT: indexSize = sizeof(uint8_t); break;
1172 default:
1173 SWR_ASSERT(0);
1174 }
1175
1176 int draw = 0;
1177 uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices;
1178 pIB += (uint64_t)indexOffset * (uint64_t)indexSize;
1179
1180 pState->topology = topology;
1181 pState->forceFront = false;
1182
1183 // disable culling for points/lines
1184 uint32_t oldCullMode = pState->rastState.cullMode;
1185 if (topology == TOP_POINT_LIST)
1186 {
1187 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1188 pState->forceFront = true;
1189 }
1190
1191 while (remainingIndices)
1192 {
1193 uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ?
1194 remainingIndices : maxIndicesPerDraw;
1195
1196 // When breaking up draw, we need to obtain new draw context for each iteration.
1197 bool isSplitDraw = (draw > 0) ? true : false;
1198 pDC = GetDrawContext(pContext, isSplitDraw);
1199 InitDraw(pDC, isSplitDraw);
1200
1201 pDC->FeWork.type = DRAW;
1202 pDC->FeWork.pfnWork = GetProcessDrawFunc(
1203 true, // IsIndexed
1204 pState->frontendState.bEnableCutIndex,
1205 pState->tsState.tsEnable,
1206 pState->gsState.gsEnable,
1207 pState->soState.soEnable,
1208 pDC->pState->pfnProcessPrims != nullptr);
1209 pDC->FeWork.desc.draw.pDC = pDC;
1210 pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
1211 pDC->FeWork.desc.draw.pIB = (int*)pIB;
1212 pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format;
1213
1214 pDC->FeWork.desc.draw.numInstances = numInstances;
1215 pDC->FeWork.desc.draw.startInstance = startInstance;
1216 pDC->FeWork.desc.draw.baseVertex = baseVertex;
1217 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1218
1219 pDC->cleanupState = (remainingIndices == numIndicesForDraw);
1220
1221 //enqueue DC
1222 QueueDraw(pContext);
1223
1224 pIB += maxIndicesPerDraw * indexSize;
1225 remainingIndices -= numIndicesForDraw;
1226 draw++;
1227 }
1228
1229 // restore culling state
1230 pDC = GetDrawContext(pContext);
1231 pDC->pState->state.rastState.cullMode = oldCullMode;
1232
1233 RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0);
1234 }
1235
1236
1237 //////////////////////////////////////////////////////////////////////////
1238 /// @brief DrawIndexed
1239 /// @param hContext - Handle passed back from SwrCreateContext
1240 /// @param topology - Specifies topology for draw.
1241 /// @param numIndices - Number of indices to read sequentially from index buffer.
1242 /// @param indexOffset - Starting index into index buffer.
1243 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1244 void SwrDrawIndexed(
1245 HANDLE hContext,
1246 PRIMITIVE_TOPOLOGY topology,
1247 uint32_t numIndices,
1248 uint32_t indexOffset,
1249 int32_t baseVertex
1250 )
1251 {
1252 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex);
1253 }
1254
1255 //////////////////////////////////////////////////////////////////////////
1256 /// @brief SwrDrawIndexedInstanced
1257 /// @param hContext - Handle passed back from SwrCreateContext
1258 /// @param topology - Specifies topology for draw.
1259 /// @param numIndices - Number of indices to read sequentially from index buffer.
1260 /// @param numInstances - Number of instances to render.
1261 /// @param indexOffset - Starting index into index buffer.
1262 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1263 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1264 void SwrDrawIndexedInstanced(
1265 HANDLE hContext,
1266 PRIMITIVE_TOPOLOGY topology,
1267 uint32_t numIndices,
1268 uint32_t numInstances,
1269 uint32_t indexOffset,
1270 int32_t baseVertex,
1271 uint32_t startInstance)
1272 {
1273 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
1274 }
1275
1276 //////////////////////////////////////////////////////////////////////////
1277 /// @brief SwrInvalidateTiles
1278 /// @param hContext - Handle passed back from SwrCreateContext
1279 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
1280 void SwrInvalidateTiles(
1281 HANDLE hContext,
1282 uint32_t attachmentMask)
1283 {
1284 if (KNOB_TOSS_DRAW)
1285 {
1286 return;
1287 }
1288
1289 SWR_CONTEXT *pContext = GetContext(hContext);
1290 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1291
1292 pDC->FeWork.type = DISCARDINVALIDATETILES;
1293 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1294 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1295 memset(&pDC->FeWork.desc.discardInvalidateTiles.rect, 0, sizeof(SWR_RECT));
1296 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID;
1297 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false;
1298 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false;
1299
1300 //enqueue
1301 QueueDraw(pContext);
1302 }
1303
1304 //////////////////////////////////////////////////////////////////////////
1305 /// @brief SwrDiscardRect
1306 /// @param hContext - Handle passed back from SwrCreateContext
1307 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
1308 /// @param rect - if rect is all zeros, the entire attachment surface will be discarded
1309 void SwrDiscardRect(
1310 HANDLE hContext,
1311 uint32_t attachmentMask,
1312 SWR_RECT rect)
1313 {
1314 if (KNOB_TOSS_DRAW)
1315 {
1316 return;
1317 }
1318
1319 SWR_CONTEXT *pContext = GetContext(hContext);
1320 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1321
1322 // Queue a load to the hottile
1323 pDC->FeWork.type = DISCARDINVALIDATETILES;
1324 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1325 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1326 pDC->FeWork.desc.discardInvalidateTiles.rect = rect;
1327 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED;
1328 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true;
1329 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true;
1330
1331 //enqueue
1332 QueueDraw(pContext);
1333 }
1334
1335 //////////////////////////////////////////////////////////////////////////
1336 /// @brief SwrDispatch
1337 /// @param hContext - Handle passed back from SwrCreateContext
1338 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
1339 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
1340 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
1341 void SwrDispatch(
1342 HANDLE hContext,
1343 uint32_t threadGroupCountX,
1344 uint32_t threadGroupCountY,
1345 uint32_t threadGroupCountZ)
1346 {
1347 if (KNOB_TOSS_DRAW)
1348 {
1349 return;
1350 }
1351
1352 RDTSC_START(APIDispatch);
1353 SWR_CONTEXT *pContext = GetContext(hContext);
1354 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1355
1356 pDC->isCompute = true; // This is a compute context.
1357
1358 COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
1359
1360 pTaskData->threadGroupCountX = threadGroupCountX;
1361 pTaskData->threadGroupCountY = threadGroupCountY;
1362 pTaskData->threadGroupCountZ = threadGroupCountZ;
1363
1364 uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
1365 uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
1366 pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex];
1367 pDC->pDispatch->initialize(totalThreadGroups, pTaskData);
1368
1369 QueueDispatch(pContext);
1370 RDTSC_STOP(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ, 0);
1371 }
1372
1373 // Deswizzles, converts and stores current contents of the hot tiles to surface
1374 // described by pState
1375 void SwrStoreTiles(
1376 HANDLE hContext,
1377 SWR_RENDERTARGET_ATTACHMENT attachment,
1378 SWR_TILE_STATE postStoreTileState)
1379 {
1380 if (KNOB_TOSS_DRAW)
1381 {
1382 return;
1383 }
1384
1385 RDTSC_START(APIStoreTiles);
1386
1387 SWR_CONTEXT *pContext = GetContext(hContext);
1388 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1389
1390 SetupMacroTileScissors(pDC);
1391
1392 pDC->FeWork.type = STORETILES;
1393 pDC->FeWork.pfnWork = ProcessStoreTiles;
1394 pDC->FeWork.desc.storeTiles.attachment = attachment;
1395 pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState;
1396
1397 //enqueue
1398 QueueDraw(pContext);
1399
1400 RDTSC_STOP(APIStoreTiles, 0, 0);
1401 }
1402
1403 void SwrClearRenderTarget(
1404 HANDLE hContext,
1405 uint32_t clearMask,
1406 const float clearColor[4],
1407 float z,
1408 uint8_t stencil)
1409 {
1410 if (KNOB_TOSS_DRAW)
1411 {
1412 return;
1413 }
1414
1415 RDTSC_START(APIClearRenderTarget);
1416
1417 SWR_CONTEXT *pContext = GetContext(hContext);
1418
1419 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1420
1421 SetupMacroTileScissors(pDC);
1422
1423 CLEAR_FLAGS flags;
1424 flags.mask = clearMask;
1425
1426 pDC->FeWork.type = CLEAR;
1427 pDC->FeWork.pfnWork = ProcessClear;
1428 pDC->FeWork.desc.clear.flags = flags;
1429 pDC->FeWork.desc.clear.clearDepth = z;
1430 pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0];
1431 pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1];
1432 pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2];
1433 pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3];
1434 pDC->FeWork.desc.clear.clearStencil = stencil;
1435
1436 // enqueue draw
1437 QueueDraw(pContext);
1438
1439 RDTSC_STOP(APIClearRenderTarget, 0, pDC->drawId);
1440 }
1441
1442 //////////////////////////////////////////////////////////////////////////
1443 /// @brief Returns a pointer to the private context state for the current
1444 /// draw operation. This is used for external componets such as the
1445 /// sampler.
1446 /// SWR is responsible for the allocation of the private context state.
1447 /// @param hContext - Handle passed back from SwrCreateContext
1448 VOID* SwrGetPrivateContextState(
1449 HANDLE hContext)
1450 {
1451 SWR_CONTEXT* pContext = GetContext(hContext);
1452 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1453 DRAW_STATE* pState = pDC->pState;
1454
1455 if (pState->pPrivateState == nullptr)
1456 {
1457 pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float));
1458 }
1459
1460 return pState->pPrivateState;
1461 }
1462
1463 //////////////////////////////////////////////////////////////////////////
1464 /// @brief Clients can use this to allocate memory for draw/dispatch
1465 /// operations. The memory will automatically be freed once operation
1466 /// has completed. Client can use this to allocate binding tables,
1467 /// etc. needed for shader execution.
1468 /// @param hContext - Handle passed back from SwrCreateContext
1469 /// @param size - Size of allocation
1470 /// @param align - Alignment needed for allocation.
1471 VOID* SwrAllocDrawContextMemory(
1472 HANDLE hContext,
1473 uint32_t size,
1474 uint32_t align)
1475 {
1476 SWR_CONTEXT* pContext = GetContext(hContext);
1477 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1478
1479 return pDC->pState->pArena->AllocAligned(size, align);
1480 }
1481
1482 //////////////////////////////////////////////////////////////////////////
1483 /// @brief Returns pointer to SWR stats.
1484 /// @note The counters are atomically incremented by multiple threads.
1485 /// When calling this, you need to ensure all previous operations
1486 /// have completed.
1487 /// @todo If necessary, add a callback to avoid stalling the pipe to
1488 /// sample the counters.
1489 /// @param hContext - Handle passed back from SwrCreateContext
1490 /// @param pStats - SWR will fill this out for caller.
1491 void SwrGetStats(
1492 HANDLE hContext,
1493 SWR_STATS* pStats)
1494 {
1495 SWR_CONTEXT *pContext = GetContext(hContext);
1496 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1497
1498 pDC->FeWork.type = QUERYSTATS;
1499 pDC->FeWork.pfnWork = ProcessQueryStats;
1500 pDC->FeWork.desc.queryStats.pStats = pStats;
1501
1502 // cannot execute until all previous draws have completed
1503 pDC->dependent = true;
1504
1505 //enqueue
1506 QueueDraw(pContext);
1507 }
1508
1509 //////////////////////////////////////////////////////////////////////////
1510 /// @brief Enables stats counting
1511 /// @param hContext - Handle passed back from SwrCreateContext
1512 /// @param enable - If true then counts are incremented.
1513 void SwrEnableStats(
1514 HANDLE hContext,
1515 bool enable)
1516 {
1517 SWR_CONTEXT *pContext = GetContext(hContext);
1518 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1519
1520 pDC->pState->state.enableStats = enable;
1521 }
1522
1523 //////////////////////////////////////////////////////////////////////////
1524 /// @brief Mark end of frame - used for performance profiling
1525 /// @param hContext - Handle passed back from SwrCreateContext
1526 void SWR_API SwrEndFrame(
1527 HANDLE hContext)
1528 {
1529 RDTSC_ENDFRAME();
1530 SWR_CONTEXT *pContext = GetContext(hContext);
1531 pContext->frameCount++;
1532 }