swr: [rasterizer core] stop single threaded crash exit crash
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / api.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file api.cpp
24 *
25 * @brief API implementation
26 *
27 ******************************************************************************/
28
29 #include <cfloat>
30 #include <cmath>
31 #include <cstdio>
32 #include <new>
33
34 #include "core/api.h"
35 #include "core/backend.h"
36 #include "core/context.h"
37 #include "core/depthstencil.h"
38 #include "core/frontend.h"
39 #include "core/rasterizer.h"
40 #include "core/rdtsc_core.h"
41 #include "core/threads.h"
42 #include "core/tilemgr.h"
43 #include "core/clip.h"
44 #include "core/utils.h"
45
46 #include "common/simdintrin.h"
47 #include "common/os.h"
48
49 void SetupDefaultState(SWR_CONTEXT *pContext);
50
51 //////////////////////////////////////////////////////////////////////////
52 /// @brief Create SWR Context.
53 /// @param pCreateInfo - pointer to creation info.
54 HANDLE SwrCreateContext(
55 SWR_CREATECONTEXT_INFO* pCreateInfo)
56 {
57 RDTSC_RESET();
58 RDTSC_INIT(0);
59
60 void* pContextMem = AlignedMalloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4);
61 memset(pContextMem, 0, sizeof(SWR_CONTEXT));
62 SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT();
63
64 pContext->driverType = pCreateInfo->driver;
65 pContext->privateStateSize = pCreateInfo->privateStateSize;
66
67 pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
68 pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
69
70 pContext->pMacroTileManagerArray = (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
71 pContext->pDispatchQueueArray = (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
72
73 for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
74 {
75 pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
76 new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
77 new (&pContext->pDispatchQueueArray[dc]) DispatchQueue();
78
79 pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
80 }
81
82 if (!KNOB_SINGLE_THREADED)
83 {
84 memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
85 memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
86 new (&pContext->WaitLock) std::mutex();
87 new (&pContext->FifosNotEmpty) std::condition_variable();
88
89 CreateThreadPool(pContext, &pContext->threadPool);
90 }
91
92 // Calling createThreadPool() above can set SINGLE_THREADED
93 if (KNOB_SINGLE_THREADED)
94 {
95 SET_KNOB(HYPERTHREADED_FE, false);
96 pContext->NumWorkerThreads = 1;
97 pContext->NumFEThreads = 1;
98 pContext->NumBEThreads = 1;
99 }
100
101 // Allocate scratch space for workers.
102 ///@note We could lazily allocate this but its rather small amount of memory.
103 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
104 {
105 #if defined(_WIN32)
106 uint32_t numaNode = pContext->threadPool.pThreadData ?
107 pContext->threadPool.pThreadData[i].numaId : 0;
108 pContext->pScratch[i] = (uint8_t*)VirtualAllocExNuma(
109 GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE),
110 MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE,
111 numaNode);
112 #else
113 pContext->pScratch[i] = (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4);
114 #endif
115 }
116
117 // State setup AFTER context is fully initialized
118 SetupDefaultState(pContext);
119
120 // initialize hot tile manager
121 pContext->pHotTileMgr = new HotTileMgr();
122
123 // initialize function pointer tables
124 InitClearTilesTable();
125
126 // initialize store tiles function
127 pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
128 pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
129 pContext->pfnClearTile = pCreateInfo->pfnClearTile;
130
131 // pass pointer to bucket manager back to caller
132 #ifdef KNOB_ENABLE_RDTSC
133 pCreateInfo->pBucketMgr = &gBucketMgr;
134 #endif
135
136 pCreateInfo->contextSaveSize = sizeof(API_STATE);
137
138 return (HANDLE)pContext;
139 }
140
141 void SwrDestroyContext(HANDLE hContext)
142 {
143 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
144 DestroyThreadPool(pContext, &pContext->threadPool);
145
146 // free the fifos
147 for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i)
148 {
149 delete pContext->dcRing[i].pArena;
150 delete pContext->dsRing[i].pArena;
151 pContext->pMacroTileManagerArray[i].~MacroTileMgr();
152 pContext->pDispatchQueueArray[i].~DispatchQueue();
153 }
154
155 AlignedFree(pContext->pDispatchQueueArray);
156 AlignedFree(pContext->pMacroTileManagerArray);
157
158 // Free scratch space.
159 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
160 {
161 #if defined(_WIN32)
162 VirtualFree(pContext->pScratch[i], 0, MEM_RELEASE);
163 #else
164 AlignedFree(pContext->pScratch[i]);
165 #endif
166 }
167
168 delete(pContext->pHotTileMgr);
169
170 pContext->~SWR_CONTEXT();
171 AlignedFree((SWR_CONTEXT*)hContext);
172 }
173
174 void CopyState(DRAW_STATE& dst, const DRAW_STATE& src)
175 {
176 memcpy(&dst.state, &src.state, sizeof(API_STATE));
177 }
178
179 void WakeAllThreads(SWR_CONTEXT *pContext)
180 {
181 pContext->FifosNotEmpty.notify_all();
182 }
183
184 static TileSet gSingleThreadLockedTiles;
185
186 template<bool IsDraw>
187 void QueueWork(SWR_CONTEXT *pContext)
188 {
189 DRAW_CONTEXT* pDC = pContext->pCurDrawContext;
190 uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
191
192 if (IsDraw)
193 {
194 pDC->pTileMgr = &pContext->pMacroTileManagerArray[dcIndex];
195 pDC->pTileMgr->initialize();
196 }
197
198 // Each worker thread looks at a DC for both FE and BE work at different times and so we
199 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
200 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
201 // then moved on if all work is done.)
202 pContext->pCurDrawContext->threadsDone = pContext->NumFEThreads + pContext->NumBEThreads;
203
204 _ReadWriteBarrier();
205 {
206 std::unique_lock<std::mutex> lock(pContext->WaitLock);
207 pContext->dcRing.Enqueue();
208 }
209
210 if (KNOB_SINGLE_THREADED)
211 {
212 // flush denormals to 0
213 uint32_t mxcsr = _mm_getcsr();
214 _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
215
216 if (IsDraw)
217 {
218 uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
219 WorkOnFifoFE(pContext, 0, curDraw[0]);
220 WorkOnFifoBE(pContext, 0, curDraw[1], gSingleThreadLockedTiles, 0, 0);
221 }
222 else
223 {
224 uint64_t curDispatch = pContext->pCurDrawContext->drawId;
225 WorkOnCompute(pContext, 0, curDispatch);
226 }
227
228 // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
229 while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0) {}
230
231 // restore csr
232 _mm_setcsr(mxcsr);
233 }
234 else
235 {
236 RDTSC_START(APIDrawWakeAllThreads);
237 WakeAllThreads(pContext);
238 RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
239 }
240
241 // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
242 pContext->pPrevDrawContext = pContext->pCurDrawContext;
243 pContext->pCurDrawContext = nullptr;
244 }
245
246 INLINE void QueueDraw(SWR_CONTEXT* pContext)
247 {
248 QueueWork<true>(pContext);
249 }
250
251 INLINE void QueueDispatch(SWR_CONTEXT* pContext)
252 {
253 QueueWork<false>(pContext);
254 }
255
256 DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
257 {
258 RDTSC_START(APIGetDrawContext);
259 // If current draw context is null then need to obtain a new draw context to use from ring.
260 if (pContext->pCurDrawContext == nullptr)
261 {
262 // Need to wait for a free entry.
263 while (pContext->dcRing.IsFull())
264 {
265 _mm_pause();
266 }
267
268 uint64_t curDraw = pContext->dcRing.GetHead();
269 uint32_t dcIndex = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT;
270
271 static uint64_t lastDrawChecked;
272 static uint32_t lastFrameChecked;
273 if ((pContext->frameCount - lastFrameChecked) > 2 ||
274 (curDraw - lastDrawChecked) > 0x10000)
275 {
276 // Take this opportunity to clean-up old arena allocations
277 pContext->cachingArenaAllocator.FreeOldBlocks();
278
279 lastFrameChecked = pContext->frameCount;
280 lastDrawChecked = curDraw;
281 }
282
283 DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
284 pContext->pCurDrawContext = pCurDrawContext;
285
286 // Assign next available entry in DS ring to this DC.
287 uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
288 pCurDrawContext->pState = &pContext->dsRing[dsIndex];
289
290 // Copy previous state to current state.
291 if (pContext->pPrevDrawContext)
292 {
293 DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext;
294
295 // If we're splitting our draw then we can just use the same state from the previous
296 // draw. In this case, we won't increment the DS ring index so the next non-split
297 // draw can receive the state.
298 if (isSplitDraw == false)
299 {
300 CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);
301
302 // Should have been cleaned up previously
303 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
304
305 pCurDrawContext->pState->pPrivateState = nullptr;
306
307 pContext->curStateId++; // Progress state ring index forward.
308 }
309 else
310 {
311 // If its a split draw then just copy the state pointer over
312 // since its the same draw.
313 pCurDrawContext->pState = pPrevDrawContext->pState;
314 SWR_ASSERT(pPrevDrawContext->cleanupState == false);
315 }
316 }
317 else
318 {
319 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
320 pContext->curStateId++; // Progress state ring index forward.
321 }
322
323 SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true);
324
325 pCurDrawContext->dependency = 0;
326 pCurDrawContext->pContext = pContext;
327 pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
328
329 pCurDrawContext->doneFE = false;
330 pCurDrawContext->FeLock = 0;
331 pCurDrawContext->threadsDone = 0;
332
333 // Assign unique drawId for this DC
334 pCurDrawContext->drawId = pContext->dcRing.GetHead();
335
336 pCurDrawContext->cleanupState = true;
337 }
338 else
339 {
340 SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
341 }
342
343 RDTSC_STOP(APIGetDrawContext, 0, 0);
344 return pContext->pCurDrawContext;
345 }
346
347 API_STATE* GetDrawState(SWR_CONTEXT *pContext)
348 {
349 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
350 SWR_ASSERT(pDC->pState != nullptr);
351
352 return &pDC->pState->state;
353 }
354
355 void SWR_API SwrSaveState(
356 HANDLE hContext,
357 void* pOutputStateBlock,
358 size_t memSize)
359 {
360 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
361 auto pSrc = GetDrawState(pContext);
362 SWR_ASSERT(pOutputStateBlock && memSize >= sizeof(*pSrc));
363
364 memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc));
365 }
366
367 void SWR_API SwrRestoreState(
368 HANDLE hContext,
369 const void* pStateBlock,
370 size_t memSize)
371 {
372 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
373 auto pDst = GetDrawState(pContext);
374 SWR_ASSERT(pStateBlock && memSize >= sizeof(*pDst));
375
376 memcpy(pDst, pStateBlock, sizeof(*pDst));
377 }
378
379 void SetupDefaultState(SWR_CONTEXT *pContext)
380 {
381 API_STATE* pState = GetDrawState(pContext);
382
383 pState->rastState.cullMode = SWR_CULLMODE_NONE;
384 pState->rastState.frontWinding = SWR_FRONTWINDING_CCW;
385 }
386
387 static INLINE SWR_CONTEXT* GetContext(HANDLE hContext)
388 {
389 return (SWR_CONTEXT*)hContext;
390 }
391
392 void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2, uint64_t userData3)
393 {
394 RDTSC_START(APISync);
395
396 SWR_ASSERT(pfnFunc != nullptr);
397
398 SWR_CONTEXT *pContext = GetContext(hContext);
399 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
400
401 pDC->FeWork.type = SYNC;
402 pDC->FeWork.pfnWork = ProcessSync;
403 pDC->FeWork.desc.sync.pfnCallbackFunc = pfnFunc;
404 pDC->FeWork.desc.sync.userData = userData;
405 pDC->FeWork.desc.sync.userData2 = userData2;
406 pDC->FeWork.desc.sync.userData3 = userData3;
407
408 // cannot execute until all previous draws have completed
409 pDC->dependency = pDC->drawId - 1;
410
411 //enqueue
412 QueueDraw(pContext);
413
414 RDTSC_STOP(APISync, 1, 0);
415 }
416
417 void SwrWaitForIdle(HANDLE hContext)
418 {
419 SWR_CONTEXT *pContext = GetContext(hContext);
420
421 RDTSC_START(APIWaitForIdle);
422
423 while (!pContext->dcRing.IsEmpty())
424 {
425 _mm_pause();
426 }
427
428 RDTSC_STOP(APIWaitForIdle, 1, 0);
429 }
430
431 void SwrSetVertexBuffers(
432 HANDLE hContext,
433 uint32_t numBuffers,
434 const SWR_VERTEX_BUFFER_STATE* pVertexBuffers)
435 {
436 API_STATE* pState = GetDrawState(GetContext(hContext));
437
438 for (uint32_t i = 0; i < numBuffers; ++i)
439 {
440 const SWR_VERTEX_BUFFER_STATE *pVB = &pVertexBuffers[i];
441 pState->vertexBuffers[pVB->index] = *pVB;
442 }
443 }
444
445 void SwrSetIndexBuffer(
446 HANDLE hContext,
447 const SWR_INDEX_BUFFER_STATE* pIndexBuffer)
448 {
449 API_STATE* pState = GetDrawState(GetContext(hContext));
450
451 pState->indexBuffer = *pIndexBuffer;
452 }
453
454 void SwrSetFetchFunc(
455 HANDLE hContext,
456 PFN_FETCH_FUNC pfnFetchFunc)
457 {
458 API_STATE* pState = GetDrawState(GetContext(hContext));
459
460 pState->pfnFetchFunc = pfnFetchFunc;
461 }
462
463 void SwrSetSoFunc(
464 HANDLE hContext,
465 PFN_SO_FUNC pfnSoFunc,
466 uint32_t streamIndex)
467 {
468 API_STATE* pState = GetDrawState(GetContext(hContext));
469
470 SWR_ASSERT(streamIndex < MAX_SO_STREAMS);
471
472 pState->pfnSoFunc[streamIndex] = pfnSoFunc;
473 }
474
475 void SwrSetSoState(
476 HANDLE hContext,
477 SWR_STREAMOUT_STATE* pSoState)
478 {
479 API_STATE* pState = GetDrawState(GetContext(hContext));
480
481 pState->soState = *pSoState;
482 }
483
484 void SwrSetSoBuffers(
485 HANDLE hContext,
486 SWR_STREAMOUT_BUFFER* pSoBuffer,
487 uint32_t slot)
488 {
489 API_STATE* pState = GetDrawState(GetContext(hContext));
490
491 SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot);
492
493 pState->soBuffer[slot] = *pSoBuffer;
494 }
495
496 void SwrSetVertexFunc(
497 HANDLE hContext,
498 PFN_VERTEX_FUNC pfnVertexFunc)
499 {
500 API_STATE* pState = GetDrawState(GetContext(hContext));
501
502 pState->pfnVertexFunc = pfnVertexFunc;
503 }
504
505 void SwrSetFrontendState(
506 HANDLE hContext,
507 SWR_FRONTEND_STATE *pFEState)
508 {
509 API_STATE* pState = GetDrawState(GetContext(hContext));
510 pState->frontendState = *pFEState;
511 }
512
513 void SwrSetGsState(
514 HANDLE hContext,
515 SWR_GS_STATE *pGSState)
516 {
517 API_STATE* pState = GetDrawState(GetContext(hContext));
518 pState->gsState = *pGSState;
519 }
520
521 void SwrSetGsFunc(
522 HANDLE hContext,
523 PFN_GS_FUNC pfnGsFunc)
524 {
525 API_STATE* pState = GetDrawState(GetContext(hContext));
526 pState->pfnGsFunc = pfnGsFunc;
527 }
528
529 void SwrSetCsFunc(
530 HANDLE hContext,
531 PFN_CS_FUNC pfnCsFunc,
532 uint32_t totalThreadsInGroup,
533 uint32_t totalSpillFillSize)
534 {
535 API_STATE* pState = GetDrawState(GetContext(hContext));
536 pState->pfnCsFunc = pfnCsFunc;
537 pState->totalThreadsInGroup = totalThreadsInGroup;
538 pState->totalSpillFillSize = totalSpillFillSize;
539 }
540
541 void SwrSetTsState(
542 HANDLE hContext,
543 SWR_TS_STATE *pState)
544 {
545 API_STATE* pApiState = GetDrawState(GetContext(hContext));
546 pApiState->tsState = *pState;
547 }
548
549 void SwrSetHsFunc(
550 HANDLE hContext,
551 PFN_HS_FUNC pfnFunc)
552 {
553 API_STATE* pApiState = GetDrawState(GetContext(hContext));
554 pApiState->pfnHsFunc = pfnFunc;
555 }
556
557 void SwrSetDsFunc(
558 HANDLE hContext,
559 PFN_DS_FUNC pfnFunc)
560 {
561 API_STATE* pApiState = GetDrawState(GetContext(hContext));
562 pApiState->pfnDsFunc = pfnFunc;
563 }
564
565 void SwrSetDepthStencilState(
566 HANDLE hContext,
567 SWR_DEPTH_STENCIL_STATE *pDSState)
568 {
569 API_STATE* pState = GetDrawState(GetContext(hContext));
570
571 pState->depthStencilState = *pDSState;
572 }
573
574 void SwrSetBackendState(
575 HANDLE hContext,
576 SWR_BACKEND_STATE *pBEState)
577 {
578 API_STATE* pState = GetDrawState(GetContext(hContext));
579
580 pState->backendState = *pBEState;
581 }
582
583 void SwrSetPixelShaderState(
584 HANDLE hContext,
585 SWR_PS_STATE *pPSState)
586 {
587 API_STATE *pState = GetDrawState(GetContext(hContext));
588 pState->psState = *pPSState;
589 }
590
591 void SwrSetBlendState(
592 HANDLE hContext,
593 SWR_BLEND_STATE *pBlendState)
594 {
595 API_STATE *pState = GetDrawState(GetContext(hContext));
596 memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE));
597 }
598
599 void SwrSetBlendFunc(
600 HANDLE hContext,
601 uint32_t renderTarget,
602 PFN_BLEND_JIT_FUNC pfnBlendFunc)
603 {
604 SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS);
605 API_STATE *pState = GetDrawState(GetContext(hContext));
606 pState->pfnBlendFunc[renderTarget] = pfnBlendFunc;
607 }
608
609 void SwrSetLinkage(
610 HANDLE hContext,
611 uint32_t mask,
612 const uint8_t* pMap)
613 {
614 API_STATE* pState = GetDrawState(GetContext(hContext));
615
616 static const uint8_t IDENTITY_MAP[] =
617 {
618 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
619 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
620 };
621 static_assert(sizeof(IDENTITY_MAP) == sizeof(pState->linkageMap),
622 "Update for new value of MAX_ATTRIBUTES");
623
624 pState->linkageMask = mask;
625 pState->linkageCount = _mm_popcnt_u32(mask);
626
627 if (!pMap)
628 {
629 pMap = IDENTITY_MAP;
630 }
631 memcpy(pState->linkageMap, pMap, pState->linkageCount);
632 }
633
634 // update guardband multipliers for the viewport
635 void updateGuardband(API_STATE *pState)
636 {
637 // guardband center is viewport center
638 pState->gbState.left = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
639 pState->gbState.right = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
640 pState->gbState.top = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
641 pState->gbState.bottom = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
642 }
643
644 void SwrSetRastState(
645 HANDLE hContext,
646 const SWR_RASTSTATE *pRastState)
647 {
648 SWR_CONTEXT *pContext = GetContext(hContext);
649 API_STATE* pState = GetDrawState(pContext);
650
651 memcpy(&pState->rastState, pRastState, sizeof(SWR_RASTSTATE));
652 }
653
654 void SwrSetViewports(
655 HANDLE hContext,
656 uint32_t numViewports,
657 const SWR_VIEWPORT* pViewports,
658 const SWR_VIEWPORT_MATRIX* pMatrices)
659 {
660 SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS,
661 "Invalid number of viewports.");
662
663 SWR_CONTEXT *pContext = GetContext(hContext);
664 API_STATE* pState = GetDrawState(pContext);
665
666 memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports);
667
668 if (pMatrices != nullptr)
669 {
670 memcpy(&pState->vpMatrix[0], pMatrices, sizeof(SWR_VIEWPORT_MATRIX) * numViewports);
671 }
672 else
673 {
674 // Compute default viewport transform.
675 for (uint32_t i = 0; i < numViewports; ++i)
676 {
677 if (pContext->driverType == DX)
678 {
679 pState->vpMatrix[i].m00 = pState->vp[i].width / 2.0f;
680 pState->vpMatrix[i].m11 = -pState->vp[i].height / 2.0f;
681 pState->vpMatrix[i].m22 = pState->vp[i].maxZ - pState->vp[i].minZ;
682 pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
683 pState->vpMatrix[i].m31 = pState->vp[i].y - pState->vpMatrix[i].m11;
684 pState->vpMatrix[i].m32 = pState->vp[i].minZ;
685 }
686 else
687 {
688 // Standard, with the exception that Y is inverted.
689 pState->vpMatrix[i].m00 = (pState->vp[i].width - pState->vp[i].x) / 2.0f;
690 pState->vpMatrix[i].m11 = (pState->vp[i].y - pState->vp[i].height) / 2.0f;
691 pState->vpMatrix[i].m22 = (pState->vp[i].maxZ - pState->vp[i].minZ) / 2.0f;
692 pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
693 pState->vpMatrix[i].m31 = pState->vp[i].height + pState->vpMatrix[i].m11;
694 pState->vpMatrix[i].m32 = pState->vp[i].minZ + pState->vpMatrix[i].m22;
695
696 // Now that the matrix is calculated, clip the view coords to screen size.
697 // OpenGL allows for -ve x,y in the viewport.
698 pState->vp[i].x = std::max(pState->vp[i].x, 0.0f);
699 pState->vp[i].y = std::max(pState->vp[i].y, 0.0f);
700 }
701 }
702 }
703
704 updateGuardband(pState);
705 }
706
707 void SwrSetScissorRects(
708 HANDLE hContext,
709 uint32_t numScissors,
710 const BBOX* pScissors)
711 {
712 SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS,
713 "Invalid number of scissor rects.");
714
715 API_STATE* pState = GetDrawState(GetContext(hContext));
716 memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(BBOX));
717 };
718
719 void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
720 {
721 API_STATE *pState = &pDC->pState->state;
722 uint32_t left, right, top, bottom;
723
724 // Set up scissor dimensions based on scissor or viewport
725 if (pState->rastState.scissorEnable)
726 {
727 // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges
728 left = pState->scissorRects[0].left;
729 right = pState->scissorRects[0].right;
730 top = pState->scissorRects[0].top;
731 bottom = pState->scissorRects[0].bottom;
732 }
733 else
734 {
735 left = (int32_t)pState->vp[0].x;
736 right = (int32_t)pState->vp[0].x + (int32_t)pState->vp[0].width;
737 top = (int32_t)pState->vp[0].y;
738 bottom = (int32_t)pState->vp[0].y + (int32_t)pState->vp[0].height;
739 }
740
741 right = std::min<uint32_t>(right, KNOB_MAX_SCISSOR_X);
742 bottom = std::min<uint32_t>(bottom, KNOB_MAX_SCISSOR_Y);
743
744 if (left > KNOB_MAX_SCISSOR_X || top > KNOB_MAX_SCISSOR_Y)
745 {
746 pState->scissorInFixedPoint.left = 0;
747 pState->scissorInFixedPoint.right = 0;
748 pState->scissorInFixedPoint.top = 0;
749 pState->scissorInFixedPoint.bottom = 0;
750 }
751 else
752 {
753 pState->scissorInFixedPoint.left = left * FIXED_POINT_SCALE;
754 pState->scissorInFixedPoint.right = right * FIXED_POINT_SCALE - 1;
755 pState->scissorInFixedPoint.top = top * FIXED_POINT_SCALE;
756 pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1;
757 }
758 }
759
760 // templated backend function tables
761 extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
762 extern PFN_BACKEND_FUNC gBackendSingleSample[2][2][2];
763 extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2][2];
764 extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
765 void SetupPipeline(DRAW_CONTEXT *pDC)
766 {
767 DRAW_STATE* pState = pDC->pState;
768 const SWR_RASTSTATE &rastState = pState->state.rastState;
769 const SWR_PS_STATE &psState = pState->state.psState;
770 BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
771 const uint32_t forcedSampleCount = (rastState.forcedSampleCount) ? 1 : 0;
772
773 // setup backend
774 if (psState.pfnPixelShader == nullptr)
775 {
776 backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
777 }
778 else
779 {
780 const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.forcedSampleCount) ? 1 : 0;
781 const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
782 const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesSourceDepth && !psState.usesUAV)) ? 1 : 0;
783
784 // currently only support 'normal' input coverage
785 SWR_ASSERT(psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
786 psState.inputCoverage == SWR_INPUT_COVERAGE_NONE);
787
788 SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
789
790 // select backend function
791 switch(psState.shadingRate)
792 {
793 case SWR_SHADING_RATE_PIXEL:
794 if(bMultisampleEnable)
795 {
796 // always need to generate I & J per sample for Z interpolation
797 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
798 backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount][canEarlyZ];
799 }
800 else
801 {
802 // always need to generate I & J per pixel for Z interpolation
803 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
804 backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid][canEarlyZ];
805 }
806 break;
807 case SWR_SHADING_RATE_SAMPLE:
808 SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
809 // always need to generate I & J per sample for Z interpolation
810 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
811 backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid][canEarlyZ];
812 break;
813 default:
814 SWR_ASSERT(0 && "Invalid shading rate");
815 break;
816 }
817 }
818
819 PFN_PROCESS_PRIMS pfnBinner;
820 switch (pState->state.topology)
821 {
822 case TOP_POINT_LIST:
823 pState->pfnProcessPrims = ClipPoints;
824 pfnBinner = BinPoints;
825 break;
826 case TOP_LINE_LIST:
827 case TOP_LINE_STRIP:
828 case TOP_LINE_LOOP:
829 case TOP_LINE_LIST_ADJ:
830 case TOP_LISTSTRIP_ADJ:
831 pState->pfnProcessPrims = ClipLines;
832 pfnBinner = BinLines;
833 break;
834 default:
835 pState->pfnProcessPrims = ClipTriangles;
836 pfnBinner = BinTriangles;
837 break;
838 };
839
840 // disable clipper if viewport transform is disabled
841 if (pState->state.frontendState.vpTransformDisable)
842 {
843 pState->pfnProcessPrims = pfnBinner;
844 }
845
846 if ((pState->state.psState.pfnPixelShader == nullptr) &&
847 (pState->state.depthStencilState.depthTestEnable == FALSE) &&
848 (pState->state.depthStencilState.depthWriteEnable == FALSE) &&
849 (pState->state.depthStencilState.stencilTestEnable == FALSE) &&
850 (pState->state.depthStencilState.stencilWriteEnable == FALSE) &&
851 (pState->state.linkageCount == 0))
852 {
853 pState->pfnProcessPrims = nullptr;
854 pState->state.linkageMask = 0;
855 }
856
857 if (pState->state.soState.rasterizerDisable == true)
858 {
859 pState->pfnProcessPrims = nullptr;
860 pState->state.linkageMask = 0;
861 }
862
863 // set up the frontend attrib mask
864 pState->state.feAttribMask = pState->state.linkageMask;
865 if (pState->state.soState.soEnable)
866 {
867 for (uint32_t i = 0; i < 4; ++i)
868 {
869 pState->state.feAttribMask |= pState->state.soState.streamMasks[i];
870 }
871 }
872
873 // complicated logic to test for cases where we don't need backing hottile memory for a draw
874 // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
875 pState->state.depthHottileEnable = ((!(pState->state.depthStencilState.depthTestEnable &&
876 !pState->state.depthStencilState.depthWriteEnable &&
877 pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) &&
878 (pState->state.depthStencilState.depthTestEnable ||
879 pState->state.depthStencilState.depthWriteEnable)) ? true : false;
880
881 pState->state.stencilHottileEnable = (((!(pState->state.depthStencilState.stencilTestEnable &&
882 !pState->state.depthStencilState.stencilWriteEnable &&
883 pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) ||
884 // for stencil we have to check the double sided state as well
885 (!(pState->state.depthStencilState.doubleSidedStencilTestEnable &&
886 !pState->state.depthStencilState.stencilWriteEnable &&
887 pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) &&
888 (pState->state.depthStencilState.stencilTestEnable ||
889 pState->state.depthStencilState.stencilWriteEnable)) ? true : false;
890
891 uint32_t numRTs = pState->state.psState.numRenderTargets;
892 pState->state.colorHottileEnable = 0;
893 if (psState.pfnPixelShader != nullptr)
894 {
895 for (uint32_t rt = 0; rt < numRTs; ++rt)
896 {
897 pState->state.colorHottileEnable |=
898 (!pState->state.blendState.renderTarget[rt].writeDisableAlpha ||
899 !pState->state.blendState.renderTarget[rt].writeDisableRed ||
900 !pState->state.blendState.renderTarget[rt].writeDisableGreen ||
901 !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0;
902 }
903 }
904
905 // Setup depth quantization function
906 if (pState->state.depthHottileEnable)
907 {
908 switch (pState->state.rastState.depthFormat)
909 {
910 case R32_FLOAT_X8X24_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT_X8X24_TYPELESS > ; break;
911 case R32_FLOAT: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; break;
912 case R24_UNORM_X8_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R24_UNORM_X8_TYPELESS > ; break;
913 case R16_UNORM: pState->state.pfnQuantizeDepth = QuantizeDepth < R16_UNORM > ; break;
914 default: SWR_ASSERT(false, "Unsupported depth format for depth quantiztion.");
915 pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
916 }
917 }
918 else
919 {
920 // set up pass-through quantize if depth isn't enabled
921 pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
922 }
923 }
924
925 //////////////////////////////////////////////////////////////////////////
926 /// @brief InitDraw
927 /// @param pDC - Draw context to initialize for this draw.
928 void InitDraw(
929 DRAW_CONTEXT *pDC,
930 bool isSplitDraw)
931 {
932 // We don't need to re-setup the scissors/pipeline state again for split draw.
933 if (isSplitDraw == false)
934 {
935 SetupMacroTileScissors(pDC);
936 SetupPipeline(pDC);
937 }
938 }
939
940 //////////////////////////////////////////////////////////////////////////
941 /// @brief We can split the draw for certain topologies for better performance.
942 /// @param totalVerts - Total vertices for draw
943 /// @param topology - Topology used for draw
944 uint32_t MaxVertsPerDraw(
945 DRAW_CONTEXT* pDC,
946 uint32_t totalVerts,
947 PRIMITIVE_TOPOLOGY topology)
948 {
949 API_STATE& state = pDC->pState->state;
950
951 uint32_t vertsPerDraw = totalVerts;
952
953 if (state.soState.soEnable)
954 {
955 return totalVerts;
956 }
957
958 switch (topology)
959 {
960 case TOP_POINT_LIST:
961 case TOP_TRIANGLE_LIST:
962 vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW;
963 break;
964
965 case TOP_PATCHLIST_1:
966 case TOP_PATCHLIST_2:
967 case TOP_PATCHLIST_3:
968 case TOP_PATCHLIST_4:
969 case TOP_PATCHLIST_5:
970 case TOP_PATCHLIST_6:
971 case TOP_PATCHLIST_7:
972 case TOP_PATCHLIST_8:
973 case TOP_PATCHLIST_9:
974 case TOP_PATCHLIST_10:
975 case TOP_PATCHLIST_11:
976 case TOP_PATCHLIST_12:
977 case TOP_PATCHLIST_13:
978 case TOP_PATCHLIST_14:
979 case TOP_PATCHLIST_15:
980 case TOP_PATCHLIST_16:
981 case TOP_PATCHLIST_17:
982 case TOP_PATCHLIST_18:
983 case TOP_PATCHLIST_19:
984 case TOP_PATCHLIST_20:
985 case TOP_PATCHLIST_21:
986 case TOP_PATCHLIST_22:
987 case TOP_PATCHLIST_23:
988 case TOP_PATCHLIST_24:
989 case TOP_PATCHLIST_25:
990 case TOP_PATCHLIST_26:
991 case TOP_PATCHLIST_27:
992 case TOP_PATCHLIST_28:
993 case TOP_PATCHLIST_29:
994 case TOP_PATCHLIST_30:
995 case TOP_PATCHLIST_31:
996 case TOP_PATCHLIST_32:
997 if (pDC->pState->state.tsState.tsEnable)
998 {
999 uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE;
1000 vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW;
1001 }
1002 break;
1003
1004 // The Primitive Assembly code can only handle 1 RECT at a time.
1005 case TOP_RECT_LIST:
1006 vertsPerDraw = 3;
1007 break;
1008
1009 default:
1010 // We are not splitting up draws for other topologies.
1011 break;
1012 }
1013
1014 return vertsPerDraw;
1015 }
1016
1017
1018 //////////////////////////////////////////////////////////////////////////
1019 /// @brief DrawInstanced
1020 /// @param hContext - Handle passed back from SwrCreateContext
1021 /// @param topology - Specifies topology for draw.
1022 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
1023 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1024 /// @param numInstances - How many instances to render.
1025 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1026 void DrawInstanced(
1027 HANDLE hContext,
1028 PRIMITIVE_TOPOLOGY topology,
1029 uint32_t numVertices,
1030 uint32_t startVertex,
1031 uint32_t numInstances = 1,
1032 uint32_t startInstance = 0)
1033 {
1034 if (KNOB_TOSS_DRAW)
1035 {
1036 return;
1037 }
1038
1039 RDTSC_START(APIDraw);
1040
1041 SWR_CONTEXT *pContext = GetContext(hContext);
1042 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1043
1044 uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
1045 uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
1046 uint32_t remainingVerts = numVertices;
1047
1048 API_STATE *pState = &pDC->pState->state;
1049 pState->topology = topology;
1050 pState->forceFront = false;
1051
1052 // disable culling for points/lines
1053 uint32_t oldCullMode = pState->rastState.cullMode;
1054 if (topology == TOP_POINT_LIST)
1055 {
1056 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1057 pState->forceFront = true;
1058 }
1059
1060 int draw = 0;
1061 while (remainingVerts)
1062 {
1063 uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ?
1064 remainingVerts : maxVertsPerDraw;
1065
1066 bool isSplitDraw = (draw > 0) ? true : false;
1067 DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw);
1068 InitDraw(pDC, isSplitDraw);
1069
1070 pDC->FeWork.type = DRAW;
1071 pDC->FeWork.pfnWork = GetProcessDrawFunc(
1072 false, // IsIndexed
1073 false, // bEnableCutIndex
1074 pState->tsState.tsEnable,
1075 pState->gsState.gsEnable,
1076 pState->soState.soEnable,
1077 pDC->pState->pfnProcessPrims != nullptr);
1078 pDC->FeWork.desc.draw.numVerts = numVertsForDraw;
1079 pDC->FeWork.desc.draw.startVertex = startVertex;
1080 pDC->FeWork.desc.draw.numInstances = numInstances;
1081 pDC->FeWork.desc.draw.startInstance = startInstance;
1082 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1083 pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw;
1084
1085 pDC->cleanupState = (remainingVerts == numVertsForDraw);
1086
1087 //enqueue DC
1088 QueueDraw(pContext);
1089
1090 remainingVerts -= numVertsForDraw;
1091 draw++;
1092 }
1093
1094 // restore culling state
1095 pDC = GetDrawContext(pContext);
1096 pDC->pState->state.rastState.cullMode = oldCullMode;
1097
1098 RDTSC_STOP(APIDraw, numVertices * numInstances, 0);
1099 }
1100
1101 //////////////////////////////////////////////////////////////////////////
1102 /// @brief SwrDraw
1103 /// @param hContext - Handle passed back from SwrCreateContext
1104 /// @param topology - Specifies topology for draw.
1105 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
1106 /// @param primCount - Number of vertices.
1107 void SwrDraw(
1108 HANDLE hContext,
1109 PRIMITIVE_TOPOLOGY topology,
1110 uint32_t startVertex,
1111 uint32_t numVertices)
1112 {
1113 DrawInstanced(hContext, topology, numVertices, startVertex);
1114 }
1115
1116 //////////////////////////////////////////////////////////////////////////
1117 /// @brief SwrDrawInstanced
1118 /// @param hContext - Handle passed back from SwrCreateContext
1119 /// @param topology - Specifies topology for draw.
1120 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
1121 /// @param numInstances - How many instances to render.
1122 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1123 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1124 void SwrDrawInstanced(
1125 HANDLE hContext,
1126 PRIMITIVE_TOPOLOGY topology,
1127 uint32_t numVertsPerInstance,
1128 uint32_t numInstances,
1129 uint32_t startVertex,
1130 uint32_t startInstance
1131 )
1132 {
1133 DrawInstanced(hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance);
1134 }
1135
1136 //////////////////////////////////////////////////////////////////////////
1137 /// @brief DrawIndexedInstanced
1138 /// @param hContext - Handle passed back from SwrCreateContext
1139 /// @param topology - Specifies topology for draw.
1140 /// @param numIndices - Number of indices to read sequentially from index buffer.
1141 /// @param indexOffset - Starting index into index buffer.
1142 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1143 /// @param numInstances - Number of instances to render.
1144 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1145 void DrawIndexedInstance(
1146 HANDLE hContext,
1147 PRIMITIVE_TOPOLOGY topology,
1148 uint32_t numIndices,
1149 uint32_t indexOffset,
1150 int32_t baseVertex,
1151 uint32_t numInstances = 1,
1152 uint32_t startInstance = 0)
1153 {
1154 if (KNOB_TOSS_DRAW)
1155 {
1156 return;
1157 }
1158
1159 RDTSC_START(APIDrawIndexed);
1160
1161 SWR_CONTEXT *pContext = GetContext(hContext);
1162 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1163 API_STATE* pState = &pDC->pState->state;
1164
1165 uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
1166 uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
1167 uint32_t remainingIndices = numIndices;
1168
1169 uint32_t indexSize = 0;
1170 switch (pState->indexBuffer.format)
1171 {
1172 case R32_UINT: indexSize = sizeof(uint32_t); break;
1173 case R16_UINT: indexSize = sizeof(uint16_t); break;
1174 case R8_UINT: indexSize = sizeof(uint8_t); break;
1175 default:
1176 SWR_ASSERT(0);
1177 }
1178
1179 int draw = 0;
1180 uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices;
1181 pIB += (uint64_t)indexOffset * (uint64_t)indexSize;
1182
1183 pState->topology = topology;
1184 pState->forceFront = false;
1185
1186 // disable culling for points/lines
1187 uint32_t oldCullMode = pState->rastState.cullMode;
1188 if (topology == TOP_POINT_LIST)
1189 {
1190 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1191 pState->forceFront = true;
1192 }
1193
1194 while (remainingIndices)
1195 {
1196 uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ?
1197 remainingIndices : maxIndicesPerDraw;
1198
1199 // When breaking up draw, we need to obtain new draw context for each iteration.
1200 bool isSplitDraw = (draw > 0) ? true : false;
1201 pDC = GetDrawContext(pContext, isSplitDraw);
1202 InitDraw(pDC, isSplitDraw);
1203
1204 pDC->FeWork.type = DRAW;
1205 pDC->FeWork.pfnWork = GetProcessDrawFunc(
1206 true, // IsIndexed
1207 pState->frontendState.bEnableCutIndex,
1208 pState->tsState.tsEnable,
1209 pState->gsState.gsEnable,
1210 pState->soState.soEnable,
1211 pDC->pState->pfnProcessPrims != nullptr);
1212 pDC->FeWork.desc.draw.pDC = pDC;
1213 pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
1214 pDC->FeWork.desc.draw.pIB = (int*)pIB;
1215 pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format;
1216
1217 pDC->FeWork.desc.draw.numInstances = numInstances;
1218 pDC->FeWork.desc.draw.startInstance = startInstance;
1219 pDC->FeWork.desc.draw.baseVertex = baseVertex;
1220 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1221
1222 pDC->cleanupState = (remainingIndices == numIndicesForDraw);
1223
1224 //enqueue DC
1225 QueueDraw(pContext);
1226
1227 pIB += maxIndicesPerDraw * indexSize;
1228 remainingIndices -= numIndicesForDraw;
1229 draw++;
1230 }
1231
1232 // restore culling state
1233 pDC = GetDrawContext(pContext);
1234 pDC->pState->state.rastState.cullMode = oldCullMode;
1235
1236 RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0);
1237 }
1238
1239
1240 //////////////////////////////////////////////////////////////////////////
1241 /// @brief DrawIndexed
1242 /// @param hContext - Handle passed back from SwrCreateContext
1243 /// @param topology - Specifies topology for draw.
1244 /// @param numIndices - Number of indices to read sequentially from index buffer.
1245 /// @param indexOffset - Starting index into index buffer.
1246 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1247 void SwrDrawIndexed(
1248 HANDLE hContext,
1249 PRIMITIVE_TOPOLOGY topology,
1250 uint32_t numIndices,
1251 uint32_t indexOffset,
1252 int32_t baseVertex
1253 )
1254 {
1255 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex);
1256 }
1257
1258 //////////////////////////////////////////////////////////////////////////
1259 /// @brief SwrDrawIndexedInstanced
1260 /// @param hContext - Handle passed back from SwrCreateContext
1261 /// @param topology - Specifies topology for draw.
1262 /// @param numIndices - Number of indices to read sequentially from index buffer.
1263 /// @param numInstances - Number of instances to render.
1264 /// @param indexOffset - Starting index into index buffer.
1265 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1266 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1267 void SwrDrawIndexedInstanced(
1268 HANDLE hContext,
1269 PRIMITIVE_TOPOLOGY topology,
1270 uint32_t numIndices,
1271 uint32_t numInstances,
1272 uint32_t indexOffset,
1273 int32_t baseVertex,
1274 uint32_t startInstance)
1275 {
1276 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
1277 }
1278
1279 //////////////////////////////////////////////////////////////////////////
1280 /// @brief SwrInvalidateTiles
1281 /// @param hContext - Handle passed back from SwrCreateContext
1282 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
1283 void SwrInvalidateTiles(
1284 HANDLE hContext,
1285 uint32_t attachmentMask)
1286 {
1287 if (KNOB_TOSS_DRAW)
1288 {
1289 return;
1290 }
1291
1292 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1293 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1294
1295 pDC->FeWork.type = DISCARDINVALIDATETILES;
1296 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1297 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1298 memset(&pDC->FeWork.desc.discardInvalidateTiles.rect, 0, sizeof(SWR_RECT));
1299 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID;
1300 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false;
1301 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false;
1302
1303 //enqueue
1304 QueueDraw(pContext);
1305 }
1306
1307 //////////////////////////////////////////////////////////////////////////
1308 /// @brief SwrDiscardRect
1309 /// @param hContext - Handle passed back from SwrCreateContext
1310 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
1311 /// @param rect - if rect is all zeros, the entire attachment surface will be discarded
1312 void SwrDiscardRect(
1313 HANDLE hContext,
1314 uint32_t attachmentMask,
1315 SWR_RECT rect)
1316 {
1317 if (KNOB_TOSS_DRAW)
1318 {
1319 return;
1320 }
1321
1322 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1323 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1324
1325 // Queue a load to the hottile
1326 pDC->FeWork.type = DISCARDINVALIDATETILES;
1327 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1328 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1329 pDC->FeWork.desc.discardInvalidateTiles.rect = rect;
1330 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED;
1331 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true;
1332 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true;
1333
1334 //enqueue
1335 QueueDraw(pContext);
1336 }
1337
1338 //////////////////////////////////////////////////////////////////////////
1339 /// @brief SwrDispatch
1340 /// @param hContext - Handle passed back from SwrCreateContext
1341 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
1342 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
1343 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
1344 void SwrDispatch(
1345 HANDLE hContext,
1346 uint32_t threadGroupCountX,
1347 uint32_t threadGroupCountY,
1348 uint32_t threadGroupCountZ)
1349 {
1350 if (KNOB_TOSS_DRAW)
1351 {
1352 return;
1353 }
1354
1355 RDTSC_START(APIDispatch);
1356 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1357 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1358
1359 pDC->isCompute = true; // This is a compute context.
1360
1361 COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
1362
1363 pTaskData->threadGroupCountX = threadGroupCountX;
1364 pTaskData->threadGroupCountY = threadGroupCountY;
1365 pTaskData->threadGroupCountZ = threadGroupCountZ;
1366
1367 uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
1368 uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
1369 pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex];
1370 pDC->pDispatch->initialize(totalThreadGroups, pTaskData);
1371
1372 QueueDispatch(pContext);
1373 RDTSC_STOP(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ, 0);
1374 }
1375
1376 // Deswizzles, converts and stores current contents of the hot tiles to surface
1377 // described by pState
1378 void SwrStoreTiles(
1379 HANDLE hContext,
1380 SWR_RENDERTARGET_ATTACHMENT attachment,
1381 SWR_TILE_STATE postStoreTileState)
1382 {
1383 if (KNOB_TOSS_DRAW)
1384 {
1385 return;
1386 }
1387
1388 RDTSC_START(APIStoreTiles);
1389
1390 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1391 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1392
1393 SetupMacroTileScissors(pDC);
1394
1395 pDC->FeWork.type = STORETILES;
1396 pDC->FeWork.pfnWork = ProcessStoreTiles;
1397 pDC->FeWork.desc.storeTiles.attachment = attachment;
1398 pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState;
1399
1400 //enqueue
1401 QueueDraw(pContext);
1402
1403 RDTSC_STOP(APIStoreTiles, 0, 0);
1404 }
1405
1406 void SwrClearRenderTarget(
1407 HANDLE hContext,
1408 uint32_t clearMask,
1409 const float clearColor[4],
1410 float z,
1411 uint8_t stencil)
1412 {
1413 if (KNOB_TOSS_DRAW)
1414 {
1415 return;
1416 }
1417
1418 RDTSC_START(APIClearRenderTarget);
1419
1420 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1421
1422 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1423
1424 SetupMacroTileScissors(pDC);
1425
1426 CLEAR_FLAGS flags;
1427 flags.mask = clearMask;
1428
1429 pDC->FeWork.type = CLEAR;
1430 pDC->FeWork.pfnWork = ProcessClear;
1431 pDC->FeWork.desc.clear.flags = flags;
1432 pDC->FeWork.desc.clear.clearDepth = z;
1433 pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0];
1434 pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1];
1435 pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2];
1436 pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3];
1437 pDC->FeWork.desc.clear.clearStencil = stencil;
1438
1439 // enqueue draw
1440 QueueDraw(pContext);
1441
1442 RDTSC_STOP(APIClearRenderTarget, 0, pDC->drawId);
1443 }
1444
1445 //////////////////////////////////////////////////////////////////////////
1446 /// @brief Returns a pointer to the private context state for the current
1447 /// draw operation. This is used for external componets such as the
1448 /// sampler.
1449 /// SWR is responsible for the allocation of the private context state.
1450 /// @param hContext - Handle passed back from SwrCreateContext
1451 VOID* SwrGetPrivateContextState(
1452 HANDLE hContext)
1453 {
1454 SWR_CONTEXT* pContext = GetContext(hContext);
1455 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1456 DRAW_STATE* pState = pDC->pState;
1457
1458 if (pState->pPrivateState == nullptr)
1459 {
1460 pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float));
1461 }
1462
1463 return pState->pPrivateState;
1464 }
1465
1466 //////////////////////////////////////////////////////////////////////////
1467 /// @brief Clients can use this to allocate memory for draw/dispatch
1468 /// operations. The memory will automatically be freed once operation
1469 /// has completed. Client can use this to allocate binding tables,
1470 /// etc. needed for shader execution.
1471 /// @param hContext - Handle passed back from SwrCreateContext
1472 /// @param size - Size of allocation
1473 /// @param align - Alignment needed for allocation.
1474 VOID* SwrAllocDrawContextMemory(
1475 HANDLE hContext,
1476 uint32_t size,
1477 uint32_t align)
1478 {
1479 SWR_CONTEXT* pContext = GetContext(hContext);
1480 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1481
1482 return pDC->pState->pArena->AllocAligned(size, align);
1483 }
1484
1485 //////////////////////////////////////////////////////////////////////////
1486 /// @brief Returns pointer to SWR stats.
1487 /// @note The counters are atomically incremented by multiple threads.
1488 /// When calling this, you need to ensure all previous operations
1489 /// have completed.
1490 /// @todo If necessary, add a callback to avoid stalling the pipe to
1491 /// sample the counters.
1492 /// @param hContext - Handle passed back from SwrCreateContext
1493 /// @param pStats - SWR will fill this out for caller.
1494 void SwrGetStats(
1495 HANDLE hContext,
1496 SWR_STATS* pStats)
1497 {
1498 SWR_CONTEXT *pContext = GetContext(hContext);
1499 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1500
1501 pDC->FeWork.type = QUERYSTATS;
1502 pDC->FeWork.pfnWork = ProcessQueryStats;
1503 pDC->FeWork.desc.queryStats.pStats = pStats;
1504
1505 // cannot execute until all previous draws have completed
1506 pDC->dependency = pDC->drawId - 1;
1507
1508 //enqueue
1509 QueueDraw(pContext);
1510 }
1511
1512 //////////////////////////////////////////////////////////////////////////
1513 /// @brief Enables stats counting
1514 /// @param hContext - Handle passed back from SwrCreateContext
1515 /// @param enable - If true then counts are incremented.
1516 void SwrEnableStats(
1517 HANDLE hContext,
1518 bool enable)
1519 {
1520 SWR_CONTEXT *pContext = GetContext(hContext);
1521 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1522
1523 pDC->pState->state.enableStats = enable;
1524 }
1525
1526 //////////////////////////////////////////////////////////////////////////
1527 /// @brief Mark end of frame - used for performance profiling
1528 /// @param hContext - Handle passed back from SwrCreateContext
1529 void SWR_API SwrEndFrame(
1530 HANDLE hContext)
1531 {
1532 RDTSC_ENDFRAME();
1533 SWR_CONTEXT *pContext = GetContext(hContext);
1534 pContext->frameCount++;
1535 }