swr: [rasterizer core] split FE and BE stats
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / api.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file api.cpp
24 *
25 * @brief API implementation
26 *
27 ******************************************************************************/
28
29 #include <cfloat>
30 #include <cmath>
31 #include <cstdio>
32 #include <new>
33
34 #include "core/api.h"
35 #include "core/backend.h"
36 #include "core/context.h"
37 #include "core/depthstencil.h"
38 #include "core/frontend.h"
39 #include "core/rasterizer.h"
40 #include "core/rdtsc_core.h"
41 #include "core/threads.h"
42 #include "core/tilemgr.h"
43 #include "core/clip.h"
44 #include "core/utils.h"
45
46 #include "common/simdintrin.h"
47 #include "common/os.h"
48
49 void SetupDefaultState(SWR_CONTEXT *pContext);
50
51 static INLINE SWR_CONTEXT* GetContext(HANDLE hContext)
52 {
53 return (SWR_CONTEXT*)hContext;
54 }
55
56 //////////////////////////////////////////////////////////////////////////
57 /// @brief Create SWR Context.
58 /// @param pCreateInfo - pointer to creation info.
59 HANDLE SwrCreateContext(
60 SWR_CREATECONTEXT_INFO* pCreateInfo)
61 {
62 RDTSC_RESET();
63 RDTSC_INIT(0);
64
65 void* pContextMem = AlignedMalloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4);
66 memset(pContextMem, 0, sizeof(SWR_CONTEXT));
67 SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT();
68
69 pContext->driverType = pCreateInfo->driver;
70 pContext->privateStateSize = pCreateInfo->privateStateSize;
71
72 pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
73 pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
74
75 pContext->pMacroTileManagerArray = (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
76 pContext->pDispatchQueueArray = (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
77
78 pContext->threadInfo.MAX_WORKER_THREADS = KNOB_MAX_WORKER_THREADS;
79 pContext->threadInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES;
80 pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE;
81 pContext->threadInfo.MAX_THREADS_PER_CORE = KNOB_MAX_THREADS_PER_CORE;
82 pContext->threadInfo.SINGLE_THREADED = KNOB_SINGLE_THREADED;
83
84 if (pCreateInfo->pThreadInfo)
85 {
86 pContext->threadInfo = *pCreateInfo->pThreadInfo;
87 }
88
89 for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
90 {
91 pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
92 new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
93 new (&pContext->pDispatchQueueArray[dc]) DispatchQueue();
94
95 pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
96 }
97
98 if (!pContext->threadInfo.SINGLE_THREADED)
99 {
100 memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
101 memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
102 new (&pContext->WaitLock) std::mutex();
103 new (&pContext->FifosNotEmpty) std::condition_variable();
104
105 CreateThreadPool(pContext, &pContext->threadPool);
106 }
107
108 // Calling createThreadPool() above can set SINGLE_THREADED
109 if (pContext->threadInfo.SINGLE_THREADED)
110 {
111 pContext->NumWorkerThreads = 1;
112 pContext->NumFEThreads = 1;
113 pContext->NumBEThreads = 1;
114 }
115
116 // Allocate scratch space for workers.
117 ///@note We could lazily allocate this but its rather small amount of memory.
118 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
119 {
120 #if defined(_WIN32)
121 uint32_t numaNode = pContext->threadPool.pThreadData ?
122 pContext->threadPool.pThreadData[i].numaId : 0;
123 pContext->pScratch[i] = (uint8_t*)VirtualAllocExNuma(
124 GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE),
125 MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE,
126 numaNode);
127 #else
128 pContext->pScratch[i] = (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4);
129 #endif
130 }
131
132 // State setup AFTER context is fully initialized
133 SetupDefaultState(pContext);
134
135 // initialize hot tile manager
136 pContext->pHotTileMgr = new HotTileMgr();
137
138 // initialize function pointer tables
139 InitClearTilesTable();
140
141 // initialize callback functions
142 pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
143 pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
144 pContext->pfnClearTile = pCreateInfo->pfnClearTile;
145 pContext->pfnUpdateSoWriteOffset = pCreateInfo->pfnUpdateSoWriteOffset;
146 pContext->pfnUpdateStats = pCreateInfo->pfnUpdateStats;
147 pContext->pfnUpdateStatsFE = pCreateInfo->pfnUpdateStatsFE;
148
149 // pass pointer to bucket manager back to caller
150 #ifdef KNOB_ENABLE_RDTSC
151 pCreateInfo->pBucketMgr = &gBucketMgr;
152 #endif
153
154 pCreateInfo->contextSaveSize = sizeof(API_STATE);
155
156 return (HANDLE)pContext;
157 }
158
159 void SwrDestroyContext(HANDLE hContext)
160 {
161 SWR_CONTEXT *pContext = GetContext(hContext);
162 DestroyThreadPool(pContext, &pContext->threadPool);
163
164 // free the fifos
165 for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i)
166 {
167 delete pContext->dcRing[i].pArena;
168 delete pContext->dsRing[i].pArena;
169 pContext->pMacroTileManagerArray[i].~MacroTileMgr();
170 pContext->pDispatchQueueArray[i].~DispatchQueue();
171 }
172
173 AlignedFree(pContext->pDispatchQueueArray);
174 AlignedFree(pContext->pMacroTileManagerArray);
175
176 // Free scratch space.
177 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
178 {
179 #if defined(_WIN32)
180 VirtualFree(pContext->pScratch[i], 0, MEM_RELEASE);
181 #else
182 AlignedFree(pContext->pScratch[i]);
183 #endif
184 }
185
186 delete(pContext->pHotTileMgr);
187
188 pContext->~SWR_CONTEXT();
189 AlignedFree(GetContext(hContext));
190 }
191
192 void CopyState(DRAW_STATE& dst, const DRAW_STATE& src)
193 {
194 memcpy(&dst.state, &src.state, sizeof(API_STATE));
195 }
196
197 void WakeAllThreads(SWR_CONTEXT *pContext)
198 {
199 pContext->FifosNotEmpty.notify_all();
200 }
201
202 static TileSet gSingleThreadLockedTiles;
203
204 template<bool IsDraw>
205 void QueueWork(SWR_CONTEXT *pContext)
206 {
207 DRAW_CONTEXT* pDC = pContext->pCurDrawContext;
208 uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
209
210 if (IsDraw)
211 {
212 pDC->pTileMgr = &pContext->pMacroTileManagerArray[dcIndex];
213 pDC->pTileMgr->initialize();
214 }
215
216 // Each worker thread looks at a DC for both FE and BE work at different times and so we
217 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
218 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
219 // then moved on if all work is done.)
220 pContext->pCurDrawContext->threadsDone = pContext->NumFEThreads + pContext->NumBEThreads;
221
222 if (IsDraw)
223 {
224 InterlockedIncrement((volatile LONG*)&pContext->drawsOutstandingFE);
225 }
226
227 _ReadWriteBarrier();
228 {
229 std::unique_lock<std::mutex> lock(pContext->WaitLock);
230 pContext->dcRing.Enqueue();
231 }
232
233 if (pContext->threadInfo.SINGLE_THREADED)
234 {
235 // flush denormals to 0
236 uint32_t mxcsr = _mm_getcsr();
237 _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
238
239 if (IsDraw)
240 {
241 uint32_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
242 WorkOnFifoFE(pContext, 0, curDraw[0]);
243 WorkOnFifoBE(pContext, 0, curDraw[1], gSingleThreadLockedTiles, 0, 0);
244 }
245 else
246 {
247 uint32_t curDispatch = pContext->pCurDrawContext->drawId;
248 WorkOnCompute(pContext, 0, curDispatch);
249 }
250
251 // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
252 while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0) {}
253
254 // restore csr
255 _mm_setcsr(mxcsr);
256 }
257 else
258 {
259 RDTSC_START(APIDrawWakeAllThreads);
260 WakeAllThreads(pContext);
261 RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
262 }
263
264 // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
265 pContext->pPrevDrawContext = pContext->pCurDrawContext;
266 pContext->pCurDrawContext = nullptr;
267 }
268
269 INLINE void QueueDraw(SWR_CONTEXT* pContext)
270 {
271 QueueWork<true>(pContext);
272 }
273
274 INLINE void QueueDispatch(SWR_CONTEXT* pContext)
275 {
276 QueueWork<false>(pContext);
277 }
278
279 DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
280 {
281 RDTSC_START(APIGetDrawContext);
282 // If current draw context is null then need to obtain a new draw context to use from ring.
283 if (pContext->pCurDrawContext == nullptr)
284 {
285 // Need to wait for a free entry.
286 while (pContext->dcRing.IsFull())
287 {
288 _mm_pause();
289 }
290
291 uint64_t curDraw = pContext->dcRing.GetHead();
292 uint32_t dcIndex = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT;
293
294 static uint64_t lastDrawChecked;
295 static uint32_t lastFrameChecked;
296 if ((pContext->frameCount - lastFrameChecked) > 2 ||
297 (curDraw - lastDrawChecked) > 0x10000)
298 {
299 // Take this opportunity to clean-up old arena allocations
300 pContext->cachingArenaAllocator.FreeOldBlocks();
301
302 lastFrameChecked = pContext->frameCount;
303 lastDrawChecked = curDraw;
304 }
305
306 DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
307 pContext->pCurDrawContext = pCurDrawContext;
308
309 // Assign next available entry in DS ring to this DC.
310 uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
311 pCurDrawContext->pState = &pContext->dsRing[dsIndex];
312
313 // Copy previous state to current state.
314 if (pContext->pPrevDrawContext)
315 {
316 DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext;
317
318 // If we're splitting our draw then we can just use the same state from the previous
319 // draw. In this case, we won't increment the DS ring index so the next non-split
320 // draw can receive the state.
321 if (isSplitDraw == false)
322 {
323 CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);
324
325 // Should have been cleaned up previously
326 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
327
328 pCurDrawContext->pState->pPrivateState = nullptr;
329
330 pContext->curStateId++; // Progress state ring index forward.
331 }
332 else
333 {
334 // If its a split draw then just copy the state pointer over
335 // since its the same draw.
336 pCurDrawContext->pState = pPrevDrawContext->pState;
337 SWR_ASSERT(pPrevDrawContext->cleanupState == false);
338 }
339 }
340 else
341 {
342 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
343 pContext->curStateId++; // Progress state ring index forward.
344 }
345
346 SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true);
347
348 pCurDrawContext->dependent = false;
349 pCurDrawContext->pContext = pContext;
350 pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
351
352 pCurDrawContext->doneFE = false;
353 pCurDrawContext->FeLock = 0;
354 pCurDrawContext->threadsDone = 0;
355 pCurDrawContext->retireCallback.pfnCallbackFunc = nullptr;
356
357 memset(&pCurDrawContext->dynState, 0, sizeof(pCurDrawContext->dynState));
358
359 // Assign unique drawId for this DC
360 pCurDrawContext->drawId = pContext->dcRing.GetHead();
361
362 pCurDrawContext->cleanupState = true;
363 }
364 else
365 {
366 SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
367 }
368
369 RDTSC_STOP(APIGetDrawContext, 0, 0);
370 return pContext->pCurDrawContext;
371 }
372
373 API_STATE* GetDrawState(SWR_CONTEXT *pContext)
374 {
375 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
376 SWR_ASSERT(pDC->pState != nullptr);
377
378 return &pDC->pState->state;
379 }
380
381 void SWR_API SwrSaveState(
382 HANDLE hContext,
383 void* pOutputStateBlock,
384 size_t memSize)
385 {
386 SWR_CONTEXT *pContext = GetContext(hContext);
387 auto pSrc = GetDrawState(pContext);
388 SWR_ASSERT(pOutputStateBlock && memSize >= sizeof(*pSrc));
389
390 memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc));
391 }
392
393 void SWR_API SwrRestoreState(
394 HANDLE hContext,
395 const void* pStateBlock,
396 size_t memSize)
397 {
398 SWR_CONTEXT *pContext = GetContext(hContext);
399 auto pDst = GetDrawState(pContext);
400 SWR_ASSERT(pStateBlock && memSize >= sizeof(*pDst));
401
402 memcpy(pDst, pStateBlock, sizeof(*pDst));
403 }
404
405 void SetupDefaultState(SWR_CONTEXT *pContext)
406 {
407 API_STATE* pState = GetDrawState(pContext);
408
409 pState->rastState.cullMode = SWR_CULLMODE_NONE;
410 pState->rastState.frontWinding = SWR_FRONTWINDING_CCW;
411 }
412
413 void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2, uint64_t userData3)
414 {
415 RDTSC_START(APISync);
416
417 SWR_ASSERT(pfnFunc != nullptr);
418
419 SWR_CONTEXT *pContext = GetContext(hContext);
420 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
421
422 pDC->FeWork.type = SYNC;
423 pDC->FeWork.pfnWork = ProcessSync;
424
425 // Setup callback function
426 pDC->retireCallback.pfnCallbackFunc = pfnFunc;
427 pDC->retireCallback.userData = userData;
428 pDC->retireCallback.userData2 = userData2;
429 pDC->retireCallback.userData3 = userData3;
430
431 //enqueue
432 QueueDraw(pContext);
433
434 RDTSC_STOP(APISync, 1, 0);
435 }
436
437 void SwrWaitForIdle(HANDLE hContext)
438 {
439 SWR_CONTEXT *pContext = GetContext(hContext);
440
441 RDTSC_START(APIWaitForIdle);
442
443 while (!pContext->dcRing.IsEmpty())
444 {
445 _mm_pause();
446 }
447
448 RDTSC_STOP(APIWaitForIdle, 1, 0);
449 }
450
451 void SwrWaitForIdleFE(HANDLE hContext)
452 {
453 SWR_CONTEXT *pContext = GetContext(hContext);
454
455 RDTSC_START(APIWaitForIdle);
456
457 while (pContext->drawsOutstandingFE > 0)
458 {
459 _mm_pause();
460 }
461
462 RDTSC_STOP(APIWaitForIdle, 1, 0);
463 }
464
465 void SwrSetVertexBuffers(
466 HANDLE hContext,
467 uint32_t numBuffers,
468 const SWR_VERTEX_BUFFER_STATE* pVertexBuffers)
469 {
470 API_STATE* pState = GetDrawState(GetContext(hContext));
471
472 for (uint32_t i = 0; i < numBuffers; ++i)
473 {
474 const SWR_VERTEX_BUFFER_STATE *pVB = &pVertexBuffers[i];
475 pState->vertexBuffers[pVB->index] = *pVB;
476 }
477 }
478
479 void SwrSetIndexBuffer(
480 HANDLE hContext,
481 const SWR_INDEX_BUFFER_STATE* pIndexBuffer)
482 {
483 API_STATE* pState = GetDrawState(GetContext(hContext));
484
485 pState->indexBuffer = *pIndexBuffer;
486 }
487
488 void SwrSetFetchFunc(
489 HANDLE hContext,
490 PFN_FETCH_FUNC pfnFetchFunc)
491 {
492 API_STATE* pState = GetDrawState(GetContext(hContext));
493
494 pState->pfnFetchFunc = pfnFetchFunc;
495 }
496
497 void SwrSetSoFunc(
498 HANDLE hContext,
499 PFN_SO_FUNC pfnSoFunc,
500 uint32_t streamIndex)
501 {
502 API_STATE* pState = GetDrawState(GetContext(hContext));
503
504 SWR_ASSERT(streamIndex < MAX_SO_STREAMS);
505
506 pState->pfnSoFunc[streamIndex] = pfnSoFunc;
507 }
508
509 void SwrSetSoState(
510 HANDLE hContext,
511 SWR_STREAMOUT_STATE* pSoState)
512 {
513 API_STATE* pState = GetDrawState(GetContext(hContext));
514
515 pState->soState = *pSoState;
516 }
517
518 void SwrSetSoBuffers(
519 HANDLE hContext,
520 SWR_STREAMOUT_BUFFER* pSoBuffer,
521 uint32_t slot)
522 {
523 API_STATE* pState = GetDrawState(GetContext(hContext));
524
525 SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot);
526
527 pState->soBuffer[slot] = *pSoBuffer;
528 }
529
530 void SwrSetVertexFunc(
531 HANDLE hContext,
532 PFN_VERTEX_FUNC pfnVertexFunc)
533 {
534 API_STATE* pState = GetDrawState(GetContext(hContext));
535
536 pState->pfnVertexFunc = pfnVertexFunc;
537 }
538
539 void SwrSetFrontendState(
540 HANDLE hContext,
541 SWR_FRONTEND_STATE *pFEState)
542 {
543 API_STATE* pState = GetDrawState(GetContext(hContext));
544 pState->frontendState = *pFEState;
545 }
546
547 void SwrSetGsState(
548 HANDLE hContext,
549 SWR_GS_STATE *pGSState)
550 {
551 API_STATE* pState = GetDrawState(GetContext(hContext));
552 pState->gsState = *pGSState;
553 }
554
555 void SwrSetGsFunc(
556 HANDLE hContext,
557 PFN_GS_FUNC pfnGsFunc)
558 {
559 API_STATE* pState = GetDrawState(GetContext(hContext));
560 pState->pfnGsFunc = pfnGsFunc;
561 }
562
563 void SwrSetCsFunc(
564 HANDLE hContext,
565 PFN_CS_FUNC pfnCsFunc,
566 uint32_t totalThreadsInGroup,
567 uint32_t totalSpillFillSize)
568 {
569 API_STATE* pState = GetDrawState(GetContext(hContext));
570 pState->pfnCsFunc = pfnCsFunc;
571 pState->totalThreadsInGroup = totalThreadsInGroup;
572 pState->totalSpillFillSize = totalSpillFillSize;
573 }
574
575 void SwrSetTsState(
576 HANDLE hContext,
577 SWR_TS_STATE *pState)
578 {
579 API_STATE* pApiState = GetDrawState(GetContext(hContext));
580 pApiState->tsState = *pState;
581 }
582
583 void SwrSetHsFunc(
584 HANDLE hContext,
585 PFN_HS_FUNC pfnFunc)
586 {
587 API_STATE* pApiState = GetDrawState(GetContext(hContext));
588 pApiState->pfnHsFunc = pfnFunc;
589 }
590
591 void SwrSetDsFunc(
592 HANDLE hContext,
593 PFN_DS_FUNC pfnFunc)
594 {
595 API_STATE* pApiState = GetDrawState(GetContext(hContext));
596 pApiState->pfnDsFunc = pfnFunc;
597 }
598
599 void SwrSetDepthStencilState(
600 HANDLE hContext,
601 SWR_DEPTH_STENCIL_STATE *pDSState)
602 {
603 API_STATE* pState = GetDrawState(GetContext(hContext));
604
605 pState->depthStencilState = *pDSState;
606 }
607
608 void SwrSetBackendState(
609 HANDLE hContext,
610 SWR_BACKEND_STATE *pBEState)
611 {
612 API_STATE* pState = GetDrawState(GetContext(hContext));
613
614 pState->backendState = *pBEState;
615 }
616
617 void SwrSetPixelShaderState(
618 HANDLE hContext,
619 SWR_PS_STATE *pPSState)
620 {
621 API_STATE *pState = GetDrawState(GetContext(hContext));
622 pState->psState = *pPSState;
623 }
624
625 void SwrSetBlendState(
626 HANDLE hContext,
627 SWR_BLEND_STATE *pBlendState)
628 {
629 API_STATE *pState = GetDrawState(GetContext(hContext));
630 memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE));
631 }
632
633 void SwrSetBlendFunc(
634 HANDLE hContext,
635 uint32_t renderTarget,
636 PFN_BLEND_JIT_FUNC pfnBlendFunc)
637 {
638 SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS);
639 API_STATE *pState = GetDrawState(GetContext(hContext));
640 pState->pfnBlendFunc[renderTarget] = pfnBlendFunc;
641 }
642
643 // update guardband multipliers for the viewport
644 void updateGuardband(API_STATE *pState)
645 {
646 // guardband center is viewport center
647 pState->gbState.left = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
648 pState->gbState.right = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
649 pState->gbState.top = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
650 pState->gbState.bottom = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
651 }
652
653 void SwrSetRastState(
654 HANDLE hContext,
655 const SWR_RASTSTATE *pRastState)
656 {
657 SWR_CONTEXT *pContext = GetContext(hContext);
658 API_STATE* pState = GetDrawState(pContext);
659
660 memcpy(&pState->rastState, pRastState, sizeof(SWR_RASTSTATE));
661 }
662
663 void SwrSetViewports(
664 HANDLE hContext,
665 uint32_t numViewports,
666 const SWR_VIEWPORT* pViewports,
667 const SWR_VIEWPORT_MATRICES* pMatrices)
668 {
669 SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS,
670 "Invalid number of viewports.");
671
672 SWR_CONTEXT *pContext = GetContext(hContext);
673 API_STATE* pState = GetDrawState(pContext);
674
675 memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports);
676
677 if (pMatrices != nullptr)
678 {
679 //memcpy(&pState->vpMatrix[0], pMatrices, sizeof(SWR_VIEWPORT_MATRIX) * numViewports);
680 // @todo Faster to copy portions of the SOA or just copy all of it?
681 memcpy(&pState->vpMatrices, pMatrices, sizeof(SWR_VIEWPORT_MATRICES));
682 }
683 else
684 {
685 // Compute default viewport transform.
686 for (uint32_t i = 0; i < numViewports; ++i)
687 {
688 if (pContext->driverType == DX)
689 {
690 pState->vpMatrices.m00[i] = pState->vp[i].width / 2.0f;
691 pState->vpMatrices.m11[i] = -pState->vp[i].height / 2.0f;
692 pState->vpMatrices.m22[i] = pState->vp[i].maxZ - pState->vp[i].minZ;
693 pState->vpMatrices.m30[i] = pState->vp[i].x + pState->vpMatrices.m00[i];
694 pState->vpMatrices.m31[i] = pState->vp[i].y - pState->vpMatrices.m11[i];
695 pState->vpMatrices.m32[i] = pState->vp[i].minZ;
696 }
697 else
698 {
699 // Standard, with the exception that Y is inverted.
700 pState->vpMatrices.m00[i] = (pState->vp[i].width - pState->vp[i].x) / 2.0f;
701 pState->vpMatrices.m11[i] = (pState->vp[i].y - pState->vp[i].height) / 2.0f;
702 pState->vpMatrices.m22[i] = (pState->vp[i].maxZ - pState->vp[i].minZ) / 2.0f;
703 pState->vpMatrices.m30[i] = pState->vp[i].x + pState->vpMatrices.m00[i];
704 pState->vpMatrices.m31[i] = pState->vp[i].height + pState->vpMatrices.m11[i];
705 pState->vpMatrices.m32[i] = pState->vp[i].minZ + pState->vpMatrices.m22[i];
706
707 // Now that the matrix is calculated, clip the view coords to screen size.
708 // OpenGL allows for -ve x,y in the viewport.
709 pState->vp[i].x = std::max(pState->vp[i].x, 0.0f);
710 pState->vp[i].y = std::max(pState->vp[i].y, 0.0f);
711 }
712 }
713 }
714
715 updateGuardband(pState);
716 }
717
718 void SwrSetScissorRects(
719 HANDLE hContext,
720 uint32_t numScissors,
721 const BBOX* pScissors)
722 {
723 SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS,
724 "Invalid number of scissor rects.");
725
726 API_STATE* pState = GetDrawState(GetContext(hContext));
727 memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(BBOX));
728 };
729
730 void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
731 {
732 API_STATE *pState = &pDC->pState->state;
733 uint32_t left, right, top, bottom;
734
735 // Set up scissor dimensions based on scissor or viewport
736 if (pState->rastState.scissorEnable)
737 {
738 // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges
739 left = pState->scissorRects[0].left;
740 right = pState->scissorRects[0].right;
741 top = pState->scissorRects[0].top;
742 bottom = pState->scissorRects[0].bottom;
743 }
744 else
745 {
746 // the vp width and height must be added to origin un-rounded then the result round to -inf.
747 // The cast to int works for rounding assuming all [left, right, top, bottom] are positive.
748 left = (int32_t)pState->vp[0].x;
749 right = (int32_t)(pState->vp[0].x + pState->vp[0].width);
750 top = (int32_t)pState->vp[0].y;
751 bottom = (int32_t)(pState->vp[0].y + pState->vp[0].height);
752 }
753
754 right = std::min<uint32_t>(right, KNOB_MAX_SCISSOR_X);
755 bottom = std::min<uint32_t>(bottom, KNOB_MAX_SCISSOR_Y);
756
757 if (left > KNOB_MAX_SCISSOR_X || top > KNOB_MAX_SCISSOR_Y)
758 {
759 pState->scissorInFixedPoint.left = 0;
760 pState->scissorInFixedPoint.right = 0;
761 pState->scissorInFixedPoint.top = 0;
762 pState->scissorInFixedPoint.bottom = 0;
763 }
764 else
765 {
766 pState->scissorInFixedPoint.left = left * FIXED_POINT_SCALE;
767 pState->scissorInFixedPoint.right = right * FIXED_POINT_SCALE - 1;
768 pState->scissorInFixedPoint.top = top * FIXED_POINT_SCALE;
769 pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1;
770 }
771 }
772
773 // templated backend function tables
774 extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
775 extern PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2][2];
776 extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2][2];
777 extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2];
778 void SetupPipeline(DRAW_CONTEXT *pDC)
779 {
780 DRAW_STATE* pState = pDC->pState;
781 const SWR_RASTSTATE &rastState = pState->state.rastState;
782 const SWR_PS_STATE &psState = pState->state.psState;
783 BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
784 const uint32_t forcedSampleCount = (rastState.forcedSampleCount) ? 1 : 0;
785
786 // setup backend
787 if (psState.pfnPixelShader == nullptr)
788 {
789 backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
790 }
791 else
792 {
793 const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.forcedSampleCount) ? 1 : 0;
794 const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
795 const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesSourceDepth && !psState.usesUAV)) ? 1 : 0;
796
797 SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
798
799 // select backend function
800 switch(psState.shadingRate)
801 {
802 case SWR_SHADING_RATE_PIXEL:
803 if(bMultisampleEnable)
804 {
805 // always need to generate I & J per sample for Z interpolation
806 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
807 backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount][canEarlyZ];
808 }
809 else
810 {
811 // always need to generate I & J per pixel for Z interpolation
812 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
813 backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid][canEarlyZ];
814 }
815 break;
816 case SWR_SHADING_RATE_SAMPLE:
817 SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
818 // always need to generate I & J per sample for Z interpolation
819 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
820 backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid][canEarlyZ];
821 break;
822 default:
823 SWR_ASSERT(0 && "Invalid shading rate");
824 break;
825 }
826 }
827
828 PFN_PROCESS_PRIMS pfnBinner;
829 switch (pState->state.topology)
830 {
831 case TOP_POINT_LIST:
832 pState->pfnProcessPrims = ClipPoints;
833 pfnBinner = BinPoints;
834 break;
835 case TOP_LINE_LIST:
836 case TOP_LINE_STRIP:
837 case TOP_LINE_LOOP:
838 case TOP_LINE_LIST_ADJ:
839 case TOP_LISTSTRIP_ADJ:
840 pState->pfnProcessPrims = ClipLines;
841 pfnBinner = BinLines;
842 break;
843 default:
844 pState->pfnProcessPrims = ClipTriangles;
845 pfnBinner = GetBinTrianglesFunc((rastState.conservativeRast > 0));
846 break;
847 };
848
849 // disable clipper if viewport transform is disabled
850 if (pState->state.frontendState.vpTransformDisable)
851 {
852 pState->pfnProcessPrims = pfnBinner;
853 }
854
855 if ((pState->state.psState.pfnPixelShader == nullptr) &&
856 (pState->state.depthStencilState.depthTestEnable == FALSE) &&
857 (pState->state.depthStencilState.depthWriteEnable == FALSE) &&
858 (pState->state.depthStencilState.stencilTestEnable == FALSE) &&
859 (pState->state.depthStencilState.stencilWriteEnable == FALSE) &&
860 (pState->state.backendState.numAttributes == 0))
861 {
862 pState->pfnProcessPrims = nullptr;
863 }
864
865 if (pState->state.soState.rasterizerDisable == true)
866 {
867 pState->pfnProcessPrims = nullptr;
868 }
869
870 // set up the frontend attribute count
871 pState->state.feNumAttributes = 0;
872 const SWR_BACKEND_STATE& backendState = pState->state.backendState;
873 if (backendState.swizzleEnable)
874 {
875 // attribute swizzling is enabled, iterate over the map and record the max attribute used
876 for (uint32_t i = 0; i < backendState.numAttributes; ++i)
877 {
878 pState->state.feNumAttributes = std::max(pState->state.feNumAttributes, (uint32_t)backendState.swizzleMap[i].sourceAttrib + 1);
879 }
880 }
881 else
882 {
883 pState->state.feNumAttributes = pState->state.backendState.numAttributes;
884 }
885
886 if (pState->state.soState.soEnable)
887 {
888 uint32_t streamMasks = 0;
889 for (uint32_t i = 0; i < 4; ++i)
890 {
891 streamMasks |= pState->state.soState.streamMasks[i];
892 }
893
894 DWORD maxAttrib;
895 if (_BitScanReverse(&maxAttrib, streamMasks))
896 {
897 pState->state.feNumAttributes = std::max(pState->state.feNumAttributes, (uint32_t)(maxAttrib + 1));
898 }
899 }
900
901 // complicated logic to test for cases where we don't need backing hottile memory for a draw
902 // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
903 pState->state.depthHottileEnable = ((!(pState->state.depthStencilState.depthTestEnable &&
904 !pState->state.depthStencilState.depthWriteEnable &&
905 pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) &&
906 (pState->state.depthStencilState.depthTestEnable ||
907 pState->state.depthStencilState.depthWriteEnable)) ? true : false;
908
909 pState->state.stencilHottileEnable = (((!(pState->state.depthStencilState.stencilTestEnable &&
910 !pState->state.depthStencilState.stencilWriteEnable &&
911 pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) ||
912 // for stencil we have to check the double sided state as well
913 (!(pState->state.depthStencilState.doubleSidedStencilTestEnable &&
914 !pState->state.depthStencilState.stencilWriteEnable &&
915 pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) &&
916 (pState->state.depthStencilState.stencilTestEnable ||
917 pState->state.depthStencilState.stencilWriteEnable)) ? true : false;
918
919 uint32_t numRTs = pState->state.psState.numRenderTargets;
920 pState->state.colorHottileEnable = 0;
921 if (psState.pfnPixelShader != nullptr)
922 {
923 for (uint32_t rt = 0; rt < numRTs; ++rt)
924 {
925 pState->state.colorHottileEnable |=
926 (!pState->state.blendState.renderTarget[rt].writeDisableAlpha ||
927 !pState->state.blendState.renderTarget[rt].writeDisableRed ||
928 !pState->state.blendState.renderTarget[rt].writeDisableGreen ||
929 !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0;
930 }
931 }
932
933 // Setup depth quantization function
934 if (pState->state.depthHottileEnable)
935 {
936 switch (pState->state.rastState.depthFormat)
937 {
938 case R32_FLOAT_X8X24_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT_X8X24_TYPELESS > ; break;
939 case R32_FLOAT: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; break;
940 case R24_UNORM_X8_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R24_UNORM_X8_TYPELESS > ; break;
941 case R16_UNORM: pState->state.pfnQuantizeDepth = QuantizeDepth < R16_UNORM > ; break;
942 default: SWR_ASSERT(false, "Unsupported depth format for depth quantiztion.");
943 pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
944 }
945 }
946 else
947 {
948 // set up pass-through quantize if depth isn't enabled
949 pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
950 }
951 }
952
953 //////////////////////////////////////////////////////////////////////////
954 /// @brief InitDraw
955 /// @param pDC - Draw context to initialize for this draw.
956 void InitDraw(
957 DRAW_CONTEXT *pDC,
958 bool isSplitDraw)
959 {
960 // We don't need to re-setup the scissors/pipeline state again for split draw.
961 if (isSplitDraw == false)
962 {
963 SetupMacroTileScissors(pDC);
964 SetupPipeline(pDC);
965 }
966 }
967
968 //////////////////////////////////////////////////////////////////////////
969 /// @brief We can split the draw for certain topologies for better performance.
970 /// @param totalVerts - Total vertices for draw
971 /// @param topology - Topology used for draw
972 uint32_t MaxVertsPerDraw(
973 DRAW_CONTEXT* pDC,
974 uint32_t totalVerts,
975 PRIMITIVE_TOPOLOGY topology)
976 {
977 API_STATE& state = pDC->pState->state;
978
979 uint32_t vertsPerDraw = totalVerts;
980
981 if (state.soState.soEnable)
982 {
983 return totalVerts;
984 }
985
986 switch (topology)
987 {
988 case TOP_POINT_LIST:
989 case TOP_TRIANGLE_LIST:
990 vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW;
991 break;
992
993 case TOP_PATCHLIST_1:
994 case TOP_PATCHLIST_2:
995 case TOP_PATCHLIST_3:
996 case TOP_PATCHLIST_4:
997 case TOP_PATCHLIST_5:
998 case TOP_PATCHLIST_6:
999 case TOP_PATCHLIST_7:
1000 case TOP_PATCHLIST_8:
1001 case TOP_PATCHLIST_9:
1002 case TOP_PATCHLIST_10:
1003 case TOP_PATCHLIST_11:
1004 case TOP_PATCHLIST_12:
1005 case TOP_PATCHLIST_13:
1006 case TOP_PATCHLIST_14:
1007 case TOP_PATCHLIST_15:
1008 case TOP_PATCHLIST_16:
1009 case TOP_PATCHLIST_17:
1010 case TOP_PATCHLIST_18:
1011 case TOP_PATCHLIST_19:
1012 case TOP_PATCHLIST_20:
1013 case TOP_PATCHLIST_21:
1014 case TOP_PATCHLIST_22:
1015 case TOP_PATCHLIST_23:
1016 case TOP_PATCHLIST_24:
1017 case TOP_PATCHLIST_25:
1018 case TOP_PATCHLIST_26:
1019 case TOP_PATCHLIST_27:
1020 case TOP_PATCHLIST_28:
1021 case TOP_PATCHLIST_29:
1022 case TOP_PATCHLIST_30:
1023 case TOP_PATCHLIST_31:
1024 case TOP_PATCHLIST_32:
1025 if (pDC->pState->state.tsState.tsEnable)
1026 {
1027 uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE;
1028 vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW;
1029 }
1030 break;
1031
1032 // The Primitive Assembly code can only handle 1 RECT at a time.
1033 case TOP_RECT_LIST:
1034 vertsPerDraw = 3;
1035 break;
1036
1037 default:
1038 // We are not splitting up draws for other topologies.
1039 break;
1040 }
1041
1042 return vertsPerDraw;
1043 }
1044
1045
1046 //////////////////////////////////////////////////////////////////////////
1047 /// @brief DrawInstanced
1048 /// @param hContext - Handle passed back from SwrCreateContext
1049 /// @param topology - Specifies topology for draw.
1050 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
1051 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1052 /// @param numInstances - How many instances to render.
1053 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1054 void DrawInstanced(
1055 HANDLE hContext,
1056 PRIMITIVE_TOPOLOGY topology,
1057 uint32_t numVertices,
1058 uint32_t startVertex,
1059 uint32_t numInstances = 1,
1060 uint32_t startInstance = 0)
1061 {
1062 if (KNOB_TOSS_DRAW)
1063 {
1064 return;
1065 }
1066
1067 RDTSC_START(APIDraw);
1068
1069 SWR_CONTEXT *pContext = GetContext(hContext);
1070 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1071
1072 uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
1073 uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
1074 uint32_t remainingVerts = numVertices;
1075
1076 API_STATE *pState = &pDC->pState->state;
1077 pState->topology = topology;
1078 pState->forceFront = false;
1079
1080 // disable culling for points/lines
1081 uint32_t oldCullMode = pState->rastState.cullMode;
1082 if (topology == TOP_POINT_LIST)
1083 {
1084 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1085 pState->forceFront = true;
1086 }
1087
1088 int draw = 0;
1089 while (remainingVerts)
1090 {
1091 uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ?
1092 remainingVerts : maxVertsPerDraw;
1093
1094 bool isSplitDraw = (draw > 0) ? true : false;
1095 DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw);
1096 InitDraw(pDC, isSplitDraw);
1097
1098 pDC->FeWork.type = DRAW;
1099 pDC->FeWork.pfnWork = GetProcessDrawFunc(
1100 false, // IsIndexed
1101 false, // bEnableCutIndex
1102 pState->tsState.tsEnable,
1103 pState->gsState.gsEnable,
1104 pState->soState.soEnable,
1105 pDC->pState->pfnProcessPrims != nullptr);
1106 pDC->FeWork.desc.draw.numVerts = numVertsForDraw;
1107 pDC->FeWork.desc.draw.startVertex = startVertex;
1108 pDC->FeWork.desc.draw.numInstances = numInstances;
1109 pDC->FeWork.desc.draw.startInstance = startInstance;
1110 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1111 pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw;
1112
1113 pDC->cleanupState = (remainingVerts == numVertsForDraw);
1114
1115 //enqueue DC
1116 QueueDraw(pContext);
1117
1118 remainingVerts -= numVertsForDraw;
1119 draw++;
1120 }
1121
1122 // restore culling state
1123 pDC = GetDrawContext(pContext);
1124 pDC->pState->state.rastState.cullMode = oldCullMode;
1125
1126 RDTSC_STOP(APIDraw, numVertices * numInstances, 0);
1127 }
1128
1129 //////////////////////////////////////////////////////////////////////////
1130 /// @brief SwrDraw
1131 /// @param hContext - Handle passed back from SwrCreateContext
1132 /// @param topology - Specifies topology for draw.
1133 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
1134 /// @param primCount - Number of vertices.
1135 void SwrDraw(
1136 HANDLE hContext,
1137 PRIMITIVE_TOPOLOGY topology,
1138 uint32_t startVertex,
1139 uint32_t numVertices)
1140 {
1141 DrawInstanced(hContext, topology, numVertices, startVertex);
1142 }
1143
1144 //////////////////////////////////////////////////////////////////////////
1145 /// @brief SwrDrawInstanced
1146 /// @param hContext - Handle passed back from SwrCreateContext
1147 /// @param topology - Specifies topology for draw.
1148 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
1149 /// @param numInstances - How many instances to render.
1150 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1151 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1152 void SwrDrawInstanced(
1153 HANDLE hContext,
1154 PRIMITIVE_TOPOLOGY topology,
1155 uint32_t numVertsPerInstance,
1156 uint32_t numInstances,
1157 uint32_t startVertex,
1158 uint32_t startInstance
1159 )
1160 {
1161 DrawInstanced(hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance);
1162 }
1163
1164 //////////////////////////////////////////////////////////////////////////
1165 /// @brief DrawIndexedInstanced
1166 /// @param hContext - Handle passed back from SwrCreateContext
1167 /// @param topology - Specifies topology for draw.
1168 /// @param numIndices - Number of indices to read sequentially from index buffer.
1169 /// @param indexOffset - Starting index into index buffer.
1170 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1171 /// @param numInstances - Number of instances to render.
1172 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1173 void DrawIndexedInstance(
1174 HANDLE hContext,
1175 PRIMITIVE_TOPOLOGY topology,
1176 uint32_t numIndices,
1177 uint32_t indexOffset,
1178 int32_t baseVertex,
1179 uint32_t numInstances = 1,
1180 uint32_t startInstance = 0)
1181 {
1182 if (KNOB_TOSS_DRAW)
1183 {
1184 return;
1185 }
1186
1187 RDTSC_START(APIDrawIndexed);
1188
1189 SWR_CONTEXT *pContext = GetContext(hContext);
1190 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1191 API_STATE* pState = &pDC->pState->state;
1192
1193 uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
1194 uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
1195 uint32_t remainingIndices = numIndices;
1196
1197 uint32_t indexSize = 0;
1198 switch (pState->indexBuffer.format)
1199 {
1200 case R32_UINT: indexSize = sizeof(uint32_t); break;
1201 case R16_UINT: indexSize = sizeof(uint16_t); break;
1202 case R8_UINT: indexSize = sizeof(uint8_t); break;
1203 default:
1204 SWR_ASSERT(0);
1205 }
1206
1207 int draw = 0;
1208 uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices;
1209 pIB += (uint64_t)indexOffset * (uint64_t)indexSize;
1210
1211 pState->topology = topology;
1212 pState->forceFront = false;
1213
1214 // disable culling for points/lines
1215 uint32_t oldCullMode = pState->rastState.cullMode;
1216 if (topology == TOP_POINT_LIST)
1217 {
1218 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1219 pState->forceFront = true;
1220 }
1221
1222 while (remainingIndices)
1223 {
1224 uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ?
1225 remainingIndices : maxIndicesPerDraw;
1226
1227 // When breaking up draw, we need to obtain new draw context for each iteration.
1228 bool isSplitDraw = (draw > 0) ? true : false;
1229 pDC = GetDrawContext(pContext, isSplitDraw);
1230 InitDraw(pDC, isSplitDraw);
1231
1232 pDC->FeWork.type = DRAW;
1233 pDC->FeWork.pfnWork = GetProcessDrawFunc(
1234 true, // IsIndexed
1235 pState->frontendState.bEnableCutIndex,
1236 pState->tsState.tsEnable,
1237 pState->gsState.gsEnable,
1238 pState->soState.soEnable,
1239 pDC->pState->pfnProcessPrims != nullptr);
1240 pDC->FeWork.desc.draw.pDC = pDC;
1241 pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
1242 pDC->FeWork.desc.draw.pIB = (int*)pIB;
1243 pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format;
1244
1245 pDC->FeWork.desc.draw.numInstances = numInstances;
1246 pDC->FeWork.desc.draw.startInstance = startInstance;
1247 pDC->FeWork.desc.draw.baseVertex = baseVertex;
1248 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1249
1250 pDC->cleanupState = (remainingIndices == numIndicesForDraw);
1251
1252 //enqueue DC
1253 QueueDraw(pContext);
1254
1255 pIB += maxIndicesPerDraw * indexSize;
1256 remainingIndices -= numIndicesForDraw;
1257 draw++;
1258 }
1259
1260 // restore culling state
1261 pDC = GetDrawContext(pContext);
1262 pDC->pState->state.rastState.cullMode = oldCullMode;
1263
1264 RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0);
1265 }
1266
1267
1268 //////////////////////////////////////////////////////////////////////////
1269 /// @brief DrawIndexed
1270 /// @param hContext - Handle passed back from SwrCreateContext
1271 /// @param topology - Specifies topology for draw.
1272 /// @param numIndices - Number of indices to read sequentially from index buffer.
1273 /// @param indexOffset - Starting index into index buffer.
1274 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1275 void SwrDrawIndexed(
1276 HANDLE hContext,
1277 PRIMITIVE_TOPOLOGY topology,
1278 uint32_t numIndices,
1279 uint32_t indexOffset,
1280 int32_t baseVertex
1281 )
1282 {
1283 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex);
1284 }
1285
1286 //////////////////////////////////////////////////////////////////////////
1287 /// @brief SwrDrawIndexedInstanced
1288 /// @param hContext - Handle passed back from SwrCreateContext
1289 /// @param topology - Specifies topology for draw.
1290 /// @param numIndices - Number of indices to read sequentially from index buffer.
1291 /// @param numInstances - Number of instances to render.
1292 /// @param indexOffset - Starting index into index buffer.
1293 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1294 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1295 void SwrDrawIndexedInstanced(
1296 HANDLE hContext,
1297 PRIMITIVE_TOPOLOGY topology,
1298 uint32_t numIndices,
1299 uint32_t numInstances,
1300 uint32_t indexOffset,
1301 int32_t baseVertex,
1302 uint32_t startInstance)
1303 {
1304 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
1305 }
1306
1307 //////////////////////////////////////////////////////////////////////////
1308 /// @brief SwrInvalidateTiles
1309 /// @param hContext - Handle passed back from SwrCreateContext
1310 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
1311 void SwrInvalidateTiles(
1312 HANDLE hContext,
1313 uint32_t attachmentMask)
1314 {
1315 if (KNOB_TOSS_DRAW)
1316 {
1317 return;
1318 }
1319
1320 SWR_CONTEXT *pContext = GetContext(hContext);
1321 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1322
1323 pDC->FeWork.type = DISCARDINVALIDATETILES;
1324 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1325 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1326 memset(&pDC->FeWork.desc.discardInvalidateTiles.rect, 0, sizeof(SWR_RECT));
1327 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID;
1328 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false;
1329 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false;
1330
1331 //enqueue
1332 QueueDraw(pContext);
1333 }
1334
1335 //////////////////////////////////////////////////////////////////////////
1336 /// @brief SwrDiscardRect
1337 /// @param hContext - Handle passed back from SwrCreateContext
1338 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
1339 /// @param rect - if rect is all zeros, the entire attachment surface will be discarded
1340 void SwrDiscardRect(
1341 HANDLE hContext,
1342 uint32_t attachmentMask,
1343 SWR_RECT rect)
1344 {
1345 if (KNOB_TOSS_DRAW)
1346 {
1347 return;
1348 }
1349
1350 SWR_CONTEXT *pContext = GetContext(hContext);
1351 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1352
1353 // Queue a load to the hottile
1354 pDC->FeWork.type = DISCARDINVALIDATETILES;
1355 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1356 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1357 pDC->FeWork.desc.discardInvalidateTiles.rect = rect;
1358 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED;
1359 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true;
1360 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true;
1361
1362 //enqueue
1363 QueueDraw(pContext);
1364 }
1365
1366 //////////////////////////////////////////////////////////////////////////
1367 /// @brief SwrDispatch
1368 /// @param hContext - Handle passed back from SwrCreateContext
1369 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
1370 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
1371 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
1372 void SwrDispatch(
1373 HANDLE hContext,
1374 uint32_t threadGroupCountX,
1375 uint32_t threadGroupCountY,
1376 uint32_t threadGroupCountZ)
1377 {
1378 if (KNOB_TOSS_DRAW)
1379 {
1380 return;
1381 }
1382
1383 RDTSC_START(APIDispatch);
1384 SWR_CONTEXT *pContext = GetContext(hContext);
1385 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1386
1387 pDC->isCompute = true; // This is a compute context.
1388
1389 COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
1390
1391 pTaskData->threadGroupCountX = threadGroupCountX;
1392 pTaskData->threadGroupCountY = threadGroupCountY;
1393 pTaskData->threadGroupCountZ = threadGroupCountZ;
1394
1395 uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
1396 uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
1397 pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex];
1398 pDC->pDispatch->initialize(totalThreadGroups, pTaskData);
1399
1400 QueueDispatch(pContext);
1401 RDTSC_STOP(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ, 0);
1402 }
1403
1404 // Deswizzles, converts and stores current contents of the hot tiles to surface
1405 // described by pState
1406 void SwrStoreTiles(
1407 HANDLE hContext,
1408 SWR_RENDERTARGET_ATTACHMENT attachment,
1409 SWR_TILE_STATE postStoreTileState)
1410 {
1411 if (KNOB_TOSS_DRAW)
1412 {
1413 return;
1414 }
1415
1416 RDTSC_START(APIStoreTiles);
1417
1418 SWR_CONTEXT *pContext = GetContext(hContext);
1419 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1420
1421 SetupMacroTileScissors(pDC);
1422
1423 pDC->FeWork.type = STORETILES;
1424 pDC->FeWork.pfnWork = ProcessStoreTiles;
1425 pDC->FeWork.desc.storeTiles.attachment = attachment;
1426 pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState;
1427
1428 //enqueue
1429 QueueDraw(pContext);
1430
1431 RDTSC_STOP(APIStoreTiles, 0, 0);
1432 }
1433
1434 void SwrClearRenderTarget(
1435 HANDLE hContext,
1436 uint32_t clearMask,
1437 const float clearColor[4],
1438 float z,
1439 uint8_t stencil)
1440 {
1441 if (KNOB_TOSS_DRAW)
1442 {
1443 return;
1444 }
1445
1446 RDTSC_START(APIClearRenderTarget);
1447
1448 SWR_CONTEXT *pContext = GetContext(hContext);
1449
1450 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1451
1452 SetupMacroTileScissors(pDC);
1453
1454 CLEAR_FLAGS flags;
1455 flags.mask = clearMask;
1456
1457 pDC->FeWork.type = CLEAR;
1458 pDC->FeWork.pfnWork = ProcessClear;
1459 pDC->FeWork.desc.clear.flags = flags;
1460 pDC->FeWork.desc.clear.clearDepth = z;
1461 pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0];
1462 pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1];
1463 pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2];
1464 pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3];
1465 pDC->FeWork.desc.clear.clearStencil = stencil;
1466
1467 // enqueue draw
1468 QueueDraw(pContext);
1469
1470 RDTSC_STOP(APIClearRenderTarget, 0, pDC->drawId);
1471 }
1472
1473 //////////////////////////////////////////////////////////////////////////
1474 /// @brief Returns a pointer to the private context state for the current
1475 /// draw operation. This is used for external componets such as the
1476 /// sampler.
1477 /// SWR is responsible for the allocation of the private context state.
1478 /// @param hContext - Handle passed back from SwrCreateContext
1479 VOID* SwrGetPrivateContextState(
1480 HANDLE hContext)
1481 {
1482 SWR_CONTEXT* pContext = GetContext(hContext);
1483 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1484 DRAW_STATE* pState = pDC->pState;
1485
1486 if (pState->pPrivateState == nullptr)
1487 {
1488 pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float));
1489 }
1490
1491 return pState->pPrivateState;
1492 }
1493
1494 //////////////////////////////////////////////////////////////////////////
1495 /// @brief Clients can use this to allocate memory for draw/dispatch
1496 /// operations. The memory will automatically be freed once operation
1497 /// has completed. Client can use this to allocate binding tables,
1498 /// etc. needed for shader execution.
1499 /// @param hContext - Handle passed back from SwrCreateContext
1500 /// @param size - Size of allocation
1501 /// @param align - Alignment needed for allocation.
1502 VOID* SwrAllocDrawContextMemory(
1503 HANDLE hContext,
1504 uint32_t size,
1505 uint32_t align)
1506 {
1507 SWR_CONTEXT* pContext = GetContext(hContext);
1508 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1509
1510 return pDC->pState->pArena->AllocAligned(size, align);
1511 }
1512
1513 //////////////////////////////////////////////////////////////////////////
1514 /// @brief Enables stats counting
1515 /// @param hContext - Handle passed back from SwrCreateContext
1516 /// @param enable - If true then counts are incremented.
1517 void SwrEnableStats(
1518 HANDLE hContext,
1519 bool enable)
1520 {
1521 SWR_CONTEXT *pContext = GetContext(hContext);
1522 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1523
1524 pDC->pState->state.enableStats = enable;
1525 }
1526
1527 //////////////////////////////////////////////////////////////////////////
1528 /// @brief Mark end of frame - used for performance profiling
1529 /// @param hContext - Handle passed back from SwrCreateContext
1530 void SWR_API SwrEndFrame(
1531 HANDLE hContext)
1532 {
1533 RDTSC_ENDFRAME();
1534 SWR_CONTEXT *pContext = GetContext(hContext);
1535 pContext->frameCount++;
1536 }