swr: [rasterizer core] Quantize depth to depth buffer precision prior to depth test...
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / api.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file api.cpp
24 *
25 * @brief API implementation
26 *
27 ******************************************************************************/
28
29 #include <cfloat>
30 #include <cmath>
31 #include <cstdio>
32
33 #include "core/api.h"
34 #include "core/backend.h"
35 #include "core/context.h"
36 #include "core/depthstencil.h"
37 #include "core/frontend.h"
38 #include "core/rasterizer.h"
39 #include "core/rdtsc_core.h"
40 #include "core/threads.h"
41 #include "core/tilemgr.h"
42 #include "core/clip.h"
43
44 #include "common/simdintrin.h"
45 #include "common/os.h"
46
47 void SetupDefaultState(SWR_CONTEXT *pContext);
48
49 //////////////////////////////////////////////////////////////////////////
50 /// @brief Create SWR Context.
51 /// @param pCreateInfo - pointer to creation info.
52 HANDLE SwrCreateContext(
53 SWR_CREATECONTEXT_INFO* pCreateInfo)
54 {
55 RDTSC_RESET();
56 RDTSC_INIT(0);
57
58 void* pContextMem = _aligned_malloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4);
59 memset(pContextMem, 0, sizeof(SWR_CONTEXT));
60 SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT();
61
62 pContext->driverType = pCreateInfo->driver;
63 pContext->privateStateSize = pCreateInfo->privateStateSize;
64
65 pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
66 pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
67
68 for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
69 {
70 pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
71 pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena));
72 pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
73
74 pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
75 }
76
77 if (!KNOB_SINGLE_THREADED)
78 {
79 memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
80 memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
81 new (&pContext->WaitLock) std::mutex();
82 new (&pContext->FifosNotEmpty) std::condition_variable();
83
84 CreateThreadPool(pContext, &pContext->threadPool);
85 }
86
87 // Calling createThreadPool() above can set SINGLE_THREADED
88 if (KNOB_SINGLE_THREADED)
89 {
90 pContext->NumWorkerThreads = 1;
91 }
92
93 // Allocate scratch space for workers.
94 ///@note We could lazily allocate this but its rather small amount of memory.
95 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
96 {
97 #if defined(_WIN32)
98 uint32_t numaNode = pContext->threadPool.pThreadData ?
99 pContext->threadPool.pThreadData[i].numaId : 0;
100 pContext->pScratch[i] = (uint8_t*)VirtualAllocExNuma(
101 GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE),
102 MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE,
103 numaNode);
104 #else
105 pContext->pScratch[i] = (uint8_t*)_aligned_malloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4);
106 #endif
107 }
108
109 // State setup AFTER context is fully initialized
110 SetupDefaultState(pContext);
111
112 // initialize hot tile manager
113 pContext->pHotTileMgr = new HotTileMgr();
114
115 // initialize function pointer tables
116 InitClearTilesTable();
117
118 // initialize store tiles function
119 pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
120 pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
121 pContext->pfnClearTile = pCreateInfo->pfnClearTile;
122
123 // pass pointer to bucket manager back to caller
124 #ifdef KNOB_ENABLE_RDTSC
125 pCreateInfo->pBucketMgr = &gBucketMgr;
126 #endif
127
128 pCreateInfo->contextSaveSize = sizeof(API_STATE);
129
130 return (HANDLE)pContext;
131 }
132
133 void SwrDestroyContext(HANDLE hContext)
134 {
135 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
136 DestroyThreadPool(pContext, &pContext->threadPool);
137
138 // free the fifos
139 for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i)
140 {
141 delete pContext->dcRing[i].pArena;
142 delete pContext->dsRing[i].pArena;
143 delete(pContext->dcRing[i].pTileMgr);
144 delete(pContext->dcRing[i].pDispatch);
145 }
146
147 // Free scratch space.
148 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
149 {
150 #if defined(_WIN32)
151 VirtualFree(pContext->pScratch[i], 0, MEM_RELEASE);
152 #else
153 _aligned_free(pContext->pScratch[i]);
154 #endif
155 }
156
157 delete(pContext->pHotTileMgr);
158
159 pContext->~SWR_CONTEXT();
160 _aligned_free((SWR_CONTEXT*)hContext);
161 }
162
163 void CopyState(DRAW_STATE& dst, const DRAW_STATE& src)
164 {
165 memcpy(&dst.state, &src.state, sizeof(API_STATE));
166 }
167
168 void WakeAllThreads(SWR_CONTEXT *pContext)
169 {
170 pContext->FifosNotEmpty.notify_all();
171 }
172
173 template<bool IsDraw>
174 void QueueWork(SWR_CONTEXT *pContext)
175 {
176 // Each worker thread looks at a DC for both FE and BE work at different times and so we
177 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
178 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
179 // then moved on if all work is done.)
180 pContext->pCurDrawContext->threadsDone =
181 pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2;
182
183 _ReadWriteBarrier();
184 {
185 std::unique_lock<std::mutex> lock(pContext->WaitLock);
186 pContext->dcRing.Enqueue();
187 }
188
189 if (KNOB_SINGLE_THREADED)
190 {
191 // flush denormals to 0
192 uint32_t mxcsr = _mm_getcsr();
193 _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
194
195 if (IsDraw)
196 {
197 static TileSet lockedTiles;
198 uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
199 WorkOnFifoFE(pContext, 0, curDraw[0], 0);
200 WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles, 0, 0);
201 }
202 else
203 {
204 uint64_t curDispatch = pContext->pCurDrawContext->drawId;
205 WorkOnCompute(pContext, 0, curDispatch);
206 }
207
208 // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
209 while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0) {}
210
211 // restore csr
212 _mm_setcsr(mxcsr);
213 }
214 else
215 {
216 RDTSC_START(APIDrawWakeAllThreads);
217 WakeAllThreads(pContext);
218 RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
219 }
220
221 // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
222 pContext->pPrevDrawContext = pContext->pCurDrawContext;
223 pContext->pCurDrawContext = nullptr;
224 }
225
226 INLINE void QueueDraw(SWR_CONTEXT* pContext)
227 {
228 QueueWork<true>(pContext);
229 }
230
231 INLINE void QueueDispatch(SWR_CONTEXT* pContext)
232 {
233 QueueWork<false>(pContext);
234 }
235
236 DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
237 {
238 RDTSC_START(APIGetDrawContext);
239 // If current draw context is null then need to obtain a new draw context to use from ring.
240 if (pContext->pCurDrawContext == nullptr)
241 {
242 // Need to wait for a free entry.
243 while (pContext->dcRing.IsFull())
244 {
245 _mm_pause();
246 }
247
248 uint32_t dcIndex = pContext->dcRing.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT;
249
250 DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
251 pContext->pCurDrawContext = pCurDrawContext;
252
253 // Assign next available entry in DS ring to this DC.
254 uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
255 pCurDrawContext->pState = &pContext->dsRing[dsIndex];
256
257 // Copy previous state to current state.
258 if (pContext->pPrevDrawContext)
259 {
260 DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext;
261
262 // If we're splitting our draw then we can just use the same state from the previous
263 // draw. In this case, we won't increment the DS ring index so the next non-split
264 // draw can receive the state.
265 if (isSplitDraw == false)
266 {
267 CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);
268
269 // Should have been cleaned up previously
270 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
271
272 pCurDrawContext->pState->pPrivateState = nullptr;
273
274 pContext->curStateId++; // Progress state ring index forward.
275 }
276 else
277 {
278 // If its a split draw then just copy the state pointer over
279 // since its the same draw.
280 pCurDrawContext->pState = pPrevDrawContext->pState;
281 SWR_ASSERT(pPrevDrawContext->cleanupState == false);
282 }
283 }
284 else
285 {
286 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
287 pContext->curStateId++; // Progress state ring index forward.
288 }
289
290 SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true);
291
292 pCurDrawContext->dependency = 0;
293 pCurDrawContext->pContext = pContext;
294 pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
295
296 pCurDrawContext->doneFE = false;
297 pCurDrawContext->FeLock = 0;
298 pCurDrawContext->threadsDone = 0;
299
300 pCurDrawContext->pTileMgr->initialize();
301
302 // Assign unique drawId for this DC
303 pCurDrawContext->drawId = pContext->dcRing.GetHead();
304
305 pCurDrawContext->cleanupState = true;
306 }
307 else
308 {
309 SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
310 }
311
312 RDTSC_STOP(APIGetDrawContext, 0, 0);
313 return pContext->pCurDrawContext;
314 }
315
316 API_STATE* GetDrawState(SWR_CONTEXT *pContext)
317 {
318 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
319 SWR_ASSERT(pDC->pState != nullptr);
320
321 return &pDC->pState->state;
322 }
323
324 void SWR_API SwrSaveState(
325 HANDLE hContext,
326 void* pOutputStateBlock,
327 size_t memSize)
328 {
329 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
330 auto pSrc = GetDrawState(pContext);
331 SWR_ASSERT(pOutputStateBlock && memSize >= sizeof(*pSrc));
332
333 memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc));
334 }
335
336 void SWR_API SwrRestoreState(
337 HANDLE hContext,
338 const void* pStateBlock,
339 size_t memSize)
340 {
341 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
342 auto pDst = GetDrawState(pContext);
343 SWR_ASSERT(pStateBlock && memSize >= sizeof(*pDst));
344
345 memcpy(pDst, pStateBlock, sizeof(*pDst));
346 }
347
348 void SetupDefaultState(SWR_CONTEXT *pContext)
349 {
350 API_STATE* pState = GetDrawState(pContext);
351
352 pState->rastState.cullMode = SWR_CULLMODE_NONE;
353 pState->rastState.frontWinding = SWR_FRONTWINDING_CCW;
354 }
355
356 static INLINE SWR_CONTEXT* GetContext(HANDLE hContext)
357 {
358 return (SWR_CONTEXT*)hContext;
359 }
360
361 void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2, uint64_t userData3)
362 {
363 RDTSC_START(APISync);
364
365 SWR_ASSERT(pfnFunc != nullptr);
366
367 SWR_CONTEXT *pContext = GetContext(hContext);
368 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
369
370 pDC->FeWork.type = SYNC;
371 pDC->FeWork.pfnWork = ProcessSync;
372 pDC->FeWork.desc.sync.pfnCallbackFunc = pfnFunc;
373 pDC->FeWork.desc.sync.userData = userData;
374 pDC->FeWork.desc.sync.userData2 = userData2;
375 pDC->FeWork.desc.sync.userData3 = userData3;
376
377 // cannot execute until all previous draws have completed
378 pDC->dependency = pDC->drawId - 1;
379
380 //enqueue
381 QueueDraw(pContext);
382
383 RDTSC_STOP(APISync, 1, 0);
384 }
385
386 void SwrWaitForIdle(HANDLE hContext)
387 {
388 SWR_CONTEXT *pContext = GetContext(hContext);
389
390 RDTSC_START(APIWaitForIdle);
391
392 while (!pContext->dcRing.IsEmpty())
393 {
394 _mm_pause();
395 }
396
397 RDTSC_STOP(APIWaitForIdle, 1, 0);
398 }
399
400 void SwrSetVertexBuffers(
401 HANDLE hContext,
402 uint32_t numBuffers,
403 const SWR_VERTEX_BUFFER_STATE* pVertexBuffers)
404 {
405 API_STATE* pState = GetDrawState(GetContext(hContext));
406
407 for (uint32_t i = 0; i < numBuffers; ++i)
408 {
409 const SWR_VERTEX_BUFFER_STATE *pVB = &pVertexBuffers[i];
410 pState->vertexBuffers[pVB->index] = *pVB;
411 }
412 }
413
414 void SwrSetIndexBuffer(
415 HANDLE hContext,
416 const SWR_INDEX_BUFFER_STATE* pIndexBuffer)
417 {
418 API_STATE* pState = GetDrawState(GetContext(hContext));
419
420 pState->indexBuffer = *pIndexBuffer;
421 }
422
423 void SwrSetFetchFunc(
424 HANDLE hContext,
425 PFN_FETCH_FUNC pfnFetchFunc)
426 {
427 API_STATE* pState = GetDrawState(GetContext(hContext));
428
429 pState->pfnFetchFunc = pfnFetchFunc;
430 }
431
432 void SwrSetSoFunc(
433 HANDLE hContext,
434 PFN_SO_FUNC pfnSoFunc,
435 uint32_t streamIndex)
436 {
437 API_STATE* pState = GetDrawState(GetContext(hContext));
438
439 SWR_ASSERT(streamIndex < MAX_SO_STREAMS);
440
441 pState->pfnSoFunc[streamIndex] = pfnSoFunc;
442 }
443
444 void SwrSetSoState(
445 HANDLE hContext,
446 SWR_STREAMOUT_STATE* pSoState)
447 {
448 API_STATE* pState = GetDrawState(GetContext(hContext));
449
450 pState->soState = *pSoState;
451 }
452
453 void SwrSetSoBuffers(
454 HANDLE hContext,
455 SWR_STREAMOUT_BUFFER* pSoBuffer,
456 uint32_t slot)
457 {
458 API_STATE* pState = GetDrawState(GetContext(hContext));
459
460 SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot);
461
462 pState->soBuffer[slot] = *pSoBuffer;
463 }
464
465 void SwrSetVertexFunc(
466 HANDLE hContext,
467 PFN_VERTEX_FUNC pfnVertexFunc)
468 {
469 API_STATE* pState = GetDrawState(GetContext(hContext));
470
471 pState->pfnVertexFunc = pfnVertexFunc;
472 }
473
474 void SwrSetFrontendState(
475 HANDLE hContext,
476 SWR_FRONTEND_STATE *pFEState)
477 {
478 API_STATE* pState = GetDrawState(GetContext(hContext));
479 pState->frontendState = *pFEState;
480 }
481
482 void SwrSetGsState(
483 HANDLE hContext,
484 SWR_GS_STATE *pGSState)
485 {
486 API_STATE* pState = GetDrawState(GetContext(hContext));
487 pState->gsState = *pGSState;
488 }
489
490 void SwrSetGsFunc(
491 HANDLE hContext,
492 PFN_GS_FUNC pfnGsFunc)
493 {
494 API_STATE* pState = GetDrawState(GetContext(hContext));
495 pState->pfnGsFunc = pfnGsFunc;
496 }
497
498 void SwrSetCsFunc(
499 HANDLE hContext,
500 PFN_CS_FUNC pfnCsFunc,
501 uint32_t totalThreadsInGroup)
502 {
503 API_STATE* pState = GetDrawState(GetContext(hContext));
504 pState->pfnCsFunc = pfnCsFunc;
505 pState->totalThreadsInGroup = totalThreadsInGroup;
506 }
507
508 void SwrSetTsState(
509 HANDLE hContext,
510 SWR_TS_STATE *pState)
511 {
512 API_STATE* pApiState = GetDrawState(GetContext(hContext));
513 pApiState->tsState = *pState;
514 }
515
516 void SwrSetHsFunc(
517 HANDLE hContext,
518 PFN_HS_FUNC pfnFunc)
519 {
520 API_STATE* pApiState = GetDrawState(GetContext(hContext));
521 pApiState->pfnHsFunc = pfnFunc;
522 }
523
524 void SwrSetDsFunc(
525 HANDLE hContext,
526 PFN_DS_FUNC pfnFunc)
527 {
528 API_STATE* pApiState = GetDrawState(GetContext(hContext));
529 pApiState->pfnDsFunc = pfnFunc;
530 }
531
532 void SwrSetDepthStencilState(
533 HANDLE hContext,
534 SWR_DEPTH_STENCIL_STATE *pDSState)
535 {
536 API_STATE* pState = GetDrawState(GetContext(hContext));
537
538 pState->depthStencilState = *pDSState;
539 }
540
541 void SwrSetBackendState(
542 HANDLE hContext,
543 SWR_BACKEND_STATE *pBEState)
544 {
545 API_STATE* pState = GetDrawState(GetContext(hContext));
546
547 pState->backendState = *pBEState;
548 }
549
550 void SwrSetPixelShaderState(
551 HANDLE hContext,
552 SWR_PS_STATE *pPSState)
553 {
554 API_STATE *pState = GetDrawState(GetContext(hContext));
555 pState->psState = *pPSState;
556 }
557
558 void SwrSetBlendState(
559 HANDLE hContext,
560 SWR_BLEND_STATE *pBlendState)
561 {
562 API_STATE *pState = GetDrawState(GetContext(hContext));
563 memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE));
564 }
565
566 void SwrSetBlendFunc(
567 HANDLE hContext,
568 uint32_t renderTarget,
569 PFN_BLEND_JIT_FUNC pfnBlendFunc)
570 {
571 SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS);
572 API_STATE *pState = GetDrawState(GetContext(hContext));
573 pState->pfnBlendFunc[renderTarget] = pfnBlendFunc;
574 }
575
576 void SwrSetLinkage(
577 HANDLE hContext,
578 uint32_t mask,
579 const uint8_t* pMap)
580 {
581 API_STATE* pState = GetDrawState(GetContext(hContext));
582
583 static const uint8_t IDENTITY_MAP[] =
584 {
585 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
586 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
587 };
588 static_assert(sizeof(IDENTITY_MAP) == sizeof(pState->linkageMap),
589 "Update for new value of MAX_ATTRIBUTES");
590
591 pState->linkageMask = mask;
592 pState->linkageCount = _mm_popcnt_u32(mask);
593
594 if (!pMap)
595 {
596 pMap = IDENTITY_MAP;
597 }
598 memcpy(pState->linkageMap, pMap, pState->linkageCount);
599 }
600
601 // update guardband multipliers for the viewport
602 void updateGuardband(API_STATE *pState)
603 {
604 // guardband center is viewport center
605 pState->gbState.left = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
606 pState->gbState.right = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
607 pState->gbState.top = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
608 pState->gbState.bottom = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
609 }
610
611 void SwrSetRastState(
612 HANDLE hContext,
613 const SWR_RASTSTATE *pRastState)
614 {
615 SWR_CONTEXT *pContext = GetContext(hContext);
616 API_STATE* pState = GetDrawState(pContext);
617
618 memcpy(&pState->rastState, pRastState, sizeof(SWR_RASTSTATE));
619 }
620
621 void SwrSetViewports(
622 HANDLE hContext,
623 uint32_t numViewports,
624 const SWR_VIEWPORT* pViewports,
625 const SWR_VIEWPORT_MATRIX* pMatrices)
626 {
627 SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS,
628 "Invalid number of viewports.");
629
630 SWR_CONTEXT *pContext = GetContext(hContext);
631 API_STATE* pState = GetDrawState(pContext);
632
633 memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports);
634
635 if (pMatrices != nullptr)
636 {
637 memcpy(&pState->vpMatrix[0], pMatrices, sizeof(SWR_VIEWPORT_MATRIX) * numViewports);
638 }
639 else
640 {
641 // Compute default viewport transform.
642 for (uint32_t i = 0; i < numViewports; ++i)
643 {
644 if (pContext->driverType == DX)
645 {
646 pState->vpMatrix[i].m00 = pState->vp[i].width / 2.0f;
647 pState->vpMatrix[i].m11 = -pState->vp[i].height / 2.0f;
648 pState->vpMatrix[i].m22 = pState->vp[i].maxZ - pState->vp[i].minZ;
649 pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
650 pState->vpMatrix[i].m31 = pState->vp[i].y - pState->vpMatrix[i].m11;
651 pState->vpMatrix[i].m32 = pState->vp[i].minZ;
652 }
653 else
654 {
655 // Standard, with the exception that Y is inverted.
656 pState->vpMatrix[i].m00 = (pState->vp[i].width - pState->vp[i].x) / 2.0f;
657 pState->vpMatrix[i].m11 = (pState->vp[i].y - pState->vp[i].height) / 2.0f;
658 pState->vpMatrix[i].m22 = (pState->vp[i].maxZ - pState->vp[i].minZ) / 2.0f;
659 pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
660 pState->vpMatrix[i].m31 = pState->vp[i].height + pState->vpMatrix[i].m11;
661 pState->vpMatrix[i].m32 = pState->vp[i].minZ + pState->vpMatrix[i].m22;
662
663 // Now that the matrix is calculated, clip the view coords to screen size.
664 // OpenGL allows for -ve x,y in the viewport.
665 pState->vp[i].x = std::max(pState->vp[i].x, 0.0f);
666 pState->vp[i].y = std::max(pState->vp[i].y, 0.0f);
667 }
668 }
669 }
670
671 updateGuardband(pState);
672 }
673
674 void SwrSetScissorRects(
675 HANDLE hContext,
676 uint32_t numScissors,
677 const BBOX* pScissors)
678 {
679 SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS,
680 "Invalid number of scissor rects.");
681
682 API_STATE* pState = GetDrawState(GetContext(hContext));
683 memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(BBOX));
684 };
685
686 void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
687 {
688 API_STATE *pState = &pDC->pState->state;
689 uint32_t left, right, top, bottom;
690
691 // Set up scissor dimensions based on scissor or viewport
692 if (pState->rastState.scissorEnable)
693 {
694 // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges
695 left = pState->scissorRects[0].left;
696 right = pState->scissorRects[0].right;
697 top = pState->scissorRects[0].top;
698 bottom = pState->scissorRects[0].bottom;
699 }
700 else
701 {
702 left = (int32_t)pState->vp[0].x;
703 right = (int32_t)pState->vp[0].x + (int32_t)pState->vp[0].width;
704 top = (int32_t)pState->vp[0].y;
705 bottom = (int32_t)pState->vp[0].y + (int32_t)pState->vp[0].height;
706 }
707
708 right = std::min<uint32_t>(right, KNOB_MAX_SCISSOR_X);
709 bottom = std::min<uint32_t>(bottom, KNOB_MAX_SCISSOR_Y);
710
711 if (left > KNOB_MAX_SCISSOR_X || top > KNOB_MAX_SCISSOR_Y)
712 {
713 pState->scissorInFixedPoint.left = 0;
714 pState->scissorInFixedPoint.right = 0;
715 pState->scissorInFixedPoint.top = 0;
716 pState->scissorInFixedPoint.bottom = 0;
717 }
718 else
719 {
720 pState->scissorInFixedPoint.left = left * FIXED_POINT_SCALE;
721 pState->scissorInFixedPoint.right = right * FIXED_POINT_SCALE - 1;
722 pState->scissorInFixedPoint.top = top * FIXED_POINT_SCALE;
723 pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1;
724 }
725 }
726 // templated backend function tables
727 extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
728 extern PFN_BACKEND_FUNC gBackendSingleSample[2][2];
729 extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
730 extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2];
731 extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS + 1][SWR_MULTISAMPLE_TYPE_MAX];
732 extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
733 extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
734 extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2];
735 void SetupPipeline(DRAW_CONTEXT *pDC)
736 {
737 DRAW_STATE* pState = pDC->pState;
738 const SWR_RASTSTATE &rastState = pState->state.rastState;
739 const SWR_PS_STATE &psState = pState->state.psState;
740 BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
741 const uint32_t forcedSampleCount = (rastState.bForcedSampleCount) ? 1 : 0;
742
743 // setup backend
744 if (psState.pfnPixelShader == nullptr)
745 {
746 backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
747 // always need to generate I & J per sample for Z interpolation
748 backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[1];
749 }
750 else
751 {
752 const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.bForcedSampleCount) ? 1 : 0;
753 const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
754
755 // currently only support 'normal' input coverage
756 SWR_ASSERT(psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
757 psState.inputCoverage == SWR_INPUT_COVERAGE_NONE);
758
759 SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
760
761 // select backend function
762 switch(psState.shadingRate)
763 {
764 case SWR_SHADING_RATE_PIXEL:
765 if(bMultisampleEnable)
766 {
767 // always need to generate I & J per sample for Z interpolation
768 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
769 backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount];
770 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
771 }
772 else
773 {
774 // always need to generate I & J per pixel for Z interpolation
775 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
776 backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid];
777 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][SWR_MULTISAMPLE_1X];
778 }
779 break;
780 case SWR_SHADING_RATE_SAMPLE:
781 SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
782 // always need to generate I & J per sample for Z interpolation
783 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
784 backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid];
785 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
786 break;
787 default:
788 SWR_ASSERT(0 && "Invalid shading rate");
789 break;
790 }
791
792 // setup pointer to function that generates necessary barycentrics required by the PS
793 bool bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_PIXEL_MASK) > 0 ? 1 : 0;
794 backendFuncs.pfnCalcPixelBarycentrics = gPixelBarycentricTable[bBarycentrics];
795
796 bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_SAMPLE_MASK) > 0 ? 1 : 0;
797 backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[bBarycentrics];
798
799 bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0 ? 1 : 0;
800 backendFuncs.pfnCalcCentroidBarycentrics = gCentroidBarycentricTable[rastState.sampleCount][bBarycentrics][rastState.samplePattern][forcedSampleCount];
801 }
802
803 PFN_PROCESS_PRIMS pfnBinner;
804 switch (pState->state.topology)
805 {
806 case TOP_POINT_LIST:
807 pState->pfnProcessPrims = ClipPoints;
808 pfnBinner = BinPoints;
809 break;
810 case TOP_LINE_LIST:
811 case TOP_LINE_STRIP:
812 case TOP_LINE_LOOP:
813 case TOP_LINE_LIST_ADJ:
814 case TOP_LISTSTRIP_ADJ:
815 pState->pfnProcessPrims = ClipLines;
816 pfnBinner = BinLines;
817 break;
818 default:
819 pState->pfnProcessPrims = ClipTriangles;
820 pfnBinner = BinTriangles;
821 break;
822 };
823
824 // disable clipper if viewport transform is disabled
825 if (pState->state.frontendState.vpTransformDisable)
826 {
827 pState->pfnProcessPrims = pfnBinner;
828 }
829
830 if ((pState->state.psState.pfnPixelShader == nullptr) &&
831 (pState->state.depthStencilState.depthTestEnable == FALSE) &&
832 (pState->state.depthStencilState.depthWriteEnable == FALSE) &&
833 (pState->state.depthStencilState.stencilTestEnable == FALSE) &&
834 (pState->state.depthStencilState.stencilWriteEnable == FALSE) &&
835 (pState->state.linkageCount == 0))
836 {
837 pState->pfnProcessPrims = nullptr;
838 pState->state.linkageMask = 0;
839 }
840
841 if (pState->state.soState.rasterizerDisable == true)
842 {
843 pState->pfnProcessPrims = nullptr;
844 pState->state.linkageMask = 0;
845 }
846
847 // set up the frontend attrib mask
848 pState->state.feAttribMask = pState->state.linkageMask;
849 if (pState->state.soState.soEnable)
850 {
851 for (uint32_t i = 0; i < 4; ++i)
852 {
853 pState->state.feAttribMask |= pState->state.soState.streamMasks[i];
854 }
855 }
856
857 // complicated logic to test for cases where we don't need backing hottile memory for a draw
858 // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
859 pState->state.depthHottileEnable = ((!(pState->state.depthStencilState.depthTestEnable &&
860 !pState->state.depthStencilState.depthWriteEnable &&
861 pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) &&
862 (pState->state.depthStencilState.depthTestEnable ||
863 pState->state.depthStencilState.depthWriteEnable)) ? true : false;
864
865 pState->state.stencilHottileEnable = (((!(pState->state.depthStencilState.stencilTestEnable &&
866 !pState->state.depthStencilState.stencilWriteEnable &&
867 pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) ||
868 // for stencil we have to check the double sided state as well
869 (!(pState->state.depthStencilState.doubleSidedStencilTestEnable &&
870 !pState->state.depthStencilState.stencilWriteEnable &&
871 pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) &&
872 (pState->state.depthStencilState.stencilTestEnable ||
873 pState->state.depthStencilState.stencilWriteEnable)) ? true : false;
874
875 uint32_t numRTs = pState->state.psState.numRenderTargets;
876 pState->state.colorHottileEnable = 0;
877 if (psState.pfnPixelShader != nullptr)
878 {
879 for (uint32_t rt = 0; rt < numRTs; ++rt)
880 {
881 pState->state.colorHottileEnable |=
882 (!pState->state.blendState.renderTarget[rt].writeDisableAlpha ||
883 !pState->state.blendState.renderTarget[rt].writeDisableRed ||
884 !pState->state.blendState.renderTarget[rt].writeDisableGreen ||
885 !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0;
886 }
887 }
888
889 // Setup depth quantization function
890 if (pState->state.depthHottileEnable)
891 {
892 switch (pState->state.rastState.depthFormat)
893 {
894 case R32_FLOAT_X8X24_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT_X8X24_TYPELESS > ; break;
895 case R32_FLOAT: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; break;
896 case R24_UNORM_X8_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R24_UNORM_X8_TYPELESS > ; break;
897 case R16_UNORM: pState->state.pfnQuantizeDepth = QuantizeDepth < R16_UNORM > ; break;
898 default: SWR_ASSERT(false, "Unsupported depth format for depth quantiztion.");
899 pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
900 }
901 }
902 else
903 {
904 // set up pass-through quantize if depth isn't enabled
905 pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
906 }
907 }
908
909 //////////////////////////////////////////////////////////////////////////
910 /// @brief InitDraw
911 /// @param pDC - Draw context to initialize for this draw.
912 void InitDraw(
913 DRAW_CONTEXT *pDC,
914 bool isSplitDraw)
915 {
916 // We don't need to re-setup the scissors/pipeline state again for split draw.
917 if (isSplitDraw == false)
918 {
919 SetupMacroTileScissors(pDC);
920 SetupPipeline(pDC);
921 }
922 }
923
924 //////////////////////////////////////////////////////////////////////////
925 /// @brief We can split the draw for certain topologies for better performance.
926 /// @param totalVerts - Total vertices for draw
927 /// @param topology - Topology used for draw
928 uint32_t MaxVertsPerDraw(
929 DRAW_CONTEXT* pDC,
930 uint32_t totalVerts,
931 PRIMITIVE_TOPOLOGY topology)
932 {
933 API_STATE& state = pDC->pState->state;
934
935 uint32_t vertsPerDraw = totalVerts;
936
937 if (state.soState.soEnable)
938 {
939 return totalVerts;
940 }
941
942 switch (topology)
943 {
944 case TOP_POINT_LIST:
945 case TOP_TRIANGLE_LIST:
946 vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW;
947 break;
948
949 case TOP_PATCHLIST_1:
950 case TOP_PATCHLIST_2:
951 case TOP_PATCHLIST_3:
952 case TOP_PATCHLIST_4:
953 case TOP_PATCHLIST_5:
954 case TOP_PATCHLIST_6:
955 case TOP_PATCHLIST_7:
956 case TOP_PATCHLIST_8:
957 case TOP_PATCHLIST_9:
958 case TOP_PATCHLIST_10:
959 case TOP_PATCHLIST_11:
960 case TOP_PATCHLIST_12:
961 case TOP_PATCHLIST_13:
962 case TOP_PATCHLIST_14:
963 case TOP_PATCHLIST_15:
964 case TOP_PATCHLIST_16:
965 case TOP_PATCHLIST_17:
966 case TOP_PATCHLIST_18:
967 case TOP_PATCHLIST_19:
968 case TOP_PATCHLIST_20:
969 case TOP_PATCHLIST_21:
970 case TOP_PATCHLIST_22:
971 case TOP_PATCHLIST_23:
972 case TOP_PATCHLIST_24:
973 case TOP_PATCHLIST_25:
974 case TOP_PATCHLIST_26:
975 case TOP_PATCHLIST_27:
976 case TOP_PATCHLIST_28:
977 case TOP_PATCHLIST_29:
978 case TOP_PATCHLIST_30:
979 case TOP_PATCHLIST_31:
980 case TOP_PATCHLIST_32:
981 if (pDC->pState->state.tsState.tsEnable)
982 {
983 uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE;
984 vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW;
985 }
986 break;
987
988 // The Primitive Assembly code can only handle 1 RECT at a time.
989 case TOP_RECT_LIST:
990 vertsPerDraw = 3;
991 break;
992
993 default:
994 // We are not splitting up draws for other topologies.
995 break;
996 }
997
998 return vertsPerDraw;
999 }
1000
1001 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
1002 // arguments to static template arguments.
1003 template <bool... ArgsB>
1004 struct FEDrawChooser
1005 {
1006 // Last Arg Terminator
1007 static PFN_FE_WORK_FUNC GetFunc(bool bArg)
1008 {
1009 if (bArg)
1010 {
1011 return ProcessDraw<ArgsB..., true>;
1012 }
1013
1014 return ProcessDraw<ArgsB..., false>;
1015 }
1016
1017 // Recursively parse args
1018 template <typename... TArgsT>
1019 static PFN_FE_WORK_FUNC GetFunc(bool bArg, TArgsT... remainingArgs)
1020 {
1021 if (bArg)
1022 {
1023 return FEDrawChooser<ArgsB..., true>::GetFunc(remainingArgs...);
1024 }
1025
1026 return FEDrawChooser<ArgsB..., false>::GetFunc(remainingArgs...);
1027 }
1028 };
1029
1030 // Selector for correct templated Draw front-end function
1031 INLINE
1032 static PFN_FE_WORK_FUNC GetFEDrawFunc(bool IsIndexed, bool HasTessellation, bool HasGeometryShader, bool HasStreamOut, bool RasterizerEnabled)
1033 {
1034 return FEDrawChooser<>::GetFunc(IsIndexed, HasTessellation, HasGeometryShader, HasStreamOut, RasterizerEnabled);
1035 }
1036
1037
1038 //////////////////////////////////////////////////////////////////////////
1039 /// @brief DrawInstanced
1040 /// @param hContext - Handle passed back from SwrCreateContext
1041 /// @param topology - Specifies topology for draw.
1042 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
1043 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1044 /// @param numInstances - How many instances to render.
1045 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1046 void DrawInstanced(
1047 HANDLE hContext,
1048 PRIMITIVE_TOPOLOGY topology,
1049 uint32_t numVertices,
1050 uint32_t startVertex,
1051 uint32_t numInstances = 1,
1052 uint32_t startInstance = 0)
1053 {
1054 if (KNOB_TOSS_DRAW)
1055 {
1056 return;
1057 }
1058
1059 RDTSC_START(APIDraw);
1060
1061 SWR_CONTEXT *pContext = GetContext(hContext);
1062 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1063
1064 int32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
1065 uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
1066 int32_t remainingVerts = numVertices;
1067
1068 API_STATE *pState = &pDC->pState->state;
1069 pState->topology = topology;
1070 pState->forceFront = false;
1071
1072 // disable culling for points/lines
1073 uint32_t oldCullMode = pState->rastState.cullMode;
1074 if (topology == TOP_POINT_LIST)
1075 {
1076 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1077 pState->forceFront = true;
1078 }
1079
1080 int draw = 0;
1081 while (remainingVerts)
1082 {
1083 uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ?
1084 remainingVerts : maxVertsPerDraw;
1085
1086 bool isSplitDraw = (draw > 0) ? true : false;
1087 DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw);
1088 InitDraw(pDC, isSplitDraw);
1089
1090 pDC->FeWork.type = DRAW;
1091 pDC->FeWork.pfnWork = GetFEDrawFunc(
1092 false, // IsIndexed
1093 pState->tsState.tsEnable,
1094 pState->gsState.gsEnable,
1095 pState->soState.soEnable,
1096 pDC->pState->pfnProcessPrims != nullptr);
1097 pDC->FeWork.desc.draw.numVerts = numVertsForDraw;
1098 pDC->FeWork.desc.draw.startVertex = startVertex;
1099 pDC->FeWork.desc.draw.numInstances = numInstances;
1100 pDC->FeWork.desc.draw.startInstance = startInstance;
1101 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1102 pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw;
1103
1104 pDC->cleanupState = (remainingVerts == numVertsForDraw);
1105
1106 //enqueue DC
1107 QueueDraw(pContext);
1108
1109 remainingVerts -= numVertsForDraw;
1110 draw++;
1111 }
1112
1113 // restore culling state
1114 pDC = GetDrawContext(pContext);
1115 pDC->pState->state.rastState.cullMode = oldCullMode;
1116
1117 RDTSC_STOP(APIDraw, numVertices * numInstances, 0);
1118 }
1119
1120 //////////////////////////////////////////////////////////////////////////
1121 /// @brief SwrDraw
1122 /// @param hContext - Handle passed back from SwrCreateContext
1123 /// @param topology - Specifies topology for draw.
1124 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
1125 /// @param primCount - Number of vertices.
1126 void SwrDraw(
1127 HANDLE hContext,
1128 PRIMITIVE_TOPOLOGY topology,
1129 uint32_t startVertex,
1130 uint32_t numVertices)
1131 {
1132 DrawInstanced(hContext, topology, numVertices, startVertex);
1133 }
1134
1135 //////////////////////////////////////////////////////////////////////////
1136 /// @brief SwrDrawInstanced
1137 /// @param hContext - Handle passed back from SwrCreateContext
1138 /// @param topology - Specifies topology for draw.
1139 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
1140 /// @param numInstances - How many instances to render.
1141 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1142 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1143 void SwrDrawInstanced(
1144 HANDLE hContext,
1145 PRIMITIVE_TOPOLOGY topology,
1146 uint32_t numVertsPerInstance,
1147 uint32_t numInstances,
1148 uint32_t startVertex,
1149 uint32_t startInstance
1150 )
1151 {
1152 DrawInstanced(hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance);
1153 }
1154
1155 //////////////////////////////////////////////////////////////////////////
1156 /// @brief DrawIndexedInstanced
1157 /// @param hContext - Handle passed back from SwrCreateContext
1158 /// @param topology - Specifies topology for draw.
1159 /// @param numIndices - Number of indices to read sequentially from index buffer.
1160 /// @param indexOffset - Starting index into index buffer.
1161 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1162 /// @param numInstances - Number of instances to render.
1163 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1164 void DrawIndexedInstance(
1165 HANDLE hContext,
1166 PRIMITIVE_TOPOLOGY topology,
1167 uint32_t numIndices,
1168 uint32_t indexOffset,
1169 int32_t baseVertex,
1170 uint32_t numInstances = 1,
1171 uint32_t startInstance = 0)
1172 {
1173 if (KNOB_TOSS_DRAW)
1174 {
1175 return;
1176 }
1177
1178 RDTSC_START(APIDrawIndexed);
1179
1180 SWR_CONTEXT *pContext = GetContext(hContext);
1181 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1182 API_STATE* pState = &pDC->pState->state;
1183
1184 int32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
1185 uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
1186 int32_t remainingIndices = numIndices;
1187
1188 uint32_t indexSize = 0;
1189 switch (pState->indexBuffer.format)
1190 {
1191 case R32_UINT: indexSize = sizeof(uint32_t); break;
1192 case R16_UINT: indexSize = sizeof(uint16_t); break;
1193 case R8_UINT: indexSize = sizeof(uint8_t); break;
1194 default:
1195 SWR_ASSERT(0);
1196 }
1197
1198 int draw = 0;
1199 uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices;
1200 pIB += (uint64_t)indexOffset * (uint64_t)indexSize;
1201
1202 pState->topology = topology;
1203 pState->forceFront = false;
1204
1205 // disable culling for points/lines
1206 uint32_t oldCullMode = pState->rastState.cullMode;
1207 if (topology == TOP_POINT_LIST)
1208 {
1209 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1210 pState->forceFront = true;
1211 }
1212
1213 while (remainingIndices)
1214 {
1215 uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ?
1216 remainingIndices : maxIndicesPerDraw;
1217
1218 // When breaking up draw, we need to obtain new draw context for each iteration.
1219 bool isSplitDraw = (draw > 0) ? true : false;
1220 pDC = GetDrawContext(pContext, isSplitDraw);
1221 InitDraw(pDC, isSplitDraw);
1222
1223 pDC->FeWork.type = DRAW;
1224 pDC->FeWork.pfnWork = GetFEDrawFunc(
1225 true, // IsIndexed
1226 pState->tsState.tsEnable,
1227 pState->gsState.gsEnable,
1228 pState->soState.soEnable,
1229 pDC->pState->pfnProcessPrims != nullptr);
1230 pDC->FeWork.desc.draw.pDC = pDC;
1231 pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
1232 pDC->FeWork.desc.draw.pIB = (int*)pIB;
1233 pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format;
1234
1235 pDC->FeWork.desc.draw.numInstances = numInstances;
1236 pDC->FeWork.desc.draw.startInstance = startInstance;
1237 pDC->FeWork.desc.draw.baseVertex = baseVertex;
1238 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1239
1240 pDC->cleanupState = (remainingIndices == numIndicesForDraw);
1241
1242 //enqueue DC
1243 QueueDraw(pContext);
1244
1245 pIB += maxIndicesPerDraw * indexSize;
1246 remainingIndices -= numIndicesForDraw;
1247 draw++;
1248 }
1249
1250 // restore culling state
1251 pDC = GetDrawContext(pContext);
1252 pDC->pState->state.rastState.cullMode = oldCullMode;
1253
1254 RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0);
1255 }
1256
1257
1258 //////////////////////////////////////////////////////////////////////////
1259 /// @brief DrawIndexed
1260 /// @param hContext - Handle passed back from SwrCreateContext
1261 /// @param topology - Specifies topology for draw.
1262 /// @param numIndices - Number of indices to read sequentially from index buffer.
1263 /// @param indexOffset - Starting index into index buffer.
1264 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1265 void SwrDrawIndexed(
1266 HANDLE hContext,
1267 PRIMITIVE_TOPOLOGY topology,
1268 uint32_t numIndices,
1269 uint32_t indexOffset,
1270 int32_t baseVertex
1271 )
1272 {
1273 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex);
1274 }
1275
1276 //////////////////////////////////////////////////////////////////////////
1277 /// @brief SwrDrawIndexedInstanced
1278 /// @param hContext - Handle passed back from SwrCreateContext
1279 /// @param topology - Specifies topology for draw.
1280 /// @param numIndices - Number of indices to read sequentially from index buffer.
1281 /// @param numInstances - Number of instances to render.
1282 /// @param indexOffset - Starting index into index buffer.
1283 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1284 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1285 void SwrDrawIndexedInstanced(
1286 HANDLE hContext,
1287 PRIMITIVE_TOPOLOGY topology,
1288 uint32_t numIndices,
1289 uint32_t numInstances,
1290 uint32_t indexOffset,
1291 int32_t baseVertex,
1292 uint32_t startInstance)
1293 {
1294 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
1295 }
1296
1297 //////////////////////////////////////////////////////////////////////////
1298 /// @brief SwrInvalidateTiles
1299 /// @param hContext - Handle passed back from SwrCreateContext
1300 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
1301 void SwrInvalidateTiles(
1302 HANDLE hContext,
1303 uint32_t attachmentMask)
1304 {
1305 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1306 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1307
1308 pDC->FeWork.type = DISCARDINVALIDATETILES;
1309 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1310 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1311 memset(&pDC->FeWork.desc.discardInvalidateTiles.rect, 0, sizeof(SWR_RECT));
1312 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID;
1313 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false;
1314 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false;
1315
1316 //enqueue
1317 QueueDraw(pContext);
1318 }
1319
1320 //////////////////////////////////////////////////////////////////////////
1321 /// @brief SwrDiscardRect
1322 /// @param hContext - Handle passed back from SwrCreateContext
1323 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
1324 /// @param rect - if rect is all zeros, the entire attachment surface will be discarded
1325 void SwrDiscardRect(
1326 HANDLE hContext,
1327 uint32_t attachmentMask,
1328 SWR_RECT rect)
1329 {
1330 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1331 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1332
1333 // Queue a load to the hottile
1334 pDC->FeWork.type = DISCARDINVALIDATETILES;
1335 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1336 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1337 pDC->FeWork.desc.discardInvalidateTiles.rect = rect;
1338 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED;
1339 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true;
1340 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true;
1341
1342 //enqueue
1343 QueueDraw(pContext);
1344 }
1345
1346 //////////////////////////////////////////////////////////////////////////
1347 /// @brief SwrDispatch
1348 /// @param hContext - Handle passed back from SwrCreateContext
1349 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
1350 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
1351 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
1352 void SwrDispatch(
1353 HANDLE hContext,
1354 uint32_t threadGroupCountX,
1355 uint32_t threadGroupCountY,
1356 uint32_t threadGroupCountZ)
1357 {
1358 if (KNOB_TOSS_DRAW)
1359 {
1360 return;
1361 }
1362
1363 RDTSC_START(APIDispatch);
1364 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1365 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1366
1367 pDC->isCompute = true; // This is a compute context.
1368
1369 // Ensure spill fill pointers are initialized to nullptr.
1370 memset(pDC->pSpillFill, 0, sizeof(pDC->pSpillFill));
1371
1372 COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
1373
1374 pTaskData->threadGroupCountX = threadGroupCountX;
1375 pTaskData->threadGroupCountY = threadGroupCountY;
1376 pTaskData->threadGroupCountZ = threadGroupCountZ;
1377
1378 uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
1379 pDC->pDispatch->initialize(totalThreadGroups, pTaskData);
1380
1381 QueueDispatch(pContext);
1382 RDTSC_STOP(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ, 0);
1383 }
1384
1385 // Deswizzles, converts and stores current contents of the hot tiles to surface
1386 // described by pState
1387 void SwrStoreTiles(
1388 HANDLE hContext,
1389 SWR_RENDERTARGET_ATTACHMENT attachment,
1390 SWR_TILE_STATE postStoreTileState)
1391 {
1392 RDTSC_START(APIStoreTiles);
1393
1394 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1395 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1396
1397 SetupMacroTileScissors(pDC);
1398
1399 pDC->FeWork.type = STORETILES;
1400 pDC->FeWork.pfnWork = ProcessStoreTiles;
1401 pDC->FeWork.desc.storeTiles.attachment = attachment;
1402 pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState;
1403
1404 //enqueue
1405 QueueDraw(pContext);
1406
1407 RDTSC_STOP(APIStoreTiles, 0, 0);
1408 }
1409
1410 void SwrClearRenderTarget(
1411 HANDLE hContext,
1412 uint32_t clearMask,
1413 const float clearColor[4],
1414 float z,
1415 uint8_t stencil)
1416 {
1417 RDTSC_START(APIClearRenderTarget);
1418
1419 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1420
1421 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1422
1423 SetupMacroTileScissors(pDC);
1424
1425 CLEAR_FLAGS flags;
1426 flags.mask = clearMask;
1427
1428 pDC->FeWork.type = CLEAR;
1429 pDC->FeWork.pfnWork = ProcessClear;
1430 pDC->FeWork.desc.clear.flags = flags;
1431 pDC->FeWork.desc.clear.clearDepth = z;
1432 pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0];
1433 pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1];
1434 pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2];
1435 pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3];
1436 pDC->FeWork.desc.clear.clearStencil = stencil;
1437
1438 // enqueue draw
1439 QueueDraw(pContext);
1440
1441 RDTSC_STOP(APIClearRenderTarget, 0, pDC->drawId);
1442 }
1443
1444 //////////////////////////////////////////////////////////////////////////
1445 /// @brief Returns a pointer to the private context state for the current
1446 /// draw operation. This is used for external componets such as the
1447 /// sampler.
1448 /// SWR is responsible for the allocation of the private context state.
1449 /// @param hContext - Handle passed back from SwrCreateContext
1450 VOID* SwrGetPrivateContextState(
1451 HANDLE hContext)
1452 {
1453 SWR_CONTEXT* pContext = GetContext(hContext);
1454 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1455 DRAW_STATE* pState = pDC->pState;
1456
1457 if (pState->pPrivateState == nullptr)
1458 {
1459 pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float));
1460 }
1461
1462 return pState->pPrivateState;
1463 }
1464
1465 //////////////////////////////////////////////////////////////////////////
1466 /// @brief Clients can use this to allocate memory for draw/dispatch
1467 /// operations. The memory will automatically be freed once operation
1468 /// has completed. Client can use this to allocate binding tables,
1469 /// etc. needed for shader execution.
1470 /// @param hContext - Handle passed back from SwrCreateContext
1471 /// @param size - Size of allocation
1472 /// @param align - Alignment needed for allocation.
1473 VOID* SwrAllocDrawContextMemory(
1474 HANDLE hContext,
1475 uint32_t size,
1476 uint32_t align)
1477 {
1478 SWR_CONTEXT* pContext = GetContext(hContext);
1479 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1480
1481 return pDC->pState->pArena->AllocAligned(size, align);
1482 }
1483
1484 //////////////////////////////////////////////////////////////////////////
1485 /// @brief Returns pointer to SWR stats.
1486 /// @note The counters are atomically incremented by multiple threads.
1487 /// When calling this, you need to ensure all previous operations
1488 /// have completed.
1489 /// @todo If necessary, add a callback to avoid stalling the pipe to
1490 /// sample the counters.
1491 /// @param hContext - Handle passed back from SwrCreateContext
1492 /// @param pStats - SWR will fill this out for caller.
1493 void SwrGetStats(
1494 HANDLE hContext,
1495 SWR_STATS* pStats)
1496 {
1497 SWR_CONTEXT *pContext = GetContext(hContext);
1498 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1499
1500 pDC->FeWork.type = QUERYSTATS;
1501 pDC->FeWork.pfnWork = ProcessQueryStats;
1502 pDC->FeWork.desc.queryStats.pStats = pStats;
1503
1504 // cannot execute until all previous draws have completed
1505 pDC->dependency = pDC->drawId - 1;
1506
1507 //enqueue
1508 QueueDraw(pContext);
1509 }
1510
1511 //////////////////////////////////////////////////////////////////////////
1512 /// @brief Enables stats counting
1513 /// @param hContext - Handle passed back from SwrCreateContext
1514 /// @param enable - If true then counts are incremented.
1515 void SwrEnableStats(
1516 HANDLE hContext,
1517 bool enable)
1518 {
1519 SWR_CONTEXT *pContext = GetContext(hContext);
1520 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1521
1522 pDC->pState->state.enableStats = enable;
1523 }
1524
1525 //////////////////////////////////////////////////////////////////////////
1526 /// @brief Mark end of frame - used for performance profiling
1527 /// @param hContext - Handle passed back from SwrCreateContext
1528 void SWR_API SwrEndFrame(
1529 HANDLE hContext)
1530 {
1531 RDTSC_ENDFRAME();
1532 }