swr: [rasterizer core] Cleanup state ring arena after last draw that references it...
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / api.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file api.cpp
24 *
25 * @brief API implementation
26 *
27 ******************************************************************************/
28
29 #include <cfloat>
30 #include <cmath>
31 #include <cstdio>
32
33 #include "core/api.h"
34 #include "core/backend.h"
35 #include "core/context.h"
36 #include "core/frontend.h"
37 #include "core/rasterizer.h"
38 #include "core/rdtsc_core.h"
39 #include "core/threads.h"
40 #include "core/tilemgr.h"
41 #include "core/clip.h"
42
43 #include "common/simdintrin.h"
44 #include "common/os.h"
45
46 void SetupDefaultState(SWR_CONTEXT *pContext);
47
48 //////////////////////////////////////////////////////////////////////////
49 /// @brief Create SWR Context.
50 /// @param pCreateInfo - pointer to creation info.
51 HANDLE SwrCreateContext(
52 SWR_CREATECONTEXT_INFO* pCreateInfo)
53 {
54 RDTSC_RESET();
55 RDTSC_INIT(0);
56
57 void* pContextMem = _aligned_malloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4);
58 memset(pContextMem, 0, sizeof(SWR_CONTEXT));
59 SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT();
60
61 pContext->driverType = pCreateInfo->driver;
62 pContext->privateStateSize = pCreateInfo->privateStateSize;
63
64 pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
65 pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
66
67 for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
68 {
69 pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
70 pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena));
71 pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
72
73 pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
74 }
75
76 if (!KNOB_SINGLE_THREADED)
77 {
78 memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
79 memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
80 new (&pContext->WaitLock) std::mutex();
81 new (&pContext->FifosNotEmpty) std::condition_variable();
82
83 CreateThreadPool(pContext, &pContext->threadPool);
84 }
85
86 // Calling createThreadPool() above can set SINGLE_THREADED
87 if (KNOB_SINGLE_THREADED)
88 {
89 pContext->NumWorkerThreads = 1;
90 }
91
92 // Allocate scratch space for workers.
93 ///@note We could lazily allocate this but its rather small amount of memory.
94 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
95 {
96 ///@todo Use numa API for allocations using numa information from thread data (if exists).
97 pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4);
98 }
99
100 // State setup AFTER context is fully initialized
101 SetupDefaultState(pContext);
102
103 // initialize hot tile manager
104 pContext->pHotTileMgr = new HotTileMgr();
105
106 // initialize function pointer tables
107 InitClearTilesTable();
108
109 // initialize store tiles function
110 pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
111 pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
112 pContext->pfnClearTile = pCreateInfo->pfnClearTile;
113
114 // pass pointer to bucket manager back to caller
115 #ifdef KNOB_ENABLE_RDTSC
116 pCreateInfo->pBucketMgr = &gBucketMgr;
117 #endif
118
119 pCreateInfo->contextSaveSize = sizeof(API_STATE);
120
121 return (HANDLE)pContext;
122 }
123
124 void SwrDestroyContext(HANDLE hContext)
125 {
126 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
127 DestroyThreadPool(pContext, &pContext->threadPool);
128
129 // free the fifos
130 for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i)
131 {
132 delete pContext->dcRing[i].pArena;
133 delete pContext->dsRing[i].pArena;
134 delete(pContext->dcRing[i].pTileMgr);
135 delete(pContext->dcRing[i].pDispatch);
136 }
137
138 // Free scratch space.
139 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
140 {
141 _aligned_free(pContext->pScratch[i]);
142 }
143
144 delete(pContext->pHotTileMgr);
145
146 pContext->~SWR_CONTEXT();
147 _aligned_free((SWR_CONTEXT*)hContext);
148 }
149
150 void CopyState(DRAW_STATE& dst, const DRAW_STATE& src)
151 {
152 memcpy(&dst.state, &src.state, sizeof(API_STATE));
153 }
154
155 void WakeAllThreads(SWR_CONTEXT *pContext)
156 {
157 pContext->FifosNotEmpty.notify_all();
158 }
159
160 template<bool IsDraw>
161 void QueueWork(SWR_CONTEXT *pContext)
162 {
163 if (IsDraw)
164 {
165 // Each worker thread looks at a DC for both FE and BE work at different times and so we
166 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
167 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
168 // then moved on if all work is done.)
169 pContext->pCurDrawContext->threadsDone =
170 pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2;
171 }
172 else
173 {
174 pContext->pCurDrawContext->threadsDone =
175 pContext->NumWorkerThreads ? pContext->NumWorkerThreads : 1;
176 }
177
178 _ReadWriteBarrier();
179 {
180 std::unique_lock<std::mutex> lock(pContext->WaitLock);
181 pContext->dcRing.Enqueue();
182 }
183
184 if (KNOB_SINGLE_THREADED)
185 {
186 // flush denormals to 0
187 uint32_t mxcsr = _mm_getcsr();
188 _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
189
190 if (IsDraw)
191 {
192 static TileSet lockedTiles;
193 uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
194 WorkOnFifoFE(pContext, 0, curDraw[0], 0);
195 WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles);
196 }
197 else
198 {
199 uint64_t curDispatch = pContext->pCurDrawContext->drawId;
200 WorkOnCompute(pContext, 0, curDispatch);
201 }
202
203 // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
204 if (!pContext->dcRing.IsEmpty())
205 {
206 pContext->dcRing.Dequeue();
207 }
208
209 // restore csr
210 _mm_setcsr(mxcsr);
211 }
212 else
213 {
214 RDTSC_START(APIDrawWakeAllThreads);
215 WakeAllThreads(pContext);
216 RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
217 }
218
219 // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
220 pContext->pPrevDrawContext = pContext->pCurDrawContext;
221 pContext->pCurDrawContext = nullptr;
222 }
223
224 INLINE void QueueDraw(SWR_CONTEXT* pContext)
225 {
226 QueueWork<true>(pContext);
227 }
228
229 INLINE void QueueDispatch(SWR_CONTEXT* pContext)
230 {
231 QueueWork<false>(pContext);
232 }
233
234 DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
235 {
236 RDTSC_START(APIGetDrawContext);
237 // If current draw context is null then need to obtain a new draw context to use from ring.
238 if (pContext->pCurDrawContext == nullptr)
239 {
240 // Need to wait for a free entry.
241 while (pContext->dcRing.IsFull())
242 {
243 _mm_pause();
244 }
245
246 uint32_t dcIndex = pContext->dcRing.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT;
247
248 DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
249 pContext->pCurDrawContext = pCurDrawContext;
250
251 // Assign next available entry in DS ring to this DC.
252 uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
253 pCurDrawContext->pState = &pContext->dsRing[dsIndex];
254
255 auto& stateArena = *(pCurDrawContext->pState->pArena);
256
257 // Copy previous state to current state.
258 if (pContext->pPrevDrawContext)
259 {
260 DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext;
261
262 // If we're splitting our draw then we can just use the same state from the previous
263 // draw. In this case, we won't increment the DS ring index so the next non-split
264 // draw can receive the state.
265 if (isSplitDraw == false)
266 {
267 CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);
268
269 stateArena.Reset(true); // Reset memory.
270 pCurDrawContext->pState->pPrivateState = nullptr;
271
272 pContext->curStateId++; // Progress state ring index forward.
273 }
274 else
275 {
276 // If its a split draw then just copy the state pointer over
277 // since its the same draw.
278 pCurDrawContext->pState = pPrevDrawContext->pState;
279 }
280 }
281 else
282 {
283 stateArena.Reset(); // Reset memory.
284 pContext->curStateId++; // Progress state ring index forward.
285 }
286
287 pCurDrawContext->dependency = 0;
288 pCurDrawContext->pArena->Reset();
289 pCurDrawContext->pContext = pContext;
290 pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
291
292 pCurDrawContext->doneFE = false;
293 pCurDrawContext->FeLock = 0;
294 pCurDrawContext->threadsDone = 0;
295
296 pCurDrawContext->pTileMgr->initialize();
297
298 // Assign unique drawId for this DC
299 pCurDrawContext->drawId = pContext->dcRing.GetHead();
300
301 pCurDrawContext->cleanupState = true;
302 }
303 else
304 {
305 SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
306 }
307
308 RDTSC_STOP(APIGetDrawContext, 0, 0);
309 return pContext->pCurDrawContext;
310 }
311
312 API_STATE* GetDrawState(SWR_CONTEXT *pContext)
313 {
314 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
315 SWR_ASSERT(pDC->pState != nullptr);
316
317 return &pDC->pState->state;
318 }
319
320 void SWR_API SwrSaveState(
321 HANDLE hContext,
322 void* pOutputStateBlock,
323 size_t memSize)
324 {
325 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
326 auto pSrc = GetDrawState(pContext);
327 SWR_ASSERT(pOutputStateBlock && memSize >= sizeof(*pSrc));
328
329 memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc));
330 }
331
332 void SWR_API SwrRestoreState(
333 HANDLE hContext,
334 const void* pStateBlock,
335 size_t memSize)
336 {
337 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
338 auto pDst = GetDrawState(pContext);
339 SWR_ASSERT(pStateBlock && memSize >= sizeof(*pDst));
340
341 memcpy(pDst, pStateBlock, sizeof(*pDst));
342 }
343
344 void SetupDefaultState(SWR_CONTEXT *pContext)
345 {
346 API_STATE* pState = GetDrawState(pContext);
347
348 pState->rastState.cullMode = SWR_CULLMODE_NONE;
349 pState->rastState.frontWinding = SWR_FRONTWINDING_CCW;
350 }
351
352 static INLINE SWR_CONTEXT* GetContext(HANDLE hContext)
353 {
354 return (SWR_CONTEXT*)hContext;
355 }
356
357 void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2, uint64_t userData3)
358 {
359 RDTSC_START(APISync);
360
361 SWR_ASSERT(pfnFunc != nullptr);
362
363 SWR_CONTEXT *pContext = GetContext(hContext);
364 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
365
366 pDC->FeWork.type = SYNC;
367 pDC->FeWork.pfnWork = ProcessSync;
368 pDC->FeWork.desc.sync.pfnCallbackFunc = pfnFunc;
369 pDC->FeWork.desc.sync.userData = userData;
370 pDC->FeWork.desc.sync.userData2 = userData2;
371 pDC->FeWork.desc.sync.userData3 = userData3;
372
373 // cannot execute until all previous draws have completed
374 pDC->dependency = pDC->drawId - 1;
375
376 //enqueue
377 QueueDraw(pContext);
378
379 RDTSC_STOP(APISync, 1, 0);
380 }
381
382 void SwrWaitForIdle(HANDLE hContext)
383 {
384 SWR_CONTEXT *pContext = GetContext(hContext);
385
386 RDTSC_START(APIWaitForIdle);
387
388 while (!pContext->dcRing.IsEmpty())
389 {
390 _mm_pause();
391 }
392
393 RDTSC_STOP(APIWaitForIdle, 1, 0);
394 }
395
396 void SwrSetVertexBuffers(
397 HANDLE hContext,
398 uint32_t numBuffers,
399 const SWR_VERTEX_BUFFER_STATE* pVertexBuffers)
400 {
401 API_STATE* pState = GetDrawState(GetContext(hContext));
402
403 for (uint32_t i = 0; i < numBuffers; ++i)
404 {
405 const SWR_VERTEX_BUFFER_STATE *pVB = &pVertexBuffers[i];
406 pState->vertexBuffers[pVB->index] = *pVB;
407 }
408 }
409
410 void SwrSetIndexBuffer(
411 HANDLE hContext,
412 const SWR_INDEX_BUFFER_STATE* pIndexBuffer)
413 {
414 API_STATE* pState = GetDrawState(GetContext(hContext));
415
416 pState->indexBuffer = *pIndexBuffer;
417 }
418
419 void SwrSetFetchFunc(
420 HANDLE hContext,
421 PFN_FETCH_FUNC pfnFetchFunc)
422 {
423 API_STATE* pState = GetDrawState(GetContext(hContext));
424
425 pState->pfnFetchFunc = pfnFetchFunc;
426 }
427
428 void SwrSetSoFunc(
429 HANDLE hContext,
430 PFN_SO_FUNC pfnSoFunc,
431 uint32_t streamIndex)
432 {
433 API_STATE* pState = GetDrawState(GetContext(hContext));
434
435 SWR_ASSERT(streamIndex < MAX_SO_STREAMS);
436
437 pState->pfnSoFunc[streamIndex] = pfnSoFunc;
438 }
439
440 void SwrSetSoState(
441 HANDLE hContext,
442 SWR_STREAMOUT_STATE* pSoState)
443 {
444 API_STATE* pState = GetDrawState(GetContext(hContext));
445
446 pState->soState = *pSoState;
447 }
448
449 void SwrSetSoBuffers(
450 HANDLE hContext,
451 SWR_STREAMOUT_BUFFER* pSoBuffer,
452 uint32_t slot)
453 {
454 API_STATE* pState = GetDrawState(GetContext(hContext));
455
456 SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot);
457
458 pState->soBuffer[slot] = *pSoBuffer;
459 }
460
461 void SwrSetVertexFunc(
462 HANDLE hContext,
463 PFN_VERTEX_FUNC pfnVertexFunc)
464 {
465 API_STATE* pState = GetDrawState(GetContext(hContext));
466
467 pState->pfnVertexFunc = pfnVertexFunc;
468 }
469
470 void SwrSetFrontendState(
471 HANDLE hContext,
472 SWR_FRONTEND_STATE *pFEState)
473 {
474 API_STATE* pState = GetDrawState(GetContext(hContext));
475 pState->frontendState = *pFEState;
476 }
477
478 void SwrSetGsState(
479 HANDLE hContext,
480 SWR_GS_STATE *pGSState)
481 {
482 API_STATE* pState = GetDrawState(GetContext(hContext));
483 pState->gsState = *pGSState;
484 }
485
486 void SwrSetGsFunc(
487 HANDLE hContext,
488 PFN_GS_FUNC pfnGsFunc)
489 {
490 API_STATE* pState = GetDrawState(GetContext(hContext));
491 pState->pfnGsFunc = pfnGsFunc;
492 }
493
494 void SwrSetCsFunc(
495 HANDLE hContext,
496 PFN_CS_FUNC pfnCsFunc,
497 uint32_t totalThreadsInGroup)
498 {
499 API_STATE* pState = GetDrawState(GetContext(hContext));
500 pState->pfnCsFunc = pfnCsFunc;
501 pState->totalThreadsInGroup = totalThreadsInGroup;
502 }
503
504 void SwrSetTsState(
505 HANDLE hContext,
506 SWR_TS_STATE *pState)
507 {
508 API_STATE* pApiState = GetDrawState(GetContext(hContext));
509 pApiState->tsState = *pState;
510 }
511
512 void SwrSetHsFunc(
513 HANDLE hContext,
514 PFN_HS_FUNC pfnFunc)
515 {
516 API_STATE* pApiState = GetDrawState(GetContext(hContext));
517 pApiState->pfnHsFunc = pfnFunc;
518 }
519
520 void SwrSetDsFunc(
521 HANDLE hContext,
522 PFN_DS_FUNC pfnFunc)
523 {
524 API_STATE* pApiState = GetDrawState(GetContext(hContext));
525 pApiState->pfnDsFunc = pfnFunc;
526 }
527
528 void SwrSetDepthStencilState(
529 HANDLE hContext,
530 SWR_DEPTH_STENCIL_STATE *pDSState)
531 {
532 API_STATE* pState = GetDrawState(GetContext(hContext));
533
534 pState->depthStencilState = *pDSState;
535 }
536
537 void SwrSetBackendState(
538 HANDLE hContext,
539 SWR_BACKEND_STATE *pBEState)
540 {
541 API_STATE* pState = GetDrawState(GetContext(hContext));
542
543 pState->backendState = *pBEState;
544 }
545
546 void SwrSetPixelShaderState(
547 HANDLE hContext,
548 SWR_PS_STATE *pPSState)
549 {
550 API_STATE *pState = GetDrawState(GetContext(hContext));
551 pState->psState = *pPSState;
552 }
553
554 void SwrSetBlendState(
555 HANDLE hContext,
556 SWR_BLEND_STATE *pBlendState)
557 {
558 API_STATE *pState = GetDrawState(GetContext(hContext));
559 memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE));
560 }
561
562 void SwrSetBlendFunc(
563 HANDLE hContext,
564 uint32_t renderTarget,
565 PFN_BLEND_JIT_FUNC pfnBlendFunc)
566 {
567 SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS);
568 API_STATE *pState = GetDrawState(GetContext(hContext));
569 pState->pfnBlendFunc[renderTarget] = pfnBlendFunc;
570 }
571
572 void SwrSetLinkage(
573 HANDLE hContext,
574 uint32_t mask,
575 const uint8_t* pMap)
576 {
577 API_STATE* pState = GetDrawState(GetContext(hContext));
578
579 static const uint8_t IDENTITY_MAP[] =
580 {
581 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
582 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
583 };
584 static_assert(sizeof(IDENTITY_MAP) == sizeof(pState->linkageMap),
585 "Update for new value of MAX_ATTRIBUTES");
586
587 pState->linkageMask = mask;
588 pState->linkageCount = _mm_popcnt_u32(mask);
589
590 if (!pMap)
591 {
592 pMap = IDENTITY_MAP;
593 }
594 memcpy(pState->linkageMap, pMap, pState->linkageCount);
595 }
596
597 // update guardband multipliers for the viewport
598 void updateGuardband(API_STATE *pState)
599 {
600 // guardband center is viewport center
601 pState->gbState.left = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
602 pState->gbState.right = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
603 pState->gbState.top = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
604 pState->gbState.bottom = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
605 }
606
607 void SwrSetRastState(
608 HANDLE hContext,
609 const SWR_RASTSTATE *pRastState)
610 {
611 SWR_CONTEXT *pContext = GetContext(hContext);
612 API_STATE* pState = GetDrawState(pContext);
613
614 memcpy(&pState->rastState, pRastState, sizeof(SWR_RASTSTATE));
615 }
616
617 void SwrSetViewports(
618 HANDLE hContext,
619 uint32_t numViewports,
620 const SWR_VIEWPORT* pViewports,
621 const SWR_VIEWPORT_MATRIX* pMatrices)
622 {
623 SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS,
624 "Invalid number of viewports.");
625
626 SWR_CONTEXT *pContext = GetContext(hContext);
627 API_STATE* pState = GetDrawState(pContext);
628
629 memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports);
630
631 if (pMatrices != nullptr)
632 {
633 memcpy(&pState->vpMatrix[0], pMatrices, sizeof(SWR_VIEWPORT_MATRIX) * numViewports);
634 }
635 else
636 {
637 // Compute default viewport transform.
638 for (uint32_t i = 0; i < numViewports; ++i)
639 {
640 if (pContext->driverType == DX)
641 {
642 pState->vpMatrix[i].m00 = pState->vp[i].width / 2.0f;
643 pState->vpMatrix[i].m11 = -pState->vp[i].height / 2.0f;
644 pState->vpMatrix[i].m22 = pState->vp[i].maxZ - pState->vp[i].minZ;
645 pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
646 pState->vpMatrix[i].m31 = pState->vp[i].y - pState->vpMatrix[i].m11;
647 pState->vpMatrix[i].m32 = pState->vp[i].minZ;
648 }
649 else
650 {
651 // Standard, with the exception that Y is inverted.
652 pState->vpMatrix[i].m00 = (pState->vp[i].width - pState->vp[i].x) / 2.0f;
653 pState->vpMatrix[i].m11 = (pState->vp[i].y - pState->vp[i].height) / 2.0f;
654 pState->vpMatrix[i].m22 = (pState->vp[i].maxZ - pState->vp[i].minZ) / 2.0f;
655 pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
656 pState->vpMatrix[i].m31 = pState->vp[i].height + pState->vpMatrix[i].m11;
657 pState->vpMatrix[i].m32 = pState->vp[i].minZ + pState->vpMatrix[i].m22;
658
659 // Now that the matrix is calculated, clip the view coords to screen size.
660 // OpenGL allows for -ve x,y in the viewport.
661 pState->vp[i].x = std::max(pState->vp[i].x, 0.0f);
662 pState->vp[i].y = std::max(pState->vp[i].y, 0.0f);
663 }
664 }
665 }
666
667 updateGuardband(pState);
668 }
669
670 void SwrSetScissorRects(
671 HANDLE hContext,
672 uint32_t numScissors,
673 const BBOX* pScissors)
674 {
675 SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS,
676 "Invalid number of scissor rects.");
677
678 API_STATE* pState = GetDrawState(GetContext(hContext));
679 memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(BBOX));
680 };
681
682 void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
683 {
684 API_STATE *pState = &pDC->pState->state;
685 uint32_t left, right, top, bottom;
686
687 // Set up scissor dimensions based on scissor or viewport
688 if (pState->rastState.scissorEnable)
689 {
690 // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges
691 left = pState->scissorRects[0].left;
692 right = pState->scissorRects[0].right;
693 top = pState->scissorRects[0].top;
694 bottom = pState->scissorRects[0].bottom;
695 }
696 else
697 {
698 left = (int32_t)pState->vp[0].x;
699 right = (int32_t)pState->vp[0].x + (int32_t)pState->vp[0].width;
700 top = (int32_t)pState->vp[0].y;
701 bottom = (int32_t)pState->vp[0].y + (int32_t)pState->vp[0].height;
702 }
703
704 right = std::min<uint32_t>(right, KNOB_MAX_SCISSOR_X);
705 bottom = std::min<uint32_t>(bottom, KNOB_MAX_SCISSOR_Y);
706
707 if (left > KNOB_MAX_SCISSOR_X || top > KNOB_MAX_SCISSOR_Y)
708 {
709 pState->scissorInFixedPoint.left = 0;
710 pState->scissorInFixedPoint.right = 0;
711 pState->scissorInFixedPoint.top = 0;
712 pState->scissorInFixedPoint.bottom = 0;
713 }
714 else
715 {
716 pState->scissorInFixedPoint.left = left * FIXED_POINT_SCALE;
717 pState->scissorInFixedPoint.right = right * FIXED_POINT_SCALE - 1;
718 pState->scissorInFixedPoint.top = top * FIXED_POINT_SCALE;
719 pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1;
720 }
721 }
722 // templated backend function tables
723 extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
724 extern PFN_BACKEND_FUNC gBackendSingleSample[2][2];
725 extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
726 extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2];
727 extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS + 1][SWR_MULTISAMPLE_TYPE_MAX];
728 extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
729 extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
730 extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2];
731 void SetupPipeline(DRAW_CONTEXT *pDC)
732 {
733 DRAW_STATE* pState = pDC->pState;
734 const SWR_RASTSTATE &rastState = pState->state.rastState;
735 const SWR_PS_STATE &psState = pState->state.psState;
736 BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
737 const uint32_t forcedSampleCount = (rastState.bForcedSampleCount) ? 1 : 0;
738
739 // setup backend
740 if (psState.pfnPixelShader == nullptr)
741 {
742 backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
743 // always need to generate I & J per sample for Z interpolation
744 backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[1];
745 }
746 else
747 {
748 const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.bForcedSampleCount) ? 1 : 0;
749 const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
750
751 // currently only support 'normal' input coverage
752 SWR_ASSERT(psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
753 psState.inputCoverage == SWR_INPUT_COVERAGE_NONE);
754
755 SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
756
757 // select backend function
758 switch(psState.shadingRate)
759 {
760 case SWR_SHADING_RATE_PIXEL:
761 if(bMultisampleEnable)
762 {
763 // always need to generate I & J per sample for Z interpolation
764 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
765 backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount];
766 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
767 }
768 else
769 {
770 // always need to generate I & J per pixel for Z interpolation
771 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
772 backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid];
773 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][SWR_MULTISAMPLE_1X];
774 }
775 break;
776 case SWR_SHADING_RATE_SAMPLE:
777 SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
778 // always need to generate I & J per sample for Z interpolation
779 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
780 backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid];
781 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
782 break;
783 default:
784 SWR_ASSERT(0 && "Invalid shading rate");
785 break;
786 }
787
788 // setup pointer to function that generates necessary barycentrics required by the PS
789 bool bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_PIXEL_MASK) > 0 ? 1 : 0;
790 backendFuncs.pfnCalcPixelBarycentrics = gPixelBarycentricTable[bBarycentrics];
791
792 bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_SAMPLE_MASK) > 0 ? 1 : 0;
793 backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[bBarycentrics];
794
795 bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0 ? 1 : 0;
796 backendFuncs.pfnCalcCentroidBarycentrics = gCentroidBarycentricTable[rastState.sampleCount][bBarycentrics][rastState.samplePattern][forcedSampleCount];
797 }
798
799 PFN_PROCESS_PRIMS pfnBinner;
800 switch (pState->state.topology)
801 {
802 case TOP_POINT_LIST:
803 pState->pfnProcessPrims = ClipPoints;
804 pfnBinner = BinPoints;
805 break;
806 case TOP_LINE_LIST:
807 case TOP_LINE_STRIP:
808 case TOP_LINE_LOOP:
809 case TOP_LINE_LIST_ADJ:
810 case TOP_LISTSTRIP_ADJ:
811 pState->pfnProcessPrims = ClipLines;
812 pfnBinner = BinLines;
813 break;
814 default:
815 pState->pfnProcessPrims = ClipTriangles;
816 pfnBinner = BinTriangles;
817 break;
818 };
819
820 // disable clipper if viewport transform is disabled
821 if (pState->state.frontendState.vpTransformDisable)
822 {
823 pState->pfnProcessPrims = pfnBinner;
824 }
825
826 if ((pState->state.psState.pfnPixelShader == nullptr) &&
827 (pState->state.depthStencilState.depthTestEnable == FALSE) &&
828 (pState->state.depthStencilState.depthWriteEnable == FALSE) &&
829 (pState->state.depthStencilState.stencilTestEnable == FALSE) &&
830 (pState->state.depthStencilState.stencilWriteEnable == FALSE) &&
831 (pState->state.linkageCount == 0))
832 {
833 pState->pfnProcessPrims = nullptr;
834 pState->state.linkageMask = 0;
835 }
836
837 if (pState->state.soState.rasterizerDisable == true)
838 {
839 pState->pfnProcessPrims = nullptr;
840 pState->state.linkageMask = 0;
841 }
842
843 // set up the frontend attrib mask
844 pState->state.feAttribMask = pState->state.linkageMask;
845 if (pState->state.soState.soEnable)
846 {
847 for (uint32_t i = 0; i < 4; ++i)
848 {
849 pState->state.feAttribMask |= pState->state.soState.streamMasks[i];
850 }
851 }
852
853 // complicated logic to test for cases where we don't need backing hottile memory for a draw
854 // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
855 pState->state.depthHottileEnable = ((!(pState->state.depthStencilState.depthTestEnable &&
856 !pState->state.depthStencilState.depthWriteEnable &&
857 pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) &&
858 (pState->state.depthStencilState.depthTestEnable ||
859 pState->state.depthStencilState.depthWriteEnable)) ? true : false;
860
861 pState->state.stencilHottileEnable = (((!(pState->state.depthStencilState.stencilTestEnable &&
862 !pState->state.depthStencilState.stencilWriteEnable &&
863 pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) ||
864 // for stencil we have to check the double sided state as well
865 (!(pState->state.depthStencilState.doubleSidedStencilTestEnable &&
866 !pState->state.depthStencilState.stencilWriteEnable &&
867 pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) &&
868 (pState->state.depthStencilState.stencilTestEnable ||
869 pState->state.depthStencilState.stencilWriteEnable)) ? true : false;
870
871 uint32_t numRTs = pState->state.psState.numRenderTargets;
872 pState->state.colorHottileEnable = 0;
873 if (psState.pfnPixelShader != nullptr)
874 {
875 for (uint32_t rt = 0; rt < numRTs; ++rt)
876 {
877 pState->state.colorHottileEnable |=
878 (!pState->state.blendState.renderTarget[rt].writeDisableAlpha ||
879 !pState->state.blendState.renderTarget[rt].writeDisableRed ||
880 !pState->state.blendState.renderTarget[rt].writeDisableGreen ||
881 !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0;
882 }
883 }
884 }
885
886 //////////////////////////////////////////////////////////////////////////
887 /// @brief InitDraw
888 /// @param pDC - Draw context to initialize for this draw.
889 void InitDraw(
890 DRAW_CONTEXT *pDC,
891 bool isSplitDraw)
892 {
893 // We don't need to re-setup the scissors/pipeline state again for split draw.
894 if (isSplitDraw == false)
895 {
896 SetupMacroTileScissors(pDC);
897 SetupPipeline(pDC);
898 }
899 }
900
901 //////////////////////////////////////////////////////////////////////////
902 /// @brief We can split the draw for certain topologies for better performance.
903 /// @param totalVerts - Total vertices for draw
904 /// @param topology - Topology used for draw
905 uint32_t MaxVertsPerDraw(
906 DRAW_CONTEXT* pDC,
907 uint32_t totalVerts,
908 PRIMITIVE_TOPOLOGY topology)
909 {
910 API_STATE& state = pDC->pState->state;
911
912 uint32_t vertsPerDraw = totalVerts;
913
914 if (state.soState.soEnable)
915 {
916 return totalVerts;
917 }
918
919 switch (topology)
920 {
921 case TOP_POINT_LIST:
922 case TOP_TRIANGLE_LIST:
923 vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW;
924 break;
925
926 case TOP_PATCHLIST_1:
927 case TOP_PATCHLIST_2:
928 case TOP_PATCHLIST_3:
929 case TOP_PATCHLIST_4:
930 case TOP_PATCHLIST_5:
931 case TOP_PATCHLIST_6:
932 case TOP_PATCHLIST_7:
933 case TOP_PATCHLIST_8:
934 case TOP_PATCHLIST_9:
935 case TOP_PATCHLIST_10:
936 case TOP_PATCHLIST_11:
937 case TOP_PATCHLIST_12:
938 case TOP_PATCHLIST_13:
939 case TOP_PATCHLIST_14:
940 case TOP_PATCHLIST_15:
941 case TOP_PATCHLIST_16:
942 case TOP_PATCHLIST_17:
943 case TOP_PATCHLIST_18:
944 case TOP_PATCHLIST_19:
945 case TOP_PATCHLIST_20:
946 case TOP_PATCHLIST_21:
947 case TOP_PATCHLIST_22:
948 case TOP_PATCHLIST_23:
949 case TOP_PATCHLIST_24:
950 case TOP_PATCHLIST_25:
951 case TOP_PATCHLIST_26:
952 case TOP_PATCHLIST_27:
953 case TOP_PATCHLIST_28:
954 case TOP_PATCHLIST_29:
955 case TOP_PATCHLIST_30:
956 case TOP_PATCHLIST_31:
957 case TOP_PATCHLIST_32:
958 if (pDC->pState->state.tsState.tsEnable)
959 {
960 uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE;
961 vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW;
962 }
963 break;
964
965 // The Primitive Assembly code can only handle 1 RECT at a time.
966 case TOP_RECT_LIST:
967 vertsPerDraw = 3;
968 break;
969
970 default:
971 // We are not splitting up draws for other topologies.
972 break;
973 }
974
975 return vertsPerDraw;
976 }
977
978 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
979 // arguments to static template arguments.
980 template <bool... ArgsB>
981 struct FEDrawChooser
982 {
983 // Last Arg Terminator
984 static PFN_FE_WORK_FUNC GetFunc(bool bArg)
985 {
986 if (bArg)
987 {
988 return ProcessDraw<ArgsB..., true>;
989 }
990
991 return ProcessDraw<ArgsB..., false>;
992 }
993
994 // Recursively parse args
995 template <typename... TArgsT>
996 static PFN_FE_WORK_FUNC GetFunc(bool bArg, TArgsT... remainingArgs)
997 {
998 if (bArg)
999 {
1000 return FEDrawChooser<ArgsB..., true>::GetFunc(remainingArgs...);
1001 }
1002
1003 return FEDrawChooser<ArgsB..., false>::GetFunc(remainingArgs...);
1004 }
1005 };
1006
1007 // Selector for correct templated Draw front-end function
1008 INLINE
1009 static PFN_FE_WORK_FUNC GetFEDrawFunc(bool IsIndexed, bool HasTessellation, bool HasGeometryShader, bool HasStreamOut, bool RasterizerEnabled)
1010 {
1011 return FEDrawChooser<>::GetFunc(IsIndexed, HasTessellation, HasGeometryShader, HasStreamOut, RasterizerEnabled);
1012 }
1013
1014
1015 //////////////////////////////////////////////////////////////////////////
1016 /// @brief DrawInstanced
1017 /// @param hContext - Handle passed back from SwrCreateContext
1018 /// @param topology - Specifies topology for draw.
1019 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
1020 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1021 /// @param numInstances - How many instances to render.
1022 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1023 void DrawInstanced(
1024 HANDLE hContext,
1025 PRIMITIVE_TOPOLOGY topology,
1026 uint32_t numVertices,
1027 uint32_t startVertex,
1028 uint32_t numInstances = 1,
1029 uint32_t startInstance = 0)
1030 {
1031 if (KNOB_TOSS_DRAW)
1032 {
1033 return;
1034 }
1035
1036 RDTSC_START(APIDraw);
1037
1038 SWR_CONTEXT *pContext = GetContext(hContext);
1039 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1040
1041 int32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
1042 uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
1043 int32_t remainingVerts = numVertices;
1044
1045 API_STATE *pState = &pDC->pState->state;
1046 pState->topology = topology;
1047 pState->forceFront = false;
1048
1049 // disable culling for points/lines
1050 uint32_t oldCullMode = pState->rastState.cullMode;
1051 if (topology == TOP_POINT_LIST)
1052 {
1053 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1054 pState->forceFront = true;
1055 }
1056
1057 int draw = 0;
1058 while (remainingVerts)
1059 {
1060 uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ?
1061 remainingVerts : maxVertsPerDraw;
1062
1063 bool isSplitDraw = (draw > 0) ? true : false;
1064 DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw);
1065 InitDraw(pDC, isSplitDraw);
1066
1067 pDC->FeWork.type = DRAW;
1068 pDC->FeWork.pfnWork = GetFEDrawFunc(
1069 false, // IsIndexed
1070 pState->tsState.tsEnable,
1071 pState->gsState.gsEnable,
1072 pState->soState.soEnable,
1073 pDC->pState->pfnProcessPrims != nullptr);
1074 pDC->FeWork.desc.draw.numVerts = numVertsForDraw;
1075 pDC->FeWork.desc.draw.startVertex = startVertex;
1076 pDC->FeWork.desc.draw.numInstances = numInstances;
1077 pDC->FeWork.desc.draw.startInstance = startInstance;
1078 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1079 pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw;
1080
1081 pDC->cleanupState = (remainingVerts == numVertsForDraw);
1082
1083 //enqueue DC
1084 QueueDraw(pContext);
1085
1086 remainingVerts -= numVertsForDraw;
1087 draw++;
1088 }
1089
1090 // restore culling state
1091 pDC = GetDrawContext(pContext);
1092 pDC->pState->state.rastState.cullMode = oldCullMode;
1093
1094 RDTSC_STOP(APIDraw, numVertices * numInstances, 0);
1095 }
1096
1097 //////////////////////////////////////////////////////////////////////////
1098 /// @brief SwrDraw
1099 /// @param hContext - Handle passed back from SwrCreateContext
1100 /// @param topology - Specifies topology for draw.
1101 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
1102 /// @param primCount - Number of vertices.
1103 void SwrDraw(
1104 HANDLE hContext,
1105 PRIMITIVE_TOPOLOGY topology,
1106 uint32_t startVertex,
1107 uint32_t numVertices)
1108 {
1109 DrawInstanced(hContext, topology, numVertices, startVertex);
1110 }
1111
1112 //////////////////////////////////////////////////////////////////////////
1113 /// @brief SwrDrawInstanced
1114 /// @param hContext - Handle passed back from SwrCreateContext
1115 /// @param topology - Specifies topology for draw.
1116 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
1117 /// @param numInstances - How many instances to render.
1118 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1119 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1120 void SwrDrawInstanced(
1121 HANDLE hContext,
1122 PRIMITIVE_TOPOLOGY topology,
1123 uint32_t numVertsPerInstance,
1124 uint32_t numInstances,
1125 uint32_t startVertex,
1126 uint32_t startInstance
1127 )
1128 {
1129 DrawInstanced(hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance);
1130 }
1131
1132 //////////////////////////////////////////////////////////////////////////
1133 /// @brief DrawIndexedInstanced
1134 /// @param hContext - Handle passed back from SwrCreateContext
1135 /// @param topology - Specifies topology for draw.
1136 /// @param numIndices - Number of indices to read sequentially from index buffer.
1137 /// @param indexOffset - Starting index into index buffer.
1138 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1139 /// @param numInstances - Number of instances to render.
1140 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1141 void DrawIndexedInstance(
1142 HANDLE hContext,
1143 PRIMITIVE_TOPOLOGY topology,
1144 uint32_t numIndices,
1145 uint32_t indexOffset,
1146 int32_t baseVertex,
1147 uint32_t numInstances = 1,
1148 uint32_t startInstance = 0)
1149 {
1150 if (KNOB_TOSS_DRAW)
1151 {
1152 return;
1153 }
1154
1155 RDTSC_START(APIDrawIndexed);
1156
1157 SWR_CONTEXT *pContext = GetContext(hContext);
1158 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1159 API_STATE* pState = &pDC->pState->state;
1160
1161 int32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
1162 uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
1163 int32_t remainingIndices = numIndices;
1164
1165 uint32_t indexSize = 0;
1166 switch (pState->indexBuffer.format)
1167 {
1168 case R32_UINT: indexSize = sizeof(uint32_t); break;
1169 case R16_UINT: indexSize = sizeof(uint16_t); break;
1170 case R8_UINT: indexSize = sizeof(uint8_t); break;
1171 default:
1172 SWR_ASSERT(0);
1173 }
1174
1175 int draw = 0;
1176 uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices;
1177 pIB += (uint64_t)indexOffset * (uint64_t)indexSize;
1178
1179 pState->topology = topology;
1180 pState->forceFront = false;
1181
1182 // disable culling for points/lines
1183 uint32_t oldCullMode = pState->rastState.cullMode;
1184 if (topology == TOP_POINT_LIST)
1185 {
1186 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1187 pState->forceFront = true;
1188 }
1189
1190 while (remainingIndices)
1191 {
1192 uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ?
1193 remainingIndices : maxIndicesPerDraw;
1194
1195 // When breaking up draw, we need to obtain new draw context for each iteration.
1196 bool isSplitDraw = (draw > 0) ? true : false;
1197 pDC = GetDrawContext(pContext, isSplitDraw);
1198 InitDraw(pDC, isSplitDraw);
1199
1200 pDC->FeWork.type = DRAW;
1201 pDC->FeWork.pfnWork = GetFEDrawFunc(
1202 true, // IsIndexed
1203 pState->tsState.tsEnable,
1204 pState->gsState.gsEnable,
1205 pState->soState.soEnable,
1206 pDC->pState->pfnProcessPrims != nullptr);
1207 pDC->FeWork.desc.draw.pDC = pDC;
1208 pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
1209 pDC->FeWork.desc.draw.pIB = (int*)pIB;
1210 pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format;
1211
1212 pDC->FeWork.desc.draw.numInstances = numInstances;
1213 pDC->FeWork.desc.draw.startInstance = startInstance;
1214 pDC->FeWork.desc.draw.baseVertex = baseVertex;
1215 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1216
1217 pDC->cleanupState = (remainingIndices == numIndicesForDraw);
1218
1219 //enqueue DC
1220 QueueDraw(pContext);
1221
1222 pIB += maxIndicesPerDraw * indexSize;
1223 remainingIndices -= numIndicesForDraw;
1224 draw++;
1225 }
1226
1227 // restore culling state
1228 pDC = GetDrawContext(pContext);
1229 pDC->pState->state.rastState.cullMode = oldCullMode;
1230
1231 RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0);
1232 }
1233
1234
1235 //////////////////////////////////////////////////////////////////////////
1236 /// @brief DrawIndexed
1237 /// @param hContext - Handle passed back from SwrCreateContext
1238 /// @param topology - Specifies topology for draw.
1239 /// @param numIndices - Number of indices to read sequentially from index buffer.
1240 /// @param indexOffset - Starting index into index buffer.
1241 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1242 void SwrDrawIndexed(
1243 HANDLE hContext,
1244 PRIMITIVE_TOPOLOGY topology,
1245 uint32_t numIndices,
1246 uint32_t indexOffset,
1247 int32_t baseVertex
1248 )
1249 {
1250 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex);
1251 }
1252
1253 //////////////////////////////////////////////////////////////////////////
1254 /// @brief SwrDrawIndexedInstanced
1255 /// @param hContext - Handle passed back from SwrCreateContext
1256 /// @param topology - Specifies topology for draw.
1257 /// @param numIndices - Number of indices to read sequentially from index buffer.
1258 /// @param numInstances - Number of instances to render.
1259 /// @param indexOffset - Starting index into index buffer.
1260 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1261 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1262 void SwrDrawIndexedInstanced(
1263 HANDLE hContext,
1264 PRIMITIVE_TOPOLOGY topology,
1265 uint32_t numIndices,
1266 uint32_t numInstances,
1267 uint32_t indexOffset,
1268 int32_t baseVertex,
1269 uint32_t startInstance)
1270 {
1271 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
1272 }
1273
1274 //////////////////////////////////////////////////////////////////////////
1275 /// @brief SwrInvalidateTiles
1276 /// @param hContext - Handle passed back from SwrCreateContext
1277 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
1278 void SwrInvalidateTiles(
1279 HANDLE hContext,
1280 uint32_t attachmentMask)
1281 {
1282 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1283 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1284
1285 pDC->FeWork.type = DISCARDINVALIDATETILES;
1286 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1287 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1288 memset(&pDC->FeWork.desc.discardInvalidateTiles.rect, 0, sizeof(SWR_RECT));
1289 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID;
1290 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false;
1291 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false;
1292
1293 //enqueue
1294 QueueDraw(pContext);
1295 }
1296
1297 //////////////////////////////////////////////////////////////////////////
1298 /// @brief SwrDiscardRect
1299 /// @param hContext - Handle passed back from SwrCreateContext
1300 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
1301 /// @param rect - if rect is all zeros, the entire attachment surface will be discarded
1302 void SwrDiscardRect(
1303 HANDLE hContext,
1304 uint32_t attachmentMask,
1305 SWR_RECT rect)
1306 {
1307 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1308 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1309
1310 // Queue a load to the hottile
1311 pDC->FeWork.type = DISCARDINVALIDATETILES;
1312 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1313 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1314 pDC->FeWork.desc.discardInvalidateTiles.rect = rect;
1315 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED;
1316 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true;
1317 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true;
1318
1319 //enqueue
1320 QueueDraw(pContext);
1321 }
1322
1323 //////////////////////////////////////////////////////////////////////////
1324 /// @brief SwrDispatch
1325 /// @param hContext - Handle passed back from SwrCreateContext
1326 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
1327 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
1328 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
1329 void SwrDispatch(
1330 HANDLE hContext,
1331 uint32_t threadGroupCountX,
1332 uint32_t threadGroupCountY,
1333 uint32_t threadGroupCountZ)
1334 {
1335 if (KNOB_TOSS_DRAW)
1336 {
1337 return;
1338 }
1339
1340 RDTSC_START(APIDispatch);
1341 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1342 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1343
1344 pDC->isCompute = true; // This is a compute context.
1345
1346 // Ensure spill fill pointers are initialized to nullptr.
1347 memset(pDC->pSpillFill, 0, sizeof(pDC->pSpillFill));
1348
1349 COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
1350
1351 pTaskData->threadGroupCountX = threadGroupCountX;
1352 pTaskData->threadGroupCountY = threadGroupCountY;
1353 pTaskData->threadGroupCountZ = threadGroupCountZ;
1354
1355 uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
1356 pDC->pDispatch->initialize(totalThreadGroups, pTaskData);
1357
1358 QueueDispatch(pContext);
1359 RDTSC_STOP(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ, 0);
1360 }
1361
1362 // Deswizzles, converts and stores current contents of the hot tiles to surface
1363 // described by pState
1364 void SwrStoreTiles(
1365 HANDLE hContext,
1366 SWR_RENDERTARGET_ATTACHMENT attachment,
1367 SWR_TILE_STATE postStoreTileState)
1368 {
1369 RDTSC_START(APIStoreTiles);
1370
1371 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1372 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1373
1374 SetupMacroTileScissors(pDC);
1375
1376 pDC->FeWork.type = STORETILES;
1377 pDC->FeWork.pfnWork = ProcessStoreTiles;
1378 pDC->FeWork.desc.storeTiles.attachment = attachment;
1379 pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState;
1380
1381 //enqueue
1382 QueueDraw(pContext);
1383
1384 RDTSC_STOP(APIStoreTiles, 0, 0);
1385 }
1386
1387 void SwrClearRenderTarget(
1388 HANDLE hContext,
1389 uint32_t clearMask,
1390 const float clearColor[4],
1391 float z,
1392 uint8_t stencil)
1393 {
1394 RDTSC_START(APIClearRenderTarget);
1395
1396 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1397
1398 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1399
1400 SetupMacroTileScissors(pDC);
1401
1402 CLEAR_FLAGS flags;
1403 flags.mask = clearMask;
1404
1405 pDC->FeWork.type = CLEAR;
1406 pDC->FeWork.pfnWork = ProcessClear;
1407 pDC->FeWork.desc.clear.flags = flags;
1408 pDC->FeWork.desc.clear.clearDepth = z;
1409 pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0];
1410 pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1];
1411 pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2];
1412 pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3];
1413 pDC->FeWork.desc.clear.clearStencil = stencil;
1414
1415 // enqueue draw
1416 QueueDraw(pContext);
1417
1418 RDTSC_STOP(APIClearRenderTarget, 0, pDC->drawId);
1419 }
1420
1421 //////////////////////////////////////////////////////////////////////////
1422 /// @brief Returns a pointer to the private context state for the current
1423 /// draw operation. This is used for external componets such as the
1424 /// sampler.
1425 /// SWR is responsible for the allocation of the private context state.
1426 /// @param hContext - Handle passed back from SwrCreateContext
1427 VOID* SwrGetPrivateContextState(
1428 HANDLE hContext)
1429 {
1430 SWR_CONTEXT* pContext = GetContext(hContext);
1431 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1432 DRAW_STATE* pState = pDC->pState;
1433
1434 if (pState->pPrivateState == nullptr)
1435 {
1436 pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float));
1437 }
1438
1439 return pState->pPrivateState;
1440 }
1441
1442 //////////////////////////////////////////////////////////////////////////
1443 /// @brief Clients can use this to allocate memory for draw/dispatch
1444 /// operations. The memory will automatically be freed once operation
1445 /// has completed. Client can use this to allocate binding tables,
1446 /// etc. needed for shader execution.
1447 /// @param hContext - Handle passed back from SwrCreateContext
1448 /// @param size - Size of allocation
1449 /// @param align - Alignment needed for allocation.
1450 VOID* SwrAllocDrawContextMemory(
1451 HANDLE hContext,
1452 uint32_t size,
1453 uint32_t align)
1454 {
1455 SWR_CONTEXT* pContext = GetContext(hContext);
1456 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1457
1458 return pDC->pState->pArena->AllocAligned(size, align);
1459 }
1460
1461 //////////////////////////////////////////////////////////////////////////
1462 /// @brief Returns pointer to SWR stats.
1463 /// @note The counters are atomically incremented by multiple threads.
1464 /// When calling this, you need to ensure all previous operations
1465 /// have completed.
1466 /// @todo If necessary, add a callback to avoid stalling the pipe to
1467 /// sample the counters.
1468 /// @param hContext - Handle passed back from SwrCreateContext
1469 /// @param pStats - SWR will fill this out for caller.
1470 void SwrGetStats(
1471 HANDLE hContext,
1472 SWR_STATS* pStats)
1473 {
1474 SWR_CONTEXT *pContext = GetContext(hContext);
1475 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1476
1477 pDC->FeWork.type = QUERYSTATS;
1478 pDC->FeWork.pfnWork = ProcessQueryStats;
1479 pDC->FeWork.desc.queryStats.pStats = pStats;
1480
1481 // cannot execute until all previous draws have completed
1482 pDC->dependency = pDC->drawId - 1;
1483
1484 //enqueue
1485 QueueDraw(pContext);
1486 }
1487
1488 //////////////////////////////////////////////////////////////////////////
1489 /// @brief Enables stats counting
1490 /// @param hContext - Handle passed back from SwrCreateContext
1491 /// @param enable - If true then counts are incremented.
1492 void SwrEnableStats(
1493 HANDLE hContext,
1494 bool enable)
1495 {
1496 SWR_CONTEXT *pContext = GetContext(hContext);
1497 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1498
1499 pDC->pState->state.enableStats = enable;
1500 }
1501
1502 //////////////////////////////////////////////////////////////////////////
1503 /// @brief Mark end of frame - used for performance profiling
1504 /// @param hContext - Handle passed back from SwrCreateContext
1505 void SWR_API SwrEndFrame(
1506 HANDLE hContext)
1507 {
1508 RDTSC_ENDFRAME();
1509 }