swr: [rasterizer core] Affinitize thread scratch space to numa node of worker
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / api.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file api.cpp
24 *
25 * @brief API implementation
26 *
27 ******************************************************************************/
28
29 #include <cfloat>
30 #include <cmath>
31 #include <cstdio>
32
33 #include "core/api.h"
34 #include "core/backend.h"
35 #include "core/context.h"
36 #include "core/frontend.h"
37 #include "core/rasterizer.h"
38 #include "core/rdtsc_core.h"
39 #include "core/threads.h"
40 #include "core/tilemgr.h"
41 #include "core/clip.h"
42
43 #include "common/simdintrin.h"
44 #include "common/os.h"
45
46 void SetupDefaultState(SWR_CONTEXT *pContext);
47
48 //////////////////////////////////////////////////////////////////////////
49 /// @brief Create SWR Context.
50 /// @param pCreateInfo - pointer to creation info.
51 HANDLE SwrCreateContext(
52 SWR_CREATECONTEXT_INFO* pCreateInfo)
53 {
54 RDTSC_RESET();
55 RDTSC_INIT(0);
56
57 void* pContextMem = _aligned_malloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4);
58 memset(pContextMem, 0, sizeof(SWR_CONTEXT));
59 SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT();
60
61 pContext->driverType = pCreateInfo->driver;
62 pContext->privateStateSize = pCreateInfo->privateStateSize;
63
64 pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
65 pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
66
67 for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
68 {
69 pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
70 pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena));
71 pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
72
73 pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
74 }
75
76 if (!KNOB_SINGLE_THREADED)
77 {
78 memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
79 memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
80 new (&pContext->WaitLock) std::mutex();
81 new (&pContext->FifosNotEmpty) std::condition_variable();
82
83 CreateThreadPool(pContext, &pContext->threadPool);
84 }
85
86 // Calling createThreadPool() above can set SINGLE_THREADED
87 if (KNOB_SINGLE_THREADED)
88 {
89 pContext->NumWorkerThreads = 1;
90 }
91
92 // Allocate scratch space for workers.
93 ///@note We could lazily allocate this but its rather small amount of memory.
94 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
95 {
96 #if defined(_WIN32)
97 uint32_t numaNode = pContext->threadPool.pThreadData ?
98 pContext->threadPool.pThreadData[i].numaId : 0;
99 pContext->pScratch[i] = (uint8_t*)VirtualAllocExNuma(
100 GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE),
101 MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE,
102 numaNode);
103 #else
104 pContext->pScratch[i] = (uint8_t*)_aligned_malloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4);
105 #endif
106 }
107
108 // State setup AFTER context is fully initialized
109 SetupDefaultState(pContext);
110
111 // initialize hot tile manager
112 pContext->pHotTileMgr = new HotTileMgr();
113
114 // initialize function pointer tables
115 InitClearTilesTable();
116
117 // initialize store tiles function
118 pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
119 pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
120 pContext->pfnClearTile = pCreateInfo->pfnClearTile;
121
122 // pass pointer to bucket manager back to caller
123 #ifdef KNOB_ENABLE_RDTSC
124 pCreateInfo->pBucketMgr = &gBucketMgr;
125 #endif
126
127 pCreateInfo->contextSaveSize = sizeof(API_STATE);
128
129 return (HANDLE)pContext;
130 }
131
132 void SwrDestroyContext(HANDLE hContext)
133 {
134 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
135 DestroyThreadPool(pContext, &pContext->threadPool);
136
137 // free the fifos
138 for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i)
139 {
140 delete pContext->dcRing[i].pArena;
141 delete pContext->dsRing[i].pArena;
142 delete(pContext->dcRing[i].pTileMgr);
143 delete(pContext->dcRing[i].pDispatch);
144 }
145
146 // Free scratch space.
147 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
148 {
149 #if defined(_WIN32)
150 VirtualFree(pContext->pScratch[i], 0, MEM_RELEASE);
151 #else
152 _aligned_free(pContext->pScratch[i]);
153 #endif
154 }
155
156 delete(pContext->pHotTileMgr);
157
158 pContext->~SWR_CONTEXT();
159 _aligned_free((SWR_CONTEXT*)hContext);
160 }
161
162 void CopyState(DRAW_STATE& dst, const DRAW_STATE& src)
163 {
164 memcpy(&dst.state, &src.state, sizeof(API_STATE));
165 }
166
167 void WakeAllThreads(SWR_CONTEXT *pContext)
168 {
169 pContext->FifosNotEmpty.notify_all();
170 }
171
172 template<bool IsDraw>
173 void QueueWork(SWR_CONTEXT *pContext)
174 {
175 // Each worker thread looks at a DC for both FE and BE work at different times and so we
176 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
177 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
178 // then moved on if all work is done.)
179 pContext->pCurDrawContext->threadsDone =
180 pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2;
181
182 _ReadWriteBarrier();
183 {
184 std::unique_lock<std::mutex> lock(pContext->WaitLock);
185 pContext->dcRing.Enqueue();
186 }
187
188 if (KNOB_SINGLE_THREADED)
189 {
190 // flush denormals to 0
191 uint32_t mxcsr = _mm_getcsr();
192 _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
193
194 if (IsDraw)
195 {
196 static TileSet lockedTiles;
197 uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
198 WorkOnFifoFE(pContext, 0, curDraw[0], 0);
199 WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles, 0, 0);
200 }
201 else
202 {
203 uint64_t curDispatch = pContext->pCurDrawContext->drawId;
204 WorkOnCompute(pContext, 0, curDispatch);
205 }
206
207 // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
208 while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0) {}
209
210 // restore csr
211 _mm_setcsr(mxcsr);
212 }
213 else
214 {
215 RDTSC_START(APIDrawWakeAllThreads);
216 WakeAllThreads(pContext);
217 RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
218 }
219
220 // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
221 pContext->pPrevDrawContext = pContext->pCurDrawContext;
222 pContext->pCurDrawContext = nullptr;
223 }
224
225 INLINE void QueueDraw(SWR_CONTEXT* pContext)
226 {
227 QueueWork<true>(pContext);
228 }
229
230 INLINE void QueueDispatch(SWR_CONTEXT* pContext)
231 {
232 QueueWork<false>(pContext);
233 }
234
235 DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
236 {
237 RDTSC_START(APIGetDrawContext);
238 // If current draw context is null then need to obtain a new draw context to use from ring.
239 if (pContext->pCurDrawContext == nullptr)
240 {
241 // Need to wait for a free entry.
242 while (pContext->dcRing.IsFull())
243 {
244 _mm_pause();
245 }
246
247 uint32_t dcIndex = pContext->dcRing.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT;
248
249 DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
250 pContext->pCurDrawContext = pCurDrawContext;
251
252 // Assign next available entry in DS ring to this DC.
253 uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
254 pCurDrawContext->pState = &pContext->dsRing[dsIndex];
255
256 // Copy previous state to current state.
257 if (pContext->pPrevDrawContext)
258 {
259 DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext;
260
261 // If we're splitting our draw then we can just use the same state from the previous
262 // draw. In this case, we won't increment the DS ring index so the next non-split
263 // draw can receive the state.
264 if (isSplitDraw == false)
265 {
266 CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);
267
268 // Should have been cleaned up previously
269 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
270
271 pCurDrawContext->pState->pPrivateState = nullptr;
272
273 pContext->curStateId++; // Progress state ring index forward.
274 }
275 else
276 {
277 // If its a split draw then just copy the state pointer over
278 // since its the same draw.
279 pCurDrawContext->pState = pPrevDrawContext->pState;
280 SWR_ASSERT(pPrevDrawContext->cleanupState == false);
281 }
282 }
283 else
284 {
285 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
286 pContext->curStateId++; // Progress state ring index forward.
287 }
288
289 SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true);
290
291 pCurDrawContext->dependency = 0;
292 pCurDrawContext->pContext = pContext;
293 pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
294
295 pCurDrawContext->doneFE = false;
296 pCurDrawContext->FeLock = 0;
297 pCurDrawContext->threadsDone = 0;
298
299 pCurDrawContext->pTileMgr->initialize();
300
301 // Assign unique drawId for this DC
302 pCurDrawContext->drawId = pContext->dcRing.GetHead();
303
304 pCurDrawContext->cleanupState = true;
305 }
306 else
307 {
308 SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
309 }
310
311 RDTSC_STOP(APIGetDrawContext, 0, 0);
312 return pContext->pCurDrawContext;
313 }
314
315 API_STATE* GetDrawState(SWR_CONTEXT *pContext)
316 {
317 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
318 SWR_ASSERT(pDC->pState != nullptr);
319
320 return &pDC->pState->state;
321 }
322
323 void SWR_API SwrSaveState(
324 HANDLE hContext,
325 void* pOutputStateBlock,
326 size_t memSize)
327 {
328 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
329 auto pSrc = GetDrawState(pContext);
330 SWR_ASSERT(pOutputStateBlock && memSize >= sizeof(*pSrc));
331
332 memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc));
333 }
334
335 void SWR_API SwrRestoreState(
336 HANDLE hContext,
337 const void* pStateBlock,
338 size_t memSize)
339 {
340 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
341 auto pDst = GetDrawState(pContext);
342 SWR_ASSERT(pStateBlock && memSize >= sizeof(*pDst));
343
344 memcpy(pDst, pStateBlock, sizeof(*pDst));
345 }
346
347 void SetupDefaultState(SWR_CONTEXT *pContext)
348 {
349 API_STATE* pState = GetDrawState(pContext);
350
351 pState->rastState.cullMode = SWR_CULLMODE_NONE;
352 pState->rastState.frontWinding = SWR_FRONTWINDING_CCW;
353 }
354
355 static INLINE SWR_CONTEXT* GetContext(HANDLE hContext)
356 {
357 return (SWR_CONTEXT*)hContext;
358 }
359
360 void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2, uint64_t userData3)
361 {
362 RDTSC_START(APISync);
363
364 SWR_ASSERT(pfnFunc != nullptr);
365
366 SWR_CONTEXT *pContext = GetContext(hContext);
367 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
368
369 pDC->FeWork.type = SYNC;
370 pDC->FeWork.pfnWork = ProcessSync;
371 pDC->FeWork.desc.sync.pfnCallbackFunc = pfnFunc;
372 pDC->FeWork.desc.sync.userData = userData;
373 pDC->FeWork.desc.sync.userData2 = userData2;
374 pDC->FeWork.desc.sync.userData3 = userData3;
375
376 // cannot execute until all previous draws have completed
377 pDC->dependency = pDC->drawId - 1;
378
379 //enqueue
380 QueueDraw(pContext);
381
382 RDTSC_STOP(APISync, 1, 0);
383 }
384
385 void SwrWaitForIdle(HANDLE hContext)
386 {
387 SWR_CONTEXT *pContext = GetContext(hContext);
388
389 RDTSC_START(APIWaitForIdle);
390
391 while (!pContext->dcRing.IsEmpty())
392 {
393 _mm_pause();
394 }
395
396 RDTSC_STOP(APIWaitForIdle, 1, 0);
397 }
398
399 void SwrSetVertexBuffers(
400 HANDLE hContext,
401 uint32_t numBuffers,
402 const SWR_VERTEX_BUFFER_STATE* pVertexBuffers)
403 {
404 API_STATE* pState = GetDrawState(GetContext(hContext));
405
406 for (uint32_t i = 0; i < numBuffers; ++i)
407 {
408 const SWR_VERTEX_BUFFER_STATE *pVB = &pVertexBuffers[i];
409 pState->vertexBuffers[pVB->index] = *pVB;
410 }
411 }
412
413 void SwrSetIndexBuffer(
414 HANDLE hContext,
415 const SWR_INDEX_BUFFER_STATE* pIndexBuffer)
416 {
417 API_STATE* pState = GetDrawState(GetContext(hContext));
418
419 pState->indexBuffer = *pIndexBuffer;
420 }
421
422 void SwrSetFetchFunc(
423 HANDLE hContext,
424 PFN_FETCH_FUNC pfnFetchFunc)
425 {
426 API_STATE* pState = GetDrawState(GetContext(hContext));
427
428 pState->pfnFetchFunc = pfnFetchFunc;
429 }
430
431 void SwrSetSoFunc(
432 HANDLE hContext,
433 PFN_SO_FUNC pfnSoFunc,
434 uint32_t streamIndex)
435 {
436 API_STATE* pState = GetDrawState(GetContext(hContext));
437
438 SWR_ASSERT(streamIndex < MAX_SO_STREAMS);
439
440 pState->pfnSoFunc[streamIndex] = pfnSoFunc;
441 }
442
443 void SwrSetSoState(
444 HANDLE hContext,
445 SWR_STREAMOUT_STATE* pSoState)
446 {
447 API_STATE* pState = GetDrawState(GetContext(hContext));
448
449 pState->soState = *pSoState;
450 }
451
452 void SwrSetSoBuffers(
453 HANDLE hContext,
454 SWR_STREAMOUT_BUFFER* pSoBuffer,
455 uint32_t slot)
456 {
457 API_STATE* pState = GetDrawState(GetContext(hContext));
458
459 SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot);
460
461 pState->soBuffer[slot] = *pSoBuffer;
462 }
463
464 void SwrSetVertexFunc(
465 HANDLE hContext,
466 PFN_VERTEX_FUNC pfnVertexFunc)
467 {
468 API_STATE* pState = GetDrawState(GetContext(hContext));
469
470 pState->pfnVertexFunc = pfnVertexFunc;
471 }
472
473 void SwrSetFrontendState(
474 HANDLE hContext,
475 SWR_FRONTEND_STATE *pFEState)
476 {
477 API_STATE* pState = GetDrawState(GetContext(hContext));
478 pState->frontendState = *pFEState;
479 }
480
481 void SwrSetGsState(
482 HANDLE hContext,
483 SWR_GS_STATE *pGSState)
484 {
485 API_STATE* pState = GetDrawState(GetContext(hContext));
486 pState->gsState = *pGSState;
487 }
488
489 void SwrSetGsFunc(
490 HANDLE hContext,
491 PFN_GS_FUNC pfnGsFunc)
492 {
493 API_STATE* pState = GetDrawState(GetContext(hContext));
494 pState->pfnGsFunc = pfnGsFunc;
495 }
496
497 void SwrSetCsFunc(
498 HANDLE hContext,
499 PFN_CS_FUNC pfnCsFunc,
500 uint32_t totalThreadsInGroup)
501 {
502 API_STATE* pState = GetDrawState(GetContext(hContext));
503 pState->pfnCsFunc = pfnCsFunc;
504 pState->totalThreadsInGroup = totalThreadsInGroup;
505 }
506
507 void SwrSetTsState(
508 HANDLE hContext,
509 SWR_TS_STATE *pState)
510 {
511 API_STATE* pApiState = GetDrawState(GetContext(hContext));
512 pApiState->tsState = *pState;
513 }
514
515 void SwrSetHsFunc(
516 HANDLE hContext,
517 PFN_HS_FUNC pfnFunc)
518 {
519 API_STATE* pApiState = GetDrawState(GetContext(hContext));
520 pApiState->pfnHsFunc = pfnFunc;
521 }
522
523 void SwrSetDsFunc(
524 HANDLE hContext,
525 PFN_DS_FUNC pfnFunc)
526 {
527 API_STATE* pApiState = GetDrawState(GetContext(hContext));
528 pApiState->pfnDsFunc = pfnFunc;
529 }
530
531 void SwrSetDepthStencilState(
532 HANDLE hContext,
533 SWR_DEPTH_STENCIL_STATE *pDSState)
534 {
535 API_STATE* pState = GetDrawState(GetContext(hContext));
536
537 pState->depthStencilState = *pDSState;
538 }
539
540 void SwrSetBackendState(
541 HANDLE hContext,
542 SWR_BACKEND_STATE *pBEState)
543 {
544 API_STATE* pState = GetDrawState(GetContext(hContext));
545
546 pState->backendState = *pBEState;
547 }
548
549 void SwrSetPixelShaderState(
550 HANDLE hContext,
551 SWR_PS_STATE *pPSState)
552 {
553 API_STATE *pState = GetDrawState(GetContext(hContext));
554 pState->psState = *pPSState;
555 }
556
557 void SwrSetBlendState(
558 HANDLE hContext,
559 SWR_BLEND_STATE *pBlendState)
560 {
561 API_STATE *pState = GetDrawState(GetContext(hContext));
562 memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE));
563 }
564
565 void SwrSetBlendFunc(
566 HANDLE hContext,
567 uint32_t renderTarget,
568 PFN_BLEND_JIT_FUNC pfnBlendFunc)
569 {
570 SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS);
571 API_STATE *pState = GetDrawState(GetContext(hContext));
572 pState->pfnBlendFunc[renderTarget] = pfnBlendFunc;
573 }
574
575 void SwrSetLinkage(
576 HANDLE hContext,
577 uint32_t mask,
578 const uint8_t* pMap)
579 {
580 API_STATE* pState = GetDrawState(GetContext(hContext));
581
582 static const uint8_t IDENTITY_MAP[] =
583 {
584 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
585 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
586 };
587 static_assert(sizeof(IDENTITY_MAP) == sizeof(pState->linkageMap),
588 "Update for new value of MAX_ATTRIBUTES");
589
590 pState->linkageMask = mask;
591 pState->linkageCount = _mm_popcnt_u32(mask);
592
593 if (!pMap)
594 {
595 pMap = IDENTITY_MAP;
596 }
597 memcpy(pState->linkageMap, pMap, pState->linkageCount);
598 }
599
600 // update guardband multipliers for the viewport
601 void updateGuardband(API_STATE *pState)
602 {
603 // guardband center is viewport center
604 pState->gbState.left = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
605 pState->gbState.right = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
606 pState->gbState.top = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
607 pState->gbState.bottom = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
608 }
609
610 void SwrSetRastState(
611 HANDLE hContext,
612 const SWR_RASTSTATE *pRastState)
613 {
614 SWR_CONTEXT *pContext = GetContext(hContext);
615 API_STATE* pState = GetDrawState(pContext);
616
617 memcpy(&pState->rastState, pRastState, sizeof(SWR_RASTSTATE));
618 }
619
620 void SwrSetViewports(
621 HANDLE hContext,
622 uint32_t numViewports,
623 const SWR_VIEWPORT* pViewports,
624 const SWR_VIEWPORT_MATRIX* pMatrices)
625 {
626 SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS,
627 "Invalid number of viewports.");
628
629 SWR_CONTEXT *pContext = GetContext(hContext);
630 API_STATE* pState = GetDrawState(pContext);
631
632 memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports);
633
634 if (pMatrices != nullptr)
635 {
636 memcpy(&pState->vpMatrix[0], pMatrices, sizeof(SWR_VIEWPORT_MATRIX) * numViewports);
637 }
638 else
639 {
640 // Compute default viewport transform.
641 for (uint32_t i = 0; i < numViewports; ++i)
642 {
643 if (pContext->driverType == DX)
644 {
645 pState->vpMatrix[i].m00 = pState->vp[i].width / 2.0f;
646 pState->vpMatrix[i].m11 = -pState->vp[i].height / 2.0f;
647 pState->vpMatrix[i].m22 = pState->vp[i].maxZ - pState->vp[i].minZ;
648 pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
649 pState->vpMatrix[i].m31 = pState->vp[i].y - pState->vpMatrix[i].m11;
650 pState->vpMatrix[i].m32 = pState->vp[i].minZ;
651 }
652 else
653 {
654 // Standard, with the exception that Y is inverted.
655 pState->vpMatrix[i].m00 = (pState->vp[i].width - pState->vp[i].x) / 2.0f;
656 pState->vpMatrix[i].m11 = (pState->vp[i].y - pState->vp[i].height) / 2.0f;
657 pState->vpMatrix[i].m22 = (pState->vp[i].maxZ - pState->vp[i].minZ) / 2.0f;
658 pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
659 pState->vpMatrix[i].m31 = pState->vp[i].height + pState->vpMatrix[i].m11;
660 pState->vpMatrix[i].m32 = pState->vp[i].minZ + pState->vpMatrix[i].m22;
661
662 // Now that the matrix is calculated, clip the view coords to screen size.
663 // OpenGL allows for -ve x,y in the viewport.
664 pState->vp[i].x = std::max(pState->vp[i].x, 0.0f);
665 pState->vp[i].y = std::max(pState->vp[i].y, 0.0f);
666 }
667 }
668 }
669
670 updateGuardband(pState);
671 }
672
673 void SwrSetScissorRects(
674 HANDLE hContext,
675 uint32_t numScissors,
676 const BBOX* pScissors)
677 {
678 SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS,
679 "Invalid number of scissor rects.");
680
681 API_STATE* pState = GetDrawState(GetContext(hContext));
682 memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(BBOX));
683 };
684
685 void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
686 {
687 API_STATE *pState = &pDC->pState->state;
688 uint32_t left, right, top, bottom;
689
690 // Set up scissor dimensions based on scissor or viewport
691 if (pState->rastState.scissorEnable)
692 {
693 // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges
694 left = pState->scissorRects[0].left;
695 right = pState->scissorRects[0].right;
696 top = pState->scissorRects[0].top;
697 bottom = pState->scissorRects[0].bottom;
698 }
699 else
700 {
701 left = (int32_t)pState->vp[0].x;
702 right = (int32_t)pState->vp[0].x + (int32_t)pState->vp[0].width;
703 top = (int32_t)pState->vp[0].y;
704 bottom = (int32_t)pState->vp[0].y + (int32_t)pState->vp[0].height;
705 }
706
707 right = std::min<uint32_t>(right, KNOB_MAX_SCISSOR_X);
708 bottom = std::min<uint32_t>(bottom, KNOB_MAX_SCISSOR_Y);
709
710 if (left > KNOB_MAX_SCISSOR_X || top > KNOB_MAX_SCISSOR_Y)
711 {
712 pState->scissorInFixedPoint.left = 0;
713 pState->scissorInFixedPoint.right = 0;
714 pState->scissorInFixedPoint.top = 0;
715 pState->scissorInFixedPoint.bottom = 0;
716 }
717 else
718 {
719 pState->scissorInFixedPoint.left = left * FIXED_POINT_SCALE;
720 pState->scissorInFixedPoint.right = right * FIXED_POINT_SCALE - 1;
721 pState->scissorInFixedPoint.top = top * FIXED_POINT_SCALE;
722 pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1;
723 }
724 }
725 // templated backend function tables
726 extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
727 extern PFN_BACKEND_FUNC gBackendSingleSample[2][2];
728 extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
729 extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2];
730 extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS + 1][SWR_MULTISAMPLE_TYPE_MAX];
731 extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
732 extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
733 extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2];
734 void SetupPipeline(DRAW_CONTEXT *pDC)
735 {
736 DRAW_STATE* pState = pDC->pState;
737 const SWR_RASTSTATE &rastState = pState->state.rastState;
738 const SWR_PS_STATE &psState = pState->state.psState;
739 BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
740 const uint32_t forcedSampleCount = (rastState.bForcedSampleCount) ? 1 : 0;
741
742 // setup backend
743 if (psState.pfnPixelShader == nullptr)
744 {
745 backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
746 // always need to generate I & J per sample for Z interpolation
747 backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[1];
748 }
749 else
750 {
751 const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.bForcedSampleCount) ? 1 : 0;
752 const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
753
754 // currently only support 'normal' input coverage
755 SWR_ASSERT(psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
756 psState.inputCoverage == SWR_INPUT_COVERAGE_NONE);
757
758 SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
759
760 // select backend function
761 switch(psState.shadingRate)
762 {
763 case SWR_SHADING_RATE_PIXEL:
764 if(bMultisampleEnable)
765 {
766 // always need to generate I & J per sample for Z interpolation
767 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
768 backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount];
769 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
770 }
771 else
772 {
773 // always need to generate I & J per pixel for Z interpolation
774 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
775 backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid];
776 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][SWR_MULTISAMPLE_1X];
777 }
778 break;
779 case SWR_SHADING_RATE_SAMPLE:
780 SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
781 // always need to generate I & J per sample for Z interpolation
782 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
783 backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid];
784 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
785 break;
786 default:
787 SWR_ASSERT(0 && "Invalid shading rate");
788 break;
789 }
790
791 // setup pointer to function that generates necessary barycentrics required by the PS
792 bool bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_PIXEL_MASK) > 0 ? 1 : 0;
793 backendFuncs.pfnCalcPixelBarycentrics = gPixelBarycentricTable[bBarycentrics];
794
795 bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_SAMPLE_MASK) > 0 ? 1 : 0;
796 backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[bBarycentrics];
797
798 bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0 ? 1 : 0;
799 backendFuncs.pfnCalcCentroidBarycentrics = gCentroidBarycentricTable[rastState.sampleCount][bBarycentrics][rastState.samplePattern][forcedSampleCount];
800 }
801
802 PFN_PROCESS_PRIMS pfnBinner;
803 switch (pState->state.topology)
804 {
805 case TOP_POINT_LIST:
806 pState->pfnProcessPrims = ClipPoints;
807 pfnBinner = BinPoints;
808 break;
809 case TOP_LINE_LIST:
810 case TOP_LINE_STRIP:
811 case TOP_LINE_LOOP:
812 case TOP_LINE_LIST_ADJ:
813 case TOP_LISTSTRIP_ADJ:
814 pState->pfnProcessPrims = ClipLines;
815 pfnBinner = BinLines;
816 break;
817 default:
818 pState->pfnProcessPrims = ClipTriangles;
819 pfnBinner = BinTriangles;
820 break;
821 };
822
823 // disable clipper if viewport transform is disabled
824 if (pState->state.frontendState.vpTransformDisable)
825 {
826 pState->pfnProcessPrims = pfnBinner;
827 }
828
829 if ((pState->state.psState.pfnPixelShader == nullptr) &&
830 (pState->state.depthStencilState.depthTestEnable == FALSE) &&
831 (pState->state.depthStencilState.depthWriteEnable == FALSE) &&
832 (pState->state.depthStencilState.stencilTestEnable == FALSE) &&
833 (pState->state.depthStencilState.stencilWriteEnable == FALSE) &&
834 (pState->state.linkageCount == 0))
835 {
836 pState->pfnProcessPrims = nullptr;
837 pState->state.linkageMask = 0;
838 }
839
840 if (pState->state.soState.rasterizerDisable == true)
841 {
842 pState->pfnProcessPrims = nullptr;
843 pState->state.linkageMask = 0;
844 }
845
846 // set up the frontend attrib mask
847 pState->state.feAttribMask = pState->state.linkageMask;
848 if (pState->state.soState.soEnable)
849 {
850 for (uint32_t i = 0; i < 4; ++i)
851 {
852 pState->state.feAttribMask |= pState->state.soState.streamMasks[i];
853 }
854 }
855
856 // complicated logic to test for cases where we don't need backing hottile memory for a draw
857 // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
858 pState->state.depthHottileEnable = ((!(pState->state.depthStencilState.depthTestEnable &&
859 !pState->state.depthStencilState.depthWriteEnable &&
860 pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) &&
861 (pState->state.depthStencilState.depthTestEnable ||
862 pState->state.depthStencilState.depthWriteEnable)) ? true : false;
863
864 pState->state.stencilHottileEnable = (((!(pState->state.depthStencilState.stencilTestEnable &&
865 !pState->state.depthStencilState.stencilWriteEnable &&
866 pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) ||
867 // for stencil we have to check the double sided state as well
868 (!(pState->state.depthStencilState.doubleSidedStencilTestEnable &&
869 !pState->state.depthStencilState.stencilWriteEnable &&
870 pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) &&
871 (pState->state.depthStencilState.stencilTestEnable ||
872 pState->state.depthStencilState.stencilWriteEnable)) ? true : false;
873
874 uint32_t numRTs = pState->state.psState.numRenderTargets;
875 pState->state.colorHottileEnable = 0;
876 if (psState.pfnPixelShader != nullptr)
877 {
878 for (uint32_t rt = 0; rt < numRTs; ++rt)
879 {
880 pState->state.colorHottileEnable |=
881 (!pState->state.blendState.renderTarget[rt].writeDisableAlpha ||
882 !pState->state.blendState.renderTarget[rt].writeDisableRed ||
883 !pState->state.blendState.renderTarget[rt].writeDisableGreen ||
884 !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0;
885 }
886 }
887 }
888
889 //////////////////////////////////////////////////////////////////////////
890 /// @brief InitDraw
891 /// @param pDC - Draw context to initialize for this draw.
892 void InitDraw(
893 DRAW_CONTEXT *pDC,
894 bool isSplitDraw)
895 {
896 // We don't need to re-setup the scissors/pipeline state again for split draw.
897 if (isSplitDraw == false)
898 {
899 SetupMacroTileScissors(pDC);
900 SetupPipeline(pDC);
901 }
902 }
903
904 //////////////////////////////////////////////////////////////////////////
905 /// @brief We can split the draw for certain topologies for better performance.
906 /// @param totalVerts - Total vertices for draw
907 /// @param topology - Topology used for draw
908 uint32_t MaxVertsPerDraw(
909 DRAW_CONTEXT* pDC,
910 uint32_t totalVerts,
911 PRIMITIVE_TOPOLOGY topology)
912 {
913 API_STATE& state = pDC->pState->state;
914
915 uint32_t vertsPerDraw = totalVerts;
916
917 if (state.soState.soEnable)
918 {
919 return totalVerts;
920 }
921
922 switch (topology)
923 {
924 case TOP_POINT_LIST:
925 case TOP_TRIANGLE_LIST:
926 vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW;
927 break;
928
929 case TOP_PATCHLIST_1:
930 case TOP_PATCHLIST_2:
931 case TOP_PATCHLIST_3:
932 case TOP_PATCHLIST_4:
933 case TOP_PATCHLIST_5:
934 case TOP_PATCHLIST_6:
935 case TOP_PATCHLIST_7:
936 case TOP_PATCHLIST_8:
937 case TOP_PATCHLIST_9:
938 case TOP_PATCHLIST_10:
939 case TOP_PATCHLIST_11:
940 case TOP_PATCHLIST_12:
941 case TOP_PATCHLIST_13:
942 case TOP_PATCHLIST_14:
943 case TOP_PATCHLIST_15:
944 case TOP_PATCHLIST_16:
945 case TOP_PATCHLIST_17:
946 case TOP_PATCHLIST_18:
947 case TOP_PATCHLIST_19:
948 case TOP_PATCHLIST_20:
949 case TOP_PATCHLIST_21:
950 case TOP_PATCHLIST_22:
951 case TOP_PATCHLIST_23:
952 case TOP_PATCHLIST_24:
953 case TOP_PATCHLIST_25:
954 case TOP_PATCHLIST_26:
955 case TOP_PATCHLIST_27:
956 case TOP_PATCHLIST_28:
957 case TOP_PATCHLIST_29:
958 case TOP_PATCHLIST_30:
959 case TOP_PATCHLIST_31:
960 case TOP_PATCHLIST_32:
961 if (pDC->pState->state.tsState.tsEnable)
962 {
963 uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE;
964 vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW;
965 }
966 break;
967
968 // The Primitive Assembly code can only handle 1 RECT at a time.
969 case TOP_RECT_LIST:
970 vertsPerDraw = 3;
971 break;
972
973 default:
974 // We are not splitting up draws for other topologies.
975 break;
976 }
977
978 return vertsPerDraw;
979 }
980
981 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
982 // arguments to static template arguments.
983 template <bool... ArgsB>
984 struct FEDrawChooser
985 {
986 // Last Arg Terminator
987 static PFN_FE_WORK_FUNC GetFunc(bool bArg)
988 {
989 if (bArg)
990 {
991 return ProcessDraw<ArgsB..., true>;
992 }
993
994 return ProcessDraw<ArgsB..., false>;
995 }
996
997 // Recursively parse args
998 template <typename... TArgsT>
999 static PFN_FE_WORK_FUNC GetFunc(bool bArg, TArgsT... remainingArgs)
1000 {
1001 if (bArg)
1002 {
1003 return FEDrawChooser<ArgsB..., true>::GetFunc(remainingArgs...);
1004 }
1005
1006 return FEDrawChooser<ArgsB..., false>::GetFunc(remainingArgs...);
1007 }
1008 };
1009
1010 // Selector for correct templated Draw front-end function
1011 INLINE
1012 static PFN_FE_WORK_FUNC GetFEDrawFunc(bool IsIndexed, bool HasTessellation, bool HasGeometryShader, bool HasStreamOut, bool RasterizerEnabled)
1013 {
1014 return FEDrawChooser<>::GetFunc(IsIndexed, HasTessellation, HasGeometryShader, HasStreamOut, RasterizerEnabled);
1015 }
1016
1017
1018 //////////////////////////////////////////////////////////////////////////
1019 /// @brief DrawInstanced
1020 /// @param hContext - Handle passed back from SwrCreateContext
1021 /// @param topology - Specifies topology for draw.
1022 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
1023 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1024 /// @param numInstances - How many instances to render.
1025 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1026 void DrawInstanced(
1027 HANDLE hContext,
1028 PRIMITIVE_TOPOLOGY topology,
1029 uint32_t numVertices,
1030 uint32_t startVertex,
1031 uint32_t numInstances = 1,
1032 uint32_t startInstance = 0)
1033 {
1034 if (KNOB_TOSS_DRAW)
1035 {
1036 return;
1037 }
1038
1039 RDTSC_START(APIDraw);
1040
1041 SWR_CONTEXT *pContext = GetContext(hContext);
1042 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1043
1044 int32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
1045 uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
1046 int32_t remainingVerts = numVertices;
1047
1048 API_STATE *pState = &pDC->pState->state;
1049 pState->topology = topology;
1050 pState->forceFront = false;
1051
1052 // disable culling for points/lines
1053 uint32_t oldCullMode = pState->rastState.cullMode;
1054 if (topology == TOP_POINT_LIST)
1055 {
1056 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1057 pState->forceFront = true;
1058 }
1059
1060 int draw = 0;
1061 while (remainingVerts)
1062 {
1063 uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ?
1064 remainingVerts : maxVertsPerDraw;
1065
1066 bool isSplitDraw = (draw > 0) ? true : false;
1067 DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw);
1068 InitDraw(pDC, isSplitDraw);
1069
1070 pDC->FeWork.type = DRAW;
1071 pDC->FeWork.pfnWork = GetFEDrawFunc(
1072 false, // IsIndexed
1073 pState->tsState.tsEnable,
1074 pState->gsState.gsEnable,
1075 pState->soState.soEnable,
1076 pDC->pState->pfnProcessPrims != nullptr);
1077 pDC->FeWork.desc.draw.numVerts = numVertsForDraw;
1078 pDC->FeWork.desc.draw.startVertex = startVertex;
1079 pDC->FeWork.desc.draw.numInstances = numInstances;
1080 pDC->FeWork.desc.draw.startInstance = startInstance;
1081 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1082 pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw;
1083
1084 pDC->cleanupState = (remainingVerts == numVertsForDraw);
1085
1086 //enqueue DC
1087 QueueDraw(pContext);
1088
1089 remainingVerts -= numVertsForDraw;
1090 draw++;
1091 }
1092
1093 // restore culling state
1094 pDC = GetDrawContext(pContext);
1095 pDC->pState->state.rastState.cullMode = oldCullMode;
1096
1097 RDTSC_STOP(APIDraw, numVertices * numInstances, 0);
1098 }
1099
1100 //////////////////////////////////////////////////////////////////////////
1101 /// @brief SwrDraw
1102 /// @param hContext - Handle passed back from SwrCreateContext
1103 /// @param topology - Specifies topology for draw.
1104 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
1105 /// @param primCount - Number of vertices.
1106 void SwrDraw(
1107 HANDLE hContext,
1108 PRIMITIVE_TOPOLOGY topology,
1109 uint32_t startVertex,
1110 uint32_t numVertices)
1111 {
1112 DrawInstanced(hContext, topology, numVertices, startVertex);
1113 }
1114
1115 //////////////////////////////////////////////////////////////////////////
1116 /// @brief SwrDrawInstanced
1117 /// @param hContext - Handle passed back from SwrCreateContext
1118 /// @param topology - Specifies topology for draw.
1119 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
1120 /// @param numInstances - How many instances to render.
1121 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1122 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1123 void SwrDrawInstanced(
1124 HANDLE hContext,
1125 PRIMITIVE_TOPOLOGY topology,
1126 uint32_t numVertsPerInstance,
1127 uint32_t numInstances,
1128 uint32_t startVertex,
1129 uint32_t startInstance
1130 )
1131 {
1132 DrawInstanced(hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance);
1133 }
1134
1135 //////////////////////////////////////////////////////////////////////////
1136 /// @brief DrawIndexedInstanced
1137 /// @param hContext - Handle passed back from SwrCreateContext
1138 /// @param topology - Specifies topology for draw.
1139 /// @param numIndices - Number of indices to read sequentially from index buffer.
1140 /// @param indexOffset - Starting index into index buffer.
1141 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1142 /// @param numInstances - Number of instances to render.
1143 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1144 void DrawIndexedInstance(
1145 HANDLE hContext,
1146 PRIMITIVE_TOPOLOGY topology,
1147 uint32_t numIndices,
1148 uint32_t indexOffset,
1149 int32_t baseVertex,
1150 uint32_t numInstances = 1,
1151 uint32_t startInstance = 0)
1152 {
1153 if (KNOB_TOSS_DRAW)
1154 {
1155 return;
1156 }
1157
1158 RDTSC_START(APIDrawIndexed);
1159
1160 SWR_CONTEXT *pContext = GetContext(hContext);
1161 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1162 API_STATE* pState = &pDC->pState->state;
1163
1164 int32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
1165 uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
1166 int32_t remainingIndices = numIndices;
1167
1168 uint32_t indexSize = 0;
1169 switch (pState->indexBuffer.format)
1170 {
1171 case R32_UINT: indexSize = sizeof(uint32_t); break;
1172 case R16_UINT: indexSize = sizeof(uint16_t); break;
1173 case R8_UINT: indexSize = sizeof(uint8_t); break;
1174 default:
1175 SWR_ASSERT(0);
1176 }
1177
1178 int draw = 0;
1179 uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices;
1180 pIB += (uint64_t)indexOffset * (uint64_t)indexSize;
1181
1182 pState->topology = topology;
1183 pState->forceFront = false;
1184
1185 // disable culling for points/lines
1186 uint32_t oldCullMode = pState->rastState.cullMode;
1187 if (topology == TOP_POINT_LIST)
1188 {
1189 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1190 pState->forceFront = true;
1191 }
1192
1193 while (remainingIndices)
1194 {
1195 uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ?
1196 remainingIndices : maxIndicesPerDraw;
1197
1198 // When breaking up draw, we need to obtain new draw context for each iteration.
1199 bool isSplitDraw = (draw > 0) ? true : false;
1200 pDC = GetDrawContext(pContext, isSplitDraw);
1201 InitDraw(pDC, isSplitDraw);
1202
1203 pDC->FeWork.type = DRAW;
1204 pDC->FeWork.pfnWork = GetFEDrawFunc(
1205 true, // IsIndexed
1206 pState->tsState.tsEnable,
1207 pState->gsState.gsEnable,
1208 pState->soState.soEnable,
1209 pDC->pState->pfnProcessPrims != nullptr);
1210 pDC->FeWork.desc.draw.pDC = pDC;
1211 pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
1212 pDC->FeWork.desc.draw.pIB = (int*)pIB;
1213 pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format;
1214
1215 pDC->FeWork.desc.draw.numInstances = numInstances;
1216 pDC->FeWork.desc.draw.startInstance = startInstance;
1217 pDC->FeWork.desc.draw.baseVertex = baseVertex;
1218 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1219
1220 pDC->cleanupState = (remainingIndices == numIndicesForDraw);
1221
1222 //enqueue DC
1223 QueueDraw(pContext);
1224
1225 pIB += maxIndicesPerDraw * indexSize;
1226 remainingIndices -= numIndicesForDraw;
1227 draw++;
1228 }
1229
1230 // restore culling state
1231 pDC = GetDrawContext(pContext);
1232 pDC->pState->state.rastState.cullMode = oldCullMode;
1233
1234 RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0);
1235 }
1236
1237
1238 //////////////////////////////////////////////////////////////////////////
1239 /// @brief DrawIndexed
1240 /// @param hContext - Handle passed back from SwrCreateContext
1241 /// @param topology - Specifies topology for draw.
1242 /// @param numIndices - Number of indices to read sequentially from index buffer.
1243 /// @param indexOffset - Starting index into index buffer.
1244 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1245 void SwrDrawIndexed(
1246 HANDLE hContext,
1247 PRIMITIVE_TOPOLOGY topology,
1248 uint32_t numIndices,
1249 uint32_t indexOffset,
1250 int32_t baseVertex
1251 )
1252 {
1253 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex);
1254 }
1255
1256 //////////////////////////////////////////////////////////////////////////
1257 /// @brief SwrDrawIndexedInstanced
1258 /// @param hContext - Handle passed back from SwrCreateContext
1259 /// @param topology - Specifies topology for draw.
1260 /// @param numIndices - Number of indices to read sequentially from index buffer.
1261 /// @param numInstances - Number of instances to render.
1262 /// @param indexOffset - Starting index into index buffer.
1263 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1264 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1265 void SwrDrawIndexedInstanced(
1266 HANDLE hContext,
1267 PRIMITIVE_TOPOLOGY topology,
1268 uint32_t numIndices,
1269 uint32_t numInstances,
1270 uint32_t indexOffset,
1271 int32_t baseVertex,
1272 uint32_t startInstance)
1273 {
1274 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
1275 }
1276
1277 //////////////////////////////////////////////////////////////////////////
1278 /// @brief SwrInvalidateTiles
1279 /// @param hContext - Handle passed back from SwrCreateContext
1280 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
1281 void SwrInvalidateTiles(
1282 HANDLE hContext,
1283 uint32_t attachmentMask)
1284 {
1285 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1286 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1287
1288 pDC->FeWork.type = DISCARDINVALIDATETILES;
1289 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1290 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1291 memset(&pDC->FeWork.desc.discardInvalidateTiles.rect, 0, sizeof(SWR_RECT));
1292 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID;
1293 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false;
1294 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false;
1295
1296 //enqueue
1297 QueueDraw(pContext);
1298 }
1299
1300 //////////////////////////////////////////////////////////////////////////
1301 /// @brief SwrDiscardRect
1302 /// @param hContext - Handle passed back from SwrCreateContext
1303 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
1304 /// @param rect - if rect is all zeros, the entire attachment surface will be discarded
1305 void SwrDiscardRect(
1306 HANDLE hContext,
1307 uint32_t attachmentMask,
1308 SWR_RECT rect)
1309 {
1310 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1311 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1312
1313 // Queue a load to the hottile
1314 pDC->FeWork.type = DISCARDINVALIDATETILES;
1315 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1316 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1317 pDC->FeWork.desc.discardInvalidateTiles.rect = rect;
1318 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED;
1319 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true;
1320 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true;
1321
1322 //enqueue
1323 QueueDraw(pContext);
1324 }
1325
1326 //////////////////////////////////////////////////////////////////////////
1327 /// @brief SwrDispatch
1328 /// @param hContext - Handle passed back from SwrCreateContext
1329 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
1330 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
1331 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
1332 void SwrDispatch(
1333 HANDLE hContext,
1334 uint32_t threadGroupCountX,
1335 uint32_t threadGroupCountY,
1336 uint32_t threadGroupCountZ)
1337 {
1338 if (KNOB_TOSS_DRAW)
1339 {
1340 return;
1341 }
1342
1343 RDTSC_START(APIDispatch);
1344 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1345 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1346
1347 pDC->isCompute = true; // This is a compute context.
1348
1349 // Ensure spill fill pointers are initialized to nullptr.
1350 memset(pDC->pSpillFill, 0, sizeof(pDC->pSpillFill));
1351
1352 COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
1353
1354 pTaskData->threadGroupCountX = threadGroupCountX;
1355 pTaskData->threadGroupCountY = threadGroupCountY;
1356 pTaskData->threadGroupCountZ = threadGroupCountZ;
1357
1358 uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
1359 pDC->pDispatch->initialize(totalThreadGroups, pTaskData);
1360
1361 QueueDispatch(pContext);
1362 RDTSC_STOP(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ, 0);
1363 }
1364
1365 // Deswizzles, converts and stores current contents of the hot tiles to surface
1366 // described by pState
1367 void SwrStoreTiles(
1368 HANDLE hContext,
1369 SWR_RENDERTARGET_ATTACHMENT attachment,
1370 SWR_TILE_STATE postStoreTileState)
1371 {
1372 RDTSC_START(APIStoreTiles);
1373
1374 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1375 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1376
1377 SetupMacroTileScissors(pDC);
1378
1379 pDC->FeWork.type = STORETILES;
1380 pDC->FeWork.pfnWork = ProcessStoreTiles;
1381 pDC->FeWork.desc.storeTiles.attachment = attachment;
1382 pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState;
1383
1384 //enqueue
1385 QueueDraw(pContext);
1386
1387 RDTSC_STOP(APIStoreTiles, 0, 0);
1388 }
1389
1390 void SwrClearRenderTarget(
1391 HANDLE hContext,
1392 uint32_t clearMask,
1393 const float clearColor[4],
1394 float z,
1395 uint8_t stencil)
1396 {
1397 RDTSC_START(APIClearRenderTarget);
1398
1399 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1400
1401 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1402
1403 SetupMacroTileScissors(pDC);
1404
1405 CLEAR_FLAGS flags;
1406 flags.mask = clearMask;
1407
1408 pDC->FeWork.type = CLEAR;
1409 pDC->FeWork.pfnWork = ProcessClear;
1410 pDC->FeWork.desc.clear.flags = flags;
1411 pDC->FeWork.desc.clear.clearDepth = z;
1412 pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0];
1413 pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1];
1414 pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2];
1415 pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3];
1416 pDC->FeWork.desc.clear.clearStencil = stencil;
1417
1418 // enqueue draw
1419 QueueDraw(pContext);
1420
1421 RDTSC_STOP(APIClearRenderTarget, 0, pDC->drawId);
1422 }
1423
1424 //////////////////////////////////////////////////////////////////////////
1425 /// @brief Returns a pointer to the private context state for the current
1426 /// draw operation. This is used for external componets such as the
1427 /// sampler.
1428 /// SWR is responsible for the allocation of the private context state.
1429 /// @param hContext - Handle passed back from SwrCreateContext
1430 VOID* SwrGetPrivateContextState(
1431 HANDLE hContext)
1432 {
1433 SWR_CONTEXT* pContext = GetContext(hContext);
1434 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1435 DRAW_STATE* pState = pDC->pState;
1436
1437 if (pState->pPrivateState == nullptr)
1438 {
1439 pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float));
1440 }
1441
1442 return pState->pPrivateState;
1443 }
1444
1445 //////////////////////////////////////////////////////////////////////////
1446 /// @brief Clients can use this to allocate memory for draw/dispatch
1447 /// operations. The memory will automatically be freed once operation
1448 /// has completed. Client can use this to allocate binding tables,
1449 /// etc. needed for shader execution.
1450 /// @param hContext - Handle passed back from SwrCreateContext
1451 /// @param size - Size of allocation
1452 /// @param align - Alignment needed for allocation.
1453 VOID* SwrAllocDrawContextMemory(
1454 HANDLE hContext,
1455 uint32_t size,
1456 uint32_t align)
1457 {
1458 SWR_CONTEXT* pContext = GetContext(hContext);
1459 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1460
1461 return pDC->pState->pArena->AllocAligned(size, align);
1462 }
1463
1464 //////////////////////////////////////////////////////////////////////////
1465 /// @brief Returns pointer to SWR stats.
1466 /// @note The counters are atomically incremented by multiple threads.
1467 /// When calling this, you need to ensure all previous operations
1468 /// have completed.
1469 /// @todo If necessary, add a callback to avoid stalling the pipe to
1470 /// sample the counters.
1471 /// @param hContext - Handle passed back from SwrCreateContext
1472 /// @param pStats - SWR will fill this out for caller.
1473 void SwrGetStats(
1474 HANDLE hContext,
1475 SWR_STATS* pStats)
1476 {
1477 SWR_CONTEXT *pContext = GetContext(hContext);
1478 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1479
1480 pDC->FeWork.type = QUERYSTATS;
1481 pDC->FeWork.pfnWork = ProcessQueryStats;
1482 pDC->FeWork.desc.queryStats.pStats = pStats;
1483
1484 // cannot execute until all previous draws have completed
1485 pDC->dependency = pDC->drawId - 1;
1486
1487 //enqueue
1488 QueueDraw(pContext);
1489 }
1490
1491 //////////////////////////////////////////////////////////////////////////
1492 /// @brief Enables stats counting
1493 /// @param hContext - Handle passed back from SwrCreateContext
1494 /// @param enable - If true then counts are incremented.
1495 void SwrEnableStats(
1496 HANDLE hContext,
1497 bool enable)
1498 {
1499 SWR_CONTEXT *pContext = GetContext(hContext);
1500 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1501
1502 pDC->pState->state.enableStats = enable;
1503 }
1504
1505 //////////////////////////////////////////////////////////////////////////
1506 /// @brief Mark end of frame - used for performance profiling
1507 /// @param hContext - Handle passed back from SwrCreateContext
1508 void SWR_API SwrEndFrame(
1509 HANDLE hContext)
1510 {
1511 RDTSC_ENDFRAME();
1512 }