swr: [rasterizer core] NUMA optimizations...
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / api.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file api.cpp
24 *
25 * @brief API implementation
26 *
27 ******************************************************************************/
28
29 #include <cfloat>
30 #include <cmath>
31 #include <cstdio>
32
33 #include "core/api.h"
34 #include "core/backend.h"
35 #include "core/context.h"
36 #include "core/frontend.h"
37 #include "core/rasterizer.h"
38 #include "core/rdtsc_core.h"
39 #include "core/threads.h"
40 #include "core/tilemgr.h"
41 #include "core/clip.h"
42
43 #include "common/simdintrin.h"
44 #include "common/os.h"
45
46 void SetupDefaultState(SWR_CONTEXT *pContext);
47
48 //////////////////////////////////////////////////////////////////////////
49 /// @brief Create SWR Context.
50 /// @param pCreateInfo - pointer to creation info.
51 HANDLE SwrCreateContext(
52 SWR_CREATECONTEXT_INFO* pCreateInfo)
53 {
54 RDTSC_RESET();
55 RDTSC_INIT(0);
56
57 void* pContextMem = _aligned_malloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4);
58 memset(pContextMem, 0, sizeof(SWR_CONTEXT));
59 SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT();
60
61 pContext->driverType = pCreateInfo->driver;
62 pContext->privateStateSize = pCreateInfo->privateStateSize;
63
64 pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
65 pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
66
67 for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
68 {
69 pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
70 pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena));
71 pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
72
73 pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
74 }
75
76 if (!KNOB_SINGLE_THREADED)
77 {
78 memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
79 memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
80 new (&pContext->WaitLock) std::mutex();
81 new (&pContext->FifosNotEmpty) std::condition_variable();
82
83 CreateThreadPool(pContext, &pContext->threadPool);
84 }
85
86 // Calling createThreadPool() above can set SINGLE_THREADED
87 if (KNOB_SINGLE_THREADED)
88 {
89 pContext->NumWorkerThreads = 1;
90 }
91
92 // Allocate scratch space for workers.
93 ///@note We could lazily allocate this but its rather small amount of memory.
94 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
95 {
96 ///@todo Use numa API for allocations using numa information from thread data (if exists).
97 pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4);
98 }
99
100 // State setup AFTER context is fully initialized
101 SetupDefaultState(pContext);
102
103 // initialize hot tile manager
104 pContext->pHotTileMgr = new HotTileMgr();
105
106 // initialize function pointer tables
107 InitClearTilesTable();
108
109 // initialize store tiles function
110 pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
111 pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
112 pContext->pfnClearTile = pCreateInfo->pfnClearTile;
113
114 // pass pointer to bucket manager back to caller
115 #ifdef KNOB_ENABLE_RDTSC
116 pCreateInfo->pBucketMgr = &gBucketMgr;
117 #endif
118
119 pCreateInfo->contextSaveSize = sizeof(API_STATE);
120
121 return (HANDLE)pContext;
122 }
123
124 void SwrDestroyContext(HANDLE hContext)
125 {
126 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
127 DestroyThreadPool(pContext, &pContext->threadPool);
128
129 // free the fifos
130 for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i)
131 {
132 delete pContext->dcRing[i].pArena;
133 delete pContext->dsRing[i].pArena;
134 delete(pContext->dcRing[i].pTileMgr);
135 delete(pContext->dcRing[i].pDispatch);
136 }
137
138 // Free scratch space.
139 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
140 {
141 _aligned_free(pContext->pScratch[i]);
142 }
143
144 delete(pContext->pHotTileMgr);
145
146 pContext->~SWR_CONTEXT();
147 _aligned_free((SWR_CONTEXT*)hContext);
148 }
149
150 void CopyState(DRAW_STATE& dst, const DRAW_STATE& src)
151 {
152 memcpy(&dst.state, &src.state, sizeof(API_STATE));
153 }
154
155 void WakeAllThreads(SWR_CONTEXT *pContext)
156 {
157 pContext->FifosNotEmpty.notify_all();
158 }
159
160 template<bool IsDraw>
161 void QueueWork(SWR_CONTEXT *pContext)
162 {
163 // Each worker thread looks at a DC for both FE and BE work at different times and so we
164 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
165 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
166 // then moved on if all work is done.)
167 pContext->pCurDrawContext->threadsDone =
168 pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2;
169
170 _ReadWriteBarrier();
171 {
172 std::unique_lock<std::mutex> lock(pContext->WaitLock);
173 pContext->dcRing.Enqueue();
174 }
175
176 if (KNOB_SINGLE_THREADED)
177 {
178 // flush denormals to 0
179 uint32_t mxcsr = _mm_getcsr();
180 _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
181
182 if (IsDraw)
183 {
184 static TileSet lockedTiles;
185 uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
186 WorkOnFifoFE(pContext, 0, curDraw[0], 0);
187 WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles, 0, 0);
188 }
189 else
190 {
191 uint64_t curDispatch = pContext->pCurDrawContext->drawId;
192 WorkOnCompute(pContext, 0, curDispatch);
193 }
194
195 // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
196 while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0) {}
197
198 // restore csr
199 _mm_setcsr(mxcsr);
200 }
201 else
202 {
203 RDTSC_START(APIDrawWakeAllThreads);
204 WakeAllThreads(pContext);
205 RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
206 }
207
208 // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
209 pContext->pPrevDrawContext = pContext->pCurDrawContext;
210 pContext->pCurDrawContext = nullptr;
211 }
212
213 INLINE void QueueDraw(SWR_CONTEXT* pContext)
214 {
215 QueueWork<true>(pContext);
216 }
217
218 INLINE void QueueDispatch(SWR_CONTEXT* pContext)
219 {
220 QueueWork<false>(pContext);
221 }
222
223 DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
224 {
225 RDTSC_START(APIGetDrawContext);
226 // If current draw context is null then need to obtain a new draw context to use from ring.
227 if (pContext->pCurDrawContext == nullptr)
228 {
229 // Need to wait for a free entry.
230 while (pContext->dcRing.IsFull())
231 {
232 _mm_pause();
233 }
234
235 uint32_t dcIndex = pContext->dcRing.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT;
236
237 DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
238 pContext->pCurDrawContext = pCurDrawContext;
239
240 // Assign next available entry in DS ring to this DC.
241 uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
242 pCurDrawContext->pState = &pContext->dsRing[dsIndex];
243
244 // Copy previous state to current state.
245 if (pContext->pPrevDrawContext)
246 {
247 DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext;
248
249 // If we're splitting our draw then we can just use the same state from the previous
250 // draw. In this case, we won't increment the DS ring index so the next non-split
251 // draw can receive the state.
252 if (isSplitDraw == false)
253 {
254 CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);
255
256 // Should have been cleaned up previously
257 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
258
259 pCurDrawContext->pState->pPrivateState = nullptr;
260
261 pContext->curStateId++; // Progress state ring index forward.
262 }
263 else
264 {
265 // If its a split draw then just copy the state pointer over
266 // since its the same draw.
267 pCurDrawContext->pState = pPrevDrawContext->pState;
268 SWR_ASSERT(pPrevDrawContext->cleanupState == false);
269 }
270 }
271 else
272 {
273 SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
274 pContext->curStateId++; // Progress state ring index forward.
275 }
276
277 SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true);
278
279 pCurDrawContext->dependency = 0;
280 pCurDrawContext->pContext = pContext;
281 pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
282
283 pCurDrawContext->doneFE = false;
284 pCurDrawContext->FeLock = 0;
285 pCurDrawContext->threadsDone = 0;
286
287 pCurDrawContext->pTileMgr->initialize();
288
289 // Assign unique drawId for this DC
290 pCurDrawContext->drawId = pContext->dcRing.GetHead();
291
292 pCurDrawContext->cleanupState = true;
293 }
294 else
295 {
296 SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
297 }
298
299 RDTSC_STOP(APIGetDrawContext, 0, 0);
300 return pContext->pCurDrawContext;
301 }
302
303 API_STATE* GetDrawState(SWR_CONTEXT *pContext)
304 {
305 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
306 SWR_ASSERT(pDC->pState != nullptr);
307
308 return &pDC->pState->state;
309 }
310
311 void SWR_API SwrSaveState(
312 HANDLE hContext,
313 void* pOutputStateBlock,
314 size_t memSize)
315 {
316 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
317 auto pSrc = GetDrawState(pContext);
318 SWR_ASSERT(pOutputStateBlock && memSize >= sizeof(*pSrc));
319
320 memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc));
321 }
322
323 void SWR_API SwrRestoreState(
324 HANDLE hContext,
325 const void* pStateBlock,
326 size_t memSize)
327 {
328 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
329 auto pDst = GetDrawState(pContext);
330 SWR_ASSERT(pStateBlock && memSize >= sizeof(*pDst));
331
332 memcpy(pDst, pStateBlock, sizeof(*pDst));
333 }
334
335 void SetupDefaultState(SWR_CONTEXT *pContext)
336 {
337 API_STATE* pState = GetDrawState(pContext);
338
339 pState->rastState.cullMode = SWR_CULLMODE_NONE;
340 pState->rastState.frontWinding = SWR_FRONTWINDING_CCW;
341 }
342
343 static INLINE SWR_CONTEXT* GetContext(HANDLE hContext)
344 {
345 return (SWR_CONTEXT*)hContext;
346 }
347
348 void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2, uint64_t userData3)
349 {
350 RDTSC_START(APISync);
351
352 SWR_ASSERT(pfnFunc != nullptr);
353
354 SWR_CONTEXT *pContext = GetContext(hContext);
355 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
356
357 pDC->FeWork.type = SYNC;
358 pDC->FeWork.pfnWork = ProcessSync;
359 pDC->FeWork.desc.sync.pfnCallbackFunc = pfnFunc;
360 pDC->FeWork.desc.sync.userData = userData;
361 pDC->FeWork.desc.sync.userData2 = userData2;
362 pDC->FeWork.desc.sync.userData3 = userData3;
363
364 // cannot execute until all previous draws have completed
365 pDC->dependency = pDC->drawId - 1;
366
367 //enqueue
368 QueueDraw(pContext);
369
370 RDTSC_STOP(APISync, 1, 0);
371 }
372
373 void SwrWaitForIdle(HANDLE hContext)
374 {
375 SWR_CONTEXT *pContext = GetContext(hContext);
376
377 RDTSC_START(APIWaitForIdle);
378
379 while (!pContext->dcRing.IsEmpty())
380 {
381 _mm_pause();
382 }
383
384 RDTSC_STOP(APIWaitForIdle, 1, 0);
385 }
386
387 void SwrSetVertexBuffers(
388 HANDLE hContext,
389 uint32_t numBuffers,
390 const SWR_VERTEX_BUFFER_STATE* pVertexBuffers)
391 {
392 API_STATE* pState = GetDrawState(GetContext(hContext));
393
394 for (uint32_t i = 0; i < numBuffers; ++i)
395 {
396 const SWR_VERTEX_BUFFER_STATE *pVB = &pVertexBuffers[i];
397 pState->vertexBuffers[pVB->index] = *pVB;
398 }
399 }
400
401 void SwrSetIndexBuffer(
402 HANDLE hContext,
403 const SWR_INDEX_BUFFER_STATE* pIndexBuffer)
404 {
405 API_STATE* pState = GetDrawState(GetContext(hContext));
406
407 pState->indexBuffer = *pIndexBuffer;
408 }
409
410 void SwrSetFetchFunc(
411 HANDLE hContext,
412 PFN_FETCH_FUNC pfnFetchFunc)
413 {
414 API_STATE* pState = GetDrawState(GetContext(hContext));
415
416 pState->pfnFetchFunc = pfnFetchFunc;
417 }
418
419 void SwrSetSoFunc(
420 HANDLE hContext,
421 PFN_SO_FUNC pfnSoFunc,
422 uint32_t streamIndex)
423 {
424 API_STATE* pState = GetDrawState(GetContext(hContext));
425
426 SWR_ASSERT(streamIndex < MAX_SO_STREAMS);
427
428 pState->pfnSoFunc[streamIndex] = pfnSoFunc;
429 }
430
431 void SwrSetSoState(
432 HANDLE hContext,
433 SWR_STREAMOUT_STATE* pSoState)
434 {
435 API_STATE* pState = GetDrawState(GetContext(hContext));
436
437 pState->soState = *pSoState;
438 }
439
440 void SwrSetSoBuffers(
441 HANDLE hContext,
442 SWR_STREAMOUT_BUFFER* pSoBuffer,
443 uint32_t slot)
444 {
445 API_STATE* pState = GetDrawState(GetContext(hContext));
446
447 SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot);
448
449 pState->soBuffer[slot] = *pSoBuffer;
450 }
451
452 void SwrSetVertexFunc(
453 HANDLE hContext,
454 PFN_VERTEX_FUNC pfnVertexFunc)
455 {
456 API_STATE* pState = GetDrawState(GetContext(hContext));
457
458 pState->pfnVertexFunc = pfnVertexFunc;
459 }
460
461 void SwrSetFrontendState(
462 HANDLE hContext,
463 SWR_FRONTEND_STATE *pFEState)
464 {
465 API_STATE* pState = GetDrawState(GetContext(hContext));
466 pState->frontendState = *pFEState;
467 }
468
469 void SwrSetGsState(
470 HANDLE hContext,
471 SWR_GS_STATE *pGSState)
472 {
473 API_STATE* pState = GetDrawState(GetContext(hContext));
474 pState->gsState = *pGSState;
475 }
476
477 void SwrSetGsFunc(
478 HANDLE hContext,
479 PFN_GS_FUNC pfnGsFunc)
480 {
481 API_STATE* pState = GetDrawState(GetContext(hContext));
482 pState->pfnGsFunc = pfnGsFunc;
483 }
484
485 void SwrSetCsFunc(
486 HANDLE hContext,
487 PFN_CS_FUNC pfnCsFunc,
488 uint32_t totalThreadsInGroup)
489 {
490 API_STATE* pState = GetDrawState(GetContext(hContext));
491 pState->pfnCsFunc = pfnCsFunc;
492 pState->totalThreadsInGroup = totalThreadsInGroup;
493 }
494
495 void SwrSetTsState(
496 HANDLE hContext,
497 SWR_TS_STATE *pState)
498 {
499 API_STATE* pApiState = GetDrawState(GetContext(hContext));
500 pApiState->tsState = *pState;
501 }
502
503 void SwrSetHsFunc(
504 HANDLE hContext,
505 PFN_HS_FUNC pfnFunc)
506 {
507 API_STATE* pApiState = GetDrawState(GetContext(hContext));
508 pApiState->pfnHsFunc = pfnFunc;
509 }
510
511 void SwrSetDsFunc(
512 HANDLE hContext,
513 PFN_DS_FUNC pfnFunc)
514 {
515 API_STATE* pApiState = GetDrawState(GetContext(hContext));
516 pApiState->pfnDsFunc = pfnFunc;
517 }
518
519 void SwrSetDepthStencilState(
520 HANDLE hContext,
521 SWR_DEPTH_STENCIL_STATE *pDSState)
522 {
523 API_STATE* pState = GetDrawState(GetContext(hContext));
524
525 pState->depthStencilState = *pDSState;
526 }
527
528 void SwrSetBackendState(
529 HANDLE hContext,
530 SWR_BACKEND_STATE *pBEState)
531 {
532 API_STATE* pState = GetDrawState(GetContext(hContext));
533
534 pState->backendState = *pBEState;
535 }
536
537 void SwrSetPixelShaderState(
538 HANDLE hContext,
539 SWR_PS_STATE *pPSState)
540 {
541 API_STATE *pState = GetDrawState(GetContext(hContext));
542 pState->psState = *pPSState;
543 }
544
545 void SwrSetBlendState(
546 HANDLE hContext,
547 SWR_BLEND_STATE *pBlendState)
548 {
549 API_STATE *pState = GetDrawState(GetContext(hContext));
550 memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE));
551 }
552
553 void SwrSetBlendFunc(
554 HANDLE hContext,
555 uint32_t renderTarget,
556 PFN_BLEND_JIT_FUNC pfnBlendFunc)
557 {
558 SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS);
559 API_STATE *pState = GetDrawState(GetContext(hContext));
560 pState->pfnBlendFunc[renderTarget] = pfnBlendFunc;
561 }
562
563 void SwrSetLinkage(
564 HANDLE hContext,
565 uint32_t mask,
566 const uint8_t* pMap)
567 {
568 API_STATE* pState = GetDrawState(GetContext(hContext));
569
570 static const uint8_t IDENTITY_MAP[] =
571 {
572 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
573 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
574 };
575 static_assert(sizeof(IDENTITY_MAP) == sizeof(pState->linkageMap),
576 "Update for new value of MAX_ATTRIBUTES");
577
578 pState->linkageMask = mask;
579 pState->linkageCount = _mm_popcnt_u32(mask);
580
581 if (!pMap)
582 {
583 pMap = IDENTITY_MAP;
584 }
585 memcpy(pState->linkageMap, pMap, pState->linkageCount);
586 }
587
588 // update guardband multipliers for the viewport
589 void updateGuardband(API_STATE *pState)
590 {
591 // guardband center is viewport center
592 pState->gbState.left = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
593 pState->gbState.right = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
594 pState->gbState.top = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
595 pState->gbState.bottom = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
596 }
597
598 void SwrSetRastState(
599 HANDLE hContext,
600 const SWR_RASTSTATE *pRastState)
601 {
602 SWR_CONTEXT *pContext = GetContext(hContext);
603 API_STATE* pState = GetDrawState(pContext);
604
605 memcpy(&pState->rastState, pRastState, sizeof(SWR_RASTSTATE));
606 }
607
608 void SwrSetViewports(
609 HANDLE hContext,
610 uint32_t numViewports,
611 const SWR_VIEWPORT* pViewports,
612 const SWR_VIEWPORT_MATRIX* pMatrices)
613 {
614 SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS,
615 "Invalid number of viewports.");
616
617 SWR_CONTEXT *pContext = GetContext(hContext);
618 API_STATE* pState = GetDrawState(pContext);
619
620 memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports);
621
622 if (pMatrices != nullptr)
623 {
624 memcpy(&pState->vpMatrix[0], pMatrices, sizeof(SWR_VIEWPORT_MATRIX) * numViewports);
625 }
626 else
627 {
628 // Compute default viewport transform.
629 for (uint32_t i = 0; i < numViewports; ++i)
630 {
631 if (pContext->driverType == DX)
632 {
633 pState->vpMatrix[i].m00 = pState->vp[i].width / 2.0f;
634 pState->vpMatrix[i].m11 = -pState->vp[i].height / 2.0f;
635 pState->vpMatrix[i].m22 = pState->vp[i].maxZ - pState->vp[i].minZ;
636 pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
637 pState->vpMatrix[i].m31 = pState->vp[i].y - pState->vpMatrix[i].m11;
638 pState->vpMatrix[i].m32 = pState->vp[i].minZ;
639 }
640 else
641 {
642 // Standard, with the exception that Y is inverted.
643 pState->vpMatrix[i].m00 = (pState->vp[i].width - pState->vp[i].x) / 2.0f;
644 pState->vpMatrix[i].m11 = (pState->vp[i].y - pState->vp[i].height) / 2.0f;
645 pState->vpMatrix[i].m22 = (pState->vp[i].maxZ - pState->vp[i].minZ) / 2.0f;
646 pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
647 pState->vpMatrix[i].m31 = pState->vp[i].height + pState->vpMatrix[i].m11;
648 pState->vpMatrix[i].m32 = pState->vp[i].minZ + pState->vpMatrix[i].m22;
649
650 // Now that the matrix is calculated, clip the view coords to screen size.
651 // OpenGL allows for -ve x,y in the viewport.
652 pState->vp[i].x = std::max(pState->vp[i].x, 0.0f);
653 pState->vp[i].y = std::max(pState->vp[i].y, 0.0f);
654 }
655 }
656 }
657
658 updateGuardband(pState);
659 }
660
661 void SwrSetScissorRects(
662 HANDLE hContext,
663 uint32_t numScissors,
664 const BBOX* pScissors)
665 {
666 SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS,
667 "Invalid number of scissor rects.");
668
669 API_STATE* pState = GetDrawState(GetContext(hContext));
670 memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(BBOX));
671 };
672
673 void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
674 {
675 API_STATE *pState = &pDC->pState->state;
676 uint32_t left, right, top, bottom;
677
678 // Set up scissor dimensions based on scissor or viewport
679 if (pState->rastState.scissorEnable)
680 {
681 // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges
682 left = pState->scissorRects[0].left;
683 right = pState->scissorRects[0].right;
684 top = pState->scissorRects[0].top;
685 bottom = pState->scissorRects[0].bottom;
686 }
687 else
688 {
689 left = (int32_t)pState->vp[0].x;
690 right = (int32_t)pState->vp[0].x + (int32_t)pState->vp[0].width;
691 top = (int32_t)pState->vp[0].y;
692 bottom = (int32_t)pState->vp[0].y + (int32_t)pState->vp[0].height;
693 }
694
695 right = std::min<uint32_t>(right, KNOB_MAX_SCISSOR_X);
696 bottom = std::min<uint32_t>(bottom, KNOB_MAX_SCISSOR_Y);
697
698 if (left > KNOB_MAX_SCISSOR_X || top > KNOB_MAX_SCISSOR_Y)
699 {
700 pState->scissorInFixedPoint.left = 0;
701 pState->scissorInFixedPoint.right = 0;
702 pState->scissorInFixedPoint.top = 0;
703 pState->scissorInFixedPoint.bottom = 0;
704 }
705 else
706 {
707 pState->scissorInFixedPoint.left = left * FIXED_POINT_SCALE;
708 pState->scissorInFixedPoint.right = right * FIXED_POINT_SCALE - 1;
709 pState->scissorInFixedPoint.top = top * FIXED_POINT_SCALE;
710 pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1;
711 }
712 }
713 // templated backend function tables
714 extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
715 extern PFN_BACKEND_FUNC gBackendSingleSample[2][2];
716 extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
717 extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2];
718 extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS + 1][SWR_MULTISAMPLE_TYPE_MAX];
719 extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
720 extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
721 extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2];
722 void SetupPipeline(DRAW_CONTEXT *pDC)
723 {
724 DRAW_STATE* pState = pDC->pState;
725 const SWR_RASTSTATE &rastState = pState->state.rastState;
726 const SWR_PS_STATE &psState = pState->state.psState;
727 BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
728 const uint32_t forcedSampleCount = (rastState.bForcedSampleCount) ? 1 : 0;
729
730 // setup backend
731 if (psState.pfnPixelShader == nullptr)
732 {
733 backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
734 // always need to generate I & J per sample for Z interpolation
735 backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[1];
736 }
737 else
738 {
739 const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.bForcedSampleCount) ? 1 : 0;
740 const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
741
742 // currently only support 'normal' input coverage
743 SWR_ASSERT(psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
744 psState.inputCoverage == SWR_INPUT_COVERAGE_NONE);
745
746 SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
747
748 // select backend function
749 switch(psState.shadingRate)
750 {
751 case SWR_SHADING_RATE_PIXEL:
752 if(bMultisampleEnable)
753 {
754 // always need to generate I & J per sample for Z interpolation
755 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
756 backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount];
757 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
758 }
759 else
760 {
761 // always need to generate I & J per pixel for Z interpolation
762 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
763 backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid];
764 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][SWR_MULTISAMPLE_1X];
765 }
766 break;
767 case SWR_SHADING_RATE_SAMPLE:
768 SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
769 // always need to generate I & J per sample for Z interpolation
770 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
771 backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid];
772 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
773 break;
774 default:
775 SWR_ASSERT(0 && "Invalid shading rate");
776 break;
777 }
778
779 // setup pointer to function that generates necessary barycentrics required by the PS
780 bool bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_PIXEL_MASK) > 0 ? 1 : 0;
781 backendFuncs.pfnCalcPixelBarycentrics = gPixelBarycentricTable[bBarycentrics];
782
783 bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_SAMPLE_MASK) > 0 ? 1 : 0;
784 backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[bBarycentrics];
785
786 bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0 ? 1 : 0;
787 backendFuncs.pfnCalcCentroidBarycentrics = gCentroidBarycentricTable[rastState.sampleCount][bBarycentrics][rastState.samplePattern][forcedSampleCount];
788 }
789
790 PFN_PROCESS_PRIMS pfnBinner;
791 switch (pState->state.topology)
792 {
793 case TOP_POINT_LIST:
794 pState->pfnProcessPrims = ClipPoints;
795 pfnBinner = BinPoints;
796 break;
797 case TOP_LINE_LIST:
798 case TOP_LINE_STRIP:
799 case TOP_LINE_LOOP:
800 case TOP_LINE_LIST_ADJ:
801 case TOP_LISTSTRIP_ADJ:
802 pState->pfnProcessPrims = ClipLines;
803 pfnBinner = BinLines;
804 break;
805 default:
806 pState->pfnProcessPrims = ClipTriangles;
807 pfnBinner = BinTriangles;
808 break;
809 };
810
811 // disable clipper if viewport transform is disabled
812 if (pState->state.frontendState.vpTransformDisable)
813 {
814 pState->pfnProcessPrims = pfnBinner;
815 }
816
817 if ((pState->state.psState.pfnPixelShader == nullptr) &&
818 (pState->state.depthStencilState.depthTestEnable == FALSE) &&
819 (pState->state.depthStencilState.depthWriteEnable == FALSE) &&
820 (pState->state.depthStencilState.stencilTestEnable == FALSE) &&
821 (pState->state.depthStencilState.stencilWriteEnable == FALSE) &&
822 (pState->state.linkageCount == 0))
823 {
824 pState->pfnProcessPrims = nullptr;
825 pState->state.linkageMask = 0;
826 }
827
828 if (pState->state.soState.rasterizerDisable == true)
829 {
830 pState->pfnProcessPrims = nullptr;
831 pState->state.linkageMask = 0;
832 }
833
834 // set up the frontend attrib mask
835 pState->state.feAttribMask = pState->state.linkageMask;
836 if (pState->state.soState.soEnable)
837 {
838 for (uint32_t i = 0; i < 4; ++i)
839 {
840 pState->state.feAttribMask |= pState->state.soState.streamMasks[i];
841 }
842 }
843
844 // complicated logic to test for cases where we don't need backing hottile memory for a draw
845 // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
846 pState->state.depthHottileEnable = ((!(pState->state.depthStencilState.depthTestEnable &&
847 !pState->state.depthStencilState.depthWriteEnable &&
848 pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) &&
849 (pState->state.depthStencilState.depthTestEnable ||
850 pState->state.depthStencilState.depthWriteEnable)) ? true : false;
851
852 pState->state.stencilHottileEnable = (((!(pState->state.depthStencilState.stencilTestEnable &&
853 !pState->state.depthStencilState.stencilWriteEnable &&
854 pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) ||
855 // for stencil we have to check the double sided state as well
856 (!(pState->state.depthStencilState.doubleSidedStencilTestEnable &&
857 !pState->state.depthStencilState.stencilWriteEnable &&
858 pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) &&
859 (pState->state.depthStencilState.stencilTestEnable ||
860 pState->state.depthStencilState.stencilWriteEnable)) ? true : false;
861
862 uint32_t numRTs = pState->state.psState.numRenderTargets;
863 pState->state.colorHottileEnable = 0;
864 if (psState.pfnPixelShader != nullptr)
865 {
866 for (uint32_t rt = 0; rt < numRTs; ++rt)
867 {
868 pState->state.colorHottileEnable |=
869 (!pState->state.blendState.renderTarget[rt].writeDisableAlpha ||
870 !pState->state.blendState.renderTarget[rt].writeDisableRed ||
871 !pState->state.blendState.renderTarget[rt].writeDisableGreen ||
872 !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0;
873 }
874 }
875 }
876
877 //////////////////////////////////////////////////////////////////////////
878 /// @brief InitDraw
879 /// @param pDC - Draw context to initialize for this draw.
880 void InitDraw(
881 DRAW_CONTEXT *pDC,
882 bool isSplitDraw)
883 {
884 // We don't need to re-setup the scissors/pipeline state again for split draw.
885 if (isSplitDraw == false)
886 {
887 SetupMacroTileScissors(pDC);
888 SetupPipeline(pDC);
889 }
890 }
891
892 //////////////////////////////////////////////////////////////////////////
893 /// @brief We can split the draw for certain topologies for better performance.
894 /// @param totalVerts - Total vertices for draw
895 /// @param topology - Topology used for draw
896 uint32_t MaxVertsPerDraw(
897 DRAW_CONTEXT* pDC,
898 uint32_t totalVerts,
899 PRIMITIVE_TOPOLOGY topology)
900 {
901 API_STATE& state = pDC->pState->state;
902
903 uint32_t vertsPerDraw = totalVerts;
904
905 if (state.soState.soEnable)
906 {
907 return totalVerts;
908 }
909
910 switch (topology)
911 {
912 case TOP_POINT_LIST:
913 case TOP_TRIANGLE_LIST:
914 vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW;
915 break;
916
917 case TOP_PATCHLIST_1:
918 case TOP_PATCHLIST_2:
919 case TOP_PATCHLIST_3:
920 case TOP_PATCHLIST_4:
921 case TOP_PATCHLIST_5:
922 case TOP_PATCHLIST_6:
923 case TOP_PATCHLIST_7:
924 case TOP_PATCHLIST_8:
925 case TOP_PATCHLIST_9:
926 case TOP_PATCHLIST_10:
927 case TOP_PATCHLIST_11:
928 case TOP_PATCHLIST_12:
929 case TOP_PATCHLIST_13:
930 case TOP_PATCHLIST_14:
931 case TOP_PATCHLIST_15:
932 case TOP_PATCHLIST_16:
933 case TOP_PATCHLIST_17:
934 case TOP_PATCHLIST_18:
935 case TOP_PATCHLIST_19:
936 case TOP_PATCHLIST_20:
937 case TOP_PATCHLIST_21:
938 case TOP_PATCHLIST_22:
939 case TOP_PATCHLIST_23:
940 case TOP_PATCHLIST_24:
941 case TOP_PATCHLIST_25:
942 case TOP_PATCHLIST_26:
943 case TOP_PATCHLIST_27:
944 case TOP_PATCHLIST_28:
945 case TOP_PATCHLIST_29:
946 case TOP_PATCHLIST_30:
947 case TOP_PATCHLIST_31:
948 case TOP_PATCHLIST_32:
949 if (pDC->pState->state.tsState.tsEnable)
950 {
951 uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE;
952 vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW;
953 }
954 break;
955
956 // The Primitive Assembly code can only handle 1 RECT at a time.
957 case TOP_RECT_LIST:
958 vertsPerDraw = 3;
959 break;
960
961 default:
962 // We are not splitting up draws for other topologies.
963 break;
964 }
965
966 return vertsPerDraw;
967 }
968
969 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
970 // arguments to static template arguments.
971 template <bool... ArgsB>
972 struct FEDrawChooser
973 {
974 // Last Arg Terminator
975 static PFN_FE_WORK_FUNC GetFunc(bool bArg)
976 {
977 if (bArg)
978 {
979 return ProcessDraw<ArgsB..., true>;
980 }
981
982 return ProcessDraw<ArgsB..., false>;
983 }
984
985 // Recursively parse args
986 template <typename... TArgsT>
987 static PFN_FE_WORK_FUNC GetFunc(bool bArg, TArgsT... remainingArgs)
988 {
989 if (bArg)
990 {
991 return FEDrawChooser<ArgsB..., true>::GetFunc(remainingArgs...);
992 }
993
994 return FEDrawChooser<ArgsB..., false>::GetFunc(remainingArgs...);
995 }
996 };
997
998 // Selector for correct templated Draw front-end function
999 INLINE
1000 static PFN_FE_WORK_FUNC GetFEDrawFunc(bool IsIndexed, bool HasTessellation, bool HasGeometryShader, bool HasStreamOut, bool RasterizerEnabled)
1001 {
1002 return FEDrawChooser<>::GetFunc(IsIndexed, HasTessellation, HasGeometryShader, HasStreamOut, RasterizerEnabled);
1003 }
1004
1005
1006 //////////////////////////////////////////////////////////////////////////
1007 /// @brief DrawInstanced
1008 /// @param hContext - Handle passed back from SwrCreateContext
1009 /// @param topology - Specifies topology for draw.
1010 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
1011 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1012 /// @param numInstances - How many instances to render.
1013 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1014 void DrawInstanced(
1015 HANDLE hContext,
1016 PRIMITIVE_TOPOLOGY topology,
1017 uint32_t numVertices,
1018 uint32_t startVertex,
1019 uint32_t numInstances = 1,
1020 uint32_t startInstance = 0)
1021 {
1022 if (KNOB_TOSS_DRAW)
1023 {
1024 return;
1025 }
1026
1027 RDTSC_START(APIDraw);
1028
1029 SWR_CONTEXT *pContext = GetContext(hContext);
1030 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1031
1032 int32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
1033 uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
1034 int32_t remainingVerts = numVertices;
1035
1036 API_STATE *pState = &pDC->pState->state;
1037 pState->topology = topology;
1038 pState->forceFront = false;
1039
1040 // disable culling for points/lines
1041 uint32_t oldCullMode = pState->rastState.cullMode;
1042 if (topology == TOP_POINT_LIST)
1043 {
1044 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1045 pState->forceFront = true;
1046 }
1047
1048 int draw = 0;
1049 while (remainingVerts)
1050 {
1051 uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ?
1052 remainingVerts : maxVertsPerDraw;
1053
1054 bool isSplitDraw = (draw > 0) ? true : false;
1055 DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw);
1056 InitDraw(pDC, isSplitDraw);
1057
1058 pDC->FeWork.type = DRAW;
1059 pDC->FeWork.pfnWork = GetFEDrawFunc(
1060 false, // IsIndexed
1061 pState->tsState.tsEnable,
1062 pState->gsState.gsEnable,
1063 pState->soState.soEnable,
1064 pDC->pState->pfnProcessPrims != nullptr);
1065 pDC->FeWork.desc.draw.numVerts = numVertsForDraw;
1066 pDC->FeWork.desc.draw.startVertex = startVertex;
1067 pDC->FeWork.desc.draw.numInstances = numInstances;
1068 pDC->FeWork.desc.draw.startInstance = startInstance;
1069 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1070 pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw;
1071
1072 pDC->cleanupState = (remainingVerts == numVertsForDraw);
1073
1074 //enqueue DC
1075 QueueDraw(pContext);
1076
1077 remainingVerts -= numVertsForDraw;
1078 draw++;
1079 }
1080
1081 // restore culling state
1082 pDC = GetDrawContext(pContext);
1083 pDC->pState->state.rastState.cullMode = oldCullMode;
1084
1085 RDTSC_STOP(APIDraw, numVertices * numInstances, 0);
1086 }
1087
1088 //////////////////////////////////////////////////////////////////////////
1089 /// @brief SwrDraw
1090 /// @param hContext - Handle passed back from SwrCreateContext
1091 /// @param topology - Specifies topology for draw.
1092 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
1093 /// @param primCount - Number of vertices.
1094 void SwrDraw(
1095 HANDLE hContext,
1096 PRIMITIVE_TOPOLOGY topology,
1097 uint32_t startVertex,
1098 uint32_t numVertices)
1099 {
1100 DrawInstanced(hContext, topology, numVertices, startVertex);
1101 }
1102
1103 //////////////////////////////////////////////////////////////////////////
1104 /// @brief SwrDrawInstanced
1105 /// @param hContext - Handle passed back from SwrCreateContext
1106 /// @param topology - Specifies topology for draw.
1107 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
1108 /// @param numInstances - How many instances to render.
1109 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1110 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1111 void SwrDrawInstanced(
1112 HANDLE hContext,
1113 PRIMITIVE_TOPOLOGY topology,
1114 uint32_t numVertsPerInstance,
1115 uint32_t numInstances,
1116 uint32_t startVertex,
1117 uint32_t startInstance
1118 )
1119 {
1120 DrawInstanced(hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance);
1121 }
1122
1123 //////////////////////////////////////////////////////////////////////////
1124 /// @brief DrawIndexedInstanced
1125 /// @param hContext - Handle passed back from SwrCreateContext
1126 /// @param topology - Specifies topology for draw.
1127 /// @param numIndices - Number of indices to read sequentially from index buffer.
1128 /// @param indexOffset - Starting index into index buffer.
1129 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1130 /// @param numInstances - Number of instances to render.
1131 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1132 void DrawIndexedInstance(
1133 HANDLE hContext,
1134 PRIMITIVE_TOPOLOGY topology,
1135 uint32_t numIndices,
1136 uint32_t indexOffset,
1137 int32_t baseVertex,
1138 uint32_t numInstances = 1,
1139 uint32_t startInstance = 0)
1140 {
1141 if (KNOB_TOSS_DRAW)
1142 {
1143 return;
1144 }
1145
1146 RDTSC_START(APIDrawIndexed);
1147
1148 SWR_CONTEXT *pContext = GetContext(hContext);
1149 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1150 API_STATE* pState = &pDC->pState->state;
1151
1152 int32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
1153 uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
1154 int32_t remainingIndices = numIndices;
1155
1156 uint32_t indexSize = 0;
1157 switch (pState->indexBuffer.format)
1158 {
1159 case R32_UINT: indexSize = sizeof(uint32_t); break;
1160 case R16_UINT: indexSize = sizeof(uint16_t); break;
1161 case R8_UINT: indexSize = sizeof(uint8_t); break;
1162 default:
1163 SWR_ASSERT(0);
1164 }
1165
1166 int draw = 0;
1167 uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices;
1168 pIB += (uint64_t)indexOffset * (uint64_t)indexSize;
1169
1170 pState->topology = topology;
1171 pState->forceFront = false;
1172
1173 // disable culling for points/lines
1174 uint32_t oldCullMode = pState->rastState.cullMode;
1175 if (topology == TOP_POINT_LIST)
1176 {
1177 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1178 pState->forceFront = true;
1179 }
1180
1181 while (remainingIndices)
1182 {
1183 uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ?
1184 remainingIndices : maxIndicesPerDraw;
1185
1186 // When breaking up draw, we need to obtain new draw context for each iteration.
1187 bool isSplitDraw = (draw > 0) ? true : false;
1188 pDC = GetDrawContext(pContext, isSplitDraw);
1189 InitDraw(pDC, isSplitDraw);
1190
1191 pDC->FeWork.type = DRAW;
1192 pDC->FeWork.pfnWork = GetFEDrawFunc(
1193 true, // IsIndexed
1194 pState->tsState.tsEnable,
1195 pState->gsState.gsEnable,
1196 pState->soState.soEnable,
1197 pDC->pState->pfnProcessPrims != nullptr);
1198 pDC->FeWork.desc.draw.pDC = pDC;
1199 pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
1200 pDC->FeWork.desc.draw.pIB = (int*)pIB;
1201 pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format;
1202
1203 pDC->FeWork.desc.draw.numInstances = numInstances;
1204 pDC->FeWork.desc.draw.startInstance = startInstance;
1205 pDC->FeWork.desc.draw.baseVertex = baseVertex;
1206 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1207
1208 pDC->cleanupState = (remainingIndices == numIndicesForDraw);
1209
1210 //enqueue DC
1211 QueueDraw(pContext);
1212
1213 pIB += maxIndicesPerDraw * indexSize;
1214 remainingIndices -= numIndicesForDraw;
1215 draw++;
1216 }
1217
1218 // restore culling state
1219 pDC = GetDrawContext(pContext);
1220 pDC->pState->state.rastState.cullMode = oldCullMode;
1221
1222 RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0);
1223 }
1224
1225
1226 //////////////////////////////////////////////////////////////////////////
1227 /// @brief DrawIndexed
1228 /// @param hContext - Handle passed back from SwrCreateContext
1229 /// @param topology - Specifies topology for draw.
1230 /// @param numIndices - Number of indices to read sequentially from index buffer.
1231 /// @param indexOffset - Starting index into index buffer.
1232 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1233 void SwrDrawIndexed(
1234 HANDLE hContext,
1235 PRIMITIVE_TOPOLOGY topology,
1236 uint32_t numIndices,
1237 uint32_t indexOffset,
1238 int32_t baseVertex
1239 )
1240 {
1241 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex);
1242 }
1243
1244 //////////////////////////////////////////////////////////////////////////
1245 /// @brief SwrDrawIndexedInstanced
1246 /// @param hContext - Handle passed back from SwrCreateContext
1247 /// @param topology - Specifies topology for draw.
1248 /// @param numIndices - Number of indices to read sequentially from index buffer.
1249 /// @param numInstances - Number of instances to render.
1250 /// @param indexOffset - Starting index into index buffer.
1251 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1252 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1253 void SwrDrawIndexedInstanced(
1254 HANDLE hContext,
1255 PRIMITIVE_TOPOLOGY topology,
1256 uint32_t numIndices,
1257 uint32_t numInstances,
1258 uint32_t indexOffset,
1259 int32_t baseVertex,
1260 uint32_t startInstance)
1261 {
1262 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
1263 }
1264
1265 //////////////////////////////////////////////////////////////////////////
1266 /// @brief SwrInvalidateTiles
1267 /// @param hContext - Handle passed back from SwrCreateContext
1268 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
1269 void SwrInvalidateTiles(
1270 HANDLE hContext,
1271 uint32_t attachmentMask)
1272 {
1273 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1274 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1275
1276 pDC->FeWork.type = DISCARDINVALIDATETILES;
1277 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1278 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1279 memset(&pDC->FeWork.desc.discardInvalidateTiles.rect, 0, sizeof(SWR_RECT));
1280 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID;
1281 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false;
1282 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false;
1283
1284 //enqueue
1285 QueueDraw(pContext);
1286 }
1287
1288 //////////////////////////////////////////////////////////////////////////
1289 /// @brief SwrDiscardRect
1290 /// @param hContext - Handle passed back from SwrCreateContext
1291 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
1292 /// @param rect - if rect is all zeros, the entire attachment surface will be discarded
1293 void SwrDiscardRect(
1294 HANDLE hContext,
1295 uint32_t attachmentMask,
1296 SWR_RECT rect)
1297 {
1298 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1299 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1300
1301 // Queue a load to the hottile
1302 pDC->FeWork.type = DISCARDINVALIDATETILES;
1303 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1304 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1305 pDC->FeWork.desc.discardInvalidateTiles.rect = rect;
1306 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED;
1307 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true;
1308 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true;
1309
1310 //enqueue
1311 QueueDraw(pContext);
1312 }
1313
1314 //////////////////////////////////////////////////////////////////////////
1315 /// @brief SwrDispatch
1316 /// @param hContext - Handle passed back from SwrCreateContext
1317 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
1318 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
1319 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
1320 void SwrDispatch(
1321 HANDLE hContext,
1322 uint32_t threadGroupCountX,
1323 uint32_t threadGroupCountY,
1324 uint32_t threadGroupCountZ)
1325 {
1326 if (KNOB_TOSS_DRAW)
1327 {
1328 return;
1329 }
1330
1331 RDTSC_START(APIDispatch);
1332 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1333 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1334
1335 pDC->isCompute = true; // This is a compute context.
1336
1337 // Ensure spill fill pointers are initialized to nullptr.
1338 memset(pDC->pSpillFill, 0, sizeof(pDC->pSpillFill));
1339
1340 COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
1341
1342 pTaskData->threadGroupCountX = threadGroupCountX;
1343 pTaskData->threadGroupCountY = threadGroupCountY;
1344 pTaskData->threadGroupCountZ = threadGroupCountZ;
1345
1346 uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
1347 pDC->pDispatch->initialize(totalThreadGroups, pTaskData);
1348
1349 QueueDispatch(pContext);
1350 RDTSC_STOP(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ, 0);
1351 }
1352
1353 // Deswizzles, converts and stores current contents of the hot tiles to surface
1354 // described by pState
1355 void SwrStoreTiles(
1356 HANDLE hContext,
1357 SWR_RENDERTARGET_ATTACHMENT attachment,
1358 SWR_TILE_STATE postStoreTileState)
1359 {
1360 RDTSC_START(APIStoreTiles);
1361
1362 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1363 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1364
1365 SetupMacroTileScissors(pDC);
1366
1367 pDC->FeWork.type = STORETILES;
1368 pDC->FeWork.pfnWork = ProcessStoreTiles;
1369 pDC->FeWork.desc.storeTiles.attachment = attachment;
1370 pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState;
1371
1372 //enqueue
1373 QueueDraw(pContext);
1374
1375 RDTSC_STOP(APIStoreTiles, 0, 0);
1376 }
1377
1378 void SwrClearRenderTarget(
1379 HANDLE hContext,
1380 uint32_t clearMask,
1381 const float clearColor[4],
1382 float z,
1383 uint8_t stencil)
1384 {
1385 RDTSC_START(APIClearRenderTarget);
1386
1387 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1388
1389 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1390
1391 SetupMacroTileScissors(pDC);
1392
1393 CLEAR_FLAGS flags;
1394 flags.mask = clearMask;
1395
1396 pDC->FeWork.type = CLEAR;
1397 pDC->FeWork.pfnWork = ProcessClear;
1398 pDC->FeWork.desc.clear.flags = flags;
1399 pDC->FeWork.desc.clear.clearDepth = z;
1400 pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0];
1401 pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1];
1402 pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2];
1403 pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3];
1404 pDC->FeWork.desc.clear.clearStencil = stencil;
1405
1406 // enqueue draw
1407 QueueDraw(pContext);
1408
1409 RDTSC_STOP(APIClearRenderTarget, 0, pDC->drawId);
1410 }
1411
1412 //////////////////////////////////////////////////////////////////////////
1413 /// @brief Returns a pointer to the private context state for the current
1414 /// draw operation. This is used for external componets such as the
1415 /// sampler.
1416 /// SWR is responsible for the allocation of the private context state.
1417 /// @param hContext - Handle passed back from SwrCreateContext
1418 VOID* SwrGetPrivateContextState(
1419 HANDLE hContext)
1420 {
1421 SWR_CONTEXT* pContext = GetContext(hContext);
1422 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1423 DRAW_STATE* pState = pDC->pState;
1424
1425 if (pState->pPrivateState == nullptr)
1426 {
1427 pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float));
1428 }
1429
1430 return pState->pPrivateState;
1431 }
1432
1433 //////////////////////////////////////////////////////////////////////////
1434 /// @brief Clients can use this to allocate memory for draw/dispatch
1435 /// operations. The memory will automatically be freed once operation
1436 /// has completed. Client can use this to allocate binding tables,
1437 /// etc. needed for shader execution.
1438 /// @param hContext - Handle passed back from SwrCreateContext
1439 /// @param size - Size of allocation
1440 /// @param align - Alignment needed for allocation.
1441 VOID* SwrAllocDrawContextMemory(
1442 HANDLE hContext,
1443 uint32_t size,
1444 uint32_t align)
1445 {
1446 SWR_CONTEXT* pContext = GetContext(hContext);
1447 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1448
1449 return pDC->pState->pArena->AllocAligned(size, align);
1450 }
1451
1452 //////////////////////////////////////////////////////////////////////////
1453 /// @brief Returns pointer to SWR stats.
1454 /// @note The counters are atomically incremented by multiple threads.
1455 /// When calling this, you need to ensure all previous operations
1456 /// have completed.
1457 /// @todo If necessary, add a callback to avoid stalling the pipe to
1458 /// sample the counters.
1459 /// @param hContext - Handle passed back from SwrCreateContext
1460 /// @param pStats - SWR will fill this out for caller.
1461 void SwrGetStats(
1462 HANDLE hContext,
1463 SWR_STATS* pStats)
1464 {
1465 SWR_CONTEXT *pContext = GetContext(hContext);
1466 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1467
1468 pDC->FeWork.type = QUERYSTATS;
1469 pDC->FeWork.pfnWork = ProcessQueryStats;
1470 pDC->FeWork.desc.queryStats.pStats = pStats;
1471
1472 // cannot execute until all previous draws have completed
1473 pDC->dependency = pDC->drawId - 1;
1474
1475 //enqueue
1476 QueueDraw(pContext);
1477 }
1478
1479 //////////////////////////////////////////////////////////////////////////
1480 /// @brief Enables stats counting
1481 /// @param hContext - Handle passed back from SwrCreateContext
1482 /// @param enable - If true then counts are incremented.
1483 void SwrEnableStats(
1484 HANDLE hContext,
1485 bool enable)
1486 {
1487 SWR_CONTEXT *pContext = GetContext(hContext);
1488 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1489
1490 pDC->pState->state.enableStats = enable;
1491 }
1492
1493 //////////////////////////////////////////////////////////////////////////
1494 /// @brief Mark end of frame - used for performance profiling
1495 /// @param hContext - Handle passed back from SwrCreateContext
1496 void SWR_API SwrEndFrame(
1497 HANDLE hContext)
1498 {
1499 RDTSC_ENDFRAME();
1500 }