swr: [rasterizer] more arena work
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / api.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file api.cpp
24 *
25 * @brief API implementation
26 *
27 ******************************************************************************/
28
29 #include <cfloat>
30 #include <cmath>
31 #include <cstdio>
32
33 #include "core/api.h"
34 #include "core/backend.h"
35 #include "core/context.h"
36 #include "core/frontend.h"
37 #include "core/rasterizer.h"
38 #include "core/rdtsc_core.h"
39 #include "core/threads.h"
40 #include "core/tilemgr.h"
41 #include "core/clip.h"
42
43 #include "common/simdintrin.h"
44 #include "common/os.h"
45
46 void SetupDefaultState(SWR_CONTEXT *pContext);
47
48 //////////////////////////////////////////////////////////////////////////
49 /// @brief Create SWR Context.
50 /// @param pCreateInfo - pointer to creation info.
51 HANDLE SwrCreateContext(
52 SWR_CREATECONTEXT_INFO* pCreateInfo)
53 {
54 RDTSC_RESET();
55 RDTSC_INIT(0);
56
57 void* pContextMem = _aligned_malloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4);
58 memset(pContextMem, 0, sizeof(SWR_CONTEXT));
59 SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT();
60
61 pContext->driverType = pCreateInfo->driver;
62 pContext->privateStateSize = pCreateInfo->privateStateSize;
63
64 pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
65 pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
66
67 for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
68 {
69 pContext->dcRing[dc].pArena = new Arena();
70 pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena));
71 pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
72
73 pContext->dsRing[dc].pArena = new Arena();
74 }
75
76 if (!KNOB_SINGLE_THREADED)
77 {
78 memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
79 memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
80 new (&pContext->WaitLock) std::mutex();
81 new (&pContext->FifosNotEmpty) std::condition_variable();
82
83 CreateThreadPool(pContext, &pContext->threadPool);
84 }
85
86 // Calling createThreadPool() above can set SINGLE_THREADED
87 if (KNOB_SINGLE_THREADED)
88 {
89 pContext->NumWorkerThreads = 1;
90 }
91
92 // Allocate scratch space for workers.
93 ///@note We could lazily allocate this but its rather small amount of memory.
94 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
95 {
96 ///@todo Use numa API for allocations using numa information from thread data (if exists).
97 pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4);
98 }
99
100 // State setup AFTER context is fully initialized
101 SetupDefaultState(pContext);
102
103 // initialize hot tile manager
104 pContext->pHotTileMgr = new HotTileMgr();
105
106 // initialize function pointer tables
107 InitClearTilesTable();
108
109 // initialize store tiles function
110 pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
111 pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
112 pContext->pfnClearTile = pCreateInfo->pfnClearTile;
113
114 // pass pointer to bucket manager back to caller
115 #ifdef KNOB_ENABLE_RDTSC
116 pCreateInfo->pBucketMgr = &gBucketMgr;
117 #endif
118
119 pCreateInfo->contextSaveSize = sizeof(API_STATE);
120
121 return (HANDLE)pContext;
122 }
123
124 void SwrDestroyContext(HANDLE hContext)
125 {
126 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
127 DestroyThreadPool(pContext, &pContext->threadPool);
128
129 // free the fifos
130 for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i)
131 {
132 delete pContext->dcRing[i].pArena;
133 delete pContext->dsRing[i].pArena;
134 delete(pContext->dcRing[i].pTileMgr);
135 delete(pContext->dcRing[i].pDispatch);
136 }
137
138 // Free scratch space.
139 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
140 {
141 _aligned_free(pContext->pScratch[i]);
142 }
143
144 delete(pContext->pHotTileMgr);
145
146 pContext->~SWR_CONTEXT();
147 _aligned_free((SWR_CONTEXT*)hContext);
148 }
149
150 void CopyState(DRAW_STATE& dst, const DRAW_STATE& src)
151 {
152 memcpy(&dst.state, &src.state, sizeof(API_STATE));
153 }
154
155 void WakeAllThreads(SWR_CONTEXT *pContext)
156 {
157 pContext->FifosNotEmpty.notify_all();
158 }
159
160 template<bool IsDraw>
161 void QueueWork(SWR_CONTEXT *pContext)
162 {
163 if (IsDraw)
164 {
165 // Each worker thread looks at a DC for both FE and BE work at different times and so we
166 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
167 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
168 // then moved on if all work is done.)
169 pContext->pCurDrawContext->threadsDone =
170 pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2;
171 }
172 else
173 {
174 pContext->pCurDrawContext->threadsDone =
175 pContext->NumWorkerThreads ? pContext->NumWorkerThreads : 1;
176 }
177
178 _ReadWriteBarrier();
179 {
180 std::unique_lock<std::mutex> lock(pContext->WaitLock);
181 pContext->dcRing.Enqueue();
182 }
183
184 if (KNOB_SINGLE_THREADED)
185 {
186 // flush denormals to 0
187 uint32_t mxcsr = _mm_getcsr();
188 _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
189
190 if (IsDraw)
191 {
192 static TileSet lockedTiles;
193 uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
194 WorkOnFifoFE(pContext, 0, curDraw[0], 0);
195 WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles);
196 }
197 else
198 {
199 uint64_t curDispatch = pContext->pCurDrawContext->drawId;
200 WorkOnCompute(pContext, 0, curDispatch);
201 }
202
203 // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
204 if (!pContext->dcRing.IsEmpty())
205 {
206 pContext->dcRing.Dequeue();
207 }
208
209 // restore csr
210 _mm_setcsr(mxcsr);
211 }
212 else
213 {
214 RDTSC_START(APIDrawWakeAllThreads);
215 WakeAllThreads(pContext);
216 RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
217 }
218
219 // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
220 pContext->pPrevDrawContext = pContext->pCurDrawContext;
221 pContext->pCurDrawContext = nullptr;
222 }
223
224 INLINE void QueueDraw(SWR_CONTEXT* pContext)
225 {
226 QueueWork<true>(pContext);
227 }
228
229 INLINE void QueueDispatch(SWR_CONTEXT* pContext)
230 {
231 QueueWork<false>(pContext);
232 }
233
234 DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
235 {
236 RDTSC_START(APIGetDrawContext);
237 // If current draw context is null then need to obtain a new draw context to use from ring.
238 if (pContext->pCurDrawContext == nullptr)
239 {
240 // Need to wait for a free entry.
241 while (pContext->dcRing.IsFull())
242 {
243 _mm_pause();
244 }
245
246 uint32_t dcIndex = pContext->dcRing.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT;
247
248 DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
249 pContext->pCurDrawContext = pCurDrawContext;
250
251 // Assign next available entry in DS ring to this DC.
252 uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
253 pCurDrawContext->pState = &pContext->dsRing[dsIndex];
254
255 Arena& stateArena = *(pCurDrawContext->pState->pArena);
256
257 // Copy previous state to current state.
258 if (pContext->pPrevDrawContext)
259 {
260 DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext;
261
262 // If we're splitting our draw then we can just use the same state from the previous
263 // draw. In this case, we won't increment the DS ring index so the next non-split
264 // draw can receive the state.
265 if (isSplitDraw == false)
266 {
267 CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);
268
269 stateArena.Reset(true); // Reset memory.
270 pCurDrawContext->pState->pPrivateState = nullptr;
271
272 pContext->curStateId++; // Progress state ring index forward.
273 }
274 else
275 {
276 // If its a split draw then just copy the state pointer over
277 // since its the same draw.
278 pCurDrawContext->pState = pPrevDrawContext->pState;
279 }
280 }
281 else
282 {
283 stateArena.Reset(); // Reset memory.
284 pContext->curStateId++; // Progress state ring index forward.
285 }
286
287 pCurDrawContext->dependency = 0;
288 pCurDrawContext->pArena->Reset();
289 pCurDrawContext->pContext = pContext;
290 pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
291
292 pCurDrawContext->doneFE = false;
293 pCurDrawContext->FeLock = 0;
294 pCurDrawContext->threadsDone = 0;
295
296 pCurDrawContext->pTileMgr->initialize();
297
298 // Assign unique drawId for this DC
299 pCurDrawContext->drawId = pContext->dcRing.GetHead();
300 }
301 else
302 {
303 SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
304 }
305
306 RDTSC_STOP(APIGetDrawContext, 0, 0);
307 return pContext->pCurDrawContext;
308 }
309
310 API_STATE* GetDrawState(SWR_CONTEXT *pContext)
311 {
312 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
313 SWR_ASSERT(pDC->pState != nullptr);
314
315 return &pDC->pState->state;
316 }
317
318 void SWR_API SwrSaveState(
319 HANDLE hContext,
320 void* pOutputStateBlock,
321 size_t memSize)
322 {
323 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
324 auto pSrc = GetDrawState(pContext);
325 SWR_ASSERT(pOutputStateBlock && memSize >= sizeof(*pSrc));
326
327 memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc));
328 }
329
330 void SWR_API SwrRestoreState(
331 HANDLE hContext,
332 const void* pStateBlock,
333 size_t memSize)
334 {
335 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
336 auto pDst = GetDrawState(pContext);
337 SWR_ASSERT(pStateBlock && memSize >= sizeof(*pDst));
338
339 memcpy(pDst, pStateBlock, sizeof(*pDst));
340 }
341
342 void SetupDefaultState(SWR_CONTEXT *pContext)
343 {
344 API_STATE* pState = GetDrawState(pContext);
345
346 pState->rastState.cullMode = SWR_CULLMODE_NONE;
347 pState->rastState.frontWinding = SWR_FRONTWINDING_CCW;
348 }
349
350 static INLINE SWR_CONTEXT* GetContext(HANDLE hContext)
351 {
352 return (SWR_CONTEXT*)hContext;
353 }
354
355 void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2, uint64_t userData3)
356 {
357 RDTSC_START(APISync);
358
359 SWR_ASSERT(pfnFunc != nullptr);
360
361 SWR_CONTEXT *pContext = GetContext(hContext);
362 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
363
364 pDC->FeWork.type = SYNC;
365 pDC->FeWork.pfnWork = ProcessSync;
366 pDC->FeWork.desc.sync.pfnCallbackFunc = pfnFunc;
367 pDC->FeWork.desc.sync.userData = userData;
368 pDC->FeWork.desc.sync.userData2 = userData2;
369 pDC->FeWork.desc.sync.userData3 = userData3;
370
371 // cannot execute until all previous draws have completed
372 pDC->dependency = pDC->drawId - 1;
373
374 //enqueue
375 QueueDraw(pContext);
376
377 RDTSC_STOP(APISync, 1, 0);
378 }
379
380 void SwrWaitForIdle(HANDLE hContext)
381 {
382 SWR_CONTEXT *pContext = GetContext(hContext);
383
384 RDTSC_START(APIWaitForIdle);
385
386 while (!pContext->dcRing.IsEmpty())
387 {
388 _mm_pause();
389 }
390
391 RDTSC_STOP(APIWaitForIdle, 1, 0);
392 }
393
394 void SwrSetVertexBuffers(
395 HANDLE hContext,
396 uint32_t numBuffers,
397 const SWR_VERTEX_BUFFER_STATE* pVertexBuffers)
398 {
399 API_STATE* pState = GetDrawState(GetContext(hContext));
400
401 for (uint32_t i = 0; i < numBuffers; ++i)
402 {
403 const SWR_VERTEX_BUFFER_STATE *pVB = &pVertexBuffers[i];
404 pState->vertexBuffers[pVB->index] = *pVB;
405 }
406 }
407
408 void SwrSetIndexBuffer(
409 HANDLE hContext,
410 const SWR_INDEX_BUFFER_STATE* pIndexBuffer)
411 {
412 API_STATE* pState = GetDrawState(GetContext(hContext));
413
414 pState->indexBuffer = *pIndexBuffer;
415 }
416
417 void SwrSetFetchFunc(
418 HANDLE hContext,
419 PFN_FETCH_FUNC pfnFetchFunc)
420 {
421 API_STATE* pState = GetDrawState(GetContext(hContext));
422
423 pState->pfnFetchFunc = pfnFetchFunc;
424 }
425
426 void SwrSetSoFunc(
427 HANDLE hContext,
428 PFN_SO_FUNC pfnSoFunc,
429 uint32_t streamIndex)
430 {
431 API_STATE* pState = GetDrawState(GetContext(hContext));
432
433 SWR_ASSERT(streamIndex < MAX_SO_STREAMS);
434
435 pState->pfnSoFunc[streamIndex] = pfnSoFunc;
436 }
437
438 void SwrSetSoState(
439 HANDLE hContext,
440 SWR_STREAMOUT_STATE* pSoState)
441 {
442 API_STATE* pState = GetDrawState(GetContext(hContext));
443
444 pState->soState = *pSoState;
445 }
446
447 void SwrSetSoBuffers(
448 HANDLE hContext,
449 SWR_STREAMOUT_BUFFER* pSoBuffer,
450 uint32_t slot)
451 {
452 API_STATE* pState = GetDrawState(GetContext(hContext));
453
454 SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot);
455
456 pState->soBuffer[slot] = *pSoBuffer;
457 }
458
459 void SwrSetVertexFunc(
460 HANDLE hContext,
461 PFN_VERTEX_FUNC pfnVertexFunc)
462 {
463 API_STATE* pState = GetDrawState(GetContext(hContext));
464
465 pState->pfnVertexFunc = pfnVertexFunc;
466 }
467
468 void SwrSetFrontendState(
469 HANDLE hContext,
470 SWR_FRONTEND_STATE *pFEState)
471 {
472 API_STATE* pState = GetDrawState(GetContext(hContext));
473 pState->frontendState = *pFEState;
474 }
475
476 void SwrSetGsState(
477 HANDLE hContext,
478 SWR_GS_STATE *pGSState)
479 {
480 API_STATE* pState = GetDrawState(GetContext(hContext));
481 pState->gsState = *pGSState;
482 }
483
484 void SwrSetGsFunc(
485 HANDLE hContext,
486 PFN_GS_FUNC pfnGsFunc)
487 {
488 API_STATE* pState = GetDrawState(GetContext(hContext));
489 pState->pfnGsFunc = pfnGsFunc;
490 }
491
492 void SwrSetCsFunc(
493 HANDLE hContext,
494 PFN_CS_FUNC pfnCsFunc,
495 uint32_t totalThreadsInGroup)
496 {
497 API_STATE* pState = GetDrawState(GetContext(hContext));
498 pState->pfnCsFunc = pfnCsFunc;
499 pState->totalThreadsInGroup = totalThreadsInGroup;
500 }
501
502 void SwrSetTsState(
503 HANDLE hContext,
504 SWR_TS_STATE *pState)
505 {
506 API_STATE* pApiState = GetDrawState(GetContext(hContext));
507 pApiState->tsState = *pState;
508 }
509
510 void SwrSetHsFunc(
511 HANDLE hContext,
512 PFN_HS_FUNC pfnFunc)
513 {
514 API_STATE* pApiState = GetDrawState(GetContext(hContext));
515 pApiState->pfnHsFunc = pfnFunc;
516 }
517
518 void SwrSetDsFunc(
519 HANDLE hContext,
520 PFN_DS_FUNC pfnFunc)
521 {
522 API_STATE* pApiState = GetDrawState(GetContext(hContext));
523 pApiState->pfnDsFunc = pfnFunc;
524 }
525
526 void SwrSetDepthStencilState(
527 HANDLE hContext,
528 SWR_DEPTH_STENCIL_STATE *pDSState)
529 {
530 API_STATE* pState = GetDrawState(GetContext(hContext));
531
532 pState->depthStencilState = *pDSState;
533 }
534
535 void SwrSetBackendState(
536 HANDLE hContext,
537 SWR_BACKEND_STATE *pBEState)
538 {
539 API_STATE* pState = GetDrawState(GetContext(hContext));
540
541 pState->backendState = *pBEState;
542 }
543
544 void SwrSetPixelShaderState(
545 HANDLE hContext,
546 SWR_PS_STATE *pPSState)
547 {
548 API_STATE *pState = GetDrawState(GetContext(hContext));
549 pState->psState = *pPSState;
550 }
551
552 void SwrSetBlendState(
553 HANDLE hContext,
554 SWR_BLEND_STATE *pBlendState)
555 {
556 API_STATE *pState = GetDrawState(GetContext(hContext));
557 memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE));
558 }
559
560 void SwrSetBlendFunc(
561 HANDLE hContext,
562 uint32_t renderTarget,
563 PFN_BLEND_JIT_FUNC pfnBlendFunc)
564 {
565 SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS);
566 API_STATE *pState = GetDrawState(GetContext(hContext));
567 pState->pfnBlendFunc[renderTarget] = pfnBlendFunc;
568 }
569
570 void SwrSetLinkage(
571 HANDLE hContext,
572 uint32_t mask,
573 const uint8_t* pMap)
574 {
575 API_STATE* pState = GetDrawState(GetContext(hContext));
576
577 static const uint8_t IDENTITY_MAP[] =
578 {
579 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
580 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
581 };
582 static_assert(sizeof(IDENTITY_MAP) == sizeof(pState->linkageMap),
583 "Update for new value of MAX_ATTRIBUTES");
584
585 pState->linkageMask = mask;
586 pState->linkageCount = _mm_popcnt_u32(mask);
587
588 if (!pMap)
589 {
590 pMap = IDENTITY_MAP;
591 }
592 memcpy(pState->linkageMap, pMap, pState->linkageCount);
593 }
594
595 // update guardband multipliers for the viewport
596 void updateGuardband(API_STATE *pState)
597 {
598 // guardband center is viewport center
599 pState->gbState.left = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
600 pState->gbState.right = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
601 pState->gbState.top = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
602 pState->gbState.bottom = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
603 }
604
605 void SwrSetRastState(
606 HANDLE hContext,
607 const SWR_RASTSTATE *pRastState)
608 {
609 SWR_CONTEXT *pContext = GetContext(hContext);
610 API_STATE* pState = GetDrawState(pContext);
611
612 memcpy(&pState->rastState, pRastState, sizeof(SWR_RASTSTATE));
613 }
614
615 void SwrSetViewports(
616 HANDLE hContext,
617 uint32_t numViewports,
618 const SWR_VIEWPORT* pViewports,
619 const SWR_VIEWPORT_MATRIX* pMatrices)
620 {
621 SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS,
622 "Invalid number of viewports.");
623
624 SWR_CONTEXT *pContext = GetContext(hContext);
625 API_STATE* pState = GetDrawState(pContext);
626
627 memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports);
628
629 if (pMatrices != nullptr)
630 {
631 memcpy(&pState->vpMatrix[0], pMatrices, sizeof(SWR_VIEWPORT_MATRIX) * numViewports);
632 }
633 else
634 {
635 // Compute default viewport transform.
636 for (uint32_t i = 0; i < numViewports; ++i)
637 {
638 if (pContext->driverType == DX)
639 {
640 pState->vpMatrix[i].m00 = pState->vp[i].width / 2.0f;
641 pState->vpMatrix[i].m11 = -pState->vp[i].height / 2.0f;
642 pState->vpMatrix[i].m22 = pState->vp[i].maxZ - pState->vp[i].minZ;
643 pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
644 pState->vpMatrix[i].m31 = pState->vp[i].y - pState->vpMatrix[i].m11;
645 pState->vpMatrix[i].m32 = pState->vp[i].minZ;
646 }
647 else
648 {
649 // Standard, with the exception that Y is inverted.
650 pState->vpMatrix[i].m00 = (pState->vp[i].width - pState->vp[i].x) / 2.0f;
651 pState->vpMatrix[i].m11 = (pState->vp[i].y - pState->vp[i].height) / 2.0f;
652 pState->vpMatrix[i].m22 = (pState->vp[i].maxZ - pState->vp[i].minZ) / 2.0f;
653 pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
654 pState->vpMatrix[i].m31 = pState->vp[i].height + pState->vpMatrix[i].m11;
655 pState->vpMatrix[i].m32 = pState->vp[i].minZ + pState->vpMatrix[i].m22;
656
657 // Now that the matrix is calculated, clip the view coords to screen size.
658 // OpenGL allows for -ve x,y in the viewport.
659 pState->vp[i].x = std::max(pState->vp[i].x, 0.0f);
660 pState->vp[i].y = std::max(pState->vp[i].y, 0.0f);
661 }
662 }
663 }
664
665 updateGuardband(pState);
666 }
667
668 void SwrSetScissorRects(
669 HANDLE hContext,
670 uint32_t numScissors,
671 const BBOX* pScissors)
672 {
673 SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS,
674 "Invalid number of scissor rects.");
675
676 API_STATE* pState = GetDrawState(GetContext(hContext));
677 memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(BBOX));
678 };
679
680 void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
681 {
682 API_STATE *pState = &pDC->pState->state;
683 uint32_t left, right, top, bottom;
684
685 // Set up scissor dimensions based on scissor or viewport
686 if (pState->rastState.scissorEnable)
687 {
688 // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges
689 left = pState->scissorRects[0].left;
690 right = pState->scissorRects[0].right;
691 top = pState->scissorRects[0].top;
692 bottom = pState->scissorRects[0].bottom;
693 }
694 else
695 {
696 left = (int32_t)pState->vp[0].x;
697 right = (int32_t)pState->vp[0].x + (int32_t)pState->vp[0].width;
698 top = (int32_t)pState->vp[0].y;
699 bottom = (int32_t)pState->vp[0].y + (int32_t)pState->vp[0].height;
700 }
701
702 right = std::min<uint32_t>(right, KNOB_MAX_SCISSOR_X);
703 bottom = std::min<uint32_t>(bottom, KNOB_MAX_SCISSOR_Y);
704
705 if (left > KNOB_MAX_SCISSOR_X || top > KNOB_MAX_SCISSOR_Y)
706 {
707 pState->scissorInFixedPoint.left = 0;
708 pState->scissorInFixedPoint.right = 0;
709 pState->scissorInFixedPoint.top = 0;
710 pState->scissorInFixedPoint.bottom = 0;
711 }
712 else
713 {
714 pState->scissorInFixedPoint.left = left * FIXED_POINT_SCALE;
715 pState->scissorInFixedPoint.right = right * FIXED_POINT_SCALE - 1;
716 pState->scissorInFixedPoint.top = top * FIXED_POINT_SCALE;
717 pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1;
718 }
719 }
720 // templated backend function tables
721 extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
722 extern PFN_BACKEND_FUNC gBackendSingleSample[2][2];
723 extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
724 extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2];
725 extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS + 1][SWR_MULTISAMPLE_TYPE_MAX];
726 extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
727 extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
728 extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2];
729 void SetupPipeline(DRAW_CONTEXT *pDC)
730 {
731 DRAW_STATE* pState = pDC->pState;
732 const SWR_RASTSTATE &rastState = pState->state.rastState;
733 const SWR_PS_STATE &psState = pState->state.psState;
734 BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
735 const uint32_t forcedSampleCount = (rastState.bForcedSampleCount) ? 1 : 0;
736
737 // setup backend
738 if (psState.pfnPixelShader == nullptr)
739 {
740 backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
741 // always need to generate I & J per sample for Z interpolation
742 backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[1];
743 }
744 else
745 {
746 const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.bForcedSampleCount) ? 1 : 0;
747 const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
748
749 // currently only support 'normal' input coverage
750 SWR_ASSERT(psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
751 psState.inputCoverage == SWR_INPUT_COVERAGE_NONE);
752
753 SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
754
755 // select backend function
756 switch(psState.shadingRate)
757 {
758 case SWR_SHADING_RATE_PIXEL:
759 if(bMultisampleEnable)
760 {
761 // always need to generate I & J per sample for Z interpolation
762 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
763 backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount];
764 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
765 }
766 else
767 {
768 // always need to generate I & J per pixel for Z interpolation
769 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
770 backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid];
771 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][SWR_MULTISAMPLE_1X];
772 }
773 break;
774 case SWR_SHADING_RATE_SAMPLE:
775 SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
776 // always need to generate I & J per sample for Z interpolation
777 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
778 backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid];
779 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
780 break;
781 default:
782 SWR_ASSERT(0 && "Invalid shading rate");
783 break;
784 }
785
786 // setup pointer to function that generates necessary barycentrics required by the PS
787 bool bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_PIXEL_MASK) > 0 ? 1 : 0;
788 backendFuncs.pfnCalcPixelBarycentrics = gPixelBarycentricTable[bBarycentrics];
789
790 bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_SAMPLE_MASK) > 0 ? 1 : 0;
791 backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[bBarycentrics];
792
793 bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0 ? 1 : 0;
794 backendFuncs.pfnCalcCentroidBarycentrics = gCentroidBarycentricTable[rastState.sampleCount][bBarycentrics][rastState.samplePattern][forcedSampleCount];
795 }
796
797 PFN_PROCESS_PRIMS pfnBinner;
798 switch (pState->state.topology)
799 {
800 case TOP_POINT_LIST:
801 pState->pfnProcessPrims = ClipPoints;
802 pfnBinner = BinPoints;
803 break;
804 case TOP_LINE_LIST:
805 case TOP_LINE_STRIP:
806 case TOP_LINE_LOOP:
807 case TOP_LINE_LIST_ADJ:
808 case TOP_LISTSTRIP_ADJ:
809 pState->pfnProcessPrims = ClipLines;
810 pfnBinner = BinLines;
811 break;
812 default:
813 pState->pfnProcessPrims = ClipTriangles;
814 pfnBinner = BinTriangles;
815 break;
816 };
817
818 // disable clipper if viewport transform is disabled
819 if (pState->state.frontendState.vpTransformDisable)
820 {
821 pState->pfnProcessPrims = pfnBinner;
822 }
823
824 if ((pState->state.psState.pfnPixelShader == nullptr) &&
825 (pState->state.depthStencilState.depthTestEnable == FALSE) &&
826 (pState->state.depthStencilState.depthWriteEnable == FALSE) &&
827 (pState->state.depthStencilState.stencilTestEnable == FALSE) &&
828 (pState->state.depthStencilState.stencilWriteEnable == FALSE) &&
829 (pState->state.linkageCount == 0))
830 {
831 pState->pfnProcessPrims = nullptr;
832 pState->state.linkageMask = 0;
833 }
834
835 if (pState->state.soState.rasterizerDisable == true)
836 {
837 pState->pfnProcessPrims = nullptr;
838 pState->state.linkageMask = 0;
839 }
840
841 // set up the frontend attrib mask
842 pState->state.feAttribMask = pState->state.linkageMask;
843 if (pState->state.soState.soEnable)
844 {
845 for (uint32_t i = 0; i < 4; ++i)
846 {
847 pState->state.feAttribMask |= pState->state.soState.streamMasks[i];
848 }
849 }
850
851 // complicated logic to test for cases where we don't need backing hottile memory for a draw
852 // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
853 pState->state.depthHottileEnable = ((!(pState->state.depthStencilState.depthTestEnable &&
854 !pState->state.depthStencilState.depthWriteEnable &&
855 pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) &&
856 (pState->state.depthStencilState.depthTestEnable ||
857 pState->state.depthStencilState.depthWriteEnable)) ? true : false;
858
859 pState->state.stencilHottileEnable = (((!(pState->state.depthStencilState.stencilTestEnable &&
860 !pState->state.depthStencilState.stencilWriteEnable &&
861 pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) ||
862 // for stencil we have to check the double sided state as well
863 (!(pState->state.depthStencilState.doubleSidedStencilTestEnable &&
864 !pState->state.depthStencilState.stencilWriteEnable &&
865 pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) &&
866 (pState->state.depthStencilState.stencilTestEnable ||
867 pState->state.depthStencilState.stencilWriteEnable)) ? true : false;
868
869 uint32_t numRTs = pState->state.psState.numRenderTargets;
870 pState->state.colorHottileEnable = 0;
871 if (psState.pfnPixelShader != nullptr)
872 {
873 for (uint32_t rt = 0; rt < numRTs; ++rt)
874 {
875 pState->state.colorHottileEnable |=
876 (!pState->state.blendState.renderTarget[rt].writeDisableAlpha ||
877 !pState->state.blendState.renderTarget[rt].writeDisableRed ||
878 !pState->state.blendState.renderTarget[rt].writeDisableGreen ||
879 !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0;
880 }
881 }
882 }
883
884 //////////////////////////////////////////////////////////////////////////
885 /// @brief InitDraw
886 /// @param pDC - Draw context to initialize for this draw.
887 void InitDraw(
888 DRAW_CONTEXT *pDC,
889 bool isSplitDraw)
890 {
891 // We don't need to re-setup the scissors/pipeline state again for split draw.
892 if (isSplitDraw == false)
893 {
894 SetupMacroTileScissors(pDC);
895 SetupPipeline(pDC);
896 }
897 }
898
899 //////////////////////////////////////////////////////////////////////////
900 /// @brief We can split the draw for certain topologies for better performance.
901 /// @param totalVerts - Total vertices for draw
902 /// @param topology - Topology used for draw
903 uint32_t MaxVertsPerDraw(
904 DRAW_CONTEXT* pDC,
905 uint32_t totalVerts,
906 PRIMITIVE_TOPOLOGY topology)
907 {
908 API_STATE& state = pDC->pState->state;
909
910 uint32_t vertsPerDraw = totalVerts;
911
912 if (state.soState.soEnable)
913 {
914 return totalVerts;
915 }
916
917 switch (topology)
918 {
919 case TOP_POINT_LIST:
920 case TOP_TRIANGLE_LIST:
921 vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW;
922 break;
923
924 case TOP_PATCHLIST_1:
925 case TOP_PATCHLIST_2:
926 case TOP_PATCHLIST_3:
927 case TOP_PATCHLIST_4:
928 case TOP_PATCHLIST_5:
929 case TOP_PATCHLIST_6:
930 case TOP_PATCHLIST_7:
931 case TOP_PATCHLIST_8:
932 case TOP_PATCHLIST_9:
933 case TOP_PATCHLIST_10:
934 case TOP_PATCHLIST_11:
935 case TOP_PATCHLIST_12:
936 case TOP_PATCHLIST_13:
937 case TOP_PATCHLIST_14:
938 case TOP_PATCHLIST_15:
939 case TOP_PATCHLIST_16:
940 case TOP_PATCHLIST_17:
941 case TOP_PATCHLIST_18:
942 case TOP_PATCHLIST_19:
943 case TOP_PATCHLIST_20:
944 case TOP_PATCHLIST_21:
945 case TOP_PATCHLIST_22:
946 case TOP_PATCHLIST_23:
947 case TOP_PATCHLIST_24:
948 case TOP_PATCHLIST_25:
949 case TOP_PATCHLIST_26:
950 case TOP_PATCHLIST_27:
951 case TOP_PATCHLIST_28:
952 case TOP_PATCHLIST_29:
953 case TOP_PATCHLIST_30:
954 case TOP_PATCHLIST_31:
955 case TOP_PATCHLIST_32:
956 if (pDC->pState->state.tsState.tsEnable)
957 {
958 uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE;
959 vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW;
960 }
961 break;
962
963 // The Primitive Assembly code can only handle 1 RECT at a time.
964 case TOP_RECT_LIST:
965 vertsPerDraw = 3;
966 break;
967
968 default:
969 // We are not splitting up draws for other topologies.
970 break;
971 }
972
973 return vertsPerDraw;
974 }
975
976 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
977 // arguments to static template arguments.
978 template <bool... ArgsB>
979 struct FEDrawChooser
980 {
981 // Last Arg Terminator
982 static PFN_FE_WORK_FUNC GetFunc(bool bArg)
983 {
984 if (bArg)
985 {
986 return ProcessDraw<ArgsB..., true>;
987 }
988
989 return ProcessDraw<ArgsB..., false>;
990 }
991
992 // Recursively parse args
993 template <typename... TArgsT>
994 static PFN_FE_WORK_FUNC GetFunc(bool bArg, TArgsT... remainingArgs)
995 {
996 if (bArg)
997 {
998 return FEDrawChooser<ArgsB..., true>::GetFunc(remainingArgs...);
999 }
1000
1001 return FEDrawChooser<ArgsB..., false>::GetFunc(remainingArgs...);
1002 }
1003 };
1004
1005 // Selector for correct templated Draw front-end function
1006 INLINE
1007 static PFN_FE_WORK_FUNC GetFEDrawFunc(bool IsIndexed, bool HasTessellation, bool HasGeometryShader, bool HasStreamOut, bool RasterizerEnabled)
1008 {
1009 return FEDrawChooser<>::GetFunc(IsIndexed, HasTessellation, HasGeometryShader, HasStreamOut, RasterizerEnabled);
1010 }
1011
1012
1013 //////////////////////////////////////////////////////////////////////////
1014 /// @brief DrawInstanced
1015 /// @param hContext - Handle passed back from SwrCreateContext
1016 /// @param topology - Specifies topology for draw.
1017 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
1018 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1019 /// @param numInstances - How many instances to render.
1020 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1021 void DrawInstanced(
1022 HANDLE hContext,
1023 PRIMITIVE_TOPOLOGY topology,
1024 uint32_t numVertices,
1025 uint32_t startVertex,
1026 uint32_t numInstances = 1,
1027 uint32_t startInstance = 0)
1028 {
1029 if (KNOB_TOSS_DRAW)
1030 {
1031 return;
1032 }
1033
1034 RDTSC_START(APIDraw);
1035
1036 SWR_CONTEXT *pContext = GetContext(hContext);
1037 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1038
1039 int32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
1040 uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
1041 int32_t remainingVerts = numVertices;
1042
1043 API_STATE *pState = &pDC->pState->state;
1044 pState->topology = topology;
1045 pState->forceFront = false;
1046
1047 // disable culling for points/lines
1048 uint32_t oldCullMode = pState->rastState.cullMode;
1049 if (topology == TOP_POINT_LIST)
1050 {
1051 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1052 pState->forceFront = true;
1053 }
1054
1055 int draw = 0;
1056 while (remainingVerts)
1057 {
1058 uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ?
1059 remainingVerts : maxVertsPerDraw;
1060
1061 bool isSplitDraw = (draw > 0) ? true : false;
1062 DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw);
1063 InitDraw(pDC, isSplitDraw);
1064
1065 pDC->FeWork.type = DRAW;
1066 pDC->FeWork.pfnWork = GetFEDrawFunc(
1067 false, // IsIndexed
1068 pState->tsState.tsEnable,
1069 pState->gsState.gsEnable,
1070 pState->soState.soEnable,
1071 pDC->pState->pfnProcessPrims != nullptr);
1072 pDC->FeWork.desc.draw.numVerts = numVertsForDraw;
1073 pDC->FeWork.desc.draw.startVertex = startVertex;
1074 pDC->FeWork.desc.draw.numInstances = numInstances;
1075 pDC->FeWork.desc.draw.startInstance = startInstance;
1076 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1077 pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw;
1078
1079 //enqueue DC
1080 QueueDraw(pContext);
1081
1082 remainingVerts -= numVertsForDraw;
1083 draw++;
1084 }
1085
1086 // restore culling state
1087 pDC = GetDrawContext(pContext);
1088 pDC->pState->state.rastState.cullMode = oldCullMode;
1089
1090 RDTSC_STOP(APIDraw, numVertices * numInstances, 0);
1091 }
1092
1093 //////////////////////////////////////////////////////////////////////////
1094 /// @brief SwrDraw
1095 /// @param hContext - Handle passed back from SwrCreateContext
1096 /// @param topology - Specifies topology for draw.
1097 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
1098 /// @param primCount - Number of vertices.
1099 void SwrDraw(
1100 HANDLE hContext,
1101 PRIMITIVE_TOPOLOGY topology,
1102 uint32_t startVertex,
1103 uint32_t numVertices)
1104 {
1105 DrawInstanced(hContext, topology, numVertices, startVertex);
1106 }
1107
1108 //////////////////////////////////////////////////////////////////////////
1109 /// @brief SwrDrawInstanced
1110 /// @param hContext - Handle passed back from SwrCreateContext
1111 /// @param topology - Specifies topology for draw.
1112 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
1113 /// @param numInstances - How many instances to render.
1114 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1115 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1116 void SwrDrawInstanced(
1117 HANDLE hContext,
1118 PRIMITIVE_TOPOLOGY topology,
1119 uint32_t numVertsPerInstance,
1120 uint32_t numInstances,
1121 uint32_t startVertex,
1122 uint32_t startInstance
1123 )
1124 {
1125 DrawInstanced(hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance);
1126 }
1127
1128 //////////////////////////////////////////////////////////////////////////
1129 /// @brief DrawIndexedInstanced
1130 /// @param hContext - Handle passed back from SwrCreateContext
1131 /// @param topology - Specifies topology for draw.
1132 /// @param numIndices - Number of indices to read sequentially from index buffer.
1133 /// @param indexOffset - Starting index into index buffer.
1134 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1135 /// @param numInstances - Number of instances to render.
1136 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1137 void DrawIndexedInstance(
1138 HANDLE hContext,
1139 PRIMITIVE_TOPOLOGY topology,
1140 uint32_t numIndices,
1141 uint32_t indexOffset,
1142 int32_t baseVertex,
1143 uint32_t numInstances = 1,
1144 uint32_t startInstance = 0)
1145 {
1146 if (KNOB_TOSS_DRAW)
1147 {
1148 return;
1149 }
1150
1151 RDTSC_START(APIDrawIndexed);
1152
1153 SWR_CONTEXT *pContext = GetContext(hContext);
1154 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1155 API_STATE* pState = &pDC->pState->state;
1156
1157 int32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
1158 uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
1159 int32_t remainingIndices = numIndices;
1160
1161 uint32_t indexSize = 0;
1162 switch (pState->indexBuffer.format)
1163 {
1164 case R32_UINT: indexSize = sizeof(uint32_t); break;
1165 case R16_UINT: indexSize = sizeof(uint16_t); break;
1166 case R8_UINT: indexSize = sizeof(uint8_t); break;
1167 default:
1168 SWR_ASSERT(0);
1169 }
1170
1171 int draw = 0;
1172 uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices;
1173 pIB += (uint64_t)indexOffset * (uint64_t)indexSize;
1174
1175 pState->topology = topology;
1176 pState->forceFront = false;
1177
1178 // disable culling for points/lines
1179 uint32_t oldCullMode = pState->rastState.cullMode;
1180 if (topology == TOP_POINT_LIST)
1181 {
1182 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1183 pState->forceFront = true;
1184 }
1185
1186 while (remainingIndices)
1187 {
1188 uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ?
1189 remainingIndices : maxIndicesPerDraw;
1190
1191 // When breaking up draw, we need to obtain new draw context for each iteration.
1192 bool isSplitDraw = (draw > 0) ? true : false;
1193 pDC = GetDrawContext(pContext, isSplitDraw);
1194 InitDraw(pDC, isSplitDraw);
1195
1196 pDC->FeWork.type = DRAW;
1197 pDC->FeWork.pfnWork = GetFEDrawFunc(
1198 true, // IsIndexed
1199 pState->tsState.tsEnable,
1200 pState->gsState.gsEnable,
1201 pState->soState.soEnable,
1202 pDC->pState->pfnProcessPrims != nullptr);
1203 pDC->FeWork.desc.draw.pDC = pDC;
1204 pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
1205 pDC->FeWork.desc.draw.pIB = (int*)pIB;
1206 pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format;
1207
1208 pDC->FeWork.desc.draw.numInstances = numInstances;
1209 pDC->FeWork.desc.draw.startInstance = startInstance;
1210 pDC->FeWork.desc.draw.baseVertex = baseVertex;
1211 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1212
1213 //enqueue DC
1214 QueueDraw(pContext);
1215
1216 pIB += maxIndicesPerDraw * indexSize;
1217 remainingIndices -= numIndicesForDraw;
1218 draw++;
1219 }
1220
1221 // restore culling state
1222 pDC = GetDrawContext(pContext);
1223 pDC->pState->state.rastState.cullMode = oldCullMode;
1224
1225 RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0);
1226 }
1227
1228
1229 //////////////////////////////////////////////////////////////////////////
1230 /// @brief DrawIndexed
1231 /// @param hContext - Handle passed back from SwrCreateContext
1232 /// @param topology - Specifies topology for draw.
1233 /// @param numIndices - Number of indices to read sequentially from index buffer.
1234 /// @param indexOffset - Starting index into index buffer.
1235 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1236 void SwrDrawIndexed(
1237 HANDLE hContext,
1238 PRIMITIVE_TOPOLOGY topology,
1239 uint32_t numIndices,
1240 uint32_t indexOffset,
1241 int32_t baseVertex
1242 )
1243 {
1244 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex);
1245 }
1246
1247 //////////////////////////////////////////////////////////////////////////
1248 /// @brief SwrDrawIndexedInstanced
1249 /// @param hContext - Handle passed back from SwrCreateContext
1250 /// @param topology - Specifies topology for draw.
1251 /// @param numIndices - Number of indices to read sequentially from index buffer.
1252 /// @param numInstances - Number of instances to render.
1253 /// @param indexOffset - Starting index into index buffer.
1254 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1255 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1256 void SwrDrawIndexedInstanced(
1257 HANDLE hContext,
1258 PRIMITIVE_TOPOLOGY topology,
1259 uint32_t numIndices,
1260 uint32_t numInstances,
1261 uint32_t indexOffset,
1262 int32_t baseVertex,
1263 uint32_t startInstance)
1264 {
1265 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
1266 }
1267
1268 //////////////////////////////////////////////////////////////////////////
1269 /// @brief SwrInvalidateTiles
1270 /// @param hContext - Handle passed back from SwrCreateContext
1271 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
1272 void SwrInvalidateTiles(
1273 HANDLE hContext,
1274 uint32_t attachmentMask)
1275 {
1276 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1277 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1278
1279 pDC->FeWork.type = DISCARDINVALIDATETILES;
1280 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1281 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1282 memset(&pDC->FeWork.desc.discardInvalidateTiles.rect, 0, sizeof(SWR_RECT));
1283 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID;
1284 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false;
1285 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false;
1286
1287 //enqueue
1288 QueueDraw(pContext);
1289 }
1290
1291 //////////////////////////////////////////////////////////////////////////
1292 /// @brief SwrDiscardRect
1293 /// @param hContext - Handle passed back from SwrCreateContext
1294 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
1295 /// @param rect - if rect is all zeros, the entire attachment surface will be discarded
1296 void SwrDiscardRect(
1297 HANDLE hContext,
1298 uint32_t attachmentMask,
1299 SWR_RECT rect)
1300 {
1301 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1302 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1303
1304 // Queue a load to the hottile
1305 pDC->FeWork.type = DISCARDINVALIDATETILES;
1306 pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
1307 pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
1308 pDC->FeWork.desc.discardInvalidateTiles.rect = rect;
1309 pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED;
1310 pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true;
1311 pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true;
1312
1313 //enqueue
1314 QueueDraw(pContext);
1315 }
1316
1317 //////////////////////////////////////////////////////////////////////////
1318 /// @brief SwrDispatch
1319 /// @param hContext - Handle passed back from SwrCreateContext
1320 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
1321 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
1322 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
1323 void SwrDispatch(
1324 HANDLE hContext,
1325 uint32_t threadGroupCountX,
1326 uint32_t threadGroupCountY,
1327 uint32_t threadGroupCountZ)
1328 {
1329 if (KNOB_TOSS_DRAW)
1330 {
1331 return;
1332 }
1333
1334 RDTSC_START(APIDispatch);
1335 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1336 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1337
1338 pDC->isCompute = true; // This is a compute context.
1339
1340 // Ensure spill fill pointers are initialized to nullptr.
1341 memset(pDC->pSpillFill, 0, sizeof(pDC->pSpillFill));
1342
1343 COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
1344
1345 pTaskData->threadGroupCountX = threadGroupCountX;
1346 pTaskData->threadGroupCountY = threadGroupCountY;
1347 pTaskData->threadGroupCountZ = threadGroupCountZ;
1348
1349 uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
1350 pDC->pDispatch->initialize(totalThreadGroups, pTaskData);
1351
1352 QueueDispatch(pContext);
1353 RDTSC_STOP(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ, 0);
1354 }
1355
1356 // Deswizzles, converts and stores current contents of the hot tiles to surface
1357 // described by pState
1358 void SwrStoreTiles(
1359 HANDLE hContext,
1360 SWR_RENDERTARGET_ATTACHMENT attachment,
1361 SWR_TILE_STATE postStoreTileState)
1362 {
1363 RDTSC_START(APIStoreTiles);
1364
1365 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1366 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1367
1368 SetupMacroTileScissors(pDC);
1369
1370 pDC->FeWork.type = STORETILES;
1371 pDC->FeWork.pfnWork = ProcessStoreTiles;
1372 pDC->FeWork.desc.storeTiles.attachment = attachment;
1373 pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState;
1374
1375 //enqueue
1376 QueueDraw(pContext);
1377
1378 RDTSC_STOP(APIStoreTiles, 0, 0);
1379 }
1380
1381 void SwrClearRenderTarget(
1382 HANDLE hContext,
1383 uint32_t clearMask,
1384 const float clearColor[4],
1385 float z,
1386 uint8_t stencil)
1387 {
1388 RDTSC_START(APIClearRenderTarget);
1389
1390 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1391
1392 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1393
1394 SetupMacroTileScissors(pDC);
1395
1396 CLEAR_FLAGS flags;
1397 flags.mask = clearMask;
1398
1399 pDC->FeWork.type = CLEAR;
1400 pDC->FeWork.pfnWork = ProcessClear;
1401 pDC->FeWork.desc.clear.flags = flags;
1402 pDC->FeWork.desc.clear.clearDepth = z;
1403 pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0];
1404 pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1];
1405 pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2];
1406 pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3];
1407 pDC->FeWork.desc.clear.clearStencil = stencil;
1408
1409 // enqueue draw
1410 QueueDraw(pContext);
1411
1412 RDTSC_STOP(APIClearRenderTarget, 0, pDC->drawId);
1413 }
1414
1415 //////////////////////////////////////////////////////////////////////////
1416 /// @brief Returns a pointer to the private context state for the current
1417 /// draw operation. This is used for external componets such as the
1418 /// sampler.
1419 /// SWR is responsible for the allocation of the private context state.
1420 /// @param hContext - Handle passed back from SwrCreateContext
1421 VOID* SwrGetPrivateContextState(
1422 HANDLE hContext)
1423 {
1424 SWR_CONTEXT* pContext = GetContext(hContext);
1425 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1426 DRAW_STATE* pState = pDC->pState;
1427
1428 if (pState->pPrivateState == nullptr)
1429 {
1430 pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float));
1431 }
1432
1433 return pState->pPrivateState;
1434 }
1435
1436 //////////////////////////////////////////////////////////////////////////
1437 /// @brief Clients can use this to allocate memory for draw/dispatch
1438 /// operations. The memory will automatically be freed once operation
1439 /// has completed. Client can use this to allocate binding tables,
1440 /// etc. needed for shader execution.
1441 /// @param hContext - Handle passed back from SwrCreateContext
1442 /// @param size - Size of allocation
1443 /// @param align - Alignment needed for allocation.
1444 VOID* SwrAllocDrawContextMemory(
1445 HANDLE hContext,
1446 uint32_t size,
1447 uint32_t align)
1448 {
1449 SWR_CONTEXT* pContext = GetContext(hContext);
1450 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1451
1452 return pDC->pState->pArena->AllocAligned(size, align);
1453 }
1454
1455 //////////////////////////////////////////////////////////////////////////
1456 /// @brief Returns pointer to SWR stats.
1457 /// @note The counters are atomically incremented by multiple threads.
1458 /// When calling this, you need to ensure all previous operations
1459 /// have completed.
1460 /// @todo If necessary, add a callback to avoid stalling the pipe to
1461 /// sample the counters.
1462 /// @param hContext - Handle passed back from SwrCreateContext
1463 /// @param pStats - SWR will fill this out for caller.
1464 void SwrGetStats(
1465 HANDLE hContext,
1466 SWR_STATS* pStats)
1467 {
1468 SWR_CONTEXT *pContext = GetContext(hContext);
1469 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1470
1471 pDC->FeWork.type = QUERYSTATS;
1472 pDC->FeWork.pfnWork = ProcessQueryStats;
1473 pDC->FeWork.desc.queryStats.pStats = pStats;
1474
1475 // cannot execute until all previous draws have completed
1476 pDC->dependency = pDC->drawId - 1;
1477
1478 //enqueue
1479 QueueDraw(pContext);
1480 }
1481
1482 //////////////////////////////////////////////////////////////////////////
1483 /// @brief Enables stats counting
1484 /// @param hContext - Handle passed back from SwrCreateContext
1485 /// @param enable - If true then counts are incremented.
1486 void SwrEnableStats(
1487 HANDLE hContext,
1488 bool enable)
1489 {
1490 SWR_CONTEXT *pContext = GetContext(hContext);
1491 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1492
1493 pDC->pState->state.enableStats = enable;
1494 }
1495
1496 //////////////////////////////////////////////////////////////////////////
1497 /// @brief Mark end of frame - used for performance profiling
1498 /// @param hContext - Handle passed back from SwrCreateContext
1499 void SWR_API SwrEndFrame(
1500 HANDLE hContext)
1501 {
1502 RDTSC_ENDFRAME();
1503 }