swr: [rasterizer core] RingBuffer class for DC/DS
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / api.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file api.cpp
24 *
25 * @brief API implementation
26 *
27 ******************************************************************************/
28
29 #include <cfloat>
30 #include <cmath>
31 #include <cstdio>
32
33 #include "core/api.h"
34 #include "core/backend.h"
35 #include "core/context.h"
36 #include "core/frontend.h"
37 #include "core/rasterizer.h"
38 #include "core/rdtsc_core.h"
39 #include "core/threads.h"
40 #include "core/tilemgr.h"
41 #include "core/clip.h"
42
43 #include "common/simdintrin.h"
44 #include "common/os.h"
45
46 void SetupDefaultState(SWR_CONTEXT *pContext);
47
48 //////////////////////////////////////////////////////////////////////////
49 /// @brief Create SWR Context.
50 /// @param pCreateInfo - pointer to creation info.
51 HANDLE SwrCreateContext(
52 const SWR_CREATECONTEXT_INFO* pCreateInfo)
53 {
54 RDTSC_RESET();
55 RDTSC_INIT(0);
56
57 void* pContextMem = _aligned_malloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4);
58 memset(pContextMem, 0, sizeof(SWR_CONTEXT));
59 SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT();
60
61 pContext->driverType = pCreateInfo->driver;
62 pContext->privateStateSize = pCreateInfo->privateStateSize;
63
64 pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
65 pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
66
67 pContext->numSubContexts = pCreateInfo->maxSubContexts;
68 if (pContext->numSubContexts > 1)
69 {
70 pContext->subCtxSave = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE) * pContext->numSubContexts, 64);
71 memset(pContext->subCtxSave, 0, sizeof(DRAW_STATE) * pContext->numSubContexts);
72 }
73
74 for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
75 {
76 pContext->dcRing[dc].pArena = new Arena();
77 pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena));
78 pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
79
80 pContext->dsRing[dc].pArena = new Arena();
81 }
82
83 if (!KNOB_SINGLE_THREADED)
84 {
85 memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
86 memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
87 new (&pContext->WaitLock) std::mutex();
88 new (&pContext->FifosNotEmpty) std::condition_variable();
89
90 CreateThreadPool(pContext, &pContext->threadPool);
91 }
92
93 // Calling createThreadPool() above can set SINGLE_THREADED
94 if (KNOB_SINGLE_THREADED)
95 {
96 pContext->NumWorkerThreads = 1;
97 }
98
99 // Allocate scratch space for workers.
100 ///@note We could lazily allocate this but its rather small amount of memory.
101 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
102 {
103 ///@todo Use numa API for allocations using numa information from thread data (if exists).
104 pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4);
105 }
106
107 // State setup AFTER context is fully initialized
108 SetupDefaultState(pContext);
109
110 // initialize hot tile manager
111 pContext->pHotTileMgr = new HotTileMgr();
112
113 // initialize function pointer tables
114 InitClearTilesTable();
115
116 // initialize store tiles function
117 pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
118 pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
119 pContext->pfnClearTile = pCreateInfo->pfnClearTile;
120
121 return (HANDLE)pContext;
122 }
123
124 void SwrDestroyContext(HANDLE hContext)
125 {
126 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
127 DestroyThreadPool(pContext, &pContext->threadPool);
128
129 // free the fifos
130 for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i)
131 {
132 delete pContext->dcRing[i].pArena;
133 delete pContext->dsRing[i].pArena;
134 delete(pContext->dcRing[i].pTileMgr);
135 delete(pContext->dcRing[i].pDispatch);
136 }
137
138 // Free scratch space.
139 for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
140 {
141 _aligned_free(pContext->pScratch[i]);
142 }
143
144 _aligned_free(pContext->subCtxSave);
145
146 delete(pContext->pHotTileMgr);
147
148 pContext->~SWR_CONTEXT();
149 _aligned_free((SWR_CONTEXT*)hContext);
150 }
151
152 void CopyState(DRAW_STATE& dst, const DRAW_STATE& src)
153 {
154 memcpy(&dst.state, &src.state, sizeof(API_STATE));
155 }
156
157 void WakeAllThreads(SWR_CONTEXT *pContext)
158 {
159 pContext->FifosNotEmpty.notify_all();
160 }
161
162 template<bool IsDraw>
163 void QueueWork(SWR_CONTEXT *pContext)
164 {
165 if (IsDraw)
166 {
167 // Each worker thread looks at a DC for both FE and BE work at different times and so we
168 // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
169 // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
170 // then moved on if all work is done.)
171 pContext->pCurDrawContext->threadsDone =
172 pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2;
173 }
174 else
175 {
176 pContext->pCurDrawContext->threadsDone =
177 pContext->NumWorkerThreads ? pContext->NumWorkerThreads : 1;
178 }
179
180 _ReadWriteBarrier();
181 {
182 std::unique_lock<std::mutex> lock(pContext->WaitLock);
183 pContext->dcRing.Enqueue();
184 }
185
186 if (KNOB_SINGLE_THREADED)
187 {
188 // flush denormals to 0
189 uint32_t mxcsr = _mm_getcsr();
190 _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
191
192 if (IsDraw)
193 {
194 std::unordered_set<uint32_t> lockedTiles;
195 uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
196 WorkOnFifoFE(pContext, 0, curDraw[0], 0);
197 WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles);
198 }
199 else
200 {
201 uint64_t curDispatch = pContext->pCurDrawContext->drawId;
202 WorkOnCompute(pContext, 0, curDispatch);
203 }
204
205 // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
206 if (!pContext->dcRing.IsEmpty())
207 {
208 pContext->dcRing.Dequeue();
209 }
210
211 // restore csr
212 _mm_setcsr(mxcsr);
213 }
214 else
215 {
216 RDTSC_START(APIDrawWakeAllThreads);
217 WakeAllThreads(pContext);
218 RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
219 }
220
221 // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
222 pContext->pPrevDrawContext = pContext->pCurDrawContext;
223 pContext->pCurDrawContext = nullptr;
224 }
225
226 INLINE void QueueDraw(SWR_CONTEXT* pContext)
227 {
228 QueueWork<true>(pContext);
229 }
230
231 INLINE void QueueDispatch(SWR_CONTEXT* pContext)
232 {
233 QueueWork<false>(pContext);
234 }
235
236 DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
237 {
238 RDTSC_START(APIGetDrawContext);
239 // If current draw context is null then need to obtain a new draw context to use from ring.
240 if (pContext->pCurDrawContext == nullptr)
241 {
242 // Need to wait for a free entry.
243 while (pContext->dcRing.IsFull())
244 {
245 _mm_pause();
246 }
247
248 uint32_t dcIndex = pContext->dcRing.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT;
249
250 DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
251 pContext->pCurDrawContext = pCurDrawContext;
252
253 // Assign next available entry in DS ring to this DC.
254 uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
255 pCurDrawContext->pState = &pContext->dsRing[dsIndex];
256
257 Arena& stateArena = *(pCurDrawContext->pState->pArena);
258
259 // Copy previous state to current state.
260 if (pContext->pPrevDrawContext)
261 {
262 DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext;
263
264 // If we're splitting our draw then we can just use the same state from the previous
265 // draw. In this case, we won't increment the DS ring index so the next non-split
266 // draw can receive the state.
267 if (isSplitDraw == false)
268 {
269 CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);
270
271 stateArena.Reset(true); // Reset memory.
272 pCurDrawContext->pState->pPrivateState = nullptr;
273
274 pContext->curStateId++; // Progress state ring index forward.
275 }
276 else
277 {
278 // If its a split draw then just copy the state pointer over
279 // since its the same draw.
280 pCurDrawContext->pState = pPrevDrawContext->pState;
281 }
282 }
283 else
284 {
285 stateArena.Reset(); // Reset memory.
286 pContext->curStateId++; // Progress state ring index forward.
287 }
288
289 pCurDrawContext->dependency = 0;
290 pCurDrawContext->pArena->Reset();
291 pCurDrawContext->pContext = pContext;
292 pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
293
294 pCurDrawContext->doneFE = false;
295 pCurDrawContext->FeLock = 0;
296 pCurDrawContext->threadsDone = 0;
297
298 pCurDrawContext->pTileMgr->initialize();
299
300 // Assign unique drawId for this DC
301 pCurDrawContext->drawId = pContext->dcRing.GetHead();
302 }
303 else
304 {
305 SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
306 }
307
308 RDTSC_STOP(APIGetDrawContext, 0, 0);
309 return pContext->pCurDrawContext;
310 }
311
312 void SWR_API SwrSetActiveSubContext(
313 HANDLE hContext,
314 uint32_t subContextIndex)
315 {
316 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
317 if (subContextIndex >= pContext->numSubContexts)
318 {
319 return;
320 }
321
322 if (subContextIndex != pContext->curSubCtxId)
323 {
324 // Save and restore draw state
325 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
326 CopyState(
327 pContext->subCtxSave[pContext->curSubCtxId],
328 *(pDC->pState));
329
330 CopyState(
331 *(pDC->pState),
332 pContext->subCtxSave[subContextIndex]);
333
334 pContext->curSubCtxId = subContextIndex;
335 }
336 }
337
338 API_STATE* GetDrawState(SWR_CONTEXT *pContext)
339 {
340 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
341 SWR_ASSERT(pDC->pState != nullptr);
342
343 return &pDC->pState->state;
344 }
345
346 void SetupDefaultState(SWR_CONTEXT *pContext)
347 {
348 API_STATE* pState = GetDrawState(pContext);
349
350 pState->rastState.cullMode = SWR_CULLMODE_NONE;
351 pState->rastState.frontWinding = SWR_FRONTWINDING_CCW;
352 }
353
354 static INLINE SWR_CONTEXT* GetContext(HANDLE hContext)
355 {
356 return (SWR_CONTEXT*)hContext;
357 }
358
359 void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2, uint64_t userData3)
360 {
361 RDTSC_START(APISync);
362
363 SWR_ASSERT(pfnFunc != nullptr);
364
365 SWR_CONTEXT *pContext = GetContext(hContext);
366 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
367
368 pDC->FeWork.type = SYNC;
369 pDC->FeWork.pfnWork = ProcessSync;
370 pDC->FeWork.desc.sync.pfnCallbackFunc = pfnFunc;
371 pDC->FeWork.desc.sync.userData = userData;
372 pDC->FeWork.desc.sync.userData2 = userData2;
373 pDC->FeWork.desc.sync.userData3 = userData3;
374
375 // cannot execute until all previous draws have completed
376 pDC->dependency = pDC->drawId - 1;
377
378 //enqueue
379 QueueDraw(pContext);
380
381 RDTSC_STOP(APISync, 1, 0);
382 }
383
384 void SwrWaitForIdle(HANDLE hContext)
385 {
386 SWR_CONTEXT *pContext = GetContext(hContext);
387
388 RDTSC_START(APIWaitForIdle);
389
390 while (!pContext->dcRing.IsEmpty())
391 {
392 _mm_pause();
393 }
394
395 RDTSC_STOP(APIWaitForIdle, 1, 0);
396 }
397
398 void SwrSetVertexBuffers(
399 HANDLE hContext,
400 uint32_t numBuffers,
401 const SWR_VERTEX_BUFFER_STATE* pVertexBuffers)
402 {
403 API_STATE* pState = GetDrawState(GetContext(hContext));
404
405 for (uint32_t i = 0; i < numBuffers; ++i)
406 {
407 const SWR_VERTEX_BUFFER_STATE *pVB = &pVertexBuffers[i];
408 pState->vertexBuffers[pVB->index] = *pVB;
409 }
410 }
411
412 void SwrSetIndexBuffer(
413 HANDLE hContext,
414 const SWR_INDEX_BUFFER_STATE* pIndexBuffer)
415 {
416 API_STATE* pState = GetDrawState(GetContext(hContext));
417
418 pState->indexBuffer = *pIndexBuffer;
419 }
420
421 void SwrSetFetchFunc(
422 HANDLE hContext,
423 PFN_FETCH_FUNC pfnFetchFunc)
424 {
425 API_STATE* pState = GetDrawState(GetContext(hContext));
426
427 pState->pfnFetchFunc = pfnFetchFunc;
428 }
429
430 void SwrSetSoFunc(
431 HANDLE hContext,
432 PFN_SO_FUNC pfnSoFunc,
433 uint32_t streamIndex)
434 {
435 API_STATE* pState = GetDrawState(GetContext(hContext));
436
437 SWR_ASSERT(streamIndex < MAX_SO_STREAMS);
438
439 pState->pfnSoFunc[streamIndex] = pfnSoFunc;
440 }
441
442 void SwrSetSoState(
443 HANDLE hContext,
444 SWR_STREAMOUT_STATE* pSoState)
445 {
446 API_STATE* pState = GetDrawState(GetContext(hContext));
447
448 pState->soState = *pSoState;
449 }
450
451 void SwrSetSoBuffers(
452 HANDLE hContext,
453 SWR_STREAMOUT_BUFFER* pSoBuffer,
454 uint32_t slot)
455 {
456 API_STATE* pState = GetDrawState(GetContext(hContext));
457
458 SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot);
459
460 pState->soBuffer[slot] = *pSoBuffer;
461 }
462
463 void SwrSetVertexFunc(
464 HANDLE hContext,
465 PFN_VERTEX_FUNC pfnVertexFunc)
466 {
467 API_STATE* pState = GetDrawState(GetContext(hContext));
468
469 pState->pfnVertexFunc = pfnVertexFunc;
470 }
471
472 void SwrSetFrontendState(
473 HANDLE hContext,
474 SWR_FRONTEND_STATE *pFEState)
475 {
476 API_STATE* pState = GetDrawState(GetContext(hContext));
477 pState->frontendState = *pFEState;
478 }
479
480 void SwrSetGsState(
481 HANDLE hContext,
482 SWR_GS_STATE *pGSState)
483 {
484 API_STATE* pState = GetDrawState(GetContext(hContext));
485 pState->gsState = *pGSState;
486 }
487
488 void SwrSetGsFunc(
489 HANDLE hContext,
490 PFN_GS_FUNC pfnGsFunc)
491 {
492 API_STATE* pState = GetDrawState(GetContext(hContext));
493 pState->pfnGsFunc = pfnGsFunc;
494 }
495
496 void SwrSetCsFunc(
497 HANDLE hContext,
498 PFN_CS_FUNC pfnCsFunc,
499 uint32_t totalThreadsInGroup)
500 {
501 API_STATE* pState = GetDrawState(GetContext(hContext));
502 pState->pfnCsFunc = pfnCsFunc;
503 pState->totalThreadsInGroup = totalThreadsInGroup;
504 }
505
506 void SwrSetTsState(
507 HANDLE hContext,
508 SWR_TS_STATE *pState)
509 {
510 API_STATE* pApiState = GetDrawState(GetContext(hContext));
511 pApiState->tsState = *pState;
512 }
513
514 void SwrSetHsFunc(
515 HANDLE hContext,
516 PFN_HS_FUNC pfnFunc)
517 {
518 API_STATE* pApiState = GetDrawState(GetContext(hContext));
519 pApiState->pfnHsFunc = pfnFunc;
520 }
521
522 void SwrSetDsFunc(
523 HANDLE hContext,
524 PFN_DS_FUNC pfnFunc)
525 {
526 API_STATE* pApiState = GetDrawState(GetContext(hContext));
527 pApiState->pfnDsFunc = pfnFunc;
528 }
529
530 void SwrSetDepthStencilState(
531 HANDLE hContext,
532 SWR_DEPTH_STENCIL_STATE *pDSState)
533 {
534 API_STATE* pState = GetDrawState(GetContext(hContext));
535
536 pState->depthStencilState = *pDSState;
537 }
538
539 void SwrSetBackendState(
540 HANDLE hContext,
541 SWR_BACKEND_STATE *pBEState)
542 {
543 API_STATE* pState = GetDrawState(GetContext(hContext));
544
545 pState->backendState = *pBEState;
546 }
547
548 void SwrSetPixelShaderState(
549 HANDLE hContext,
550 SWR_PS_STATE *pPSState)
551 {
552 API_STATE *pState = GetDrawState(GetContext(hContext));
553 pState->psState = *pPSState;
554 }
555
556 void SwrSetBlendState(
557 HANDLE hContext,
558 SWR_BLEND_STATE *pBlendState)
559 {
560 API_STATE *pState = GetDrawState(GetContext(hContext));
561 memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE));
562 }
563
564 void SwrSetBlendFunc(
565 HANDLE hContext,
566 uint32_t renderTarget,
567 PFN_BLEND_JIT_FUNC pfnBlendFunc)
568 {
569 SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS);
570 API_STATE *pState = GetDrawState(GetContext(hContext));
571 pState->pfnBlendFunc[renderTarget] = pfnBlendFunc;
572 }
573
574 void SwrSetLinkage(
575 HANDLE hContext,
576 uint32_t mask,
577 const uint8_t* pMap)
578 {
579 API_STATE* pState = GetDrawState(GetContext(hContext));
580
581 static const uint8_t IDENTITY_MAP[] =
582 {
583 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
584 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
585 };
586 static_assert(sizeof(IDENTITY_MAP) == sizeof(pState->linkageMap),
587 "Update for new value of MAX_ATTRIBUTES");
588
589 pState->linkageMask = mask;
590 pState->linkageCount = _mm_popcnt_u32(mask);
591
592 if (!pMap)
593 {
594 pMap = IDENTITY_MAP;
595 }
596 memcpy(pState->linkageMap, pMap, pState->linkageCount);
597 }
598
599 // update guardband multipliers for the viewport
600 void updateGuardband(API_STATE *pState)
601 {
602 // guardband center is viewport center
603 pState->gbState.left = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
604 pState->gbState.right = KNOB_GUARDBAND_WIDTH / pState->vp[0].width;
605 pState->gbState.top = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
606 pState->gbState.bottom = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
607 }
608
609 void SwrSetRastState(
610 HANDLE hContext,
611 const SWR_RASTSTATE *pRastState)
612 {
613 SWR_CONTEXT *pContext = GetContext(hContext);
614 API_STATE* pState = GetDrawState(pContext);
615
616 memcpy(&pState->rastState, pRastState, sizeof(SWR_RASTSTATE));
617 }
618
619 void SwrSetViewports(
620 HANDLE hContext,
621 uint32_t numViewports,
622 const SWR_VIEWPORT* pViewports,
623 const SWR_VIEWPORT_MATRIX* pMatrices)
624 {
625 SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS,
626 "Invalid number of viewports.");
627
628 SWR_CONTEXT *pContext = GetContext(hContext);
629 API_STATE* pState = GetDrawState(pContext);
630
631 memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports);
632
633 if (pMatrices != nullptr)
634 {
635 memcpy(&pState->vpMatrix[0], pMatrices, sizeof(SWR_VIEWPORT_MATRIX) * numViewports);
636 }
637 else
638 {
639 // Compute default viewport transform.
640 for (uint32_t i = 0; i < numViewports; ++i)
641 {
642 if (pContext->driverType == DX)
643 {
644 pState->vpMatrix[i].m00 = pState->vp[i].width / 2.0f;
645 pState->vpMatrix[i].m11 = -pState->vp[i].height / 2.0f;
646 pState->vpMatrix[i].m22 = pState->vp[i].maxZ - pState->vp[i].minZ;
647 pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
648 pState->vpMatrix[i].m31 = pState->vp[i].y - pState->vpMatrix[i].m11;
649 pState->vpMatrix[i].m32 = pState->vp[i].minZ;
650 }
651 else
652 {
653 // Standard, with the exception that Y is inverted.
654 pState->vpMatrix[i].m00 = (pState->vp[i].width - pState->vp[i].x) / 2.0f;
655 pState->vpMatrix[i].m11 = (pState->vp[i].y - pState->vp[i].height) / 2.0f;
656 pState->vpMatrix[i].m22 = (pState->vp[i].maxZ - pState->vp[i].minZ) / 2.0f;
657 pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
658 pState->vpMatrix[i].m31 = pState->vp[i].height + pState->vpMatrix[i].m11;
659 pState->vpMatrix[i].m32 = pState->vp[i].minZ + pState->vpMatrix[i].m22;
660
661 // Now that the matrix is calculated, clip the view coords to screen size.
662 // OpenGL allows for -ve x,y in the viewport.
663 pState->vp[i].x = std::max(pState->vp[i].x, 0.0f);
664 pState->vp[i].y = std::max(pState->vp[i].y, 0.0f);
665 }
666 }
667 }
668
669 updateGuardband(pState);
670 }
671
672 void SwrSetScissorRects(
673 HANDLE hContext,
674 uint32_t numScissors,
675 const BBOX* pScissors)
676 {
677 SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS,
678 "Invalid number of scissor rects.");
679
680 API_STATE* pState = GetDrawState(GetContext(hContext));
681 memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(BBOX));
682 };
683
684 void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
685 {
686 API_STATE *pState = &pDC->pState->state;
687 uint32_t left, right, top, bottom;
688
689 // Set up scissor dimensions based on scissor or viewport
690 if (pState->rastState.scissorEnable)
691 {
692 // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges
693 left = pState->scissorRects[0].left;
694 right = pState->scissorRects[0].right;
695 top = pState->scissorRects[0].top;
696 bottom = pState->scissorRects[0].bottom;
697 }
698 else
699 {
700 left = (int32_t)pState->vp[0].x;
701 right = (int32_t)pState->vp[0].x + (int32_t)pState->vp[0].width;
702 top = (int32_t)pState->vp[0].y;
703 bottom = (int32_t)pState->vp[0].y + (int32_t)pState->vp[0].height;
704 }
705
706 right = std::min<uint32_t>(right, KNOB_MAX_SCISSOR_X);
707 bottom = std::min<uint32_t>(bottom, KNOB_MAX_SCISSOR_Y);
708
709 if (left > KNOB_MAX_SCISSOR_X || top > KNOB_MAX_SCISSOR_Y)
710 {
711 pState->scissorInFixedPoint.left = 0;
712 pState->scissorInFixedPoint.right = 0;
713 pState->scissorInFixedPoint.top = 0;
714 pState->scissorInFixedPoint.bottom = 0;
715 }
716 else
717 {
718 pState->scissorInFixedPoint.left = left * FIXED_POINT_SCALE;
719 pState->scissorInFixedPoint.right = right * FIXED_POINT_SCALE - 1;
720 pState->scissorInFixedPoint.top = top * FIXED_POINT_SCALE;
721 pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1;
722 }
723 }
724
725 void SetupPipeline(DRAW_CONTEXT *pDC)
726 {
727 DRAW_STATE* pState = pDC->pState;
728 const SWR_RASTSTATE &rastState = pState->state.rastState;
729 BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
730 const uint32_t forcedSampleCount = (rastState.bForcedSampleCount) ? 1 : 0;
731
732 // setup backend
733 if (pState->state.psState.pfnPixelShader == nullptr)
734 {
735 backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
736 // always need to generate I & J per sample for Z interpolation
737 backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[1];
738 }
739 else
740 {
741 const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.bForcedSampleCount) ? 1 : 0;
742 const uint32_t centroid = ((pState->state.psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
743
744 // currently only support 'normal' input coverage
745 SWR_ASSERT(pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
746 pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NONE);
747
748 SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)pState->state.psState.barycentricsMask;
749
750 // select backend function
751 switch(pState->state.psState.shadingRate)
752 {
753 case SWR_SHADING_RATE_PIXEL:
754 if(bMultisampleEnable)
755 {
756 // always need to generate I & J per sample for Z interpolation
757 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
758 backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][pState->state.psState.inputCoverage][centroid][forcedSampleCount];
759 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount];
760 }
761 else
762 {
763 // always need to generate I & J per pixel for Z interpolation
764 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
765 backendFuncs.pfnBackend = gBackendSingleSample[pState->state.psState.inputCoverage][centroid];
766 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][SWR_MULTISAMPLE_1X];
767 }
768 break;
769 case SWR_SHADING_RATE_SAMPLE:
770 SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
771 // always need to generate I & J per sample for Z interpolation
772 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
773 backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][pState->state.psState.inputCoverage][centroid];
774 backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount];
775 break;
776 case SWR_SHADING_RATE_COARSE:
777 default:
778 SWR_ASSERT(0 && "Invalid shading rate");
779 break;
780 }
781
782 // setup pointer to function that generates necessary barycentrics required by the PS
783 bool bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_PIXEL_MASK) > 0 ? 1 : 0;
784 backendFuncs.pfnCalcPixelBarycentrics = gPixelBarycentricTable[bBarycentrics];
785
786 bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_SAMPLE_MASK) > 0 ? 1 : 0;
787 backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[bBarycentrics];
788
789 bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0 ? 1 : 0;
790 backendFuncs.pfnCalcCentroidBarycentrics = gCentroidBarycentricTable[rastState.sampleCount][bBarycentrics][rastState.samplePattern][forcedSampleCount];
791 }
792
793 PFN_PROCESS_PRIMS pfnBinner;
794 switch (pState->state.topology)
795 {
796 case TOP_POINT_LIST:
797 pState->pfnProcessPrims = ClipPoints;
798 pfnBinner = BinPoints;
799 break;
800 case TOP_LINE_LIST:
801 case TOP_LINE_STRIP:
802 case TOP_LINE_LOOP:
803 case TOP_LINE_LIST_ADJ:
804 case TOP_LISTSTRIP_ADJ:
805 pState->pfnProcessPrims = ClipLines;
806 pfnBinner = BinLines;
807 break;
808 default:
809 pState->pfnProcessPrims = ClipTriangles;
810 pfnBinner = BinTriangles;
811 break;
812 };
813
814 // disable clipper if viewport transform is disabled
815 if (pState->state.frontendState.vpTransformDisable)
816 {
817 pState->pfnProcessPrims = pfnBinner;
818 }
819
820 if ((pState->state.psState.pfnPixelShader == nullptr) &&
821 (pState->state.depthStencilState.depthTestEnable == FALSE) &&
822 (pState->state.depthStencilState.depthWriteEnable == FALSE) &&
823 (pState->state.depthStencilState.stencilTestEnable == FALSE) &&
824 (pState->state.depthStencilState.stencilWriteEnable == FALSE) &&
825 (pState->state.linkageCount == 0))
826 {
827 pState->pfnProcessPrims = nullptr;
828 pState->state.linkageMask = 0;
829 }
830
831 if (pState->state.soState.rasterizerDisable == true)
832 {
833 pState->pfnProcessPrims = nullptr;
834 pState->state.linkageMask = 0;
835 }
836
837 // set up the frontend attrib mask
838 pState->state.feAttribMask = pState->state.linkageMask;
839 if (pState->state.soState.soEnable)
840 {
841 for (uint32_t i = 0; i < 4; ++i)
842 {
843 pState->state.feAttribMask |= pState->state.soState.streamMasks[i];
844 }
845 }
846
847 // complicated logic to test for cases where we don't need backing hottile memory for a draw
848 // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
849 pState->state.depthHottileEnable = ((!(pState->state.depthStencilState.depthTestEnable &&
850 !pState->state.depthStencilState.depthWriteEnable &&
851 pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) &&
852 (pState->state.depthStencilState.depthTestEnable ||
853 pState->state.depthStencilState.depthWriteEnable)) ? true : false;
854
855 pState->state.stencilHottileEnable = (((!(pState->state.depthStencilState.stencilTestEnable &&
856 !pState->state.depthStencilState.stencilWriteEnable &&
857 pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) ||
858 // for stencil we have to check the double sided state as well
859 (!(pState->state.depthStencilState.doubleSidedStencilTestEnable &&
860 !pState->state.depthStencilState.stencilWriteEnable &&
861 pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) &&
862 (pState->state.depthStencilState.stencilTestEnable ||
863 pState->state.depthStencilState.stencilWriteEnable)) ? true : false;
864
865 uint32_t numRTs = pState->state.psState.numRenderTargets;
866 pState->state.colorHottileEnable = 0;
867 if(pState->state.psState.pfnPixelShader != nullptr)
868 {
869 for (uint32_t rt = 0; rt < numRTs; ++rt)
870 {
871 pState->state.colorHottileEnable |=
872 (!pState->state.blendState.renderTarget[rt].writeDisableAlpha ||
873 !pState->state.blendState.renderTarget[rt].writeDisableRed ||
874 !pState->state.blendState.renderTarget[rt].writeDisableGreen ||
875 !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0;
876 }
877 }
878 }
879
880 //////////////////////////////////////////////////////////////////////////
881 /// @brief InitDraw
882 /// @param pDC - Draw context to initialize for this draw.
883 void InitDraw(
884 DRAW_CONTEXT *pDC,
885 bool isSplitDraw)
886 {
887 // We don't need to re-setup the scissors/pipeline state again for split draw.
888 if (isSplitDraw == false)
889 {
890 SetupMacroTileScissors(pDC);
891 SetupPipeline(pDC);
892 }
893 }
894
895 //////////////////////////////////////////////////////////////////////////
896 /// @brief We can split the draw for certain topologies for better performance.
897 /// @param totalVerts - Total vertices for draw
898 /// @param topology - Topology used for draw
899 uint32_t MaxVertsPerDraw(
900 DRAW_CONTEXT* pDC,
901 uint32_t totalVerts,
902 PRIMITIVE_TOPOLOGY topology)
903 {
904 API_STATE& state = pDC->pState->state;
905
906 uint32_t vertsPerDraw = totalVerts;
907
908 if (state.soState.soEnable)
909 {
910 return totalVerts;
911 }
912
913 switch (topology)
914 {
915 case TOP_POINT_LIST:
916 case TOP_TRIANGLE_LIST:
917 vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW;
918 break;
919
920 case TOP_PATCHLIST_1:
921 case TOP_PATCHLIST_2:
922 case TOP_PATCHLIST_3:
923 case TOP_PATCHLIST_4:
924 case TOP_PATCHLIST_5:
925 case TOP_PATCHLIST_6:
926 case TOP_PATCHLIST_7:
927 case TOP_PATCHLIST_8:
928 case TOP_PATCHLIST_9:
929 case TOP_PATCHLIST_10:
930 case TOP_PATCHLIST_11:
931 case TOP_PATCHLIST_12:
932 case TOP_PATCHLIST_13:
933 case TOP_PATCHLIST_14:
934 case TOP_PATCHLIST_15:
935 case TOP_PATCHLIST_16:
936 case TOP_PATCHLIST_17:
937 case TOP_PATCHLIST_18:
938 case TOP_PATCHLIST_19:
939 case TOP_PATCHLIST_20:
940 case TOP_PATCHLIST_21:
941 case TOP_PATCHLIST_22:
942 case TOP_PATCHLIST_23:
943 case TOP_PATCHLIST_24:
944 case TOP_PATCHLIST_25:
945 case TOP_PATCHLIST_26:
946 case TOP_PATCHLIST_27:
947 case TOP_PATCHLIST_28:
948 case TOP_PATCHLIST_29:
949 case TOP_PATCHLIST_30:
950 case TOP_PATCHLIST_31:
951 case TOP_PATCHLIST_32:
952 if (pDC->pState->state.tsState.tsEnable)
953 {
954 uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE;
955 vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW;
956 }
957 break;
958
959 // The Primitive Assembly code can only handle 1 RECT at a time.
960 case TOP_RECT_LIST:
961 vertsPerDraw = 3;
962 break;
963
964 default:
965 // We are not splitting up draws for other topologies.
966 break;
967 }
968
969 return vertsPerDraw;
970 }
971
972 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function
973 // arguments to static template arguments.
974 template <bool... ArgsB>
975 struct FEDrawChooser
976 {
977 // Last Arg Terminator
978 static PFN_FE_WORK_FUNC GetFunc(bool bArg)
979 {
980 if (bArg)
981 {
982 return ProcessDraw<ArgsB..., true>;
983 }
984
985 return ProcessDraw<ArgsB..., false>;
986 }
987
988 // Recursively parse args
989 template <typename... TArgsT>
990 static PFN_FE_WORK_FUNC GetFunc(bool bArg, TArgsT... remainingArgs)
991 {
992 if (bArg)
993 {
994 return FEDrawChooser<ArgsB..., true>::GetFunc(remainingArgs...);
995 }
996
997 return FEDrawChooser<ArgsB..., false>::GetFunc(remainingArgs...);
998 }
999 };
1000
1001 // Selector for correct templated Draw front-end function
1002 INLINE
1003 static PFN_FE_WORK_FUNC GetFEDrawFunc(bool IsIndexed, bool HasTessellation, bool HasGeometryShader, bool HasStreamOut, bool RasterizerEnabled)
1004 {
1005 return FEDrawChooser<>::GetFunc(IsIndexed, HasTessellation, HasGeometryShader, HasStreamOut, RasterizerEnabled);
1006 }
1007
1008
1009 //////////////////////////////////////////////////////////////////////////
1010 /// @brief DrawInstanced
1011 /// @param hContext - Handle passed back from SwrCreateContext
1012 /// @param topology - Specifies topology for draw.
1013 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
1014 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1015 /// @param numInstances - How many instances to render.
1016 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1017 void DrawInstanced(
1018 HANDLE hContext,
1019 PRIMITIVE_TOPOLOGY topology,
1020 uint32_t numVertices,
1021 uint32_t startVertex,
1022 uint32_t numInstances = 1,
1023 uint32_t startInstance = 0)
1024 {
1025 if (KNOB_TOSS_DRAW)
1026 {
1027 return;
1028 }
1029
1030 RDTSC_START(APIDraw);
1031
1032 SWR_CONTEXT *pContext = GetContext(hContext);
1033 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1034
1035 int32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
1036 uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
1037 int32_t remainingVerts = numVertices;
1038
1039 API_STATE *pState = &pDC->pState->state;
1040 pState->topology = topology;
1041 pState->forceFront = false;
1042
1043 // disable culling for points/lines
1044 uint32_t oldCullMode = pState->rastState.cullMode;
1045 if (topology == TOP_POINT_LIST)
1046 {
1047 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1048 pState->forceFront = true;
1049 }
1050
1051 int draw = 0;
1052 while (remainingVerts)
1053 {
1054 uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ?
1055 remainingVerts : maxVertsPerDraw;
1056
1057 bool isSplitDraw = (draw > 0) ? true : false;
1058 DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw);
1059 InitDraw(pDC, isSplitDraw);
1060
1061 pDC->FeWork.type = DRAW;
1062 pDC->FeWork.pfnWork = GetFEDrawFunc(
1063 false, // IsIndexed
1064 pState->tsState.tsEnable,
1065 pState->gsState.gsEnable,
1066 pState->soState.soEnable,
1067 pDC->pState->pfnProcessPrims != nullptr);
1068 pDC->FeWork.desc.draw.numVerts = numVertsForDraw;
1069 pDC->FeWork.desc.draw.startVertex = startVertex;
1070 pDC->FeWork.desc.draw.numInstances = numInstances;
1071 pDC->FeWork.desc.draw.startInstance = startInstance;
1072 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1073 pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw;
1074
1075 //enqueue DC
1076 QueueDraw(pContext);
1077
1078 remainingVerts -= numVertsForDraw;
1079 draw++;
1080 }
1081
1082 // restore culling state
1083 pDC = GetDrawContext(pContext);
1084 pDC->pState->state.rastState.cullMode = oldCullMode;
1085
1086 RDTSC_STOP(APIDraw, numVertices * numInstances, 0);
1087 }
1088
1089 //////////////////////////////////////////////////////////////////////////
1090 /// @brief SwrDraw
1091 /// @param hContext - Handle passed back from SwrCreateContext
1092 /// @param topology - Specifies topology for draw.
1093 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
1094 /// @param primCount - Number of vertices.
1095 void SwrDraw(
1096 HANDLE hContext,
1097 PRIMITIVE_TOPOLOGY topology,
1098 uint32_t startVertex,
1099 uint32_t numVertices)
1100 {
1101 DrawInstanced(hContext, topology, numVertices, startVertex);
1102 }
1103
1104 //////////////////////////////////////////////////////////////////////////
1105 /// @brief SwrDrawInstanced
1106 /// @param hContext - Handle passed back from SwrCreateContext
1107 /// @param topology - Specifies topology for draw.
1108 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
1109 /// @param numInstances - How many instances to render.
1110 /// @param startVertex - Specifies start vertex for draw. (vertex data)
1111 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1112 void SwrDrawInstanced(
1113 HANDLE hContext,
1114 PRIMITIVE_TOPOLOGY topology,
1115 uint32_t numVertsPerInstance,
1116 uint32_t numInstances,
1117 uint32_t startVertex,
1118 uint32_t startInstance
1119 )
1120 {
1121 DrawInstanced(hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance);
1122 }
1123
1124 //////////////////////////////////////////////////////////////////////////
1125 /// @brief DrawIndexedInstanced
1126 /// @param hContext - Handle passed back from SwrCreateContext
1127 /// @param topology - Specifies topology for draw.
1128 /// @param numIndices - Number of indices to read sequentially from index buffer.
1129 /// @param indexOffset - Starting index into index buffer.
1130 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1131 /// @param numInstances - Number of instances to render.
1132 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1133 void DrawIndexedInstance(
1134 HANDLE hContext,
1135 PRIMITIVE_TOPOLOGY topology,
1136 uint32_t numIndices,
1137 uint32_t indexOffset,
1138 int32_t baseVertex,
1139 uint32_t numInstances = 1,
1140 uint32_t startInstance = 0)
1141 {
1142 if (KNOB_TOSS_DRAW)
1143 {
1144 return;
1145 }
1146
1147 RDTSC_START(APIDrawIndexed);
1148
1149 SWR_CONTEXT *pContext = GetContext(hContext);
1150 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1151 API_STATE* pState = &pDC->pState->state;
1152
1153 int32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
1154 uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
1155 int32_t remainingIndices = numIndices;
1156
1157 uint32_t indexSize = 0;
1158 switch (pState->indexBuffer.format)
1159 {
1160 case R32_UINT: indexSize = sizeof(uint32_t); break;
1161 case R16_UINT: indexSize = sizeof(uint16_t); break;
1162 case R8_UINT: indexSize = sizeof(uint8_t); break;
1163 default:
1164 SWR_ASSERT(0);
1165 }
1166
1167 int draw = 0;
1168 uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices;
1169 pIB += (uint64_t)indexOffset * (uint64_t)indexSize;
1170
1171 pState->topology = topology;
1172 pState->forceFront = false;
1173
1174 // disable culling for points/lines
1175 uint32_t oldCullMode = pState->rastState.cullMode;
1176 if (topology == TOP_POINT_LIST)
1177 {
1178 pState->rastState.cullMode = SWR_CULLMODE_NONE;
1179 pState->forceFront = true;
1180 }
1181
1182 while (remainingIndices)
1183 {
1184 uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ?
1185 remainingIndices : maxIndicesPerDraw;
1186
1187 // When breaking up draw, we need to obtain new draw context for each iteration.
1188 bool isSplitDraw = (draw > 0) ? true : false;
1189 pDC = GetDrawContext(pContext, isSplitDraw);
1190 InitDraw(pDC, isSplitDraw);
1191
1192 pDC->FeWork.type = DRAW;
1193 pDC->FeWork.pfnWork = GetFEDrawFunc(
1194 true, // IsIndexed
1195 pState->tsState.tsEnable,
1196 pState->gsState.gsEnable,
1197 pState->soState.soEnable,
1198 pDC->pState->pfnProcessPrims != nullptr);
1199 pDC->FeWork.desc.draw.pDC = pDC;
1200 pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
1201 pDC->FeWork.desc.draw.pIB = (int*)pIB;
1202 pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format;
1203
1204 pDC->FeWork.desc.draw.numInstances = numInstances;
1205 pDC->FeWork.desc.draw.startInstance = startInstance;
1206 pDC->FeWork.desc.draw.baseVertex = baseVertex;
1207 pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
1208
1209 //enqueue DC
1210 QueueDraw(pContext);
1211
1212 pIB += maxIndicesPerDraw * indexSize;
1213 remainingIndices -= numIndicesForDraw;
1214 draw++;
1215 }
1216
1217 // restore culling state
1218 pDC = GetDrawContext(pContext);
1219 pDC->pState->state.rastState.cullMode = oldCullMode;
1220
1221 RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0);
1222 }
1223
1224
1225 //////////////////////////////////////////////////////////////////////////
1226 /// @brief DrawIndexed
1227 /// @param hContext - Handle passed back from SwrCreateContext
1228 /// @param topology - Specifies topology for draw.
1229 /// @param numIndices - Number of indices to read sequentially from index buffer.
1230 /// @param indexOffset - Starting index into index buffer.
1231 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1232 void SwrDrawIndexed(
1233 HANDLE hContext,
1234 PRIMITIVE_TOPOLOGY topology,
1235 uint32_t numIndices,
1236 uint32_t indexOffset,
1237 int32_t baseVertex
1238 )
1239 {
1240 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex);
1241 }
1242
1243 //////////////////////////////////////////////////////////////////////////
1244 /// @brief SwrDrawIndexedInstanced
1245 /// @param hContext - Handle passed back from SwrCreateContext
1246 /// @param topology - Specifies topology for draw.
1247 /// @param numIndices - Number of indices to read sequentially from index buffer.
1248 /// @param numInstances - Number of instances to render.
1249 /// @param indexOffset - Starting index into index buffer.
1250 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
1251 /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
1252 void SwrDrawIndexedInstanced(
1253 HANDLE hContext,
1254 PRIMITIVE_TOPOLOGY topology,
1255 uint32_t numIndices,
1256 uint32_t numInstances,
1257 uint32_t indexOffset,
1258 int32_t baseVertex,
1259 uint32_t startInstance)
1260 {
1261 DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
1262 }
1263
1264 // Attach surfaces to pipeline
1265 void SwrInvalidateTiles(
1266 HANDLE hContext,
1267 uint32_t attachmentMask)
1268 {
1269 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1270 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1271
1272 // Queue a load to the hottile
1273 pDC->FeWork.type = INVALIDATETILES;
1274 pDC->FeWork.pfnWork = ProcessInvalidateTiles;
1275 pDC->FeWork.desc.invalidateTiles.attachmentMask = attachmentMask;
1276
1277 //enqueue
1278 QueueDraw(pContext);
1279 }
1280
1281 //////////////////////////////////////////////////////////////////////////
1282 /// @brief SwrDispatch
1283 /// @param hContext - Handle passed back from SwrCreateContext
1284 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
1285 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
1286 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
1287 void SwrDispatch(
1288 HANDLE hContext,
1289 uint32_t threadGroupCountX,
1290 uint32_t threadGroupCountY,
1291 uint32_t threadGroupCountZ)
1292 {
1293 if (KNOB_TOSS_DRAW)
1294 {
1295 return;
1296 }
1297
1298 RDTSC_START(APIDispatch);
1299 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1300 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1301
1302 pDC->isCompute = true; // This is a compute context.
1303
1304 // Ensure spill fill pointers are initialized to nullptr.
1305 memset(pDC->pSpillFill, 0, sizeof(pDC->pSpillFill));
1306
1307 COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
1308
1309 pTaskData->threadGroupCountX = threadGroupCountX;
1310 pTaskData->threadGroupCountY = threadGroupCountY;
1311 pTaskData->threadGroupCountZ = threadGroupCountZ;
1312
1313 uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
1314 pDC->pDispatch->initialize(totalThreadGroups, pTaskData);
1315
1316 QueueDispatch(pContext);
1317 RDTSC_STOP(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ, 0);
1318 }
1319
1320 // Deswizzles, converts and stores current contents of the hot tiles to surface
1321 // described by pState
1322 void SwrStoreTiles(
1323 HANDLE hContext,
1324 SWR_RENDERTARGET_ATTACHMENT attachment,
1325 SWR_TILE_STATE postStoreTileState)
1326 {
1327 RDTSC_START(APIStoreTiles);
1328
1329 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1330 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1331
1332 SetupMacroTileScissors(pDC);
1333
1334 pDC->FeWork.type = STORETILES;
1335 pDC->FeWork.pfnWork = ProcessStoreTiles;
1336 pDC->FeWork.desc.storeTiles.attachment = attachment;
1337 pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState;
1338
1339 //enqueue
1340 QueueDraw(pContext);
1341
1342 RDTSC_STOP(APIStoreTiles, 0, 0);
1343 }
1344
1345 void SwrClearRenderTarget(
1346 HANDLE hContext,
1347 uint32_t clearMask,
1348 const float clearColor[4],
1349 float z,
1350 BYTE stencil)
1351 {
1352 RDTSC_START(APIClearRenderTarget);
1353
1354 SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
1355
1356 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1357
1358 SetupMacroTileScissors(pDC);
1359
1360 CLEAR_FLAGS flags;
1361 flags.mask = clearMask;
1362
1363 pDC->FeWork.type = CLEAR;
1364 pDC->FeWork.pfnWork = ProcessClear;
1365 pDC->FeWork.desc.clear.flags = flags;
1366 pDC->FeWork.desc.clear.clearDepth = z;
1367 pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0];
1368 pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1];
1369 pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2];
1370 pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3];
1371 pDC->FeWork.desc.clear.clearStencil = stencil;
1372
1373 // enqueue draw
1374 QueueDraw(pContext);
1375
1376 RDTSC_STOP(APIClearRenderTarget, 0, pDC->drawId);
1377 }
1378
1379 //////////////////////////////////////////////////////////////////////////
1380 /// @brief Returns a pointer to the private context state for the current
1381 /// draw operation. This is used for external componets such as the
1382 /// sampler.
1383 /// SWR is responsible for the allocation of the private context state.
1384 /// @param hContext - Handle passed back from SwrCreateContext
1385 VOID* SwrGetPrivateContextState(
1386 HANDLE hContext)
1387 {
1388 SWR_CONTEXT* pContext = GetContext(hContext);
1389 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1390 DRAW_STATE* pState = pDC->pState;
1391
1392 if (pState->pPrivateState == nullptr)
1393 {
1394 pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float));
1395 }
1396
1397 return pState->pPrivateState;
1398 }
1399
1400 //////////////////////////////////////////////////////////////////////////
1401 /// @brief Clients can use this to allocate memory for draw/dispatch
1402 /// operations. The memory will automatically be freed once operation
1403 /// has completed. Client can use this to allocate binding tables,
1404 /// etc. needed for shader execution.
1405 /// @param hContext - Handle passed back from SwrCreateContext
1406 /// @param size - Size of allocation
1407 /// @param align - Alignment needed for allocation.
1408 VOID* SwrAllocDrawContextMemory(
1409 HANDLE hContext,
1410 uint32_t size,
1411 uint32_t align)
1412 {
1413 SWR_CONTEXT* pContext = GetContext(hContext);
1414 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1415
1416 return pDC->pState->pArena->AllocAligned(size, align);
1417 }
1418
1419 //////////////////////////////////////////////////////////////////////////
1420 /// @brief Returns pointer to SWR stats.
1421 /// @note The counters are atomically incremented by multiple threads.
1422 /// When calling this, you need to ensure all previous operations
1423 /// have completed.
1424 /// @todo If necessary, add a callback to avoid stalling the pipe to
1425 /// sample the counters.
1426 /// @param hContext - Handle passed back from SwrCreateContext
1427 /// @param pStats - SWR will fill this out for caller.
1428 void SwrGetStats(
1429 HANDLE hContext,
1430 SWR_STATS* pStats)
1431 {
1432 SWR_CONTEXT *pContext = GetContext(hContext);
1433 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1434
1435 pDC->FeWork.type = QUERYSTATS;
1436 pDC->FeWork.pfnWork = ProcessQueryStats;
1437 pDC->FeWork.desc.queryStats.pStats = pStats;
1438
1439 // cannot execute until all previous draws have completed
1440 pDC->dependency = pDC->drawId - 1;
1441
1442 //enqueue
1443 QueueDraw(pContext);
1444 }
1445
1446 //////////////////////////////////////////////////////////////////////////
1447 /// @brief Enables stats counting
1448 /// @param hContext - Handle passed back from SwrCreateContext
1449 /// @param enable - If true then counts are incremented.
1450 void SwrEnableStats(
1451 HANDLE hContext,
1452 bool enable)
1453 {
1454 SWR_CONTEXT *pContext = GetContext(hContext);
1455 DRAW_CONTEXT* pDC = GetDrawContext(pContext);
1456
1457 pDC->pState->state.enableStats = enable;
1458 }
1459
1460 //////////////////////////////////////////////////////////////////////////
1461 /// @brief Mark end of frame - used for performance profiling
1462 /// @param hContext - Handle passed back from SwrCreateContext
1463 void SWR_API SwrEndFrame(
1464 HANDLE hContext)
1465 {
1466 RDTSC_ENDFRAME();
1467 }