1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT
26 * The SWR_CONTEXT is our global context and contains the DC ring,
29 * The DRAW_CONTEXT contains all state associated with a draw operation.
31 ******************************************************************************/
34 #include <condition_variable>
38 #include "core/utils.h"
39 #include "core/arena.h"
40 #include "core/fifo.hpp"
41 #include "core/knobs.h"
42 #include "common/intrin.h"
43 #include "common/rdtsc_buckets.h"
44 #include "core/threads.h"
45 #include "ringbuffer.h"
46 #include "archrast/archrast.h"
48 // x.8 fixed point precision values
49 #define FIXED_POINT_SHIFT 8
50 #define FIXED_POINT_SCALE 256
52 // x.16 fixed point precision values
53 #define FIXED_POINT16_SHIFT 16
54 #define FIXED_POINT16_SCALE 65536
61 uint32_t frontFacing
: 1;
63 uint32_t coverageMask
: (SIMD_TILE_X_DIM
* SIMD_TILE_Y_DIM
);
64 uint32_t reserved
: 32 - 1 - 1 - (SIMD_TILE_X_DIM
* SIMD_TILE_Y_DIM
);
66 uint32_t renderTargetArrayIndex
;
67 uint32_t viewportIndex
;
70 //////////////////////////////////////////////////////////////////////////
72 /////////////////////////////////////////////////////////////////////////
73 struct SWR_TRIANGLE_DESC
85 float* pUserClipBuffer
;
87 uint64_t coverageMask
[SWR_MAX_NUM_MULTISAMPLES
];
88 uint64_t innerCoverageMask
; // Conservative rasterization inner coverage: marked covered if
89 // entire pixel is covered
90 uint64_t anyCoveredSamples
;
95 struct TRIANGLE_WORK_DESC
99 float* pUserClipBuffer
;
107 uint32_t attachmentMask
;
108 uint32_t renderTargetArrayIndex
;
109 float clearRTColor
[4]; // RGBA_32F
110 float clearDepth
; // [0..1]
111 uint8_t clearStencil
;
114 struct DISCARD_INVALIDATE_TILES_DESC
116 uint32_t attachmentMask
;
118 SWR_TILE_STATE newTileState
;
125 PFN_CALLBACK_FUNC pfnCallbackFunc
;
131 struct STORE_TILES_DESC
133 uint32_t attachmentMask
;
134 SWR_TILE_STATE postStoreTileState
;
140 uint32_t threadGroupCountX
;
141 uint32_t threadGroupCountY
;
142 uint32_t threadGroupCountZ
;
143 bool enableThreadDispatch
;
146 typedef void (*PFN_WORK_FUNC
)(DRAW_CONTEXT
* pDC
,
156 DISCARDINVALIDATETILES
,
161 OSALIGNSIMD(struct) BE_WORK
164 PFN_WORK_FUNC pfnWork
;
168 TRIANGLE_WORK_DESC tri
;
170 DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles
;
171 STORE_TILES_DESC storeTiles
;
180 uint32_t numIndices
; // DrawIndexed: Number of indices for draw.
181 uint32_t numVerts
; // Draw: Number of verts (triangles, lines, etc)
185 gfxptr_t xpIB
; // DrawIndexed: App supplied int32 indices
186 uint32_t startVertex
; // Draw: Starting vertex in VB to render from.
189 uint32_t numInstances
; // Number of instances
190 uint32_t startInstance
; // Instance offset
191 uint32_t startPrimID
; // starting primitiveID for this draw batch
193 startVertexID
; // starting VertexID for this draw batch (only needed for non-indexed draws)
194 SWR_FORMAT type
; // index buffer type
197 typedef void (*PFN_FE_WORK_FUNC
)(SWR_CONTEXT
* pContext
,
204 PFN_FE_WORK_FUNC pfnWork
;
210 DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles
;
211 STORE_TILES_DESC storeTiles
;
217 float left
[KNOB_NUM_VIEWPORTS_SCISSORS
];
218 float right
[KNOB_NUM_VIEWPORTS_SCISSORS
];
219 float top
[KNOB_NUM_VIEWPORTS_SCISSORS
];
220 float bottom
[KNOB_NUM_VIEWPORTS_SCISSORS
];
225 // function signature for pipeline stages that execute after primitive assembly
226 typedef void (*PFN_PROCESS_PRIMS
)(DRAW_CONTEXT
* pDC
,
231 simdscalari
const& primID
,
232 simdscalari
const& viewportIdx
,
233 simdscalari
const& rtIdx
);
235 // function signature for pipeline stages that execute after primitive assembly
236 typedef void(SIMDCALL
* PFN_PROCESS_PRIMS_SIMD16
)(DRAW_CONTEXT
* pDC
,
239 simd16vector prims
[],
241 simd16scalari
const& primID
,
242 simd16scalari
const& viewportIdx
,
243 simd16scalari
const& rtIdx
);
245 OSALIGNLINE(struct) API_STATE
248 SWR_VERTEX_BUFFER_STATE vertexBuffers
[KNOB_NUM_STREAMS
];
250 // GS - Geometry Shader State
251 SWR_GS_STATE gsState
;
252 PFN_GS_FUNC pfnGsFunc
;
254 // FS - Fetch Shader State
255 PFN_FETCH_FUNC pfnFetchFunc
;
257 // VS - Vertex Shader State
258 PFN_VERTEX_FUNC pfnVertexFunc
;
261 SWR_INDEX_BUFFER_STATE indexBuffer
;
263 // CS - Compute Shader
264 PFN_CS_FUNC pfnCsFunc
;
265 uint32_t totalThreadsInGroup
;
266 uint32_t totalSpillFillSize
;
267 uint32_t scratchSpaceSizePerWarp
;
268 uint32_t scratchSpaceNumWarps
;
270 // FE - Frontend State
271 SWR_FRONTEND_STATE frontendState
;
273 // SOS - Streamout Shader State
274 PFN_SO_FUNC pfnSoFunc
[MAX_SO_STREAMS
];
277 SWR_STREAMOUT_STATE soState
;
278 mutable SWR_STREAMOUT_BUFFER soBuffer
[MAX_SO_STREAMS
];
280 // Tessellation State
281 PFN_HS_FUNC pfnHsFunc
;
282 PFN_DS_FUNC pfnDsFunc
;
283 SWR_TS_STATE tsState
;
285 // Number of attributes used by the frontend (vs, so, gs)
286 uint32_t feNumAttributes
;
288 // RS - Rasterizer State
289 SWR_RASTSTATE rastState
;
290 // floating point multisample offsets
291 float samplePos
[SWR_MAX_NUM_MULTISAMPLES
* 2];
295 SWR_VIEWPORT vp
[KNOB_NUM_VIEWPORTS_SCISSORS
];
296 SWR_VIEWPORT_MATRICES vpMatrices
;
298 SWR_RECT scissorRects
[KNOB_NUM_VIEWPORTS_SCISSORS
];
299 SWR_RECT scissorsInFixedPoint
[KNOB_NUM_VIEWPORTS_SCISSORS
];
300 bool scissorsTileAligned
;
303 PRIMITIVE_TOPOLOGY topology
;
307 OSALIGNLINE(SWR_BACKEND_STATE
) backendState
;
309 SWR_DEPTH_BOUNDS_STATE depthBoundsState
;
311 // PS - Pixel shader state
312 SWR_PS_STATE psState
;
314 SWR_DEPTH_STENCIL_STATE depthStencilState
;
316 // OM - Output Merger State
317 SWR_BLEND_STATE blendState
;
318 PFN_BLEND_JIT_FUNC pfnBlendFunc
[SWR_NUM_RENDERTARGETS
];
322 uint32_t enableStatsFE
: 1; // Enable frontend pipeline stats
323 uint32_t enableStatsBE
: 1; // Enable backend pipeline stats
324 uint32_t colorHottileEnable
: 8; // Bitmask of enabled color hottiles
325 uint32_t depthHottileEnable
: 1; // Enable depth buffer hottile
326 uint32_t stencilHottileEnable
: 1; // Enable stencil buffer hottile
329 PFN_QUANTIZE_DEPTH pfnQuantizeDepth
;
336 struct RenderOutputBuffers
338 uint8_t* pColor
[SWR_NUM_RENDERTARGETS
];
342 HOTTILE
* pColorHotTile
[SWR_NUM_RENDERTARGETS
];
343 HOTTILE
* pDepthHotTile
;
344 HOTTILE
* pStencilHotTile
;
347 // Plane equation A/B/C coeffs used to evaluate I/J barycentric coords
348 struct BarycentricCoeffs
362 simdscalar vRecipDet
;
364 simdscalar vAOneOverW
;
365 simdscalar vBOneOverW
;
366 simdscalar vCOneOverW
;
369 // pipeline function pointer types
370 typedef void (*PFN_BACKEND_FUNC
)(
371 DRAW_CONTEXT
*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC
&, RenderOutputBuffers
&);
372 typedef void (*PFN_OUTPUT_MERGER
)(SWR_PS_CONTEXT
&,
373 uint8_t* (&)[SWR_NUM_RENDERTARGETS
],
375 const SWR_BLEND_STATE
*,
376 const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS
],
379 typedef void (*PFN_CALC_PIXEL_BARYCENTRICS
)(const BarycentricCoeffs
&, SWR_PS_CONTEXT
&);
380 typedef void (*PFN_CALC_SAMPLE_BARYCENTRICS
)(const BarycentricCoeffs
&, SWR_PS_CONTEXT
&);
381 typedef void (*PFN_CALC_CENTROID_BARYCENTRICS
)(const BarycentricCoeffs
&,
383 const uint64_t* const,
390 PFN_BACKEND_FUNC pfnBackend
;
398 void* pPrivateState
; // Its required the driver sets this up for each draw.
400 // pipeline function pointers, filled in by API thread when setting up the draw
401 BACKEND_FUNCS backendFuncs
;
402 PFN_PROCESS_PRIMS pfnProcessPrims
;
403 #if USE_SIMD16_FRONTEND
404 PFN_PROCESS_PRIMS_SIMD16 pfnProcessPrims_simd16
;
407 CachingArena
* pArena
; // This should only be used by API thread.
410 struct DRAW_DYNAMIC_STATE
412 void Reset(uint32_t numThreads
)
414 SWR_STATS
* pSavePtr
= pStats
;
415 memset(this, 0, sizeof(*this));
417 memset(pStats
, 0, sizeof(SWR_STATS
) * numThreads
);
419 ///@todo Currently assumes only a single FE can do stream output for a draw.
420 uint32_t SoWriteOffset
[4];
421 bool SoWriteOffsetDirty
[4];
423 SWR_STATS_FE statsFE
; // Only one FE thread per DC.
428 // The api thread sets up a draw context that exists for the life of the draw.
429 // This draw context maintains all of the state needed for the draw operation.
432 SWR_CONTEXT
* pContext
;
435 MacroTileMgr
* pTileMgr
;
436 DispatchQueue
* pDispatch
; // Queue for thread groups. (isCompute)
438 DRAW_STATE
* pState
; // Read-only state. Core should not update this outside of API thread.
439 CachingArena
* pArena
;
442 bool dependentFE
; // Frontend work is dependent on all previous FE
443 bool dependent
; // Backend work is dependent on all previous BE
444 bool isCompute
; // Is this DC a compute context?
445 bool cleanupState
; // True if this is the last draw using an entry in the state ring.
449 SYNC_DESC retireCallback
; // Call this func when this DC is retired.
451 DRAW_DYNAMIC_STATE dynState
;
453 volatile OSALIGNLINE(bool) doneFE
; // Is FE work done for this draw?
454 volatile OSALIGNLINE(uint32_t) FeLock
;
455 volatile OSALIGNLINE(uint32_t) threadsDone
;
458 static_assert((sizeof(DRAW_CONTEXT
) & 63) == 0, "Invalid size for DRAW_CONTEXT");
460 INLINE
const API_STATE
& GetApiState(const DRAW_CONTEXT
* pDC
)
462 SWR_ASSERT(pDC
!= nullptr);
463 SWR_ASSERT(pDC
->pState
!= nullptr);
465 return pDC
->pState
->state
;
468 INLINE
void* GetPrivateState(const DRAW_CONTEXT
* pDC
)
470 SWR_ASSERT(pDC
!= nullptr);
471 SWR_ASSERT(pDC
->pState
!= nullptr);
473 return pDC
->pState
->pPrivateState
;
481 // Each draw needs its own state in order to support mulitple draws in flight across multiple
482 // threads. We maintain N draw contexts configured as a ring. The size of the ring limits the
483 // maximum number of draws that can be in flight at any given time.
486 // 1. State - When an application first sets state we'll request a new draw context to use.
487 // a. If there are no available draw contexts then we'll have to wait until one becomes
488 // free. b. If one is available then set pCurDrawContext to point to it and mark it in use.
489 // c. All state calls set state on pCurDrawContext.
490 // 2. Draw - Creates submits a work item that is associated with current draw context.
491 // a. Set pPrevDrawContext = pCurDrawContext
492 // b. Set pCurDrawContext to NULL.
493 // 3. State - When an applications sets state after draw
494 // a. Same as step 1.
495 // b. State is copied from prev draw context to current.
496 RingBuffer
<DRAW_CONTEXT
> dcRing
;
498 DRAW_CONTEXT
* pCurDrawContext
; // This points to DC entry in ring for an unsubmitted draw.
499 DRAW_CONTEXT
* pPrevDrawContext
; // This points to DC entry for the previous context submitted
500 // that we can copy state from.
502 MacroTileMgr
* pMacroTileManagerArray
;
503 DispatchQueue
* pDispatchQueueArray
;
506 // When draw are very large (lots of primitives) then the API thread will break these up.
507 // These split draws all have identical state. So instead of storing the state directly
508 // in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
509 // to reference a single entry in the DS ring.
510 RingBuffer
<DRAW_STATE
> dsRing
;
512 uint32_t curStateId
; // Current index to the next available entry in the DS ring.
514 uint32_t NumWorkerThreads
;
515 uint32_t NumFEThreads
;
516 uint32_t NumBEThreads
;
518 THREAD_POOL threadPool
; // Thread pool associated with this context
519 SWR_THREADING_INFO threadInfo
;
520 SWR_API_THREADING_INFO apiThreadInfo
;
521 SWR_WORKER_PRIVATE_STATE workerPrivateState
;
523 uint32_t MAX_DRAWS_IN_FLIGHT
;
525 std::condition_variable FifosNotEmpty
;
528 uint32_t privateStateSize
;
530 HotTileMgr
* pHotTileMgr
;
532 // Callback functions, passed in at create context time
533 PFN_LOAD_TILE pfnLoadTile
;
534 PFN_STORE_TILE pfnStoreTile
;
535 PFN_TRANSLATE_GFXPTR_FOR_READ pfnTranslateGfxptrForRead
;
536 PFN_TRANSLATE_GFXPTR_FOR_WRITE pfnTranslateGfxptrForWrite
;
537 PFN_MAKE_GFXPTR pfnMakeGfxPtr
;
538 PFN_CREATE_MEMORY_CONTEXT pfnCreateMemoryContext
;
539 PFN_DESTROY_MEMORY_CONTEXT pfnDestroyMemoryContext
;
540 PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset
;
541 PFN_UPDATE_STATS pfnUpdateStats
;
542 PFN_UPDATE_STATS_FE pfnUpdateStatsFE
;
548 // Scratch space for workers.
551 volatile OSALIGNLINE(uint32_t) drawsOutstandingFE
;
553 OSALIGNLINE(CachingAllocator
) cachingArenaAllocator
;
556 uint32_t lastFrameChecked
;
557 uint64_t lastDrawChecked
;
558 TileSet
* pSingleThreadLockedTiles
;
560 // ArchRast thread contexts.
563 // handle to external memory for worker datas to create memory contexts
564 HANDLE hExternalMemory
;
566 BucketManager
*pBucketMgr
;
569 #define UPDATE_STAT_BE(name, count) \
570 if (GetApiState(pDC).enableStatsBE) \
572 pDC->dynState.pStats[workerId].name += count; \
574 #define UPDATE_STAT_FE(name, count) \
575 if (GetApiState(pDC).enableStatsFE) \
577 pDC->dynState.statsFE.name += count; \
580 // ArchRast instrumentation framework
581 #define AR_WORKER_CTX pDC->pContext->pArContext[workerId]
582 #define AR_API_CTX pDC->pContext->pArContext[pContext->NumWorkerThreads]
584 #ifdef KNOB_ENABLE_RDTSC
585 #define RDTSC_BEGIN(pBucketMgr, type, drawid) RDTSC_START(pBucketMgr, type)
586 #define RDTSC_END(pBucketMgr, type, count) RDTSC_STOP(pBucketMgr, type, count, 0)
588 #define RDTSC_BEGIN(pBucketMgr, type, drawid)
589 #define RDTSC_END(pBucketMgr, type, count)
592 #ifdef KNOB_ENABLE_AR
593 #define _AR_EVENT(ctx, event) ArchRast::Dispatch(ctx, ArchRast::event)
594 #define _AR_FLUSH(ctx, id) ArchRast::FlushDraw(ctx, id)
596 #define _AR_EVENT(ctx, event)
597 #define _AR_FLUSH(ctx, id)
600 // Use these macros for api thread.
601 #define AR_API_EVENT(event) _AR_EVENT(AR_API_CTX, event)
603 // Use these macros for worker threads.
604 #define AR_EVENT(event) _AR_EVENT(AR_WORKER_CTX, event)
605 #define AR_FLUSH(id) _AR_FLUSH(AR_WORKER_CTX, id)