swr/rasterizer: modernize thread TLB
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / context.h
1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file context.h
24 *
25 * @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT
26 * The SWR_CONTEXT is our global context and contains the DC ring,
27 * thread state, etc.
28 *
29 * The DRAW_CONTEXT contains all state associated with a draw operation.
30 *
31 ******************************************************************************/
32 #pragma once
33
34 #include <condition_variable>
35 #include <algorithm>
36
37 #include "core/api.h"
38 #include "core/utils.h"
39 #include "core/arena.h"
40 #include "core/fifo.hpp"
41 #include "core/knobs.h"
42 #include "common/intrin.h"
43 #include "common/rdtsc_buckets.h"
44 #include "core/threads.h"
45 #include "ringbuffer.h"
46 #include "archrast/archrast.h"
47
48 // x.8 fixed point precision values
49 #define FIXED_POINT_SHIFT 8
50 #define FIXED_POINT_SCALE 256
51
52 // x.16 fixed point precision values
53 #define FIXED_POINT16_SHIFT 16
54 #define FIXED_POINT16_SCALE 65536
55
56 struct SWR_CONTEXT;
57 struct DRAW_CONTEXT;
58
59 struct TRI_FLAGS
60 {
61 uint32_t frontFacing : 1;
62 uint32_t yMajor : 1;
63 uint32_t coverageMask : (SIMD_TILE_X_DIM* SIMD_TILE_Y_DIM);
64 uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
65 float pointSize;
66 uint32_t renderTargetArrayIndex;
67 uint32_t viewportIndex;
68 };
69
70 //////////////////////////////////////////////////////////////////////////
71 /// SWR_TRIANGLE_DESC
72 /////////////////////////////////////////////////////////////////////////
73 struct SWR_TRIANGLE_DESC
74 {
75 float I[3];
76 float J[3];
77 float Z[3];
78 float OneOverW[3];
79 float recipDet;
80
81 float* pRecipW;
82 float* pAttribs;
83 float* pPerspAttribs;
84 float* pSamplePos;
85 float* pUserClipBuffer;
86
87 uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
88 uint64_t innerCoverageMask; // Conservative rasterization inner coverage: marked covered if
89 // entire pixel is covered
90 uint64_t anyCoveredSamples;
91
92 TRI_FLAGS triFlags;
93 };
94
95 struct TRIANGLE_WORK_DESC
96 {
97 float* pTriBuffer;
98 float* pAttribs;
99 float* pUserClipBuffer;
100 uint32_t numAttribs;
101 TRI_FLAGS triFlags;
102 };
103
104 struct CLEAR_DESC
105 {
106 SWR_RECT rect;
107 uint32_t attachmentMask;
108 uint32_t renderTargetArrayIndex;
109 float clearRTColor[4]; // RGBA_32F
110 float clearDepth; // [0..1]
111 uint8_t clearStencil;
112 };
113
114 struct DISCARD_INVALIDATE_TILES_DESC
115 {
116 uint32_t attachmentMask;
117 SWR_RECT rect;
118 SWR_TILE_STATE newTileState;
119 bool createNewTiles;
120 bool fullTilesOnly;
121 };
122
123 struct SYNC_DESC
124 {
125 PFN_CALLBACK_FUNC pfnCallbackFunc;
126 uint64_t userData;
127 uint64_t userData2;
128 uint64_t userData3;
129 };
130
131 struct STORE_TILES_DESC
132 {
133 uint32_t attachmentMask;
134 SWR_TILE_STATE postStoreTileState;
135 SWR_RECT rect;
136 };
137
138 struct COMPUTE_DESC
139 {
140 uint32_t threadGroupCountX;
141 uint32_t threadGroupCountY;
142 uint32_t threadGroupCountZ;
143 bool enableThreadDispatch;
144 };
145
146 typedef void (*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC,
147 uint32_t workerId,
148 uint32_t macroTile,
149 void* pDesc);
150
151 enum WORK_TYPE
152 {
153 SYNC,
154 DRAW,
155 CLEAR,
156 DISCARDINVALIDATETILES,
157 STORETILES,
158 SHUTDOWN,
159 };
160
161 OSALIGNSIMD(struct) BE_WORK
162 {
163 WORK_TYPE type;
164 PFN_WORK_FUNC pfnWork;
165 union
166 {
167 SYNC_DESC sync;
168 TRIANGLE_WORK_DESC tri;
169 CLEAR_DESC clear;
170 DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
171 STORE_TILES_DESC storeTiles;
172 } desc;
173 };
174
175 struct DRAW_WORK
176 {
177 DRAW_CONTEXT* pDC;
178 union
179 {
180 uint32_t numIndices; // DrawIndexed: Number of indices for draw.
181 uint32_t numVerts; // Draw: Number of verts (triangles, lines, etc)
182 };
183 union
184 {
185 gfxptr_t xpIB; // DrawIndexed: App supplied int32 indices
186 uint32_t startVertex; // Draw: Starting vertex in VB to render from.
187 };
188 int32_t baseVertex;
189 uint32_t numInstances; // Number of instances
190 uint32_t startInstance; // Instance offset
191 uint32_t startPrimID; // starting primitiveID for this draw batch
192 uint32_t
193 startVertexID; // starting VertexID for this draw batch (only needed for non-indexed draws)
194 SWR_FORMAT type; // index buffer type
195 };
196
197 typedef void (*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext,
198 DRAW_CONTEXT* pDC,
199 uint32_t workerId,
200 void* pDesc);
201 struct FE_WORK
202 {
203 WORK_TYPE type;
204 PFN_FE_WORK_FUNC pfnWork;
205 union
206 {
207 SYNC_DESC sync;
208 DRAW_WORK draw;
209 CLEAR_DESC clear;
210 DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
211 STORE_TILES_DESC storeTiles;
212 } desc;
213 };
214
215 struct GUARDBANDS
216 {
217 float left[KNOB_NUM_VIEWPORTS_SCISSORS];
218 float right[KNOB_NUM_VIEWPORTS_SCISSORS];
219 float top[KNOB_NUM_VIEWPORTS_SCISSORS];
220 float bottom[KNOB_NUM_VIEWPORTS_SCISSORS];
221 };
222
223 struct PA_STATE;
224
225 // function signature for pipeline stages that execute after primitive assembly
226 typedef void (*PFN_PROCESS_PRIMS)(DRAW_CONTEXT* pDC,
227 PA_STATE& pa,
228 uint32_t workerId,
229 simdvector prims[],
230 uint32_t primMask,
231 simdscalari const& primID,
232 simdscalari const& viewportIdx,
233 simdscalari const& rtIdx);
234
235 // function signature for pipeline stages that execute after primitive assembly
236 typedef void(SIMDCALL* PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT* pDC,
237 PA_STATE& pa,
238 uint32_t workerId,
239 simd16vector prims[],
240 uint32_t primMask,
241 simd16scalari const& primID,
242 simd16scalari const& viewportIdx,
243 simd16scalari const& rtIdx);
244
245 OSALIGNLINE(struct) API_STATE
246 {
247 // Vertex Buffers
248 SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS];
249
250 // GS - Geometry Shader State
251 SWR_GS_STATE gsState;
252 PFN_GS_FUNC pfnGsFunc;
253
254 // FS - Fetch Shader State
255 PFN_FETCH_FUNC pfnFetchFunc;
256
257 // VS - Vertex Shader State
258 PFN_VERTEX_FUNC pfnVertexFunc;
259
260 // Index Buffer
261 SWR_INDEX_BUFFER_STATE indexBuffer;
262
263 // CS - Compute Shader
264 PFN_CS_FUNC pfnCsFunc;
265 uint32_t totalThreadsInGroup;
266 uint32_t totalSpillFillSize;
267 uint32_t scratchSpaceSizePerWarp;
268 uint32_t scratchSpaceNumWarps;
269
270 // FE - Frontend State
271 SWR_FRONTEND_STATE frontendState;
272
273 // SOS - Streamout Shader State
274 PFN_SO_FUNC pfnSoFunc[MAX_SO_STREAMS];
275
276 // Streamout state
277 SWR_STREAMOUT_STATE soState;
278 mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS];
279
280 // Tessellation State
281 PFN_HS_FUNC pfnHsFunc;
282 PFN_DS_FUNC pfnDsFunc;
283 SWR_TS_STATE tsState;
284
285 // Number of attributes used by the frontend (vs, so, gs)
286 uint32_t feNumAttributes;
287
288 // RS - Rasterizer State
289 SWR_RASTSTATE rastState;
290 // floating point multisample offsets
291 float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2];
292
293 GUARDBANDS gbState;
294
295 SWR_VIEWPORT vp[KNOB_NUM_VIEWPORTS_SCISSORS];
296 SWR_VIEWPORT_MATRICES vpMatrices;
297
298 SWR_RECT scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS];
299 SWR_RECT scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS];
300 bool scissorsTileAligned;
301
302 bool forceFront;
303 PRIMITIVE_TOPOLOGY topology;
304
305
306 // Backend state
307 OSALIGNLINE(SWR_BACKEND_STATE) backendState;
308
309 SWR_DEPTH_BOUNDS_STATE depthBoundsState;
310
311 // PS - Pixel shader state
312 SWR_PS_STATE psState;
313
314 SWR_DEPTH_STENCIL_STATE depthStencilState;
315
316 // OM - Output Merger State
317 SWR_BLEND_STATE blendState;
318 PFN_BLEND_JIT_FUNC pfnBlendFunc[SWR_NUM_RENDERTARGETS];
319
320 struct
321 {
322 uint32_t enableStatsFE : 1; // Enable frontend pipeline stats
323 uint32_t enableStatsBE : 1; // Enable backend pipeline stats
324 uint32_t colorHottileEnable : 8; // Bitmask of enabled color hottiles
325 uint32_t depthHottileEnable : 1; // Enable depth buffer hottile
326 uint32_t stencilHottileEnable : 1; // Enable stencil buffer hottile
327 };
328
329 PFN_QUANTIZE_DEPTH pfnQuantizeDepth;
330 };
331
332 class MacroTileMgr;
333 class DispatchQueue;
334 class HOTTILE;
335
336 struct RenderOutputBuffers
337 {
338 uint8_t* pColor[SWR_NUM_RENDERTARGETS];
339 uint8_t* pDepth;
340 uint8_t* pStencil;
341
342 HOTTILE* pColorHotTile[SWR_NUM_RENDERTARGETS];
343 HOTTILE* pDepthHotTile;
344 HOTTILE* pStencilHotTile;
345 };
346
347 // Plane equation A/B/C coeffs used to evaluate I/J barycentric coords
348 struct BarycentricCoeffs
349 {
350 simdscalar vIa;
351 simdscalar vIb;
352 simdscalar vIc;
353
354 simdscalar vJa;
355 simdscalar vJb;
356 simdscalar vJc;
357
358 simdscalar vZa;
359 simdscalar vZb;
360 simdscalar vZc;
361
362 simdscalar vRecipDet;
363
364 simdscalar vAOneOverW;
365 simdscalar vBOneOverW;
366 simdscalar vCOneOverW;
367 };
368
369 // pipeline function pointer types
370 typedef void (*PFN_BACKEND_FUNC)(
371 DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&);
372 typedef void (*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT&,
373 uint8_t* (&)[SWR_NUM_RENDERTARGETS],
374 uint32_t,
375 const SWR_BLEND_STATE*,
376 const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS],
377 simdscalar&,
378 simdscalar const&);
379 typedef void (*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
380 typedef void (*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
381 typedef void (*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&,
382 SWR_PS_CONTEXT&,
383 const uint64_t* const,
384 const uint32_t,
385 simdscalar const&,
386 simdscalar const&);
387
388 struct BACKEND_FUNCS
389 {
390 PFN_BACKEND_FUNC pfnBackend;
391 };
392
393 // Draw State
394 struct DRAW_STATE
395 {
396 API_STATE state;
397
398 void* pPrivateState; // Its required the driver sets this up for each draw.
399
400 // pipeline function pointers, filled in by API thread when setting up the draw
401 BACKEND_FUNCS backendFuncs;
402 PFN_PROCESS_PRIMS pfnProcessPrims;
403 #if USE_SIMD16_FRONTEND
404 PFN_PROCESS_PRIMS_SIMD16 pfnProcessPrims_simd16;
405 #endif
406
407 CachingArena* pArena; // This should only be used by API thread.
408 };
409
410 struct DRAW_DYNAMIC_STATE
411 {
412 void Reset(uint32_t numThreads)
413 {
414 SWR_STATS* pSavePtr = pStats;
415 memset(this, 0, sizeof(*this));
416 pStats = pSavePtr;
417 memset(pStats, 0, sizeof(SWR_STATS) * numThreads);
418 }
419 ///@todo Currently assumes only a single FE can do stream output for a draw.
420 uint32_t SoWriteOffset[4];
421 bool SoWriteOffsetDirty[4];
422
423 SWR_STATS_FE statsFE; // Only one FE thread per DC.
424 SWR_STATS* pStats;
425 };
426
427 // Draw Context
428 // The api thread sets up a draw context that exists for the life of the draw.
429 // This draw context maintains all of the state needed for the draw operation.
430 struct DRAW_CONTEXT
431 {
432 SWR_CONTEXT* pContext;
433 union
434 {
435 MacroTileMgr* pTileMgr;
436 DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
437 };
438 DRAW_STATE* pState; // Read-only state. Core should not update this outside of API thread.
439 CachingArena* pArena;
440
441 uint32_t drawId;
442 bool dependentFE; // Frontend work is dependent on all previous FE
443 bool dependent; // Backend work is dependent on all previous BE
444 bool isCompute; // Is this DC a compute context?
445 bool cleanupState; // True if this is the last draw using an entry in the state ring.
446
447 FE_WORK FeWork;
448
449 SYNC_DESC retireCallback; // Call this func when this DC is retired.
450
451 DRAW_DYNAMIC_STATE dynState;
452
453 volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw?
454 volatile OSALIGNLINE(uint32_t) FeLock;
455 volatile OSALIGNLINE(uint32_t) threadsDone;
456 };
457
458 static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
459
460 INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
461 {
462 SWR_ASSERT(pDC != nullptr);
463 SWR_ASSERT(pDC->pState != nullptr);
464
465 return pDC->pState->state;
466 }
467
468 INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC)
469 {
470 SWR_ASSERT(pDC != nullptr);
471 SWR_ASSERT(pDC->pState != nullptr);
472
473 return pDC->pState->pPrivateState;
474 }
475
476 class HotTileMgr;
477
478 struct SWR_CONTEXT
479 {
480 // Draw Context Ring
481 // Each draw needs its own state in order to support mulitple draws in flight across multiple
482 // threads. We maintain N draw contexts configured as a ring. The size of the ring limits the
483 // maximum number of draws that can be in flight at any given time.
484 //
485 // Description:
486 // 1. State - When an application first sets state we'll request a new draw context to use.
487 // a. If there are no available draw contexts then we'll have to wait until one becomes
488 // free. b. If one is available then set pCurDrawContext to point to it and mark it in use.
489 // c. All state calls set state on pCurDrawContext.
490 // 2. Draw - Creates submits a work item that is associated with current draw context.
491 // a. Set pPrevDrawContext = pCurDrawContext
492 // b. Set pCurDrawContext to NULL.
493 // 3. State - When an applications sets state after draw
494 // a. Same as step 1.
495 // b. State is copied from prev draw context to current.
496 RingBuffer<DRAW_CONTEXT> dcRing;
497
498 DRAW_CONTEXT* pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw.
499 DRAW_CONTEXT* pPrevDrawContext; // This points to DC entry for the previous context submitted
500 // that we can copy state from.
501
502 MacroTileMgr* pMacroTileManagerArray;
503 DispatchQueue* pDispatchQueueArray;
504
505 // Draw State Ring
506 // When draw are very large (lots of primitives) then the API thread will break these up.
507 // These split draws all have identical state. So instead of storing the state directly
508 // in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
509 // to reference a single entry in the DS ring.
510 RingBuffer<DRAW_STATE> dsRing;
511
512 uint32_t curStateId; // Current index to the next available entry in the DS ring.
513
514 uint32_t NumWorkerThreads;
515 uint32_t NumFEThreads;
516 uint32_t NumBEThreads;
517
518 THREAD_POOL threadPool; // Thread pool associated with this context
519 SWR_THREADING_INFO threadInfo;
520 SWR_API_THREADING_INFO apiThreadInfo;
521 SWR_WORKER_PRIVATE_STATE workerPrivateState;
522
523 uint32_t MAX_DRAWS_IN_FLIGHT;
524
525 std::condition_variable FifosNotEmpty;
526 std::mutex WaitLock;
527
528 uint32_t privateStateSize;
529
530 HotTileMgr* pHotTileMgr;
531
532 // Callback functions, passed in at create context time
533 PFN_LOAD_TILE pfnLoadTile;
534 PFN_STORE_TILE pfnStoreTile;
535 PFN_TRANSLATE_GFXPTR_FOR_READ pfnTranslateGfxptrForRead;
536 PFN_TRANSLATE_GFXPTR_FOR_WRITE pfnTranslateGfxptrForWrite;
537 PFN_MAKE_GFXPTR pfnMakeGfxPtr;
538 PFN_CREATE_MEMORY_CONTEXT pfnCreateMemoryContext;
539 PFN_DESTROY_MEMORY_CONTEXT pfnDestroyMemoryContext;
540 PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset;
541 PFN_UPDATE_STATS pfnUpdateStats;
542 PFN_UPDATE_STATS_FE pfnUpdateStatsFE;
543
544
545 // Global Stats
546 SWR_STATS* pStats;
547
548 // Scratch space for workers.
549 uint8_t** ppScratch;
550
551 volatile OSALIGNLINE(uint32_t) drawsOutstandingFE;
552
553 OSALIGNLINE(CachingAllocator) cachingArenaAllocator;
554 uint32_t frameCount;
555
556 uint32_t lastFrameChecked;
557 uint64_t lastDrawChecked;
558 TileSet* pSingleThreadLockedTiles;
559
560 // ArchRast thread contexts.
561 HANDLE* pArContext;
562
563 // handle to external memory for worker datas to create memory contexts
564 HANDLE hExternalMemory;
565
566 BucketManager *pBucketMgr;
567 };
568
569 #define UPDATE_STAT_BE(name, count) \
570 if (GetApiState(pDC).enableStatsBE) \
571 { \
572 pDC->dynState.pStats[workerId].name += count; \
573 }
574 #define UPDATE_STAT_FE(name, count) \
575 if (GetApiState(pDC).enableStatsFE) \
576 { \
577 pDC->dynState.statsFE.name += count; \
578 }
579
580 // ArchRast instrumentation framework
581 #define AR_WORKER_CTX pDC->pContext->pArContext[workerId]
582 #define AR_API_CTX pDC->pContext->pArContext[pContext->NumWorkerThreads]
583
584 #ifdef KNOB_ENABLE_RDTSC
585 #define RDTSC_BEGIN(pBucketMgr, type, drawid) RDTSC_START(pBucketMgr, type)
586 #define RDTSC_END(pBucketMgr, type, count) RDTSC_STOP(pBucketMgr, type, count, 0)
587 #else
588 #define RDTSC_BEGIN(pBucketMgr, type, drawid)
589 #define RDTSC_END(pBucketMgr, type, count)
590 #endif
591
592 #ifdef KNOB_ENABLE_AR
593 #define _AR_EVENT(ctx, event) ArchRast::Dispatch(ctx, ArchRast::event)
594 #define _AR_FLUSH(ctx, id) ArchRast::FlushDraw(ctx, id)
595 #else
596 #define _AR_EVENT(ctx, event)
597 #define _AR_FLUSH(ctx, id)
598 #endif
599
600 // Use these macros for api thread.
601 #define AR_API_EVENT(event) _AR_EVENT(AR_API_CTX, event)
602
603 // Use these macros for worker threads.
604 #define AR_EVENT(event) _AR_EVENT(AR_WORKER_CTX, event)
605 #define AR_FLUSH(id) _AR_FLUSH(AR_WORKER_CTX, id)