swr/rast: Optimize late/bindless JIT of samplers
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / context.h
1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file context.h
24 *
25 * @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT
26 * The SWR_CONTEXT is our global context and contains the DC ring,
27 * thread state, etc.
28 *
29 * The DRAW_CONTEXT contains all state associated with a draw operation.
30 *
31 ******************************************************************************/
32 #pragma once
33
34 #include <condition_variable>
35 #include <algorithm>
36
37 #include "core/api.h"
38 #include "core/utils.h"
39 #include "core/arena.h"
40 #include "core/fifo.hpp"
41 #include "core/knobs.h"
42 #include "common/intrin.h"
43 #include "core/threads.h"
44 #include "ringbuffer.h"
45 #include "archrast/archrast.h"
46
47 // x.8 fixed point precision values
48 #define FIXED_POINT_SHIFT 8
49 #define FIXED_POINT_SCALE 256
50
51 // x.16 fixed point precision values
52 #define FIXED_POINT16_SHIFT 16
53 #define FIXED_POINT16_SCALE 65536
54
55 struct SWR_CONTEXT;
56 struct DRAW_CONTEXT;
57
58 struct TRI_FLAGS
59 {
60 uint32_t frontFacing : 1;
61 uint32_t yMajor : 1;
62 uint32_t coverageMask : (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
63 uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
64 float pointSize;
65 uint32_t renderTargetArrayIndex;
66 uint32_t viewportIndex;
67 };
68
69 //////////////////////////////////////////////////////////////////////////
70 /// SWR_TRIANGLE_DESC
71 /////////////////////////////////////////////////////////////////////////
72 struct SWR_TRIANGLE_DESC
73 {
74 float I[3];
75 float J[3];
76 float Z[3];
77 float OneOverW[3];
78 float recipDet;
79
80 float *pRecipW;
81 float *pAttribs;
82 float *pPerspAttribs;
83 float *pSamplePos;
84 float *pUserClipBuffer;
85
86 uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
87 uint64_t innerCoverageMask; // Conservative rasterization inner coverage: marked covered if entire pixel is covered
88 uint64_t anyCoveredSamples;
89
90 TRI_FLAGS triFlags;
91 };
92
93 struct TRIANGLE_WORK_DESC
94 {
95 float *pTriBuffer;
96 float *pAttribs;
97 float *pUserClipBuffer;
98 uint32_t numAttribs;
99 TRI_FLAGS triFlags;
100 };
101
102 struct CLEAR_DESC
103 {
104 SWR_RECT rect;
105 uint32_t attachmentMask;
106 uint32_t renderTargetArrayIndex;
107 float clearRTColor[4]; // RGBA_32F
108 float clearDepth; // [0..1]
109 uint8_t clearStencil;
110 };
111
112 struct DISCARD_INVALIDATE_TILES_DESC
113 {
114 uint32_t attachmentMask;
115 SWR_RECT rect;
116 SWR_TILE_STATE newTileState;
117 bool createNewTiles;
118 bool fullTilesOnly;
119 };
120
121 struct SYNC_DESC
122 {
123 PFN_CALLBACK_FUNC pfnCallbackFunc;
124 uint64_t userData;
125 uint64_t userData2;
126 uint64_t userData3;
127 };
128
129 struct STORE_TILES_DESC
130 {
131 uint32_t attachmentMask;
132 SWR_TILE_STATE postStoreTileState;
133 SWR_RECT rect;
134 };
135
136 struct COMPUTE_DESC
137 {
138 uint32_t threadGroupCountX;
139 uint32_t threadGroupCountY;
140 uint32_t threadGroupCountZ;
141 };
142
143 typedef void(*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc);
144
145 enum WORK_TYPE
146 {
147 SYNC,
148 DRAW,
149 CLEAR,
150 DISCARDINVALIDATETILES,
151 STORETILES,
152 SHUTDOWN,
153 };
154
155 OSALIGNSIMD(struct) BE_WORK
156 {
157 WORK_TYPE type;
158 PFN_WORK_FUNC pfnWork;
159 union
160 {
161 SYNC_DESC sync;
162 TRIANGLE_WORK_DESC tri;
163 CLEAR_DESC clear;
164 DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
165 STORE_TILES_DESC storeTiles;
166 } desc;
167 };
168
169 struct DRAW_WORK
170 {
171 DRAW_CONTEXT* pDC;
172 union
173 {
174 uint32_t numIndices; // DrawIndexed: Number of indices for draw.
175 uint32_t numVerts; // Draw: Number of verts (triangles, lines, etc)
176 };
177 union
178 {
179 gfxptr_t xpIB; // DrawIndexed: App supplied int32 indices
180 uint32_t startVertex; // Draw: Starting vertex in VB to render from.
181 };
182 int32_t baseVertex;
183 uint32_t numInstances; // Number of instances
184 uint32_t startInstance; // Instance offset
185 uint32_t startPrimID; // starting primitiveID for this draw batch
186 uint32_t startVertexID; // starting VertexID for this draw batch (only needed for non-indexed draws)
187 SWR_FORMAT type; // index buffer type
188 };
189
190 typedef void(*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pDesc);
191 struct FE_WORK
192 {
193 WORK_TYPE type;
194 PFN_FE_WORK_FUNC pfnWork;
195 union
196 {
197 SYNC_DESC sync;
198 DRAW_WORK draw;
199 CLEAR_DESC clear;
200 DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
201 STORE_TILES_DESC storeTiles;
202 } desc;
203 };
204
205 struct GUARDBANDS
206 {
207 float left[KNOB_NUM_VIEWPORTS_SCISSORS];
208 float right[KNOB_NUM_VIEWPORTS_SCISSORS];
209 float top[KNOB_NUM_VIEWPORTS_SCISSORS];
210 float bottom[KNOB_NUM_VIEWPORTS_SCISSORS];
211 };
212
213 struct PA_STATE;
214
215 // function signature for pipeline stages that execute after primitive assembly
216 typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[],
217 uint32_t primMask, simdscalari const &primID, simdscalari const &viewportIdx, simdscalari const &rtIdx);
218
219 #if ENABLE_AVX512_SIMD16
220 // function signature for pipeline stages that execute after primitive assembly
221 typedef void(SIMDCALL *PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[],
222 uint32_t primMask, simd16scalari const &primID, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
223
224 #endif
225 OSALIGNLINE(struct) API_STATE
226 {
227 // Vertex Buffers
228 SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS];
229
230 // GS - Geometry Shader State
231 SWR_GS_STATE gsState;
232 PFN_GS_FUNC pfnGsFunc;
233
234 // FS - Fetch Shader State
235 PFN_FETCH_FUNC pfnFetchFunc;
236
237 // VS - Vertex Shader State
238 PFN_VERTEX_FUNC pfnVertexFunc;
239
240 // Index Buffer
241 SWR_INDEX_BUFFER_STATE indexBuffer;
242
243 // CS - Compute Shader
244 PFN_CS_FUNC pfnCsFunc;
245 uint32_t totalThreadsInGroup;
246 uint32_t totalSpillFillSize;
247 uint32_t scratchSpaceSize;
248 uint32_t scratchSpaceNumInstances;
249
250 // FE - Frontend State
251 SWR_FRONTEND_STATE frontendState;
252
253 // SOS - Streamout Shader State
254 PFN_SO_FUNC pfnSoFunc[MAX_SO_STREAMS];
255
256 // Streamout state
257 SWR_STREAMOUT_STATE soState;
258 mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS];
259
260 // Tessellation State
261 PFN_HS_FUNC pfnHsFunc;
262 PFN_DS_FUNC pfnDsFunc;
263 SWR_TS_STATE tsState;
264
265 // Number of attributes used by the frontend (vs, so, gs)
266 uint32_t feNumAttributes;
267
268
269 // RS - Rasterizer State
270 SWR_RASTSTATE rastState;
271 // floating point multisample offsets
272 float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2];
273
274 GUARDBANDS gbState;
275
276 SWR_VIEWPORT vp[KNOB_NUM_VIEWPORTS_SCISSORS];
277 SWR_VIEWPORT_MATRICES vpMatrices;
278
279 SWR_RECT scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS];
280 SWR_RECT scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS];
281 bool scissorsTileAligned;
282
283 bool forceFront;
284 PRIMITIVE_TOPOLOGY topology;
285
286
287 // Backend state
288 OSALIGNLINE(SWR_BACKEND_STATE) backendState;
289
290 SWR_DEPTH_BOUNDS_STATE depthBoundsState;
291
292 // PS - Pixel shader state
293 SWR_PS_STATE psState;
294
295 SWR_DEPTH_STENCIL_STATE depthStencilState;
296
297 // OM - Output Merger State
298 SWR_BLEND_STATE blendState;
299 PFN_BLEND_JIT_FUNC pfnBlendFunc[SWR_NUM_RENDERTARGETS];
300
301 struct
302 {
303 uint32_t enableStatsFE : 1; // Enable frontend pipeline stats
304 uint32_t enableStatsBE : 1; // Enable backend pipeline stats
305 uint32_t colorHottileEnable : 8; // Bitmask of enabled color hottiles
306 uint32_t depthHottileEnable: 1; // Enable depth buffer hottile
307 uint32_t stencilHottileEnable : 1; // Enable stencil buffer hottile
308 };
309
310 PFN_QUANTIZE_DEPTH pfnQuantizeDepth;
311 };
312
313 class MacroTileMgr;
314 class DispatchQueue;
315
316 struct RenderOutputBuffers
317 {
318 uint8_t* pColor[SWR_NUM_RENDERTARGETS];
319 uint8_t* pDepth;
320 uint8_t* pStencil;
321 };
322
323 // Plane equation A/B/C coeffs used to evaluate I/J barycentric coords
324 struct BarycentricCoeffs
325 {
326 simdscalar vIa;
327 simdscalar vIb;
328 simdscalar vIc;
329
330 simdscalar vJa;
331 simdscalar vJb;
332 simdscalar vJc;
333
334 simdscalar vZa;
335 simdscalar vZb;
336 simdscalar vZc;
337
338 simdscalar vRecipDet;
339
340 simdscalar vAOneOverW;
341 simdscalar vBOneOverW;
342 simdscalar vCOneOverW;
343 };
344
345 // pipeline function pointer types
346 typedef void(*PFN_BACKEND_FUNC)(DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&);
347 typedef void(*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT &, uint8_t* (&)[SWR_NUM_RENDERTARGETS], uint32_t, const SWR_BLEND_STATE*,
348 const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS], simdscalar&, simdscalar const &);
349 typedef void(*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &);
350 typedef void(*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
351 typedef void(*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &, const uint64_t *const, const uint32_t,
352 simdscalar const &, simdscalar const &);
353
354 struct BACKEND_FUNCS
355 {
356 PFN_BACKEND_FUNC pfnBackend;
357 };
358
359 // Draw State
360 struct DRAW_STATE
361 {
362 API_STATE state;
363
364 void* pPrivateState; // Its required the driver sets this up for each draw.
365
366 // pipeline function pointers, filled in by API thread when setting up the draw
367 BACKEND_FUNCS backendFuncs;
368 PFN_PROCESS_PRIMS pfnProcessPrims;
369 #if USE_SIMD16_FRONTEND
370 PFN_PROCESS_PRIMS_SIMD16 pfnProcessPrims_simd16;
371 #endif
372
373 CachingArena* pArena; // This should only be used by API thread.
374 };
375
376 struct DRAW_DYNAMIC_STATE
377 {
378 void Reset(uint32_t numThreads)
379 {
380 SWR_STATS* pSavePtr = pStats;
381 memset(this, 0, sizeof(*this));
382 pStats = pSavePtr;
383 memset(pStats, 0, sizeof(SWR_STATS) * numThreads);
384 }
385 ///@todo Currently assumes only a single FE can do stream output for a draw.
386 uint32_t SoWriteOffset[4];
387 bool SoWriteOffsetDirty[4];
388
389 SWR_STATS_FE statsFE; // Only one FE thread per DC.
390 SWR_STATS* pStats;
391 };
392
393 // Draw Context
394 // The api thread sets up a draw context that exists for the life of the draw.
395 // This draw context maintains all of the state needed for the draw operation.
396 struct DRAW_CONTEXT
397 {
398 SWR_CONTEXT* pContext;
399 union
400 {
401 MacroTileMgr* pTileMgr;
402 DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
403 };
404 DRAW_STATE* pState; // Read-only state. Core should not update this outside of API thread.
405 CachingArena* pArena;
406
407 uint32_t drawId;
408 bool dependentFE; // Frontend work is dependent on all previous FE
409 bool dependent; // Backend work is dependent on all previous BE
410 bool isCompute; // Is this DC a compute context?
411 bool cleanupState; // True if this is the last draw using an entry in the state ring.
412
413 FE_WORK FeWork;
414
415 SYNC_DESC retireCallback; // Call this func when this DC is retired.
416
417 DRAW_DYNAMIC_STATE dynState;
418
419 volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw?
420 volatile OSALIGNLINE(uint32_t) FeLock;
421 volatile OSALIGNLINE(uint32_t) threadsDone;
422 };
423
424 static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
425
426 INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
427 {
428 SWR_ASSERT(pDC != nullptr);
429 SWR_ASSERT(pDC->pState != nullptr);
430
431 return pDC->pState->state;
432 }
433
434 INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC)
435 {
436 SWR_ASSERT(pDC != nullptr);
437 SWR_ASSERT(pDC->pState != nullptr);
438
439 return pDC->pState->pPrivateState;
440 }
441
442 class HotTileMgr;
443
444 struct SWR_CONTEXT
445 {
446 // Draw Context Ring
447 // Each draw needs its own state in order to support mulitple draws in flight across multiple threads.
448 // We maintain N draw contexts configured as a ring. The size of the ring limits the maximum number
449 // of draws that can be in flight at any given time.
450 //
451 // Description:
452 // 1. State - When an application first sets state we'll request a new draw context to use.
453 // a. If there are no available draw contexts then we'll have to wait until one becomes free.
454 // b. If one is available then set pCurDrawContext to point to it and mark it in use.
455 // c. All state calls set state on pCurDrawContext.
456 // 2. Draw - Creates submits a work item that is associated with current draw context.
457 // a. Set pPrevDrawContext = pCurDrawContext
458 // b. Set pCurDrawContext to NULL.
459 // 3. State - When an applications sets state after draw
460 // a. Same as step 1.
461 // b. State is copied from prev draw context to current.
462 RingBuffer<DRAW_CONTEXT> dcRing;
463
464 DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw.
465 DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from.
466
467 MacroTileMgr* pMacroTileManagerArray;
468 DispatchQueue* pDispatchQueueArray;
469
470 // Draw State Ring
471 // When draw are very large (lots of primitives) then the API thread will break these up.
472 // These split draws all have identical state. So instead of storing the state directly
473 // in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
474 // to reference a single entry in the DS ring.
475 RingBuffer<DRAW_STATE> dsRing;
476
477 uint32_t curStateId; // Current index to the next available entry in the DS ring.
478
479 uint32_t NumWorkerThreads;
480 uint32_t NumFEThreads;
481 uint32_t NumBEThreads;
482
483 THREAD_POOL threadPool; // Thread pool associated with this context
484 SWR_THREADING_INFO threadInfo;
485 SWR_API_THREADING_INFO apiThreadInfo;
486 SWR_WORKER_PRIVATE_STATE workerPrivateState;
487
488 uint32_t MAX_DRAWS_IN_FLIGHT;
489
490 std::condition_variable FifosNotEmpty;
491 std::mutex WaitLock;
492
493 uint32_t privateStateSize;
494
495 HotTileMgr *pHotTileMgr;
496
497 // Callback functions, passed in at create context time
498 PFN_LOAD_TILE pfnLoadTile;
499 PFN_STORE_TILE pfnStoreTile;
500 PFN_CLEAR_TILE pfnClearTile;
501 PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset;
502 PFN_UPDATE_STATS pfnUpdateStats;
503 PFN_UPDATE_STATS_FE pfnUpdateStatsFE;
504
505
506 // Global Stats
507 SWR_STATS* pStats;
508
509 // Scratch space for workers.
510 uint8_t** ppScratch;
511
512 volatile OSALIGNLINE(uint32_t) drawsOutstandingFE;
513
514 OSALIGNLINE(CachingAllocator) cachingArenaAllocator;
515 uint32_t frameCount;
516
517 uint32_t lastFrameChecked;
518 uint64_t lastDrawChecked;
519 TileSet singleThreadLockedTiles;
520
521 // ArchRast thread contexts.
522 HANDLE* pArContext;
523 };
524
525 #define UPDATE_STAT_BE(name, count) if (GetApiState(pDC).enableStatsBE) { pDC->dynState.pStats[workerId].name += count; }
526 #define UPDATE_STAT_FE(name, count) if (GetApiState(pDC).enableStatsFE) { pDC->dynState.statsFE.name += count; }
527
528 // ArchRast instrumentation framework
529 #define AR_WORKER_CTX pDC->pContext->pArContext[workerId]
530 #define AR_API_CTX pDC->pContext->pArContext[pContext->NumWorkerThreads]
531
532 #ifdef KNOB_ENABLE_RDTSC
533 #define RDTSC_BEGIN(type, drawid) RDTSC_START(type)
534 #define RDTSC_END(type, count) RDTSC_STOP(type, count, 0)
535 #else
536 #define RDTSC_BEGIN(type, count)
537 #define RDTSC_END(type, count)
538 #endif
539
540 #ifdef KNOB_ENABLE_AR
541 #define _AR_EVENT(ctx, event) ArchRast::Dispatch(ctx, ArchRast::event)
542 #define _AR_FLUSH(ctx, id) ArchRast::FlushDraw(ctx, id)
543 #else
544 #define _AR_EVENT(ctx, event)
545 #define _AR_FLUSH(ctx, id)
546 #endif
547
548 // Use these macros for api thread.
549 #define AR_API_EVENT(event) _AR_EVENT(AR_API_CTX, event)
550
551 // Use these macros for worker threads.
552 #define AR_EVENT(event) _AR_EVENT(AR_WORKER_CTX, event)
553 #define AR_FLUSH(id) _AR_FLUSH(AR_WORKER_CTX, id)