swr: [rasterizer] add archrast instrumentation
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / context.h
1 /****************************************************************************
2 * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file context.h
24 *
25 * @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT
26 * The SWR_CONTEXT is our global context and contains the DC ring,
27 * thread state, etc.
28 *
29 * The DRAW_CONTEXT contains all state associated with a draw operation.
30 *
31 ******************************************************************************/
32 #pragma once
33
34 #include <condition_variable>
35 #include <algorithm>
36
37 #include "core/api.h"
38 #include "core/utils.h"
39 #include "core/arena.h"
40 #include "core/fifo.hpp"
41 #include "core/knobs.h"
42 #include "common/simdintrin.h"
43 #include "core/threads.h"
44 #include "ringbuffer.h"
45
46 // x.8 fixed point precision values
47 #define FIXED_POINT_SHIFT 8
48 #define FIXED_POINT_SCALE 256
49
50 // x.16 fixed point precision values
51 #define FIXED_POINT16_SHIFT 16
52 #define FIXED_POINT16_SCALE 65536
53
54 struct SWR_CONTEXT;
55 struct DRAW_CONTEXT;
56
57 struct TRI_FLAGS
58 {
59 uint32_t frontFacing : 1;
60 uint32_t yMajor : 1;
61 uint32_t coverageMask : (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
62 uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
63 float pointSize;
64 uint32_t primID;
65 uint32_t renderTargetArrayIndex;
66 uint32_t viewportIndex;
67 };
68
69 //////////////////////////////////////////////////////////////////////////
70 /// SWR_TRIANGLE_DESC
71 /////////////////////////////////////////////////////////////////////////
72 struct SWR_TRIANGLE_DESC
73 {
74 float I[3];
75 float J[3];
76 float Z[3];
77 float OneOverW[3];
78 float recipDet;
79
80 float *pRecipW;
81 float *pAttribs;
82 float *pPerspAttribs;
83 float *pSamplePos;
84 float *pUserClipBuffer;
85
86 uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
87 uint64_t innerCoverageMask; // Conservative rasterization inner coverage: marked covered if entire pixel is covered
88 uint64_t anyCoveredSamples;
89
90 TRI_FLAGS triFlags;
91 };
92
93 struct TRIANGLE_WORK_DESC
94 {
95 float *pTriBuffer;
96 float *pAttribs;
97 float *pUserClipBuffer;
98 uint32_t numAttribs;
99 TRI_FLAGS triFlags;
100 };
101
102 union CLEAR_FLAGS
103 {
104 struct
105 {
106 uint32_t mask : 3;
107 };
108 uint32_t bits;
109 };
110
111 struct CLEAR_DESC
112 {
113 SWR_RECT rect;
114 CLEAR_FLAGS flags;
115 float clearRTColor[4]; // RGBA_32F
116 float clearDepth; // [0..1]
117 uint8_t clearStencil;
118 };
119
120 struct DISCARD_INVALIDATE_TILES_DESC
121 {
122 uint32_t attachmentMask;
123 SWR_RECT rect;
124 SWR_TILE_STATE newTileState;
125 bool createNewTiles;
126 bool fullTilesOnly;
127 };
128
129 struct SYNC_DESC
130 {
131 PFN_CALLBACK_FUNC pfnCallbackFunc;
132 uint64_t userData;
133 uint64_t userData2;
134 uint64_t userData3;
135 };
136
137 struct STORE_TILES_DESC
138 {
139 SWR_RENDERTARGET_ATTACHMENT attachment;
140 SWR_TILE_STATE postStoreTileState;
141 SWR_RECT rect;
142 };
143
144 struct COMPUTE_DESC
145 {
146 uint32_t threadGroupCountX;
147 uint32_t threadGroupCountY;
148 uint32_t threadGroupCountZ;
149 };
150
151 typedef void(*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc);
152
153 enum WORK_TYPE
154 {
155 SYNC,
156 DRAW,
157 CLEAR,
158 DISCARDINVALIDATETILES,
159 STORETILES,
160 };
161
162 struct BE_WORK
163 {
164 WORK_TYPE type;
165 PFN_WORK_FUNC pfnWork;
166 union
167 {
168 SYNC_DESC sync;
169 TRIANGLE_WORK_DESC tri;
170 CLEAR_DESC clear;
171 DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
172 STORE_TILES_DESC storeTiles;
173 } desc;
174 };
175
176 struct DRAW_WORK
177 {
178 DRAW_CONTEXT* pDC;
179 union
180 {
181 uint32_t numIndices; // DrawIndexed: Number of indices for draw.
182 uint32_t numVerts; // Draw: Number of verts (triangles, lines, etc)
183 };
184 union
185 {
186 const int32_t* pIB; // DrawIndexed: App supplied indices
187 uint32_t startVertex; // Draw: Starting vertex in VB to render from.
188 };
189 int32_t baseVertex;
190 uint32_t numInstances; // Number of instances
191 uint32_t startInstance; // Instance offset
192 uint32_t startPrimID; // starting primitiveID for this draw batch
193 uint32_t startVertexID; // starting VertexID for this draw batch (only needed for non-indexed draws)
194 SWR_FORMAT type; // index buffer type
195 };
196
197 typedef void(*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pDesc);
198 struct FE_WORK
199 {
200 WORK_TYPE type;
201 PFN_FE_WORK_FUNC pfnWork;
202 union
203 {
204 SYNC_DESC sync;
205 DRAW_WORK draw;
206 CLEAR_DESC clear;
207 DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
208 STORE_TILES_DESC storeTiles;
209 } desc;
210 };
211
212 struct GUARDBANDS
213 {
214 float left[KNOB_NUM_VIEWPORTS_SCISSORS];
215 float right[KNOB_NUM_VIEWPORTS_SCISSORS];
216 float top[KNOB_NUM_VIEWPORTS_SCISSORS];
217 float bottom[KNOB_NUM_VIEWPORTS_SCISSORS];
218 };
219
220 struct PA_STATE;
221
222 // function signature for pipeline stages that execute after primitive assembly
223 typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[],
224 uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
225
226 OSALIGNLINE(struct) API_STATE
227 {
228 // Vertex Buffers
229 SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS];
230
231 // Index Buffer
232 SWR_INDEX_BUFFER_STATE indexBuffer;
233
234 // FS - Fetch Shader State
235 PFN_FETCH_FUNC pfnFetchFunc;
236
237 // VS - Vertex Shader State
238 PFN_VERTEX_FUNC pfnVertexFunc;
239
240 // GS - Geometry Shader State
241 PFN_GS_FUNC pfnGsFunc;
242 SWR_GS_STATE gsState;
243
244 // CS - Compute Shader
245 PFN_CS_FUNC pfnCsFunc;
246 uint32_t totalThreadsInGroup;
247 uint32_t totalSpillFillSize;
248
249 // FE - Frontend State
250 SWR_FRONTEND_STATE frontendState;
251
252 // SOS - Streamout Shader State
253 PFN_SO_FUNC pfnSoFunc[MAX_SO_STREAMS];
254
255 // Streamout state
256 SWR_STREAMOUT_STATE soState;
257 mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS];
258
259 // Tessellation State
260 PFN_HS_FUNC pfnHsFunc;
261 PFN_DS_FUNC pfnDsFunc;
262 SWR_TS_STATE tsState;
263
264 // Number of attributes used by the frontend (vs, so, gs)
265 uint32_t feNumAttributes;
266
267 PRIMITIVE_TOPOLOGY topology;
268 bool forceFront;
269
270 // RS - Rasterizer State
271 SWR_RASTSTATE rastState;
272 // floating point multisample offsets
273 float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2];
274
275 GUARDBANDS gbState;
276
277 SWR_VIEWPORT vp[KNOB_NUM_VIEWPORTS_SCISSORS];
278 SWR_VIEWPORT_MATRICES vpMatrices;
279
280 SWR_RECT scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS];
281 SWR_RECT scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS];
282 bool scissorsTileAligned;
283
284 // Backend state
285 SWR_BACKEND_STATE backendState;
286
287 // PS - Pixel shader state
288 SWR_PS_STATE psState;
289
290 SWR_DEPTH_STENCIL_STATE depthStencilState;
291
292 // OM - Output Merger State
293 SWR_BLEND_STATE blendState;
294 PFN_BLEND_JIT_FUNC pfnBlendFunc[SWR_NUM_RENDERTARGETS];
295
296 // Stats are incremented when this is true.
297 bool enableStats;
298
299 struct
300 {
301 uint32_t colorHottileEnable : 8;
302 uint32_t depthHottileEnable: 1;
303 uint32_t stencilHottileEnable : 1;
304 };
305
306 PFN_QUANTIZE_DEPTH pfnQuantizeDepth;
307 };
308
309 class MacroTileMgr;
310 class DispatchQueue;
311
312 struct RenderOutputBuffers
313 {
314 uint8_t* pColor[SWR_NUM_RENDERTARGETS];
315 uint8_t* pDepth;
316 uint8_t* pStencil;
317 };
318
319 // Plane equation A/B/C coeffs used to evaluate I/J barycentric coords
320 struct BarycentricCoeffs
321 {
322 simdscalar vIa;
323 simdscalar vIb;
324 simdscalar vIc;
325
326 simdscalar vJa;
327 simdscalar vJb;
328 simdscalar vJc;
329
330 simdscalar vZa;
331 simdscalar vZb;
332 simdscalar vZc;
333
334 simdscalar vRecipDet;
335
336 simdscalar vAOneOverW;
337 simdscalar vBOneOverW;
338 simdscalar vCOneOverW;
339 };
340
341 // pipeline function pointer types
342 typedef void(*PFN_BACKEND_FUNC)(DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&);
343 typedef void(*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT &, uint8_t* (&)[SWR_NUM_RENDERTARGETS], uint32_t, const SWR_BLEND_STATE*,
344 const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS], simdscalar&, simdscalar);
345 typedef void(*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &);
346 typedef void(*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
347 typedef void(*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &, const uint64_t *const, const uint32_t,
348 const simdscalar, const simdscalar);
349
350 struct BACKEND_FUNCS
351 {
352 PFN_BACKEND_FUNC pfnBackend;
353 };
354
355 // Draw State
356 struct DRAW_STATE
357 {
358 API_STATE state;
359
360 void* pPrivateState; // Its required the driver sets this up for each draw.
361
362 // pipeline function pointers, filled in by API thread when setting up the draw
363 BACKEND_FUNCS backendFuncs;
364 PFN_PROCESS_PRIMS pfnProcessPrims;
365
366 CachingArena* pArena; // This should only be used by API thread.
367 };
368
369 struct DRAW_DYNAMIC_STATE
370 {
371 void Reset(uint32_t numThreads)
372 {
373 SWR_STATS* pSavePtr = pStats;
374 memset(this, 0, sizeof(*this));
375 pStats = pSavePtr;
376 memset(pStats, 0, sizeof(SWR_STATS) * numThreads);
377 }
378 ///@todo Currently assumes only a single FE can do stream output for a draw.
379 uint32_t SoWriteOffset[4];
380 bool SoWriteOffsetDirty[4];
381
382 SWR_STATS_FE statsFE; // Only one FE thread per DC.
383 SWR_STATS* pStats;
384 };
385
386 // Draw Context
387 // The api thread sets up a draw context that exists for the life of the draw.
388 // This draw context maintains all of the state needed for the draw operation.
389 struct DRAW_CONTEXT
390 {
391 SWR_CONTEXT* pContext;
392 union
393 {
394 MacroTileMgr* pTileMgr;
395 DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
396 };
397 DRAW_STATE* pState; // Read-only state. Core should not update this outside of API thread.
398 DRAW_DYNAMIC_STATE dynState;
399
400 CachingArena* pArena;
401
402 uint32_t drawId;
403 bool dependent;
404 bool isCompute; // Is this DC a compute context?
405 bool cleanupState; // True if this is the last draw using an entry in the state ring.
406 volatile bool doneFE; // Is FE work done for this draw?
407
408 FE_WORK FeWork;
409
410 volatile OSALIGNLINE(uint32_t) FeLock;
411 volatile int32_t threadsDone;
412
413 SYNC_DESC retireCallback; // Call this func when this DC is retired.
414 };
415
416 static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
417
418 INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
419 {
420 SWR_ASSERT(pDC != nullptr);
421 SWR_ASSERT(pDC->pState != nullptr);
422
423 return pDC->pState->state;
424 }
425
426 INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC)
427 {
428 SWR_ASSERT(pDC != nullptr);
429 SWR_ASSERT(pDC->pState != nullptr);
430
431 return pDC->pState->pPrivateState;
432 }
433
434 class HotTileMgr;
435
436 struct SWR_CONTEXT
437 {
438 // Draw Context Ring
439 // Each draw needs its own state in order to support mulitple draws in flight across multiple threads.
440 // We maintain N draw contexts configured as a ring. The size of the ring limits the maximum number
441 // of draws that can be in flight at any given time.
442 //
443 // Description:
444 // 1. State - When an application first sets state we'll request a new draw context to use.
445 // a. If there are no available draw contexts then we'll have to wait until one becomes free.
446 // b. If one is available then set pCurDrawContext to point to it and mark it in use.
447 // c. All state calls set state on pCurDrawContext.
448 // 2. Draw - Creates submits a work item that is associated with current draw context.
449 // a. Set pPrevDrawContext = pCurDrawContext
450 // b. Set pCurDrawContext to NULL.
451 // 3. State - When an applications sets state after draw
452 // a. Same as step 1.
453 // b. State is copied from prev draw context to current.
454 RingBuffer<DRAW_CONTEXT> dcRing;
455
456 DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw.
457 DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from.
458
459 MacroTileMgr* pMacroTileManagerArray;
460 DispatchQueue* pDispatchQueueArray;
461
462 // Draw State Ring
463 // When draw are very large (lots of primitives) then the API thread will break these up.
464 // These split draws all have identical state. So instead of storing the state directly
465 // in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
466 // to reference a single entry in the DS ring.
467 RingBuffer<DRAW_STATE> dsRing;
468
469 uint32_t curStateId; // Current index to the next available entry in the DS ring.
470
471 uint32_t NumWorkerThreads;
472 uint32_t NumFEThreads;
473 uint32_t NumBEThreads;
474
475 THREAD_POOL threadPool; // Thread pool associated with this context
476 SWR_THREADING_INFO threadInfo;
477
478 std::condition_variable FifosNotEmpty;
479 std::mutex WaitLock;
480
481 DRIVER_TYPE driverType;
482
483 uint32_t privateStateSize;
484
485 HotTileMgr *pHotTileMgr;
486
487 // Callback functions, passed in at create context time
488 PFN_LOAD_TILE pfnLoadTile;
489 PFN_STORE_TILE pfnStoreTile;
490 PFN_CLEAR_TILE pfnClearTile;
491 PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset;
492 PFN_UPDATE_STATS pfnUpdateStats;
493 PFN_UPDATE_STATS_FE pfnUpdateStatsFE;
494
495 // Global Stats
496 SWR_STATS* pStats;
497
498 // Scratch space for workers.
499 uint8_t** ppScratch;
500
501 volatile int32_t drawsOutstandingFE;
502
503 CachingAllocator cachingArenaAllocator;
504 uint32_t frameCount;
505
506 uint32_t lastFrameChecked;
507 uint64_t lastDrawChecked;
508 TileSet singleThreadLockedTiles;
509
510 // ArchRast thread contexts.
511 HANDLE* pArContext;
512 };
513
514 #define UPDATE_STAT(name, count) if (GetApiState(pDC).enableStats) { pDC->dynState.pStats[workerId].name += count; }
515 #define UPDATE_STAT_FE(name, count) if (GetApiState(pDC).enableStats) { pDC->dynState.statsFE.name += count; }
516
517 // ArchRast instrumentation framework
518 #ifdef KNOB_ENABLE_AR
519 #define AR_WORKER_CTX pDC->pContext->pArContext[workerId]
520 #define AR_API_CTX pDC->pContext->pArContext[pContext->NumWorkerThreads]
521
522 #define AR_BEGIN(ctx, type, id) ArchRast::dispatch(ctx, ArchRast::Start(ArchRast::type, id))
523 #define AR_END(ctx, type, count) ArchRast::dispatch(ctx, ArchRast::End(ArchRast::type, count))
524 #define AR_EVENT(ctx, event) ArchRast::dispatch(ctx, ArchRast::event)
525 #else
526 #define AR_BEGIN(ctx, type, id)
527 #define AR_END(ctx, type, id)
528 #define AR_EVENT(ctx, event)
529 #endif