1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Implementation for Macro Tile Manager which provides the facilities
26 * for threads to work on an macro tile.
28 ******************************************************************************/
29 #include <unordered_map>
32 #include "core/tilemgr.h"
33 #include "core/multisample.h"
34 #include "rdtsc_core.h"
36 #define TILE_ID(x,y) ((x << 16 | y))
38 // override new/delete for alignment
39 void *MacroTileMgr::operator new(size_t size
)
41 return _aligned_malloc(size
, 64);
44 void MacroTileMgr::operator delete(void *p
)
49 void* DispatchQueue::operator new(size_t size
)
51 return _aligned_malloc(size
, 64);
54 void DispatchQueue::operator delete(void *p
)
59 MacroTileMgr::MacroTileMgr(Arena
& arena
) : mArena(arena
)
63 void MacroTileMgr::initialize()
65 mWorkItemsProduced
= 0;
66 mWorkItemsConsumed
= 0;
71 void MacroTileMgr::enqueue(uint32_t x
, uint32_t y
, BE_WORK
*pWork
)
73 // Should not enqueue more then what we have backing for in the hot tile manager.
74 SWR_ASSERT(x
< KNOB_NUM_HOT_TILES_X
);
75 SWR_ASSERT(y
< KNOB_NUM_HOT_TILES_Y
);
77 uint32_t id
= TILE_ID(x
, y
);
79 MacroTileQueue
&tile
= mTiles
[id
];
82 if (tile
.mWorkItemsFE
== 1)
85 mDirtyTiles
.push_back(id
);
89 tile
.enqueue_try_nosync(mArena
, pWork
);
92 void MacroTileMgr::markTileComplete(uint32_t id
)
94 SWR_ASSERT(mTiles
.find(id
) != mTiles
.end());
95 MacroTileQueue
&tile
= mTiles
[id
];
96 uint32_t numTiles
= tile
.mWorkItemsFE
;
97 InterlockedExchangeAdd(&mWorkItemsConsumed
, numTiles
);
100 tile
.mWorkItemsBE
+= numTiles
;
101 SWR_ASSERT(tile
.mWorkItemsFE
== tile
.mWorkItemsBE
);
103 // clear out tile, but defer fifo clear until the next DC first queues to it.
104 // this prevents worker threads from constantly locking a completed macro tile
105 tile
.mWorkItemsFE
= 0;
106 tile
.mWorkItemsBE
= 0;
109 HOTTILE
* HotTileMgr::GetHotTile(SWR_CONTEXT
* pContext
, DRAW_CONTEXT
* pDC
, uint32_t macroID
, SWR_RENDERTARGET_ATTACHMENT attachment
, bool create
, uint32_t numSamples
,
110 uint32_t renderTargetArrayIndex
)
113 MacroTileMgr::getTileIndices(macroID
, x
, y
);
115 SWR_ASSERT(x
< KNOB_NUM_HOT_TILES_X
);
116 SWR_ASSERT(y
< KNOB_NUM_HOT_TILES_Y
);
118 HotTileSet
&tile
= mHotTiles
[x
][y
];
119 HOTTILE
& hotTile
= tile
.Attachment
[attachment
];
120 if (hotTile
.pBuffer
== NULL
)
124 uint32_t size
= numSamples
* mHotTileSize
[attachment
];
125 hotTile
.pBuffer
= (BYTE
*)_aligned_malloc(size
, KNOB_SIMD_WIDTH
* 4);
126 hotTile
.state
= HOTTILE_INVALID
;
127 hotTile
.numSamples
= numSamples
;
128 hotTile
.renderTargetArrayIndex
= renderTargetArrayIndex
;
137 // free the old tile and create a new one with enough space to hold all samples
138 if (numSamples
> hotTile
.numSamples
)
140 // tile should be either uninitialized or resolved if we're deleting and switching to a
142 SWR_ASSERT((hotTile
.state
== HOTTILE_INVALID
) ||
143 (hotTile
.state
== HOTTILE_RESOLVED
) ||
144 (hotTile
.state
== HOTTILE_CLEAR
));
145 _aligned_free(hotTile
.pBuffer
);
147 uint32_t size
= numSamples
* mHotTileSize
[attachment
];
148 hotTile
.pBuffer
= (BYTE
*)_aligned_malloc(size
, KNOB_SIMD_WIDTH
* 4);
149 hotTile
.state
= HOTTILE_INVALID
;
150 hotTile
.numSamples
= numSamples
;
153 // if requested render target array index isn't currently loaded, need to store out the current hottile
154 // and load the requested array slice
155 if (renderTargetArrayIndex
!= hotTile
.renderTargetArrayIndex
)
160 case SWR_ATTACHMENT_COLOR0
:
161 case SWR_ATTACHMENT_COLOR1
:
162 case SWR_ATTACHMENT_COLOR2
:
163 case SWR_ATTACHMENT_COLOR3
:
164 case SWR_ATTACHMENT_COLOR4
:
165 case SWR_ATTACHMENT_COLOR5
:
166 case SWR_ATTACHMENT_COLOR6
:
167 case SWR_ATTACHMENT_COLOR7
: format
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
168 case SWR_ATTACHMENT_DEPTH
: format
= KNOB_DEPTH_HOT_TILE_FORMAT
; break;
169 case SWR_ATTACHMENT_STENCIL
: format
= KNOB_STENCIL_HOT_TILE_FORMAT
; break;
170 default: SWR_ASSERT(false, "Unknown attachment: %d", attachment
); format
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
173 if (hotTile
.state
== HOTTILE_DIRTY
)
175 pContext
->pfnStoreTile(GetPrivateState(pDC
), format
, attachment
,
176 x
* KNOB_MACROTILE_X_DIM
, y
* KNOB_MACROTILE_Y_DIM
, hotTile
.renderTargetArrayIndex
, hotTile
.pBuffer
);
179 pContext
->pfnLoadTile(GetPrivateState(pDC
), format
, attachment
,
180 x
* KNOB_MACROTILE_X_DIM
, y
* KNOB_MACROTILE_Y_DIM
, renderTargetArrayIndex
, hotTile
.pBuffer
);
182 hotTile
.renderTargetArrayIndex
= renderTargetArrayIndex
;
183 hotTile
.state
= HOTTILE_DIRTY
;
186 return &tile
.Attachment
[attachment
];
189 void HotTileMgr::ClearColorHotTile(const HOTTILE
* pHotTile
) // clear a macro tile from float4 clear data.
191 // Load clear color into SIMD register...
192 float *pClearData
= (float*)(pHotTile
->clearData
);
193 simdscalar valR
= _simd_broadcast_ss(&pClearData
[0]);
194 simdscalar valG
= _simd_broadcast_ss(&pClearData
[1]);
195 simdscalar valB
= _simd_broadcast_ss(&pClearData
[2]);
196 simdscalar valA
= _simd_broadcast_ss(&pClearData
[3]);
198 float *pfBuf
= (float*)pHotTile
->pBuffer
;
199 uint32_t numSamples
= pHotTile
->numSamples
;
201 for (uint32_t row
= 0; row
< KNOB_MACROTILE_Y_DIM
; row
+= KNOB_TILE_Y_DIM
)
203 for (uint32_t col
= 0; col
< KNOB_MACROTILE_X_DIM
; col
+= KNOB_TILE_X_DIM
)
205 for (uint32_t si
= 0; si
< (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* numSamples
); si
+= SIMD_TILE_X_DIM
* SIMD_TILE_Y_DIM
) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++)
207 _simd_store_ps(pfBuf
, valR
);
208 pfBuf
+= KNOB_SIMD_WIDTH
;
209 _simd_store_ps(pfBuf
, valG
);
210 pfBuf
+= KNOB_SIMD_WIDTH
;
211 _simd_store_ps(pfBuf
, valB
);
212 pfBuf
+= KNOB_SIMD_WIDTH
;
213 _simd_store_ps(pfBuf
, valA
);
214 pfBuf
+= KNOB_SIMD_WIDTH
;
220 void HotTileMgr::ClearDepthHotTile(const HOTTILE
* pHotTile
) // clear a macro tile from float4 clear data.
222 // Load clear color into SIMD register...
223 float *pClearData
= (float*)(pHotTile
->clearData
);
224 simdscalar valZ
= _simd_broadcast_ss(&pClearData
[0]);
226 float *pfBuf
= (float*)pHotTile
->pBuffer
;
227 uint32_t numSamples
= pHotTile
->numSamples
;
229 for (uint32_t row
= 0; row
< KNOB_MACROTILE_Y_DIM
; row
+= KNOB_TILE_Y_DIM
)
231 for (uint32_t col
= 0; col
< KNOB_MACROTILE_X_DIM
; col
+= KNOB_TILE_X_DIM
)
233 for (uint32_t si
= 0; si
< (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* numSamples
); si
+= SIMD_TILE_X_DIM
* SIMD_TILE_Y_DIM
)
235 _simd_store_ps(pfBuf
, valZ
);
236 pfBuf
+= KNOB_SIMD_WIDTH
;
242 void HotTileMgr::ClearStencilHotTile(const HOTTILE
* pHotTile
)
244 // convert from F32 to U8.
245 uint8_t clearVal
= (uint8_t)(pHotTile
->clearData
[0]);
246 //broadcast 32x into __m256i...
247 simdscalari valS
= _simd_set1_epi8(clearVal
);
249 simdscalari
* pBuf
= (simdscalari
*)pHotTile
->pBuffer
;
250 uint32_t numSamples
= pHotTile
->numSamples
;
252 for (uint32_t row
= 0; row
< KNOB_MACROTILE_Y_DIM
; row
+= KNOB_TILE_Y_DIM
)
254 for (uint32_t col
= 0; col
< KNOB_MACROTILE_X_DIM
; col
+= KNOB_TILE_X_DIM
)
256 // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
257 for (uint32_t si
= 0; si
< (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* numSamples
); si
+= SIMD_TILE_X_DIM
* SIMD_TILE_Y_DIM
* 4)
259 _simd_store_si(pBuf
, valS
);
266 //////////////////////////////////////////////////////////////////////////
267 /// @brief InitializeHotTiles
268 /// for draw calls, we initialize the active hot tiles and perform deferred
269 /// load on them if tile is in invalid state. we do this in the outer thread
270 /// loop instead of inside the draw routine itself mainly for performance,
271 /// to avoid unnecessary setup every triangle
272 /// @todo support deferred clear
273 /// @param pCreateInfo - pointer to creation info.
274 void HotTileMgr::InitializeHotTiles(SWR_CONTEXT
* pContext
, DRAW_CONTEXT
* pDC
, uint32_t macroID
)
276 const API_STATE
& state
= GetApiState(pDC
);
277 HotTileMgr
*pHotTileMgr
= pContext
->pHotTileMgr
;
280 MacroTileMgr::getTileIndices(macroID
, x
, y
);
281 x
*= KNOB_MACROTILE_X_DIM
;
282 y
*= KNOB_MACROTILE_Y_DIM
;
284 uint32_t numSamples
= GetNumSamples(state
.rastState
.sampleCount
);
286 // check RT if enabled
287 unsigned long rtSlot
= 0;
288 uint32_t colorHottileEnableMask
= state
.colorHottileEnable
;
289 while (_BitScanForward(&rtSlot
, colorHottileEnableMask
))
291 HOTTILE
* pHotTile
= GetHotTile(pContext
, pDC
, macroID
, (SWR_RENDERTARGET_ATTACHMENT
)(SWR_ATTACHMENT_COLOR0
+ rtSlot
), true, numSamples
);
293 if (pHotTile
->state
== HOTTILE_INVALID
)
295 RDTSC_START(BELoadTiles
);
296 // invalid hottile before draw requires a load from surface before we can draw to it
297 pContext
->pfnLoadTile(GetPrivateState(pDC
), KNOB_COLOR_HOT_TILE_FORMAT
, (SWR_RENDERTARGET_ATTACHMENT
)(SWR_ATTACHMENT_COLOR0
+ rtSlot
), x
, y
, pHotTile
->renderTargetArrayIndex
, pHotTile
->pBuffer
);
298 pHotTile
->state
= HOTTILE_DIRTY
;
299 RDTSC_STOP(BELoadTiles
, 0, 0);
301 else if (pHotTile
->state
== HOTTILE_CLEAR
)
303 RDTSC_START(BELoadTiles
);
305 ClearColorHotTile(pHotTile
);
306 pHotTile
->state
= HOTTILE_DIRTY
;
307 RDTSC_STOP(BELoadTiles
, 0, 0);
309 colorHottileEnableMask
&= ~(1 << rtSlot
);
312 // check depth if enabled
313 if (state
.depthHottileEnable
)
315 HOTTILE
* pHotTile
= GetHotTile(pContext
, pDC
, macroID
, SWR_ATTACHMENT_DEPTH
, true, numSamples
);
316 if (pHotTile
->state
== HOTTILE_INVALID
)
318 RDTSC_START(BELoadTiles
);
319 // invalid hottile before draw requires a load from surface before we can draw to it
320 pContext
->pfnLoadTile(GetPrivateState(pDC
), KNOB_DEPTH_HOT_TILE_FORMAT
, SWR_ATTACHMENT_DEPTH
, x
, y
, pHotTile
->renderTargetArrayIndex
, pHotTile
->pBuffer
);
321 pHotTile
->state
= HOTTILE_DIRTY
;
322 RDTSC_STOP(BELoadTiles
, 0, 0);
324 else if (pHotTile
->state
== HOTTILE_CLEAR
)
326 RDTSC_START(BELoadTiles
);
328 ClearDepthHotTile(pHotTile
);
329 pHotTile
->state
= HOTTILE_DIRTY
;
330 RDTSC_STOP(BELoadTiles
, 0, 0);
334 // check stencil if enabled
335 if (state
.stencilHottileEnable
)
337 HOTTILE
* pHotTile
= GetHotTile(pContext
, pDC
, macroID
, SWR_ATTACHMENT_STENCIL
, true, numSamples
);
338 if (pHotTile
->state
== HOTTILE_INVALID
)
340 RDTSC_START(BELoadTiles
);
341 // invalid hottile before draw requires a load from surface before we can draw to it
342 pContext
->pfnLoadTile(GetPrivateState(pDC
), KNOB_STENCIL_HOT_TILE_FORMAT
, SWR_ATTACHMENT_STENCIL
, x
, y
, pHotTile
->renderTargetArrayIndex
, pHotTile
->pBuffer
);
343 pHotTile
->state
= HOTTILE_DIRTY
;
344 RDTSC_STOP(BELoadTiles
, 0, 0);
346 else if (pHotTile
->state
== HOTTILE_CLEAR
)
348 RDTSC_START(BELoadTiles
);
350 ClearStencilHotTile(pHotTile
);
351 pHotTile
->state
= HOTTILE_DIRTY
;
352 RDTSC_STOP(BELoadTiles
, 0, 0);