#define _simd_round_ps _mm256_round_ps
#define _simd_castpd_ps _mm256_castpd_ps
#define _simd_broadcast_ps(a) _mm256_broadcast_ps((const __m128*)(a))
+#define _simd_stream_ps _mm256_stream_ps
#define _simd_load_sd _mm256_load_sd
#define _simd_movemask_pd _mm256_movemask_pd
mHead = 0;
mTail = 0;
mBlocks.clear();
- T* pNewBlock = (T*)arena.Alloc(sizeof(T)*mBlockSize);
+ T* pNewBlock = (T*)arena.AllocAligned(sizeof(T)*mBlockSize, KNOB_SIMD_WIDTH*4);
mBlocks.push_back(pNewBlock);
mCurBlock = pNewBlock;
mCurBlockIdx = 0;
-
mNumEntries = 0;
- _ReadWriteBarrier();
mLock = 0;
}
template <typename ArenaT>
bool enqueue_try_nosync(ArenaT& arena, const T* entry)
{
- memcpy(&mCurBlock[mTail], entry, sizeof(T));
+ const float* pSrc = (const float*)entry;
+ float* pDst = (float*)&mCurBlock[mTail];
+
+ auto lambda = [&](int32_t i)
+ {
+ __m256 vSrc = _simd_load_ps(pSrc + i*KNOB_SIMD_WIDTH);
+ _simd_stream_ps(pDst + i*KNOB_SIMD_WIDTH, vSrc);
+ };
+
+ const uint32_t numSimdLines = sizeof(T) / (KNOB_SIMD_WIDTH*4);
+ static_assert(numSimdLines * KNOB_SIMD_WIDTH * 4 == sizeof(T),
+ "FIFO element size should be multiple of SIMD width.");
+
+ UnrollerL<0, numSimdLines, 1>::step(lambda);
mTail ++;
if (mTail == mBlockSize)
}
else
{
- T* newBlock = (T*)arena.Alloc(sizeof(T)*mBlockSize);
+ T* newBlock = (T*)arena.AllocAligned(sizeof(T)*mBlockSize, KNOB_SIMD_WIDTH*4);
SWR_ASSERT(newBlock);
mBlocks.push_back(newBlock);
/// @brief Called when FE work is complete for this DC.
INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
{
- _ReadWriteBarrier();
-
if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStats)
{
pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &pDC->dynState.statsFE);
}
}
+ // Ensure all streaming writes are globally visible before marking this FE done
+ _mm_mfence();
pDC->doneFE = true;
-
InterlockedDecrement((volatile LONG*)&pContext->drawsOutstandingFE);
}
queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer);
queue.finishedWork();
}
+
+ // Ensure all streaming writes are globally visible before moving onto the next draw
+ _mm_mfence();
}
}
}