swr: [rasterizer core] align Macrotile FIFO memory to SIMD size
authorTim Rowley <timothy.o.rowley@intel.com>
Tue, 4 Oct 2016 17:59:30 +0000 (12:59 -0500)
committerTim Rowley <timothy.o.rowley@intel.com>
Tue, 11 Oct 2016 16:22:04 +0000 (11:22 -0500)
Align and use streaming store instructions for BE fifo queues.
Provides slightly faster enqueue and doesn't pollute the caches.
Add appropriate memory fences to ensure streaming writes are
globally visible.

Signed-off-by: Tim Rowley <timothy.o.rowley@intel.com>
src/gallium/drivers/swr/rasterizer/common/simdintrin.h
src/gallium/drivers/swr/rasterizer/core/context.h
src/gallium/drivers/swr/rasterizer/core/fifo.hpp
src/gallium/drivers/swr/rasterizer/core/threads.cpp

index 4a91e95f7ff54ea6beb7b7db48a55a97347c9a4a..3ad37de3d49c58e7c307de464dfcc8323206600f 100644 (file)
@@ -114,6 +114,7 @@ OSALIGNSIMD(union) simdvector
 #define _simd_round_ps _mm256_round_ps
 #define _simd_castpd_ps _mm256_castpd_ps
 #define _simd_broadcast_ps(a) _mm256_broadcast_ps((const __m128*)(a))
+#define _simd_stream_ps _mm256_stream_ps
 
 #define _simd_load_sd _mm256_load_sd
 #define _simd_movemask_pd _mm256_movemask_pd
index dfcc1c0d39a164f9a2634ec4709b67c96778f83e..7a817ef0dd8ef899b194b091e4d1a43791aefce5 100644 (file)
@@ -161,7 +161,7 @@ enum WORK_TYPE
     SHUTDOWN,
 };
 
-struct BE_WORK
+OSALIGNSIMD(struct) BE_WORK
 {
     WORK_TYPE type;
     PFN_WORK_FUNC pfnWork;
index ccf0b70544f0d429f1037840e1b559ab07a6f4fc..7e07e6aeb2c7038f6d37961a41b1b89a77b1c5bb 100644 (file)
@@ -55,13 +55,11 @@ struct QUEUE
         mHead = 0;
         mTail = 0;
         mBlocks.clear();
-        T* pNewBlock = (T*)arena.Alloc(sizeof(T)*mBlockSize);
+        T* pNewBlock = (T*)arena.AllocAligned(sizeof(T)*mBlockSize, KNOB_SIMD_WIDTH*4);
         mBlocks.push_back(pNewBlock);
         mCurBlock = pNewBlock;
         mCurBlockIdx = 0;
-
         mNumEntries = 0;
-        _ReadWriteBarrier();
         mLock = 0;
     }
 
@@ -106,7 +104,20 @@ struct QUEUE
     template <typename ArenaT>
     bool enqueue_try_nosync(ArenaT& arena, const T* entry)
     {
-        memcpy(&mCurBlock[mTail], entry, sizeof(T));
+        const float* pSrc = (const float*)entry;
+        float* pDst = (float*)&mCurBlock[mTail];
+
+        auto lambda = [&](int32_t i)
+        {
+            __m256 vSrc = _simd_load_ps(pSrc + i*KNOB_SIMD_WIDTH);
+            _simd_stream_ps(pDst + i*KNOB_SIMD_WIDTH, vSrc);
+        };
+            
+        const uint32_t numSimdLines = sizeof(T) / (KNOB_SIMD_WIDTH*4);
+        static_assert(numSimdLines * KNOB_SIMD_WIDTH * 4 == sizeof(T),
+            "FIFO element size should be multiple of SIMD width.");
+
+        UnrollerL<0, numSimdLines, 1>::step(lambda);
 
         mTail ++;
         if (mTail == mBlockSize)
@@ -117,7 +128,7 @@ struct QUEUE
             }
             else
             {
-                T* newBlock = (T*)arena.Alloc(sizeof(T)*mBlockSize);
+                T* newBlock = (T*)arena.AllocAligned(sizeof(T)*mBlockSize, KNOB_SIMD_WIDTH*4);
                 SWR_ASSERT(newBlock);
 
                 mBlocks.push_back(newBlock);
index 28cd9298967dabc7defba0b2c5b82bd61871bab0..08a4d17821c22d6016ccc1b2051eb4448ab2eb36 100644 (file)
@@ -565,8 +565,6 @@ bool WorkOnFifoBE(
 /// @brief Called when FE work is complete for this DC.
 INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
 {
-    _ReadWriteBarrier();
-
     if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStats)
     {
         pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &pDC->dynState.statsFE);
@@ -584,8 +582,9 @@ INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
         }
     }
 
+    // Ensure all streaming writes are globally visible before marking this FE done
+    _mm_mfence();
     pDC->doneFE = true;
-
     InterlockedDecrement((volatile LONG*)&pContext->drawsOutstandingFE);
 }
 
@@ -673,6 +672,9 @@ void WorkOnCompute(
                 queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer);
                 queue.finishedWork();
             }
+
+            // Ensure all streaming writes are globally visible before moving onto the next draw
+            _mm_mfence();
         }
     }
 }