swr/rasterizer: Add memory tracking support
authorJan Zielinski <jan.zielinski@intel.com>
Fri, 26 Jul 2019 07:37:12 +0000 (09:37 +0200)
committerJan Zielinski <jan.zielinski@intel.com>
Tue, 30 Jul 2019 13:58:36 +0000 (15:58 +0200)
Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
12 files changed:
src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
src/gallium/drivers/swr/rasterizer/archrast/events.proto
src/gallium/drivers/swr/rasterizer/archrast/events_private.proto
src/gallium/drivers/swr/rasterizer/core/api.cpp
src/gallium/drivers/swr/rasterizer/core/api.h
src/gallium/drivers/swr/rasterizer/core/frontend.cpp
src/gallium/drivers/swr/rasterizer/core/state.h
src/gallium/drivers/swr/rasterizer/core/threads.cpp
src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h
src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp

index a454fc127011b233b39d340e3a8fea1fa6972453..06e0f616f707e18d8018a9b91134f5c1b9cc54d5 100644 (file)
@@ -26,6 +26,7 @@
  *
  ******************************************************************************/
 #include <atomic>
+#include <map>
 
 #include "common/os.h"
 #include "archrast/archrast.h"
@@ -85,6 +86,74 @@ namespace ArchRast
         uint32_t alphaBlendCount = 0;
     };
 
+    struct MemoryStats
+    {
+        struct MemoryTrackerKey
+        {
+            uint64_t address;
+            uint64_t mask;
+        };
+
+        struct MemoryTrackerData
+        {
+            uint32_t accessCountRead;
+            uint32_t accessCountWrite;
+            uint64_t tscMin;
+            uint64_t tscMax;
+        };
+
+        struct AddressRangeComparator 
+        {
+            bool operator()(MemoryTrackerKey a, MemoryTrackerKey b) const 
+            {
+                return (a.address & a.mask) < (b.address & b.mask);
+            }
+        };
+
+        typedef std::map<MemoryTrackerKey, MemoryTrackerData, AddressRangeComparator> MemoryTrackerMap;
+        MemoryTrackerMap trackedMemory = {};
+
+        void TrackMemoryAccess(uint64_t address, uint64_t addressMask, uint8_t isRead, uint64_t tsc)
+        {
+            MemoryTrackerKey key;
+            key.address = address;
+            key.mask = addressMask;
+
+            MemoryTrackerMap::iterator i = trackedMemory.lower_bound(key);
+            if (i != trackedMemory.end() && !(trackedMemory.key_comp()(key, i->first)))
+            {
+                // already in map
+                if (isRead)
+                {
+                    i->second.accessCountRead++;
+                }
+                else
+                {
+                    i->second.accessCountWrite++;
+                }
+                i->second.tscMax = tsc;
+            }
+            else
+            {
+                // new entry
+                MemoryTrackerData data;
+                if (isRead)
+                {
+                    data.accessCountRead = 1;
+                    data.accessCountWrite = 0;
+                }
+                else
+                {
+                    data.accessCountRead = 0;
+                    data.accessCountWrite = 1;
+                }
+                data.tscMin = tsc;
+                data.tscMax = tsc;
+                trackedMemory.insert(i, MemoryTrackerMap::value_type(key, data));
+            }
+        }
+    };
+
     //////////////////////////////////////////////////////////////////////////
     /// @brief Event handler that handles API thread events. This is shared
     ///        between the API and its caller (e.g. driver shim) but typically
@@ -180,6 +249,16 @@ namespace ArchRast
         EventHandlerWorkerStats(uint32_t id) : EventHandlerFile(id), mNeedFlush(false)
         {
             memset(mShaderStats, 0, sizeof(mShaderStats));
+
+            // compute address mask for memory tracking
+            mAddressMask = 0;
+            uint64_t addressRangeBytes = 64;
+            while (addressRangeBytes > 0)
+            {
+                mAddressMask = (mAddressMask << 1) | 1;
+                addressRangeBytes = addressRangeBytes >> 1;
+            }
+            mAddressMask = ~mAddressMask;
         }
 
         virtual void Handle(const EarlyDepthStencilInfoSingleSample& event)
@@ -585,6 +664,28 @@ namespace ArchRast
             mGS      = {};
         }
 
+        virtual void Handle(const MemoryAccessEvent& event)
+        {
+            mMemoryStats.TrackMemoryAccess(event.data.ptr, mAddressMask, event.data.isRead, event.data.tsc);
+        }
+
+        virtual void Handle(const MemoryStatsEndEvent& event)
+        {
+            MemoryStats::MemoryTrackerMap::iterator i = mMemoryStats.trackedMemory.begin();
+            while (i != mMemoryStats.trackedMemory.end())
+            {
+                MemoryStatsEvent mse(event.data.drawId, 
+                                     i->first.address & mAddressMask, 
+                                     i->second.accessCountRead, 
+                                     i->second.accessCountWrite, 
+                                     i->second.tscMin, 
+                                     i->second.tscMax);
+                EventHandlerFile::Handle(mse);
+                i++;
+            }
+            mMemoryStats.trackedMemory.clear();
+        }
+
         virtual void Handle(const GSPrimInfo& event)
         {
             mGS.inputPrimCount += event.data.inputPrimCount;
@@ -631,6 +732,9 @@ namespace ArchRast
 
         SWR_SHADER_STATS mShaderStats[NUM_SHADER_TYPES];
 
+        MemoryStats      mMemoryStats     = {};
+        uint64_t         mAddressMask     = 0;
+
     };
 
     static EventManager* FromHandle(HANDLE hThreadContext)
index 1057a9437bea7047f2336f578ee4c2a6797dd3f7..1618e5faa4a31a409a1733f349bd6eef644a86f6 100644 (file)
@@ -463,4 +463,23 @@ event SWTagFlushEvent
        uint32_t swTagFlushCounter;
     char swTagFlushReason[256];
     uint32_t swTagFlushType;
-};
\ No newline at end of file
+};
+
+event SWTagApiCallEvent
+{
+       uint64_t swTagFrame;
+       uint32_t swTagDrawOrDispatch;
+       uint32_t swTagDraw;
+       uint32_t swTagDispatch;
+    char swTagApiCall[256];
+};
+
+event MemoryStatsEvent
+{
+    uint32_t drawId;
+    uint64_t baseAddr;
+    uint32_t accessCountRead;
+    uint32_t accessCountWrite;
+    uint64_t tscMin;
+    uint64_t tscMax;
+};
index b49d4bf8de1109d1bb2abe34c241a9dfcc881a23..19fb582a414f7f2f5527afa541146619cce1754e 100644 (file)
@@ -90,6 +90,21 @@ event FrontendDrawEndEvent
     uint32_t drawId;
 };
 
+event MemoryAccessEvent
+{
+    uint32_t drawId;
+    uint64_t tsc;
+    uint64_t ptr;
+    uint32_t size;
+    uint8_t isRead;
+    uint8_t client;
+};
+
+event MemoryStatsEndEvent
+{
+    uint32_t drawId;
+};
+
 event TessPrimCount
 {
     uint64_t primCount;
index a043a341059d480fe6eb0d5d363bc3bacb2aef76..20f1a34588094f386dbce4f232e224b46dfe9178 100644 (file)
@@ -181,7 +181,12 @@ HANDLE SwrCreateContext(SWR_CREATECONTEXT_INFO* pCreateInfo)
 #if defined(KNOB_ENABLE_AR)
         // Initialize worker thread context for ArchRast.
         pContext->pArContext[i] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::WORKER);
+
+        SWR_WORKER_DATA* pWorkerData = (SWR_WORKER_DATA*)pContext->threadPool.pThreadData[i].pWorkerPrivateData;
+        pWorkerData->hArContext = pContext->pArContext[i];
 #endif
+
+
     }
 
 #if defined(KNOB_ENABLE_AR)
index a3f065da2eac367bbe75c0dd1f06235b402c14d3..4d523bb346d73ea98c692aac8eda5d20d032aedf 100644 (file)
@@ -219,10 +219,17 @@ struct SWR_API_THREADING_INFO
                                    // Independent of KNOB_MAX_THREADS_PER_CORE.
 };
 
+struct SWR_WORKER_DATA
+{
+    HANDLE hArContext;  // handle to the archrast context
+};
+
 //////////////////////////////////////////////////////////////////////////
 /// SWR_WORKER_PRIVATE_STATE
 /// Data used to allocate per-worker thread private data.  A pointer
 /// to this data will be passed in to each shader function.
+/// The first field of this private data must be SWR_WORKER_DATA
+/// perWorkerPrivateStateSize must be >= sizeof SWR_WORKER_DATA 
 /////////////////////////////////////////////////////////////////////////
 struct SWR_WORKER_PRIVATE_STATE
 {
index 816b84e643e448dad9426f29fd7ed4d2b34253b3..d8703e57ea3f9661663a05ec124766c86e73ea9b 100644 (file)
@@ -520,6 +520,8 @@ static void StreamOut(
 {
     RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEStreamout, pDC->drawId);
 
+    void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
+
     const API_STATE&           state   = GetApiState(pDC);
     const SWR_STREAMOUT_STATE& soState = state.soState;
 
@@ -575,7 +577,7 @@ static void StreamOut(
         // Call SOS
         SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr,
                    "Trying to execute uninitialized streamout jit function.");
-        state.pfnSoFunc[streamIndex](GetPrivateState(pDC), soContext);
+        state.pfnSoFunc[streamIndex](GetPrivateState(pDC), pWorkerData, soContext);
     }
 
     // Update SO write offset. The driver provides memory for the update.
index 8b24c43fe80726f7d8703c2a0e61159b7910d464..5202e6146a14d5492fd6d7688a3c4dfaef834adf 100644 (file)
@@ -233,6 +233,7 @@ struct SWR_SHADER_STATS
     uint32_t numLodExecuted;
 };
 
+
 //////////////////////////////////////////////////////////////////////////
 /// SWR_VS_CONTEXT
 /// @brief Input to vertex shader
@@ -905,7 +906,7 @@ typedef void(__cdecl *PFN_HS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateDat
 typedef void(__cdecl *PFN_DS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_DS_CONTEXT* pDsContext);
 typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_GS_CONTEXT* pGsContext);
 typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_CS_CONTEXT* pCsContext);
-typedef void(__cdecl *PFN_SO_FUNC)(HANDLE hPrivateData, SWR_STREAMOUT_CONTEXT& soContext);
+typedef void(__cdecl *PFN_SO_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_STREAMOUT_CONTEXT& soContext);
 typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT* pContext);
 typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT* pContext);
 typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(SWR_BLEND_CONTEXT*);
index 59e37a488a4122a7f718ab6af6065ff638d1fff5..3090a2496929ee660e4b9e8327420e93b1866d36 100644 (file)
@@ -458,6 +458,9 @@ INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t workerId,
     {
         ExecuteCallbacks(pContext, workerId, pDC);
 
+        // Report accumulated memory access stats
+        AR_EVENT(MemoryStatsEndEvent(pDC->drawId));
+
         // Cleanup memory allocations
         pDC->pArena->Reset(true);
         if (!pDC->isCompute)
@@ -1193,26 +1196,31 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
 
     // Allocate worker private data
     pPool->pWorkerPrivateDataArray = nullptr;
-    if (pContext->workerPrivateState.perWorkerPrivateStateSize)
+    if (pContext->workerPrivateState.perWorkerPrivateStateSize == 0)
     {
-        size_t perWorkerSize =
-            AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64);
-        size_t totalSize = perWorkerSize * pPool->numThreads;
-        if (totalSize)
-        {
-            pPool->pWorkerPrivateDataArray = AlignedMalloc(totalSize, 64);
-            SWR_ASSERT(pPool->pWorkerPrivateDataArray);
+        pContext->workerPrivateState.perWorkerPrivateStateSize = sizeof(SWR_WORKER_DATA);
+        pContext->workerPrivateState.pfnInitWorkerData = nullptr;
+        pContext->workerPrivateState.pfnFinishWorkerData = nullptr;
+    }
+    // initialize contents of SWR_WORKER_DATA
+    size_t perWorkerSize =
+        AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64);
+    size_t totalSize = perWorkerSize * pPool->numThreads;
+    if (totalSize)
+    {
+        pPool->pWorkerPrivateDataArray = AlignedMalloc(totalSize, 64);
+        SWR_ASSERT(pPool->pWorkerPrivateDataArray);
 
-            void* pWorkerData = pPool->pWorkerPrivateDataArray;
-            for (uint32_t i = 0; i < pPool->numThreads; ++i)
+        void* pWorkerData = pPool->pWorkerPrivateDataArray;
+        for (uint32_t i = 0; i < pPool->numThreads; ++i)
+        {
+            pPool->pThreadData[i].pWorkerPrivateData = pWorkerData;
+            if (pContext->workerPrivateState.pfnInitWorkerData)
             {
-                pPool->pThreadData[i].pWorkerPrivateData = pWorkerData;
-                if (pContext->workerPrivateState.pfnInitWorkerData)
-                {
-                    pContext->workerPrivateState.pfnInitWorkerData(pWorkerData, i);
-                }
-                pWorkerData = PtrAdd(pWorkerData, perWorkerSize);
+                pContext->workerPrivateState.pfnInitWorkerData(pWorkerData, i);
             }
+            pWorkerData = PtrAdd(pWorkerData, perWorkerSize);
         }
     }
 
index adf8924ce430450eb5a90688a472b1eef3504053..21e3d47cf9dac66f7130226e644fa546a877990b 100644 (file)
@@ -42,7 +42,9 @@ namespace SwrJit
         mpTranslationFuncTy             = nullptr;
         mpfnTranslateGfxAddressForRead  = nullptr;
         mpfnTranslateGfxAddressForWrite = nullptr;
+        mpfnTrackMemAccess              = nullptr;
         mpParamSimDC                    = nullptr;
+        mpWorkerData                    = nullptr;
 
     }
 
@@ -167,9 +169,57 @@ namespace SwrJit
         return Ptr;
     }
 
+    void BuilderGfxMem::TrackerHelper(Value* Ptr, Type* Ty, JIT_MEM_CLIENT usage, bool isRead)
+    {
+#if defined(KNOB_ENABLE_AR)
+        if (!KNOB_TRACK_MEMORY_WORKING_SET)
+        {
+            return;
+        }
+
+        Value* tmpPtr;
+        // convert actual pointers to int64.
+        uint32_t size = 0;
+
+        if (Ptr->getType() == mInt64Ty)
+        {
+            DataLayout dataLayout(JM()->mpCurrentModule);
+            size = (uint32_t)dataLayout.getTypeAllocSize(Ty);
+
+            tmpPtr = Ptr;
+        }
+        else
+        {
+            DataLayout dataLayout(JM()->mpCurrentModule);
+            size = (uint32_t)dataLayout.getTypeAllocSize(Ptr->getType());
+
+            tmpPtr = PTR_TO_INT(Ptr, mInt64Ty);
+        }
+
+        // There are some shader compile setups where there's no translation functions set up.
+        // This would be a situation where the accesses are to internal rasterizer memory and won't
+        // be logged.
+        // TODO:  we may wish to revisit this for URB reads/writes, though.
+        if (mpfnTrackMemAccess)
+        {
+            SWR_ASSERT(mpWorkerData != nullptr);
+            CALL(mpfnTrackMemAccess,
+                 {mpParamSimDC,
+                  mpWorkerData,
+                  tmpPtr,
+                  C((uint32_t)size),
+                  C((uint8_t)isRead),
+                  C((uint32_t)usage)});
+        }
+#endif
+
+        return;
+    }
+
     LoadInst* BuilderGfxMem::LOAD(Value* Ptr, const char* Name, Type* Ty, JIT_MEM_CLIENT usage)
     {
         AssertGFXMemoryParams(Ptr, usage);
+        TrackerHelper(Ptr, Ty, usage, true);
 
         Ptr = TranslationHelper(Ptr, Ty);
         return Builder::LOAD(Ptr, Name);
@@ -178,6 +228,7 @@ namespace SwrJit
     LoadInst* BuilderGfxMem::LOAD(Value* Ptr, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage)
     {
         AssertGFXMemoryParams(Ptr, usage);
+        TrackerHelper(Ptr, Ty, usage, true);
 
         Ptr = TranslationHelper(Ptr, Ty);
         return Builder::LOAD(Ptr, Name);
@@ -188,6 +239,7 @@ namespace SwrJit
         Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage)
     {
         AssertGFXMemoryParams(Ptr, usage);
+        TrackerHelper(Ptr, Ty, usage, true);
 
         Ptr = TranslationHelper(Ptr, Ty);
         return Builder::LOAD(Ptr, isVolatile, Name);
@@ -232,6 +284,7 @@ namespace SwrJit
                                          JIT_MEM_CLIENT usage)
     {
         AssertGFXMemoryParams(Ptr, usage);
+        TrackerHelper(Ptr, Ty, usage, true);
 
         Ptr = TranslationHelper(Ptr, Ty);
         return Builder::MASKED_LOAD(Ptr, Align, Mask, PassThru, Name, Ty, usage);
@@ -241,6 +294,7 @@ namespace SwrJit
     BuilderGfxMem::STORE(Value* Val, Value* Ptr, bool isVolatile, Type* Ty, JIT_MEM_CLIENT usage)
     {
         AssertGFXMemoryParams(Ptr, usage);
+        TrackerHelper(Ptr, Ty, usage, false);
 
         Ptr = TranslationHelper(Ptr, Ty);
         return Builder::STORE(Val, Ptr, isVolatile, Ty, usage);
@@ -253,6 +307,7 @@ namespace SwrJit
                                     JIT_MEM_CLIENT                         usage)
     {
         AssertGFXMemoryParams(BasePtr, usage);
+        TrackerHelper(BasePtr, Ty, usage, false);
 
         BasePtr = TranslationHelper(BasePtr, Ty);
         return Builder::STORE(Val, BasePtr, offset, Ty, usage);
@@ -263,6 +318,8 @@ namespace SwrJit
     {
         AssertGFXMemoryParams(Ptr, usage);
 
+        TrackerHelper(Ptr, Ty, usage, false);
+
         Ptr = TranslationHelper(Ptr, Ty);
         return Builder::MASKED_STORE(Val, Ptr, Align, Mask, Ty, usage);
     }
index 1bbe86d16cd2a745592b38cc1ad143cfc60bb43a..52bd3ac226cf46eba1e27d51e7e6cdad1e85a1da 100644 (file)
@@ -110,7 +110,7 @@ namespace SwrJit
                                            Type*          PtrTy = nullptr,
                                            const Twine&   Name  = "",
                                            JIT_MEM_CLIENT usage = JIT_MEM_CLIENT::MEM_CLIENT_INTERNAL);
-
+        
 
     protected:
         void AssertGFXMemoryParams(Value* ptr, Builder::JIT_MEM_CLIENT usage);
@@ -120,6 +120,8 @@ namespace SwrJit
         virtual Value* OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset);
 
         Value* TranslationHelper(Value* Ptr, Type* Ty);
+        void   TrackerHelper(Value* Ptr, Type* Ty, JIT_MEM_CLIENT usage, bool isRead);
+
 
         FunctionType* GetTranslationFunctionType() { return mpTranslationFuncTy; }
         Value*        GetTranslationFunctionForRead() { return mpfnTranslateGfxAddressForRead; }
@@ -127,10 +129,14 @@ namespace SwrJit
         Value*        GetParamSimDC() { return mpParamSimDC; }
 
 
+        Value*        mpWorkerData;
+
     private:
         FunctionType* mpTranslationFuncTy;
         Value*        mpfnTranslateGfxAddressForRead;
         Value*        mpfnTranslateGfxAddressForWrite;
         Value*        mpParamSimDC;
+        FunctionType* mpTrackMemAccessFuncTy;
+        Value*        mpfnTrackMemAccess;
     };
 } // namespace SwrJit
index 5a096533d95b1b07044d85a7c0645d0e835f9956..8601d0529bcecabf2d438b69c32b72197981f9c0 100644 (file)
@@ -113,7 +113,6 @@ struct FetchJit : public BuilderGfxMem
         SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
     void ConvertFormat(SWR_FORMAT format, Value* texels[4]);
 
-    Value* mpWorkerData;
     Value* mpFetchInfo;
 };
 
@@ -141,6 +140,7 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
     mpWorkerData = &*argitr;
     ++argitr;
     mpWorkerData->setName("pWorkerData");
+
     mpFetchInfo = &*argitr;
     ++argitr;
     mpFetchInfo->setName("fetchInfo");
index 43e2c4492010abfc7800b4d1fa3b3158ce79698a..c47acf73228ac7ca85204b45838e236e5ba739da 100644 (file)
@@ -263,12 +263,10 @@ struct StreamOutJit : public BuilderGfxMem
                                  std::ios_base::in | std::ios_base::out | std::ios_base::ate);
         fnName << ComputeCRC(0, &state, sizeof(state));
 
-        Type* typeParam0;
-        typeParam0 = mInt8PtrTy;
-
         std::vector<Type*> args{
-                            typeParam0,
-                            PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT*
+            mInt8PtrTy,
+            mInt8PtrTy,
+            PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT*
         };
 
         FunctionType* fTy    = FunctionType::get(IRB()->getVoidTy(), args, false);
@@ -290,6 +288,10 @@ struct StreamOutJit : public BuilderGfxMem
         privateContext->setName("privateContext");
         SetPrivateContext(privateContext);
 
+        mpWorkerData = &*argitr;
+        ++argitr;
+        mpWorkerData->setName("pWorkerData");
+
         Value* pSoCtx = &*argitr++;
         pSoCtx->setName("pSoCtx");