*
******************************************************************************/
#include <atomic>
+#include <map>
#include "common/os.h"
#include "archrast/archrast.h"
uint32_t alphaBlendCount = 0;
};
+ struct MemoryStats
+ {
+ struct MemoryTrackerKey
+ {
+ uint64_t address;
+ uint64_t mask;
+ };
+
+ struct MemoryTrackerData
+ {
+ uint32_t accessCountRead;
+ uint32_t accessCountWrite;
+ uint64_t tscMin;
+ uint64_t tscMax;
+ };
+
+ struct AddressRangeComparator
+ {
+ bool operator()(MemoryTrackerKey a, MemoryTrackerKey b) const
+ {
+ return (a.address & a.mask) < (b.address & b.mask);
+ }
+ };
+
+ typedef std::map<MemoryTrackerKey, MemoryTrackerData, AddressRangeComparator> MemoryTrackerMap;
+ MemoryTrackerMap trackedMemory = {};
+
+ void TrackMemoryAccess(uint64_t address, uint64_t addressMask, uint8_t isRead, uint64_t tsc)
+ {
+ MemoryTrackerKey key;
+ key.address = address;
+ key.mask = addressMask;
+
+ MemoryTrackerMap::iterator i = trackedMemory.lower_bound(key);
+ if (i != trackedMemory.end() && !(trackedMemory.key_comp()(key, i->first)))
+ {
+ // already in map
+ if (isRead)
+ {
+ i->second.accessCountRead++;
+ }
+ else
+ {
+ i->second.accessCountWrite++;
+ }
+ i->second.tscMax = tsc;
+ }
+ else
+ {
+ // new entry
+ MemoryTrackerData data;
+ if (isRead)
+ {
+ data.accessCountRead = 1;
+ data.accessCountWrite = 0;
+ }
+ else
+ {
+ data.accessCountRead = 0;
+ data.accessCountWrite = 1;
+ }
+ data.tscMin = tsc;
+ data.tscMax = tsc;
+ trackedMemory.insert(i, MemoryTrackerMap::value_type(key, data));
+ }
+ }
+ };
+
//////////////////////////////////////////////////////////////////////////
/// @brief Event handler that handles API thread events. This is shared
/// between the API and its caller (e.g. driver shim) but typically
EventHandlerWorkerStats(uint32_t id) : EventHandlerFile(id), mNeedFlush(false)
{
memset(mShaderStats, 0, sizeof(mShaderStats));
+
+ // compute address mask for memory tracking
+ mAddressMask = 0;
+ uint64_t addressRangeBytes = 64;
+ while (addressRangeBytes > 0)
+ {
+ mAddressMask = (mAddressMask << 1) | 1;
+ addressRangeBytes = addressRangeBytes >> 1;
+ }
+ mAddressMask = ~mAddressMask;
}
virtual void Handle(const EarlyDepthStencilInfoSingleSample& event)
mGS = {};
}
+ virtual void Handle(const MemoryAccessEvent& event)
+ {
+ mMemoryStats.TrackMemoryAccess(event.data.ptr, mAddressMask, event.data.isRead, event.data.tsc);
+ }
+
+ virtual void Handle(const MemoryStatsEndEvent& event)
+ {
+ MemoryStats::MemoryTrackerMap::iterator i = mMemoryStats.trackedMemory.begin();
+ while (i != mMemoryStats.trackedMemory.end())
+ {
+ MemoryStatsEvent mse(event.data.drawId,
+ i->first.address & mAddressMask,
+ i->second.accessCountRead,
+ i->second.accessCountWrite,
+ i->second.tscMin,
+ i->second.tscMax);
+ EventHandlerFile::Handle(mse);
+ i++;
+ }
+ mMemoryStats.trackedMemory.clear();
+ }
+
virtual void Handle(const GSPrimInfo& event)
{
mGS.inputPrimCount += event.data.inputPrimCount;
SWR_SHADER_STATS mShaderStats[NUM_SHADER_TYPES];
+ MemoryStats mMemoryStats = {};
+ uint64_t mAddressMask = 0;
+
};
static EventManager* FromHandle(HANDLE hThreadContext)
uint32_t swTagFlushCounter;
char swTagFlushReason[256];
uint32_t swTagFlushType;
-};
\ No newline at end of file
+};
+
+event SWTagApiCallEvent
+{
+ uint64_t swTagFrame;
+ uint32_t swTagDrawOrDispatch;
+ uint32_t swTagDraw;
+ uint32_t swTagDispatch;
+ char swTagApiCall[256];
+};
+
+event MemoryStatsEvent
+{
+ uint32_t drawId;
+ uint64_t baseAddr;
+ uint32_t accessCountRead;
+ uint32_t accessCountWrite;
+ uint64_t tscMin;
+ uint64_t tscMax;
+};
uint32_t drawId;
};
+event MemoryAccessEvent
+{
+ uint32_t drawId;
+ uint64_t tsc;
+ uint64_t ptr;
+ uint32_t size;
+ uint8_t isRead;
+ uint8_t client;
+};
+
+event MemoryStatsEndEvent
+{
+ uint32_t drawId;
+};
+
event TessPrimCount
{
uint64_t primCount;
#if defined(KNOB_ENABLE_AR)
// Initialize worker thread context for ArchRast.
pContext->pArContext[i] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::WORKER);
+
+ SWR_WORKER_DATA* pWorkerData = (SWR_WORKER_DATA*)pContext->threadPool.pThreadData[i].pWorkerPrivateData;
+ pWorkerData->hArContext = pContext->pArContext[i];
#endif
+
+
}
#if defined(KNOB_ENABLE_AR)
// Independent of KNOB_MAX_THREADS_PER_CORE.
};
+struct SWR_WORKER_DATA
+{
+ HANDLE hArContext; // handle to the archrast context
+};
+
//////////////////////////////////////////////////////////////////////////
/// SWR_WORKER_PRIVATE_STATE
/// Data used to allocate per-worker thread private data. A pointer
/// to this data will be passed in to each shader function.
+/// The first field of this private data must be SWR_WORKER_DATA
+/// perWorkerPrivateStateSize must be >= sizeof SWR_WORKER_DATA
/////////////////////////////////////////////////////////////////////////
struct SWR_WORKER_PRIVATE_STATE
{
{
RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEStreamout, pDC->drawId);
+ void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
+
const API_STATE& state = GetApiState(pDC);
const SWR_STREAMOUT_STATE& soState = state.soState;
// Call SOS
SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr,
"Trying to execute uninitialized streamout jit function.");
- state.pfnSoFunc[streamIndex](GetPrivateState(pDC), soContext);
+ state.pfnSoFunc[streamIndex](GetPrivateState(pDC), pWorkerData, soContext);
}
// Update SO write offset. The driver provides memory for the update.
uint32_t numLodExecuted;
};
+
//////////////////////////////////////////////////////////////////////////
/// SWR_VS_CONTEXT
/// @brief Input to vertex shader
typedef void(__cdecl *PFN_DS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_DS_CONTEXT* pDsContext);
typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_GS_CONTEXT* pGsContext);
typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_CS_CONTEXT* pCsContext);
-typedef void(__cdecl *PFN_SO_FUNC)(HANDLE hPrivateData, SWR_STREAMOUT_CONTEXT& soContext);
+typedef void(__cdecl *PFN_SO_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_STREAMOUT_CONTEXT& soContext);
typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT* pContext);
typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT* pContext);
typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(SWR_BLEND_CONTEXT*);
{
ExecuteCallbacks(pContext, workerId, pDC);
+ // Report accumulated memory access stats
+ AR_EVENT(MemoryStatsEndEvent(pDC->drawId));
+
// Cleanup memory allocations
pDC->pArena->Reset(true);
if (!pDC->isCompute)
// Allocate worker private data
pPool->pWorkerPrivateDataArray = nullptr;
- if (pContext->workerPrivateState.perWorkerPrivateStateSize)
+ if (pContext->workerPrivateState.perWorkerPrivateStateSize == 0)
{
- size_t perWorkerSize =
- AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64);
- size_t totalSize = perWorkerSize * pPool->numThreads;
- if (totalSize)
- {
- pPool->pWorkerPrivateDataArray = AlignedMalloc(totalSize, 64);
- SWR_ASSERT(pPool->pWorkerPrivateDataArray);
+ pContext->workerPrivateState.perWorkerPrivateStateSize = sizeof(SWR_WORKER_DATA);
+ pContext->workerPrivateState.pfnInitWorkerData = nullptr;
+ pContext->workerPrivateState.pfnFinishWorkerData = nullptr;
+ }
+
+ // initialize contents of SWR_WORKER_DATA
+ size_t perWorkerSize =
+ AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64);
+ size_t totalSize = perWorkerSize * pPool->numThreads;
+ if (totalSize)
+ {
+ pPool->pWorkerPrivateDataArray = AlignedMalloc(totalSize, 64);
+ SWR_ASSERT(pPool->pWorkerPrivateDataArray);
- void* pWorkerData = pPool->pWorkerPrivateDataArray;
- for (uint32_t i = 0; i < pPool->numThreads; ++i)
+ void* pWorkerData = pPool->pWorkerPrivateDataArray;
+ for (uint32_t i = 0; i < pPool->numThreads; ++i)
+ {
+ pPool->pThreadData[i].pWorkerPrivateData = pWorkerData;
+ if (pContext->workerPrivateState.pfnInitWorkerData)
{
- pPool->pThreadData[i].pWorkerPrivateData = pWorkerData;
- if (pContext->workerPrivateState.pfnInitWorkerData)
- {
- pContext->workerPrivateState.pfnInitWorkerData(pWorkerData, i);
- }
- pWorkerData = PtrAdd(pWorkerData, perWorkerSize);
+ pContext->workerPrivateState.pfnInitWorkerData(pWorkerData, i);
}
+ pWorkerData = PtrAdd(pWorkerData, perWorkerSize);
}
}
mpTranslationFuncTy = nullptr;
mpfnTranslateGfxAddressForRead = nullptr;
mpfnTranslateGfxAddressForWrite = nullptr;
+ mpfnTrackMemAccess = nullptr;
mpParamSimDC = nullptr;
+ mpWorkerData = nullptr;
}
return Ptr;
}
+ void BuilderGfxMem::TrackerHelper(Value* Ptr, Type* Ty, JIT_MEM_CLIENT usage, bool isRead)
+ {
+#if defined(KNOB_ENABLE_AR)
+ if (!KNOB_TRACK_MEMORY_WORKING_SET)
+ {
+ return;
+ }
+
+ Value* tmpPtr;
+ // convert actual pointers to int64.
+ uint32_t size = 0;
+
+ if (Ptr->getType() == mInt64Ty)
+ {
+ DataLayout dataLayout(JM()->mpCurrentModule);
+ size = (uint32_t)dataLayout.getTypeAllocSize(Ty);
+
+ tmpPtr = Ptr;
+ }
+ else
+ {
+ DataLayout dataLayout(JM()->mpCurrentModule);
+ size = (uint32_t)dataLayout.getTypeAllocSize(Ptr->getType());
+
+ tmpPtr = PTR_TO_INT(Ptr, mInt64Ty);
+ }
+
+ // There are some shader compile setups where there's no translation functions set up.
+ // This would be a situation where the accesses are to internal rasterizer memory and won't
+ // be logged.
+ // TODO: we may wish to revisit this for URB reads/writes, though.
+ if (mpfnTrackMemAccess)
+ {
+ SWR_ASSERT(mpWorkerData != nullptr);
+ CALL(mpfnTrackMemAccess,
+ {mpParamSimDC,
+ mpWorkerData,
+ tmpPtr,
+ C((uint32_t)size),
+ C((uint8_t)isRead),
+ C((uint32_t)usage)});
+ }
+#endif
+
+ return;
+ }
+
LoadInst* BuilderGfxMem::LOAD(Value* Ptr, const char* Name, Type* Ty, JIT_MEM_CLIENT usage)
{
AssertGFXMemoryParams(Ptr, usage);
+ TrackerHelper(Ptr, Ty, usage, true);
Ptr = TranslationHelper(Ptr, Ty);
return Builder::LOAD(Ptr, Name);
LoadInst* BuilderGfxMem::LOAD(Value* Ptr, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage)
{
AssertGFXMemoryParams(Ptr, usage);
+ TrackerHelper(Ptr, Ty, usage, true);
Ptr = TranslationHelper(Ptr, Ty);
return Builder::LOAD(Ptr, Name);
Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage)
{
AssertGFXMemoryParams(Ptr, usage);
+ TrackerHelper(Ptr, Ty, usage, true);
Ptr = TranslationHelper(Ptr, Ty);
return Builder::LOAD(Ptr, isVolatile, Name);
JIT_MEM_CLIENT usage)
{
AssertGFXMemoryParams(Ptr, usage);
+ TrackerHelper(Ptr, Ty, usage, true);
Ptr = TranslationHelper(Ptr, Ty);
return Builder::MASKED_LOAD(Ptr, Align, Mask, PassThru, Name, Ty, usage);
BuilderGfxMem::STORE(Value* Val, Value* Ptr, bool isVolatile, Type* Ty, JIT_MEM_CLIENT usage)
{
AssertGFXMemoryParams(Ptr, usage);
+ TrackerHelper(Ptr, Ty, usage, false);
Ptr = TranslationHelper(Ptr, Ty);
return Builder::STORE(Val, Ptr, isVolatile, Ty, usage);
JIT_MEM_CLIENT usage)
{
AssertGFXMemoryParams(BasePtr, usage);
+ TrackerHelper(BasePtr, Ty, usage, false);
BasePtr = TranslationHelper(BasePtr, Ty);
return Builder::STORE(Val, BasePtr, offset, Ty, usage);
{
AssertGFXMemoryParams(Ptr, usage);
+ TrackerHelper(Ptr, Ty, usage, false);
+
Ptr = TranslationHelper(Ptr, Ty);
return Builder::MASKED_STORE(Val, Ptr, Align, Mask, Ty, usage);
}
Type* PtrTy = nullptr,
const Twine& Name = "",
JIT_MEM_CLIENT usage = JIT_MEM_CLIENT::MEM_CLIENT_INTERNAL);
-
+
protected:
void AssertGFXMemoryParams(Value* ptr, Builder::JIT_MEM_CLIENT usage);
virtual Value* OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset);
Value* TranslationHelper(Value* Ptr, Type* Ty);
+ void TrackerHelper(Value* Ptr, Type* Ty, JIT_MEM_CLIENT usage, bool isRead);
+
FunctionType* GetTranslationFunctionType() { return mpTranslationFuncTy; }
Value* GetTranslationFunctionForRead() { return mpfnTranslateGfxAddressForRead; }
Value* GetParamSimDC() { return mpParamSimDC; }
+ Value* mpWorkerData;
+
private:
FunctionType* mpTranslationFuncTy;
Value* mpfnTranslateGfxAddressForRead;
Value* mpfnTranslateGfxAddressForWrite;
Value* mpParamSimDC;
+ FunctionType* mpTrackMemAccessFuncTy;
+ Value* mpfnTrackMemAccess;
};
} // namespace SwrJit
SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
void ConvertFormat(SWR_FORMAT format, Value* texels[4]);
- Value* mpWorkerData;
Value* mpFetchInfo;
};
mpWorkerData = &*argitr;
++argitr;
mpWorkerData->setName("pWorkerData");
+
mpFetchInfo = &*argitr;
++argitr;
mpFetchInfo->setName("fetchInfo");
std::ios_base::in | std::ios_base::out | std::ios_base::ate);
fnName << ComputeCRC(0, &state, sizeof(state));
- Type* typeParam0;
- typeParam0 = mInt8PtrTy;
-
std::vector<Type*> args{
- typeParam0,
- PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT*
+ mInt8PtrTy,
+ mInt8PtrTy,
+ PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT*
};
FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
privateContext->setName("privateContext");
SetPrivateContext(privateContext);
+ mpWorkerData = &*argitr;
+ ++argitr;
+ mpWorkerData->setName("pWorkerData");
+
Value* pSoCtx = &*argitr++;
pSoCtx->setName("pSoCtx");