pContext->privateStateSize = pCreateInfo->privateStateSize;
+ // initialize callback functions
+ pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
+ pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
+ pContext->pfnTranslateGfxptrForRead = pCreateInfo->pfnTranslateGfxptrForRead;
+ pContext->pfnTranslateGfxptrForWrite = pCreateInfo->pfnTranslateGfxptrForWrite;
+ pContext->pfnMakeGfxPtr = pCreateInfo->pfnMakeGfxPtr;
+ pContext->pfnCreateMemoryContext = pCreateInfo->pfnCreateMemoryContext;
+ pContext->pfnDestroyMemoryContext = pCreateInfo->pfnDestroyMemoryContext;
+ pContext->pfnUpdateSoWriteOffset = pCreateInfo->pfnUpdateSoWriteOffset;
+ pContext->pfnUpdateStats = pCreateInfo->pfnUpdateStats;
+ pContext->pfnUpdateStatsFE = pCreateInfo->pfnUpdateStatsFE;
+
+
+ pContext->hExternalMemory = pCreateInfo->hExternalMemory;
+
pContext->MAX_DRAWS_IN_FLIGHT = KNOB_MAX_DRAWS_IN_FLIGHT;
if (pCreateInfo->MAX_DRAWS_IN_FLIGHT != 0)
{
pContext->threadPool.pThreadData ? pContext->threadPool.pThreadData[i].numaId : 0;
pContext->ppScratch[i] = (uint8_t*)VirtualAllocExNuma(GetCurrentProcess(),
nullptr,
- 32 * sizeof(KILOBYTE),
+ KNOB_WORKER_SCRATCH_SPACE_SIZE,
MEM_RESERVE | MEM_COMMIT,
PAGE_READWRITE,
numaNode);
#else
pContext->ppScratch[i] =
- (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4);
+ (uint8_t*)AlignedMalloc(KNOB_WORKER_SCRATCH_SPACE_SIZE, KNOB_SIMD_WIDTH * 4);
#endif
#if defined(KNOB_ENABLE_AR)
// initialize hot tile manager
pContext->pHotTileMgr = new HotTileMgr();
- // initialize callback functions
- pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
- pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
- pContext->pfnTranslateGfxptrForRead = pCreateInfo->pfnTranslateGfxptrForRead;
- pContext->pfnTranslateGfxptrForWrite = pCreateInfo->pfnTranslateGfxptrForWrite;
- pContext->pfnMakeGfxPtr = pCreateInfo->pfnMakeGfxPtr;
- pContext->pfnUpdateSoWriteOffset = pCreateInfo->pfnUpdateSoWriteOffset;
- pContext->pfnUpdateStats = pCreateInfo->pfnUpdateStats;
- pContext->pfnUpdateStatsFE = pCreateInfo->pfnUpdateStatsFE;
-
-
// pass pointer to bucket manager back to caller
#ifdef KNOB_ENABLE_RDTSC
pCreateInfo->pBucketMgr = pContext->pBucketMgr;
void SwrDispatch(HANDLE hContext,
uint32_t threadGroupCountX,
uint32_t threadGroupCountY,
- uint32_t threadGroupCountZ)
+ uint32_t threadGroupCountZ
+
+)
{
if (KNOB_TOSS_DRAW)
{
pTaskData->threadGroupCountX = threadGroupCountX;
pTaskData->threadGroupCountY = threadGroupCountY;
pTaskData->threadGroupCountZ = threadGroupCountZ;
+
pTaskData->enableThreadDispatch = false;
uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
typedef void*(SWR_API* PFN_TRANSLATE_GFXPTR_FOR_READ)(HANDLE hPrivateContext,
gfxptr_t xpAddr,
- bool* pbNullTileAccessed);
+ bool* pbNullTileAccessed,
+ HANDLE hPrivateWorkerData);
typedef void*(SWR_API* PFN_TRANSLATE_GFXPTR_FOR_WRITE)(HANDLE hPrivateContext,
gfxptr_t xpAddr,
- bool* pbNullTileAccessed);
+ bool* pbNullTileAccessed,
+ HANDLE hPrivateWorkerData);
typedef gfxptr_t(SWR_API* PFN_MAKE_GFXPTR)(HANDLE hPrivateContext, void* sysAddr);
+typedef HANDLE(SWR_API* PFN_CREATE_MEMORY_CONTEXT)(HANDLE hExternalMemory);
+
+typedef void(SWR_API* PFN_DESTROY_MEMORY_CONTEXT)(HANDLE hExternalMemory, HANDLE hMemoryContext);
+
//////////////////////////////////////////////////////////////////////////
/// @brief Callback to allow driver to update their copy of streamout write offset.
/// This is call is made for any draw operation that has streamout enabled
// Independent of KNOB_MAX_THREADS_PER_CORE.
};
-struct SWR_WORKER_DATA
-{
- HANDLE hArContext; // handle to the archrast context
-};
+//////////////////////////////////////////////////////////////////////////
+/// SWR_CONTEXT
+/// Forward Declaration (see context.h for full definition)
+/////////////////////////////////////////////////////////////////////////
+class SWR_CONTEXT;
//////////////////////////////////////////////////////////////////////////
/// SWR_WORKER_PRIVATE_STATE
/////////////////////////////////////////////////////////////////////////
struct SWR_WORKER_PRIVATE_STATE
{
- typedef void(SWR_API* PFN_WORKER_DATA)(HANDLE hWorkerPrivateData, uint32_t iWorkerNum);
+ typedef void(SWR_API* PFN_WORKER_DATA)(SWR_CONTEXT* pContext, HANDLE hWorkerPrivateData, uint32_t iWorkerNum);
size_t perWorkerPrivateStateSize; ///< Amount of data to allocate per-worker
PFN_WORKER_DATA pfnInitWorkerData; ///< Init function for worker data. If null
PFN_TRANSLATE_GFXPTR_FOR_READ pfnTranslateGfxptrForRead;
PFN_TRANSLATE_GFXPTR_FOR_WRITE pfnTranslateGfxptrForWrite;
PFN_MAKE_GFXPTR pfnMakeGfxPtr;
+ PFN_CREATE_MEMORY_CONTEXT pfnCreateMemoryContext;
+ PFN_DESTROY_MEMORY_CONTEXT pfnDestroyMemoryContext;
PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset;
PFN_UPDATE_STATS pfnUpdateStats;
PFN_UPDATE_STATS_FE pfnUpdateStatsFE;
// ArchRast event manager.
HANDLE hArEventManager;
+ // handle to external memory for worker datas to create memory contexts
+ HANDLE hExternalMemory;
+
// Input (optional): Threading info that overrides any set KNOB values.
SWR_THREADING_INFO* pThreadInfo;
uint32_t threadGroupCountY,
uint32_t threadGroupCountZ);
-
/// @note this enum needs to be kept in sync with HOTTILE_STATE!
enum SWR_TILE_STATE
{
// execute pixel shader
RDTSC_BEGIN(pDC->pContext->pBucketMgr, BEPixelShader, pDC->drawId);
state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext);
- UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(activeLanes)));
RDTSC_END(pDC->pContext->pBucketMgr, BEPixelShader, 0);
// update stats
PFN_TRANSLATE_GFXPTR_FOR_READ pfnTranslateGfxptrForRead;
PFN_TRANSLATE_GFXPTR_FOR_WRITE pfnTranslateGfxptrForWrite;
PFN_MAKE_GFXPTR pfnMakeGfxPtr;
+ PFN_CREATE_MEMORY_CONTEXT pfnCreateMemoryContext;
+ PFN_DESTROY_MEMORY_CONTEXT pfnDestroyMemoryContext;
PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset;
PFN_UPDATE_STATS pfnUpdateStats;
PFN_UPDATE_STATS_FE pfnUpdateStatsFE;
// ArchRast thread contexts.
HANDLE* pArContext;
+ // handle to external memory for worker datas to create memory contexts
+ HANDLE hExternalMemory;
+
BucketManager *pBucketMgr;
};
{
bool nullTileAccessed = false;
void* pWriteOffset = pDC->pContext->pfnTranslateGfxptrForWrite(
- GetPrivateState(pDC), soContext.pBuffer[i]->pWriteOffset, &nullTileAccessed);
+ GetPrivateState(pDC), soContext.pBuffer[i]->pWriteOffset, &nullTileAccessed, pWorkerData);
*((uint32_t*)pWriteOffset) = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
}
#define KNOB_GUARDBAND_WIDTH 32768.0f
#define KNOB_GUARDBAND_HEIGHT 32768.0f
+// Scratch space requirements per worker. Currently only used for TGSM sizing for some stages
+#define KNOB_WORKER_SCRATCH_SPACE_SIZE (32 * 1024)
+
///////////////////////////////
// Macro tile configuration
///////////////////////////////
{48, 49, 52, 53, 56, 57, 60, 61},
{50, 51, 54, 55, 58, 59, 62, 63}};
- OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
+ OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc = {};
// pull point information from triangle buffer
// @todo use structs for readability
// mask indices by the maximum valid index for x/y of coveragemap.
uint32_t tX = workDesc.triFlags.coverageMask & 0x7;
uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7;
- // todo: multisample points?
- triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX];
+ for (uint32_t i = 0; i < _countof(triDesc.coverageMask); ++i)
+ {
+ triDesc.coverageMask[i] = 1ULL << coverageMap[tY][tX];
+ }
+ triDesc.anyCoveredSamples = triDesc.coverageMask[0];
+ triDesc.innerCoverageMask = triDesc.coverageMask[0];
// no persp divide needed for points
triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
typename SIMD_T::Vec4 attrib[SWR_VTX_NUM_SLOTS];
};
+struct SWR_WORKER_DATA
+{
+ HANDLE hArContext; // handle to the archrast context
+};
+
//////////////////////////////////////////////////////////////////////////
/// SWR_SHADER_STATS
/// @brief Structure passed to shader for stats collection.
pPool->pThreadData[i].pWorkerPrivateData = pWorkerData;
if (pContext->workerPrivateState.pfnInitWorkerData)
{
- pContext->workerPrivateState.pfnInitWorkerData(pWorkerData, i);
+ pContext->workerPrivateState.pfnInitWorkerData(pContext, pWorkerData, i);
}
pWorkerData = PtrAdd(pWorkerData, perWorkerSize);
}
if (pContext->workerPrivateState.pfnFinishWorkerData)
{
pContext->workerPrivateState.pfnFinishWorkerData(
- pPool->pThreadData[t].pWorkerPrivateData, t);
+ pContext, pPool->pThreadData[t].pWorkerPrivateData, t);
}
}
int ExecUnhookedProcess(const std::string& CmdLine, std::string* pStdOut, std::string* pStdErr)
{
- return ExecCmd(CmdLine, "", pStdOut, pStdErr);
+
+ return ExecCmd(CmdLine, nullptr, pStdOut, pStdErr);
}
/// Calculate actual directory where module will be cached.
mSimdVectorTy = ArrayType::get(mSimdFP32Ty, 4);
mSimdVectorIntTy = ArrayType::get(mSimdInt32Ty, 4);
mSimdVectorTRTy = ArrayType::get(mSimdFP32Ty, 5);
+ mSimdVectorTRIntTy = ArrayType::get(mSimdInt32Ty, 5);
}
/// @brief Mark this alloca as temporary to avoid hoisting later on
Type* mSimdVectorTy;
Type* mSimdVectorTRTy;
Type* mSimdVectorIntTy;
+ Type* mSimdVectorTRIntTy;
// Built in types: simd16
return ConstantVector::get(vConsts);
}
+template <typename Ty>
+Constant* C(const std::vector<Ty>& constList)
+{
+ std::vector<Constant*> vConsts;
+ for (auto i : constList)
+ {
+ vConsts.push_back(C((Ty)i));
+ }
+ return ConstantVector::get(vConsts);
+}
+
template <typename Ty>
Constant* CA(LLVMContext& ctx, ArrayRef<Ty> constList)
{
}
}
+
+typedef void* (*PFN_TRANSLATEGFXADDRESS_FUNC)(void* pdc, gfxptr_t va, bool* out_pbNullTileAccessed, void* pWorkerData);
+
+template <typename T>
+void GetSimdValidIndicesGfx(gfxptr_t indices,
+ gfxptr_t lastIndex,
+ uint32_t vWidth,
+ PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate,
+ void* pdc,
+ uint32_t* outIndices,
+ void* pWorkerData)
+{
+ SWR_ASSERT(outIndices != nullptr);
+
+ gfxptr_t indexPtr = indices;
+ for (int64_t lane = 0; lane < vWidth; lane++)
+ {
+ uint32_t index = 0;
+
+ if (indexPtr < lastIndex)
+ {
+ // translate indexPtr and load from it
+ T* addr = (T*)pfnTranslate(pdc, indexPtr, nullptr, pWorkerData);
+ SWR_ASSERT(addr != nullptr);
+ index = *addr;
+ }
+
+ // index to 32 bits and insert into the correct simd lane
+ outIndices[lane] = index;
+
+ indexPtr += sizeof(T);
+ }
+}
+
+void GetSimdValid8bitIndicesGfx(gfxptr_t indices,
+ gfxptr_t lastIndex,
+ uint32_t vWidth,
+ PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate,
+ void* pdc,
+ uint32_t* outIndices,
+ void* pWorkerData)
+{
+ GetSimdValidIndicesGfx<uint8_t>(indices, lastIndex, vWidth, pfnTranslate, pdc, outIndices, pWorkerData);
+}
+
+void GetSimdValid16bitIndicesGfx(gfxptr_t indices,
+ gfxptr_t lastIndex,
+ uint32_t vWidth,
+ PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate,
+ void* pdc,
+ uint32_t* outIndices,
+ void* pWorkerData)
+{
+ GetSimdValidIndicesGfx<uint16_t>(indices, lastIndex, vWidth, pfnTranslate, pdc, outIndices, pWorkerData);
+}
+
+
template <typename T>
Value* FetchJit::GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex)
{