swr: [rasterizer jitter] reimplement SCATTERPS
authorTim Rowley <timothy.o.rowley@intel.com>
Thu, 25 Aug 2016 17:59:55 +0000 (12:59 -0500)
committerTim Rowley <timothy.o.rowley@intel.com>
Mon, 29 Aug 2016 17:42:23 +0000 (12:42 -0500)
Implement SCATTERPS as a dynamic loop based on mask set bits
instead of a static compile time loop.

Signed-off-by: Tim Rowley <timothy.o.rowley@intel.com>
src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h

index da77f600a71f7b96bd09b2a62e6fd538e24b745b..13c1daf6fe16f2c3f3092a4a4fa9da0c3e376573 100644 (file)
@@ -1323,6 +1323,17 @@ void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInpu
     }
 }
 
+// Helper function to create alloca in entry block of function
+Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
+{
+    auto saveIP = IRB()->saveIP();
+    IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
+                          pFunc->getEntryBlock().begin());
+    Value* pAlloca = ALLOCA(pType);
+    IRB()->restoreIP(saveIP);
+    return pAlloca;
+}
+
 //////////////////////////////////////////////////////////////////////////
 /// @brief emulates a scatter operation.
 /// @param pDst - pointer to destination 
@@ -1331,28 +1342,95 @@ void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInpu
 /// @param vMask - mask of valid lanes
 void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
 {
-    Value* pStack = STACKSAVE();
+    /* Scatter algorithm
+    
+       while(Index = BitScanForward(mask))
+            srcElem = srcVector[Index]
+            offsetElem = offsetVector[Index]
+            *(pDst + offsetElem) = srcElem
+            Update mask (&= ~(1<<Index)
 
-    Type* pSrcTy = vSrc->getType()->getVectorElementType();
+    */
 
-    // allocate tmp stack for masked off lanes
-    Value* vTmpPtr = ALLOCA(pSrcTy);
+    BasicBlock* pCurBB = IRB()->GetInsertBlock();
+    Function* pFunc = pCurBB->getParent();
+    Type* pSrcTy = vSrc->getType()->getVectorElementType();
 
-    Value *mask = MASK(vMask);
-    for (uint32_t i = 0; i < mVWidth; ++i)
+    // Store vectors on stack
+    if (pScatterStackSrc == nullptr)
     {
-        Value *offset = VEXTRACT(vOffsets, C(i));
-        // byte pointer to component
-        Value *storeAddress = GEP(pDst, offset);
-        storeAddress = BITCAST(storeAddress, PointerType::get(pSrcTy, 0));
-        Value *selMask = VEXTRACT(mask, C(i));
-        Value *srcElem = VEXTRACT(vSrc, C(i));
-        // switch in a safe address to load if we're trying to access a vertex 
-        Value *validAddress = SELECT(selMask, storeAddress, vTmpPtr);
-        STORE(srcElem, validAddress);
+        // Save off stack allocations and reuse per scatter. Significantly reduces stack
+        // requirements for shaders with a lot of scatters.
+        pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
+        pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
     }
+    
+    Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
+    Value* pOffsetsArrayPtr = pScatterStackOffsets;
+    STORE(vSrc, pSrcArrayPtr);
+    STORE(vOffsets, pOffsetsArrayPtr);
+
+    // Cast to pointers for random access
+    pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
+    pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
+
+    Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
+
+    // Get cttz function
+    Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
+    
+    // Setup loop basic block
+    BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter Loop", pFunc);
+
+    // compute first set bit
+    Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
+
+    Value* pIsUndef = ICMP_EQ(pIndex, C(32));
+
+    // Split current block
+    BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
+
+    // Remove unconditional jump created by splitBasicBlock
+    pCurBB->getTerminator()->eraseFromParent();
+
+    // Add terminator to end of original block
+    IRB()->SetInsertPoint(pCurBB);
+
+    // Add conditional branch
+    COND_BR(pIsUndef, pPostLoop, pLoop);
+
+    // Add loop basic block contents
+    IRB()->SetInsertPoint(pLoop);
+    PHINode* pIndexPhi = PHI(mInt32Ty, 2);
+    PHINode* pMaskPhi = PHI(mInt32Ty, 2);
+
+    pIndexPhi->addIncoming(pIndex, pCurBB);
+    pMaskPhi->addIncoming(pMask, pCurBB);
+
+    // Extract elements for this index
+    Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
+    Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
+
+    // GEP to this offset in dst
+    Value* pCurDst = GEP(pDst, pOffsetElem);
+    pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
+    STORE(pSrcElem, pCurDst);
+
+    // Update the mask
+    Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
+
+    // Terminator
+    Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
+
+    pIsUndef = ICMP_EQ(pNewIndex, C(32));
+    COND_BR(pIsUndef, pPostLoop, pLoop);
+
+    // Update phi edges
+    pIndexPhi->addIncoming(pNewIndex, pLoop);
+    pMaskPhi->addIncoming(pNewMask, pLoop);
 
-    STACKRESTORE(pStack);
+    // Move builder to beginning of post loop
+    IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
 }
 
 Value* Builder::VABSPS(Value* a)
index b01ffa26d0d32d421ad96760da082320ea58e0b5..3df569d2d4652fb6f62ff5bfc7121c73003f8d8c 100644 (file)
@@ -159,3 +159,9 @@ Value *VINSERTI128(Value* a, Value* b, Constant* imm8);
 // rdtsc buckets macros
 void RDTSC_START(Value* pBucketMgr, Value* pId);
 void RDTSC_STOP(Value* pBucketMgr, Value* pId);
+
+Value* CreateEntryAlloca(Function* pFunc, Type* pType);
+
+// Static stack allocations for scatter operations
+Value* pScatterStackSrc{ nullptr };
+Value* pScatterStackOffsets{ nullptr };
\ No newline at end of file