src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file streamout_jit.cpp
  24 *
  25 * @brief Implementation of the streamout jitter
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "builder.h"
  31 #include "jit_api.h"
  32 #include "streamout_jit.h"
  33 #include "gen_state_llvm.h"
  34 #include "llvm/IR/DataLayout.h"
  35
  36 #include <sstream>
  37 #include <unordered_set>
  38
  39 using namespace llvm;
  40 using namespace SwrJit;
  41
  42 //////////////////////////////////////////////////////////////////////////
  43 /// Interface to Jitting a fetch shader
  44 //////////////////////////////////////////////////////////////////////////
  45 struct StreamOutJit : public Builder
  46 {
  47     StreamOutJit(JitManager* pJitMgr) : Builder(pJitMgr){};
  48
  49     // returns pointer to SWR_STREAMOUT_BUFFER
  50     Value* getSOBuffer(Value* pSoCtx, uint32_t buffer)
  51     {
  52         return LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pBuffer, buffer });
  53     }
  54
  55
  56     //////////////////////////////////////////////////////////////////////////
  57     // @brief checks if streamout buffer is oob
  58     // @return <i1> true/false
  59     Value* oob(const STREAMOUT_COMPILE_STATE& state, Value* pSoCtx, uint32_t buffer)
  60     {
  61         Value* returnMask = C(false);
  62
  63         Value* pBuf = getSOBuffer(pSoCtx, buffer);
  64
  65         // load enable
  66         // @todo bool data types should generate <i1> llvm type
  67         Value* enabled = TRUNC(LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_enable }), IRB()->getInt1Ty());
  68
  69         // load buffer size
  70         Value* bufferSize = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_bufferSize });
  71
  72         // load current streamOffset
  73         Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
  74
  75         // load buffer pitch
  76         Value* pitch = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch });
  77
  78         // buffer is considered oob if in use in a decl but not enabled
  79         returnMask = OR(returnMask, NOT(enabled));
  80
  81         // buffer is oob if cannot fit a prims worth of verts
  82         Value* newOffset = ADD(streamOffset, MUL(pitch, C(state.numVertsPerPrim)));
  83         returnMask = OR(returnMask, ICMP_SGT(newOffset, bufferSize));
  84
  85         return returnMask;
  86     }
  87
  88
  89     //////////////////////////////////////////////////////////////////////////
  90     // @brief converts scalar bitmask to <4 x i32> suitable for shuffle vector,
  91     //        packing the active mask bits
  92     //        ex. bitmask 0011 -> (0, 1, 0, 0)
  93     //            bitmask 1000 -> (3, 0, 0, 0)
  94     //            bitmask 1100 -> (2, 3, 0, 0)
  95     Value* PackMask(uint32_t bitmask)
  96     {
  97         std::vector<Constant*> indices(4, C(0));
  98         DWORD index;
  99         uint32_t elem = 0;
 100         while (_BitScanForward(&index, bitmask))
 101         {
 102             indices[elem++] = C((int)index);
 103             bitmask &= ~(1 << index);
 104         }
 105
 106         return ConstantVector::get(indices);
 107     }
 108
 109     //////////////////////////////////////////////////////////////////////////
 110     // @brief convert scalar bitmask to <4xfloat> bitmask
 111     Value* ToMask(uint32_t bitmask)
 112     {
 113         std::vector<Constant*> indices;
 114         for (uint32_t i = 0; i < 4; ++i)
 115         {
 116             if (bitmask & (1 << i))
 117             {
 118                 indices.push_back(C(-1.0f));
 119             }
 120             else
 121             {
 122                 indices.push_back(C(0.0f));
 123             }
 124         }
 125         return ConstantVector::get(indices);
 126     }
 127
 128     //////////////////////////////////////////////////////////////////////////
 129     // @brief processes a single decl from the streamout stream. Reads 4 components from the input
 130     //        stream and writes N components to the output buffer given the componentMask or if
 131     //        a hole, just increments the buffer pointer
 132     // @param pStream - pointer to current attribute
 133     // @param pOutBuffers - pointers to the current location of each output buffer
 134     // @param decl - input decl
 135     void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl)
 136     {
 137         // @todo add this to x86 macros
 138         Function* maskStore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskstore_ps);
 139
 140         uint32_t numComponents = _mm_popcnt_u32(decl.componentMask);
 141         uint32_t packedMask = (1 << numComponents) - 1;
 142         if (!decl.hole)
 143         {
 144             // increment stream pointer to correct slot
 145             Value* pAttrib = GEP(pStream, C(4 * decl.attribSlot));
 146
 147             // load 4 components from stream
 148             Type* simd4Ty = VectorType::get(IRB()->getFloatTy(), 4);
 149             Type* simd4PtrTy = PointerType::get(simd4Ty, 0);
 150             pAttrib = BITCAST(pAttrib, simd4PtrTy);
 151             Value *vattrib = LOAD(pAttrib);
 152
 153             // shuffle/pack enabled components
 154             Value* vpackedAttrib = VSHUFFLE(vattrib, vattrib, PackMask(decl.componentMask));
 155
 156             // store to output buffer
 157             // cast SO buffer to i8*, needed by maskstore
 158             Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(mInt8Ty, 0));
 159
 160             // cast input to <4xfloat>
 161             Value* src = BITCAST(vpackedAttrib, simd4Ty);
 162
 163             // cast mask to <4xint>
 164             Value* mask = ToMask(packedMask);
 165             mask = BITCAST(mask, VectorType::get(IRB()->getInt32Ty(), 4));
 166             CALL(maskStore, {pOut, mask, src});
 167         }
 168
 169         // increment SO buffer
 170         pOutBuffers[decl.bufferIndex] = GEP(pOutBuffers[decl.bufferIndex], C(numComponents));
 171     }
 172
 173     //////////////////////////////////////////////////////////////////////////
 174     // @brief builds a single vertex worth of data for the given stream
 175     // @param streamState - state for this stream
 176     // @param pCurVertex - pointer to src stream vertex data
 177     // @param pOutBuffer - pointers to up to 4 SO buffers
 178     void buildVertex(const STREAMOUT_STREAM& streamState, Value* pCurVertex, Value* pOutBuffer[4])
 179     {
 180         for (uint32_t d = 0; d < streamState.numDecls; ++d)
 181         {
 182             const STREAMOUT_DECL& decl = streamState.decl[d];
 183             buildDecl(pCurVertex, pOutBuffer, decl);
 184         }
 185     }
 186
 187     void buildStream(const STREAMOUT_COMPILE_STATE& state, const STREAMOUT_STREAM& streamState, Value* pSoCtx, BasicBlock* returnBB, Function* soFunc)
 188     {
 189         // get list of active SO buffers
 190         std::unordered_set<uint32_t> activeSOBuffers;
 191         for (uint32_t d = 0; d < streamState.numDecls; ++d)
 192         {
 193             const STREAMOUT_DECL& decl = streamState.decl[d];
 194             activeSOBuffers.insert(decl.bufferIndex);
 195         }
 196
 197         // always increment numPrimStorageNeeded
 198         Value *numPrimStorageNeeded = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded });
 199         numPrimStorageNeeded = ADD(numPrimStorageNeeded, C(1));
 200         STORE(numPrimStorageNeeded, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded });
 201
 202         // check OOB on active SO buffers.  If any buffer is out of bound, don't write
 203         // the primitive to any buffer
 204         Value* oobMask = C(false);
 205         for (uint32_t buffer : activeSOBuffers)
 206         {
 207             oobMask = OR(oobMask, oob(state, pSoCtx, buffer));
 208         }
 209
 210         BasicBlock* validBB = BasicBlock::Create(JM()->mContext, "valid", soFunc);
 211
 212         // early out if OOB
 213         COND_BR(oobMask, returnBB, validBB);
 214
 215         IRB()->SetInsertPoint(validBB);
 216
 217         Value* numPrimsWritten = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten });
 218         numPrimsWritten = ADD(numPrimsWritten, C(1));
 219         STORE(numPrimsWritten, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten });
 220
 221         // compute start pointer for each output buffer
 222         Value* pOutBuffer[4];
 223         Value* pOutBufferStartVertex[4];
 224         Value* outBufferPitch[4];
 225         for (uint32_t b: activeSOBuffers)
 226         {
 227             Value* pBuf = getSOBuffer(pSoCtx, b);
 228             Value* pData = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pBuffer });
 229             Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
 230             pOutBuffer[b] = GEP(pData, streamOffset);
 231             pOutBufferStartVertex[b] = pOutBuffer[b];
 232
 233             outBufferPitch[b] = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch });
 234         }
 235
 236         // loop over the vertices of the prim
 237         Value* pStreamData = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pPrimData });
 238         for (uint32_t v = 0; v < state.numVertsPerPrim; ++v)
 239         {
 240             buildVertex(streamState, pStreamData, pOutBuffer);
 241
 242             // increment stream and output buffer pointers
 243             // stream verts are always 32*4 dwords apart
 244             pStreamData = GEP(pStreamData, C(SWR_VTX_NUM_SLOTS * 4));
 245
 246             // output buffers offset using pitch in buffer state
 247             for (uint32_t b : activeSOBuffers)
 248             {
 249                 pOutBufferStartVertex[b] = GEP(pOutBufferStartVertex[b], outBufferPitch[b]);
 250                 pOutBuffer[b] = pOutBufferStartVertex[b];
 251             }
 252         }
 253
 254         // update each active buffer's streamOffset
 255         for (uint32_t b : activeSOBuffers)
 256         {
 257             Value* pBuf = getSOBuffer(pSoCtx, b);
 258             Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
 259             streamOffset = ADD(streamOffset, MUL(C(state.numVertsPerPrim), outBufferPitch[b]));
 260             STORE(streamOffset, pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
 261         }
 262     }
 263
 264     Function* Create(const STREAMOUT_COMPILE_STATE& state)
 265     {
 266         static std::size_t soNum = 0;
 267
 268         std::stringstream fnName("SOShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
 269         fnName << soNum++;
 270
 271         // SO function signature
 272         // typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT*)
 273
 274         std::vector<Type*> args{
 275             PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT*
 276         };
 277
 278         FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
 279         Function* soFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
 280
 281         // create return basic block
 282         BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", soFunc);
 283         BasicBlock* returnBB = BasicBlock::Create(JM()->mContext, "return", soFunc);
 284
 285         IRB()->SetInsertPoint(entry);
 286
 287         // arguments
 288         auto argitr = soFunc->arg_begin();
 289         Value* pSoCtx = &*argitr++;
 290         pSoCtx->setName("pSoCtx");
 291
 292         const STREAMOUT_STREAM& streamState = state.stream;
 293         buildStream(state, streamState, pSoCtx, returnBB, soFunc);
 294
 295         BR(returnBB);
 296
 297         IRB()->SetInsertPoint(returnBB);
 298         RET_VOID();
 299
 300         JitManager::DumpToFile(soFunc, "SoFunc");
 301
 302         ::FunctionPassManager passes(JM()->mpCurrentModule);
 303
 304         passes.add(createBreakCriticalEdgesPass());
 305         passes.add(createCFGSimplificationPass());
 306         passes.add(createEarlyCSEPass());
 307         passes.add(createPromoteMemoryToRegisterPass());
 308         passes.add(createCFGSimplificationPass());
 309         passes.add(createEarlyCSEPass());
 310         passes.add(createInstructionCombiningPass());
 311         passes.add(createInstructionSimplifierPass());
 312         passes.add(createConstantPropagationPass());
 313         passes.add(createSCCPPass());
 314         passes.add(createAggressiveDCEPass());
 315
 316         passes.run(*soFunc);
 317
 318         JitManager::DumpToFile(soFunc, "SoFunc_optimized");
 319
 320         return soFunc;
 321     }
 322 };
 323
 324 //////////////////////////////////////////////////////////////////////////
 325 /// @brief JITs from streamout shader IR
 326 /// @param hJitMgr - JitManager handle
 327 /// @param func   - LLVM function IR
 328 /// @return PFN_SO_FUNC - pointer to SOS function
 329 PFN_SO_FUNC JitStreamoutFunc(HANDLE hJitMgr, const HANDLE hFunc)
 330 {
 331     const llvm::Function *func = (const llvm::Function*)hFunc;
 332     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
 333     PFN_SO_FUNC pfnStreamOut;
 334     pfnStreamOut = (PFN_SO_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
 335     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
 336     pJitMgr->mIsModuleFinalized = true;
 337
 338     return pfnStreamOut;
 339 }
 340
 341 //////////////////////////////////////////////////////////////////////////
 342 /// @brief JIT compiles streamout shader
 343 /// @param hJitMgr - JitManager handle
 344 /// @param state   - SO state to build function from
 345 extern "C" PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitMgr, const STREAMOUT_COMPILE_STATE& state)
 346 {
 347     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
 348
 349     STREAMOUT_COMPILE_STATE soState = state;
 350     if (soState.offsetAttribs)
 351     {
 352         for (uint32_t i = 0; i < soState.stream.numDecls; ++i)
 353         {
 354             soState.stream.decl[i].attribSlot -= soState.offsetAttribs;
 355         }
 356     }
 357
 358     pJitMgr->SetupNewModule();
 359
 360     StreamOutJit theJit(pJitMgr);
 361     HANDLE hFunc = theJit.Create(soState);
 362
 363     return JitStreamoutFunc(hJitMgr, hFunc);
 364 }