src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp

   1 /****************************************************************************
   2  * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * @file streamout_jit.cpp
  24  *
  25  * @brief Implementation of the streamout jitter
  26  *
  27  * Notes:
  28  *
  29  ******************************************************************************/
  30 #include "jit_pch.hpp"
  31 #include "builder_gfx_mem.h"
  32 #include "jit_api.h"
  33 #include "streamout_jit.h"
  34 #include "gen_state_llvm.h"
  35 #include "functionpasses/passes.h"
  36
  37 using namespace llvm;
  38 using namespace SwrJit;
  39
  40 //////////////////////////////////////////////////////////////////////////
  41 /// Interface to Jitting a fetch shader
  42 //////////////////////////////////////////////////////////////////////////
  43 struct StreamOutJit : public BuilderGfxMem
  44 {
  45     StreamOutJit(JitManager* pJitMgr) : BuilderGfxMem(pJitMgr){};
  46
  47     // returns pointer to SWR_STREAMOUT_BUFFER
  48     Value* getSOBuffer(Value* pSoCtx, uint32_t buffer)
  49     {
  50         return LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_pBuffer, buffer});
  51     }
  52
  53     //////////////////////////////////////////////////////////////////////////
  54     // @brief checks if streamout buffer is oob
  55     // @return <i1> true/false
  56     Value* oob(const STREAMOUT_COMPILE_STATE& state, Value* pSoCtx, uint32_t buffer)
  57     {
  58         Value* returnMask = C(false);
  59
  60         Value* pBuf = getSOBuffer(pSoCtx, buffer);
  61
  62         // load enable
  63         // @todo bool data types should generate <i1> llvm type
  64         Value* enabled = TRUNC(LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_enable}), IRB()->getInt1Ty());
  65
  66         // load buffer size
  67         Value* bufferSize = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_bufferSize});
  68
  69         // load current streamOffset
  70         Value* streamOffset = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset});
  71
  72         // load buffer pitch
  73         Value* pitch = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_pitch});
  74
  75         // buffer is considered oob if in use in a decl but not enabled
  76         returnMask = OR(returnMask, NOT(enabled));
  77
  78         // buffer is oob if cannot fit a prims worth of verts
  79         Value* newOffset = ADD(streamOffset, MUL(pitch, C(state.numVertsPerPrim)));
  80         returnMask       = OR(returnMask, ICMP_SGT(newOffset, bufferSize));
  81
  82         return returnMask;
  83     }
  84
  85     //////////////////////////////////////////////////////////////////////////
  86     // @brief converts scalar bitmask to <4 x i32> suitable for shuffle vector,
  87     //        packing the active mask bits
  88     //        ex. bitmask 0011 -> (0, 1, 0, 0)
  89     //            bitmask 1000 -> (3, 0, 0, 0)
  90     //            bitmask 1100 -> (2, 3, 0, 0)
  91     Value* PackMask(uint32_t bitmask)
  92     {
  93         std::vector<Constant*> indices(4, C(0));
  94         DWORD                  index;
  95         uint32_t               elem = 0;
  96         while (_BitScanForward(&index, bitmask))
  97         {
  98             indices[elem++] = C((int)index);
  99             bitmask &= ~(1 << index);
 100         }
 101
 102         return ConstantVector::get(indices);
 103     }
 104
 105     //////////////////////////////////////////////////////////////////////////
 106     // @brief convert scalar bitmask to <4xfloat> bitmask
 107     Value* ToMask(uint32_t bitmask)
 108     {
 109         std::vector<Constant*> indices;
 110         for (uint32_t i = 0; i < 4; ++i)
 111         {
 112             if (bitmask & (1 << i))
 113             {
 114                 indices.push_back(C(true));
 115             }
 116             else
 117             {
 118                 indices.push_back(C(false));
 119             }
 120         }
 121         return ConstantVector::get(indices);
 122     }
 123
 124     //////////////////////////////////////////////////////////////////////////
 125     // @brief processes a single decl from the streamout stream. Reads 4 components from the input
 126     //        stream and writes N components to the output buffer given the componentMask or if
 127     //        a hole, just increments the buffer pointer
 128     // @param pStream - pointer to current attribute
 129     // @param pOutBuffers - pointers to the current location of each output buffer
 130     // @param decl - input decl
 131     void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl)
 132     {
 133         uint32_t numComponents = _mm_popcnt_u32(decl.componentMask);
 134         uint32_t packedMask    = (1 << numComponents) - 1;
 135         if (!decl.hole)
 136         {
 137             // increment stream pointer to correct slot
 138             Value* pAttrib = GEP(pStream, C(4 * decl.attribSlot));
 139
 140             // load 4 components from stream
 141             Type* simd4Ty    = VectorType::get(IRB()->getFloatTy(), 4);
 142             Type* simd4PtrTy = PointerType::get(simd4Ty, 0);
 143             pAttrib          = BITCAST(pAttrib, simd4PtrTy);
 144             Value* vattrib   = LOAD(pAttrib);
 145
 146             // shuffle/pack enabled components
 147             Value* vpackedAttrib = VSHUFFLE(vattrib, vattrib, PackMask(decl.componentMask));
 148
 149             // store to output buffer
 150             // cast SO buffer to i8*, needed by maskstore
 151             Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(simd4Ty, 0));
 152
 153             // cast input to <4xfloat>
 154             Value* src = BITCAST(vpackedAttrib, simd4Ty);
 155
 156             // cast mask to <4xi1>
 157             Value* mask = ToMask(packedMask);
 158             MASKED_STORE(src, pOut, 4, mask, PointerType::get(simd4Ty, 0), MEM_CLIENT::GFX_MEM_CLIENT_STREAMOUT);
 159         }
 160
 161         // increment SO buffer
 162         pOutBuffers[decl.bufferIndex] = GEP(pOutBuffers[decl.bufferIndex], C(numComponents));
 163     }
 164
 165     //////////////////////////////////////////////////////////////////////////
 166     // @brief builds a single vertex worth of data for the given stream
 167     // @param streamState - state for this stream
 168     // @param pCurVertex - pointer to src stream vertex data
 169     // @param pOutBuffer - pointers to up to 4 SO buffers
 170     void buildVertex(const STREAMOUT_STREAM& streamState, Value* pCurVertex, Value* pOutBuffer[4])
 171     {
 172         for (uint32_t d = 0; d < streamState.numDecls; ++d)
 173         {
 174             const STREAMOUT_DECL& decl = streamState.decl[d];
 175             buildDecl(pCurVertex, pOutBuffer, decl);
 176         }
 177     }
 178
 179     void buildStream(const STREAMOUT_COMPILE_STATE& state,
 180                      const STREAMOUT_STREAM&        streamState,
 181                      Value*                         pSoCtx,
 182                      BasicBlock*                    returnBB,
 183                      Function*                      soFunc)
 184     {
 185         // get list of active SO buffers
 186         std::unordered_set<uint32_t> activeSOBuffers;
 187         for (uint32_t d = 0; d < streamState.numDecls; ++d)
 188         {
 189             const STREAMOUT_DECL& decl = streamState.decl[d];
 190             activeSOBuffers.insert(decl.bufferIndex);
 191         }
 192
 193         // always increment numPrimStorageNeeded
 194         Value* numPrimStorageNeeded = LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded});
 195         numPrimStorageNeeded        = ADD(numPrimStorageNeeded, C(1));
 196         STORE(numPrimStorageNeeded, pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded});
 197
 198         // check OOB on active SO buffers.  If any buffer is out of bound, don't write
 199         // the primitive to any buffer
 200         Value* oobMask = C(false);
 201         for (uint32_t buffer : activeSOBuffers)
 202         {
 203             oobMask = OR(oobMask, oob(state, pSoCtx, buffer));
 204         }
 205
 206         BasicBlock* validBB = BasicBlock::Create(JM()->mContext, "valid", soFunc);
 207
 208         // early out if OOB
 209         COND_BR(oobMask, returnBB, validBB);
 210
 211         IRB()->SetInsertPoint(validBB);
 212
 213         Value* numPrimsWritten = LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimsWritten});
 214         numPrimsWritten        = ADD(numPrimsWritten, C(1));
 215         STORE(numPrimsWritten, pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimsWritten});
 216
 217         // compute start pointer for each output buffer
 218         Value* pOutBuffer[4];
 219         Value* pOutBufferStartVertex[4];
 220         Value* outBufferPitch[4];
 221         for (uint32_t b : activeSOBuffers)
 222         {
 223             Value* pBuf              = getSOBuffer(pSoCtx, b);
 224             Value* pData             = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_pBuffer});
 225             Value* streamOffset      = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset});
 226             pOutBuffer[b] = GEP(pData, streamOffset, PointerType::get(IRB()->getInt32Ty(), 0));
 227             pOutBufferStartVertex[b] = pOutBuffer[b];
 228
 229             outBufferPitch[b] = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_pitch});
 230         }
 231
 232         // loop over the vertices of the prim
 233         Value* pStreamData = LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_pPrimData});
 234         for (uint32_t v = 0; v < state.numVertsPerPrim; ++v)
 235         {
 236             buildVertex(streamState, pStreamData, pOutBuffer);
 237
 238             // increment stream and output buffer pointers
 239             // stream verts are always 32*4 dwords apart
 240             pStreamData = GEP(pStreamData, C(SWR_VTX_NUM_SLOTS * 4));
 241
 242             // output buffers offset using pitch in buffer state
 243             for (uint32_t b : activeSOBuffers)
 244             {
 245                 pOutBufferStartVertex[b] = GEP(pOutBufferStartVertex[b], outBufferPitch[b]);
 246                 pOutBuffer[b]            = pOutBufferStartVertex[b];
 247             }
 248         }
 249
 250         // update each active buffer's streamOffset
 251         for (uint32_t b : activeSOBuffers)
 252         {
 253             Value* pBuf         = getSOBuffer(pSoCtx, b);
 254             Value* streamOffset = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset});
 255             streamOffset = ADD(streamOffset, MUL(C(state.numVertsPerPrim), outBufferPitch[b]));
 256             STORE(streamOffset, pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset});
 257         }
 258     }
 259
 260     Function* Create(const STREAMOUT_COMPILE_STATE& state)
 261     {
 262         std::stringstream fnName("SO_",
 263                                  std::ios_base::in | std::ios_base::out | std::ios_base::ate);
 264         fnName << ComputeCRC(0, &state, sizeof(state));
 265
 266         std::vector<Type*> args{
 267             mInt8PtrTy,
 268             mInt8PtrTy,
 269             PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT*
 270         };
 271
 272         FunctionType* fTy    = FunctionType::get(IRB()->getVoidTy(), args, false);
 273         Function*     soFunc = Function::Create(
 274             fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
 275
 276         soFunc->getParent()->setModuleIdentifier(soFunc->getName());
 277
 278         // create return basic block
 279         BasicBlock* entry    = BasicBlock::Create(JM()->mContext, "entry", soFunc);
 280         BasicBlock* returnBB = BasicBlock::Create(JM()->mContext, "return", soFunc);
 281
 282         IRB()->SetInsertPoint(entry);
 283
 284         // arguments
 285         auto   argitr = soFunc->arg_begin();
 286
 287         Value* privateContext = &*argitr++;
 288         privateContext->setName("privateContext");
 289         SetPrivateContext(privateContext);
 290
 291         mpWorkerData = &*argitr;
 292         ++argitr;
 293         mpWorkerData->setName("pWorkerData");
 294
 295         Value* pSoCtx = &*argitr++;
 296         pSoCtx->setName("pSoCtx");
 297
 298         const STREAMOUT_STREAM& streamState = state.stream;
 299         buildStream(state, streamState, pSoCtx, returnBB, soFunc);
 300
 301         BR(returnBB);
 302
 303         IRB()->SetInsertPoint(returnBB);
 304         RET_VOID();
 305
 306         JitManager::DumpToFile(soFunc, "SoFunc");
 307
 308         ::FunctionPassManager passes(JM()->mpCurrentModule);
 309
 310         passes.add(createBreakCriticalEdgesPass());
 311         passes.add(createCFGSimplificationPass());
 312         passes.add(createEarlyCSEPass());
 313         passes.add(createPromoteMemoryToRegisterPass());
 314         passes.add(createCFGSimplificationPass());
 315         passes.add(createEarlyCSEPass());
 316         passes.add(createInstructionCombiningPass());
 317         passes.add(createConstantPropagationPass());
 318         passes.add(createSCCPPass());
 319         passes.add(createAggressiveDCEPass());
 320
 321         passes.add(createLowerX86Pass(this));
 322
 323         passes.run(*soFunc);
 324
 325         JitManager::DumpToFile(soFunc, "SoFunc_optimized");
 326
 327
 328         return soFunc;
 329     }
 330 };
 331
 332 //////////////////////////////////////////////////////////////////////////
 333 /// @brief JITs from streamout shader IR
 334 /// @param hJitMgr - JitManager handle
 335 /// @param func   - LLVM function IR
 336 /// @return PFN_SO_FUNC - pointer to SOS function
 337 PFN_SO_FUNC JitStreamoutFunc(HANDLE hJitMgr, const HANDLE hFunc)
 338 {
 339     llvm::Function* func    = (llvm::Function*)hFunc;
 340     JitManager*     pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
 341     PFN_SO_FUNC     pfnStreamOut;
 342     pfnStreamOut = (PFN_SO_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
 343     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot
 344     // add new IR to the module
 345     pJitMgr->mIsModuleFinalized = true;
 346
 347     pJitMgr->DumpAsm(func, "SoFunc_optimized");
 348
 349
 350     return pfnStreamOut;
 351 }
 352
 353 //////////////////////////////////////////////////////////////////////////
 354 /// @brief JIT compiles streamout shader
 355 /// @param hJitMgr - JitManager handle
 356 /// @param state   - SO state to build function from
 357 extern "C" PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE                         hJitMgr,
 358                                                    const STREAMOUT_COMPILE_STATE& state)
 359 {
 360     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
 361
 362     STREAMOUT_COMPILE_STATE soState = state;
 363     if (soState.offsetAttribs)
 364     {
 365         for (uint32_t i = 0; i < soState.stream.numDecls; ++i)
 366         {
 367             soState.stream.decl[i].attribSlot -= soState.offsetAttribs;
 368         }
 369     }
 370
 371     pJitMgr->SetupNewModule();
 372
 373     StreamOutJit theJit(pJitMgr);
 374     HANDLE       hFunc = theJit.Create(soState);
 375
 376     return JitStreamoutFunc(hJitMgr, hFunc);
 377 }