src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file streamout_jit.cpp
  24 *
  25 * @brief Implementation of the streamout jitter
  26 *
  27 * Notes:
  28 *
  29 ******************************************************************************/
  30 #include "jit_api.h"
  31 #include "streamout_jit.h"
  32 #include "builder.h"
  33 #include "state_llvm.h"
  34 #include "common/containers.hpp"
  35 #include "llvm/IR/DataLayout.h"
  36
  37 #include <sstream>
  38 #include <unordered_set>
  39
  40 //////////////////////////////////////////////////////////////////////////
  41 /// Interface to Jitting a fetch shader
  42 //////////////////////////////////////////////////////////////////////////
  43 struct StreamOutJit : public Builder
  44 {
  45     StreamOutJit(JitManager* pJitMgr) : Builder(pJitMgr){};
  46
  47     // returns pointer to SWR_STREAMOUT_BUFFER
  48     Value* getSOBuffer(Value* pSoCtx, uint32_t buffer)
  49     {
  50         return LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pBuffer, buffer });
  51     }
  52
  53
  54     //////////////////////////////////////////////////////////////////////////
  55     // @brief checks if streamout buffer is oob
  56     // @return <i1> true/false
  57     Value* oob(const STREAMOUT_COMPILE_STATE& state, Value* pSoCtx, uint32_t buffer)
  58     {
  59         Value* returnMask = C(false);
  60
  61         Value* pBuf = getSOBuffer(pSoCtx, buffer);
  62
  63         // load enable
  64         // @todo bool data types should generate <i1> llvm type
  65         Value* enabled = TRUNC(LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_enable }), IRB()->getInt1Ty());
  66
  67         // load buffer size
  68         Value* bufferSize = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_bufferSize });
  69
  70         // load current streamOffset
  71         Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
  72
  73         // load buffer pitch
  74         Value* pitch = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch });
  75
  76         // buffer is considered oob if in use in a decl but not enabled
  77         returnMask = OR(returnMask, NOT(enabled));
  78
  79         // buffer is oob if cannot fit a prims worth of verts
  80         Value* newOffset = ADD(streamOffset, MUL(pitch, C(state.numVertsPerPrim)));
  81         returnMask = OR(returnMask, ICMP_SGT(newOffset, bufferSize));
  82
  83         return returnMask;
  84     }
  85
  86
  87     //////////////////////////////////////////////////////////////////////////
  88     // @brief converts scalar bitmask to <4 x i32> suitable for shuffle vector,
  89     //        packing the active mask bits
  90     //        ex. bitmask 0011 -> (0, 1, 0, 0)
  91     //            bitmask 1000 -> (3, 0, 0, 0)
  92     //            bitmask 1100 -> (2, 3, 0, 0)
  93     Value* PackMask(uint32_t bitmask)
  94     {
  95         std::vector<Constant*> indices(4, C(0));
  96         DWORD index;
  97         uint32_t elem = 0;
  98         while (_BitScanForward(&index, bitmask))
  99         {
 100             indices[elem++] = C((int)index);
 101             bitmask &= ~(1 << index);
 102         }
 103
 104         return ConstantVector::get(indices);
 105     }
 106
 107     //////////////////////////////////////////////////////////////////////////
 108     // @brief convert scalar bitmask to <4xfloat> bitmask
 109     Value* ToMask(uint32_t bitmask)
 110     {
 111         std::vector<Constant*> indices;
 112         for (uint32_t i = 0; i < 4; ++i)
 113         {
 114             if (bitmask & (1 << i))
 115             {
 116                 indices.push_back(C(-1.0f));
 117             }
 118             else
 119             {
 120                 indices.push_back(C(0.0f));
 121             }
 122         }
 123         return ConstantVector::get(indices);
 124     }
 125
 126     //////////////////////////////////////////////////////////////////////////
 127     // @brief processes a single decl from the streamout stream. Reads 4 components from the input
 128     //        stream and writes N components to the output buffer given the componentMask or if
 129     //        a hole, just increments the buffer pointer
 130     // @param pStream - pointer to current attribute
 131     // @param pOutBuffers - pointers to the current location of each output buffer
 132     // @param decl - input decl
 133     void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl)
 134     {
 135         // @todo add this to x86 macros
 136         Function* maskStore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskstore_ps);
 137
 138         uint32_t numComponents = _mm_popcnt_u32(decl.componentMask);
 139         uint32_t packedMask = (1 << numComponents) - 1;
 140         if (!decl.hole)
 141         {
 142             // increment stream pointer to correct slot
 143             Value* pAttrib = GEP(pStream, C(4 * decl.attribSlot));
 144
 145             // load 4 components from stream
 146             Type* simd4Ty = VectorType::get(IRB()->getFloatTy(), 4);
 147             Type* simd4PtrTy = PointerType::get(simd4Ty, 0);
 148             pAttrib = BITCAST(pAttrib, simd4PtrTy);
 149             Value *vattrib = LOAD(pAttrib);
 150
 151             // shuffle/pack enabled components
 152             Value* vpackedAttrib = VSHUFFLE(vattrib, vattrib, PackMask(decl.componentMask));
 153
 154             // store to output buffer
 155             // cast SO buffer to i8*, needed by maskstore
 156             Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(mInt8Ty, 0));
 157
 158             // cast input to <4xfloat>
 159             Value* src = BITCAST(vpackedAttrib, simd4Ty);
 160             CALL(maskStore, {pOut, ToMask(packedMask), src});
 161         }
 162
 163         // increment SO buffer
 164         pOutBuffers[decl.bufferIndex] = GEP(pOutBuffers[decl.bufferIndex], C(numComponents));
 165     }
 166
 167     //////////////////////////////////////////////////////////////////////////
 168     // @brief builds a single vertex worth of data for the given stream
 169     // @param streamState - state for this stream
 170     // @param pCurVertex - pointer to src stream vertex data
 171     // @param pOutBuffer - pointers to up to 4 SO buffers
 172     void buildVertex(const STREAMOUT_STREAM& streamState, Value* pCurVertex, Value* pOutBuffer[4])
 173     {
 174         for (uint32_t d = 0; d < streamState.numDecls; ++d)
 175         {
 176             const STREAMOUT_DECL& decl = streamState.decl[d];
 177             buildDecl(pCurVertex, pOutBuffer, decl);
 178         }
 179     }
 180
 181     void buildStream(const STREAMOUT_COMPILE_STATE& state, const STREAMOUT_STREAM& streamState, Value* pSoCtx, BasicBlock* returnBB, Function* soFunc)
 182     {
 183         // get list of active SO buffers
 184         std::unordered_set<uint32_t> activeSOBuffers;
 185         for (uint32_t d = 0; d < streamState.numDecls; ++d)
 186         {
 187             const STREAMOUT_DECL& decl = streamState.decl[d];
 188             activeSOBuffers.insert(decl.bufferIndex);
 189         }
 190
 191         // always increment numPrimStorageNeeded
 192         Value *numPrimStorageNeeded = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded });
 193         numPrimStorageNeeded = ADD(numPrimStorageNeeded, C(1));
 194         STORE(numPrimStorageNeeded, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded });
 195
 196         // check OOB on active SO buffers.  If any buffer is out of bound, don't write
 197         // the primitive to any buffer
 198         Value* oobMask = C(false);
 199         for (uint32_t buffer : activeSOBuffers)
 200         {
 201             oobMask = OR(oobMask, oob(state, pSoCtx, buffer));
 202         }
 203
 204         BasicBlock* validBB = BasicBlock::Create(JM()->mContext, "valid", soFunc);
 205
 206         // early out if OOB
 207         COND_BR(oobMask, returnBB, validBB);
 208
 209         IRB()->SetInsertPoint(validBB);
 210
 211         Value* numPrimsWritten = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten });
 212         numPrimsWritten = ADD(numPrimsWritten, C(1));
 213         STORE(numPrimsWritten, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten });
 214
 215         // compute start pointer for each output buffer
 216         Value* pOutBuffer[4];
 217         Value* pOutBufferStartVertex[4];
 218         Value* outBufferPitch[4];
 219         for (uint32_t b: activeSOBuffers)
 220         {
 221             Value* pBuf = getSOBuffer(pSoCtx, b);
 222             Value* pData = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pBuffer });
 223             Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
 224             pOutBuffer[b] = GEP(pData, streamOffset);
 225             pOutBufferStartVertex[b] = pOutBuffer[b];
 226
 227             outBufferPitch[b] = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch });
 228         }
 229
 230         // loop over the vertices of the prim
 231         Value* pStreamData = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pPrimData });
 232         for (uint32_t v = 0; v < state.numVertsPerPrim; ++v)
 233         {
 234             buildVertex(streamState, pStreamData, pOutBuffer);
 235
 236             // increment stream and output buffer pointers
 237             // stream verts are always 32*4 dwords apart
 238             pStreamData = GEP(pStreamData, C(KNOB_NUM_ATTRIBUTES * 4));
 239
 240             // output buffers offset using pitch in buffer state
 241             for (uint32_t b : activeSOBuffers)
 242             {
 243                 pOutBufferStartVertex[b] = GEP(pOutBufferStartVertex[b], outBufferPitch[b]);
 244                 pOutBuffer[b] = pOutBufferStartVertex[b];
 245             }
 246         }
 247
 248         // update each active buffer's streamOffset
 249         for (uint32_t b : activeSOBuffers)
 250         {
 251             Value* pBuf = getSOBuffer(pSoCtx, b);
 252             Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
 253             streamOffset = ADD(streamOffset, MUL(C(state.numVertsPerPrim), outBufferPitch[b]));
 254             STORE(streamOffset, pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
 255         }
 256     }
 257
 258     Function* Create(const STREAMOUT_COMPILE_STATE& state)
 259     {
 260         static std::size_t soNum = 0;
 261
 262         std::stringstream fnName("SOShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
 263         fnName << soNum++;
 264
 265         // SO function signature
 266         // typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT*)
 267
 268         std::vector<Type*> args{
 269             PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT*
 270         };
 271
 272         FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
 273         Function* soFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
 274
 275         // create return basic block
 276         BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", soFunc);
 277         BasicBlock* returnBB = BasicBlock::Create(JM()->mContext, "return", soFunc);
 278
 279         IRB()->SetInsertPoint(entry);
 280
 281         // arguments
 282         auto argitr = soFunc->getArgumentList().begin();
 283         Value* pSoCtx = &*argitr++;
 284         pSoCtx->setName("pSoCtx");
 285
 286         const STREAMOUT_STREAM& streamState = state.stream;
 287         buildStream(state, streamState, pSoCtx, returnBB, soFunc);
 288
 289         BR(returnBB);
 290
 291         IRB()->SetInsertPoint(returnBB);
 292         RET_VOID();
 293
 294         JitManager::DumpToFile(soFunc, "SoFunc");
 295
 296         FunctionPassManager passes(JM()->mpCurrentModule);
 297         passes.add(createBreakCriticalEdgesPass());
 298         passes.add(createCFGSimplificationPass());
 299         passes.add(createEarlyCSEPass());
 300         passes.add(createPromoteMemoryToRegisterPass());
 301         passes.add(createCFGSimplificationPass());
 302         passes.add(createEarlyCSEPass());
 303         passes.add(createInstructionCombiningPass());
 304         passes.add(createInstructionSimplifierPass());
 305         passes.add(createConstantPropagationPass());
 306         passes.add(createSCCPPass());
 307         passes.add(createAggressiveDCEPass());
 308
 309         passes.run(*soFunc);
 310
 311         JitManager::DumpToFile(soFunc, "SoFunc_optimized");
 312
 313         return soFunc;
 314     }
 315 };
 316
 317 //////////////////////////////////////////////////////////////////////////
 318 /// @brief JITs from streamout shader IR
 319 /// @param hJitMgr - JitManager handle
 320 /// @param func   - LLVM function IR
 321 /// @return PFN_SO_FUNC - pointer to SOS function
 322 PFN_SO_FUNC JitStreamoutFunc(HANDLE hJitMgr, const HANDLE hFunc)
 323 {
 324     const llvm::Function *func = (const llvm::Function*)hFunc;
 325     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
 326     PFN_SO_FUNC pfnStreamOut;
 327     pfnStreamOut = (PFN_SO_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
 328     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
 329     pJitMgr->mIsModuleFinalized = true;
 330
 331     return pfnStreamOut;
 332 }
 333
 334 //////////////////////////////////////////////////////////////////////////
 335 /// @brief JIT compiles streamout shader
 336 /// @param hJitMgr - JitManager handle
 337 /// @param state   - SO state to build function from
 338 extern "C" PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitMgr, const STREAMOUT_COMPILE_STATE& state)
 339 {
 340     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
 341
 342     STREAMOUT_COMPILE_STATE soState = state;
 343     if (soState.offsetAttribs)
 344     {
 345         for (uint32_t i = 0; i < soState.stream.numDecls; ++i)
 346         {
 347             soState.stream.decl[i].attribSlot -= soState.offsetAttribs;
 348         }
 349     }
 350
 351     pJitMgr->SetupNewModule();
 352
 353     StreamOutJit theJit(pJitMgr);
 354     HANDLE hFunc = theJit.Create(soState);
 355
 356     return JitStreamoutFunc(hJitMgr, hFunc);
 357 }