1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * @file streamout_jit.cpp
25 * @brief Implementation of the streamout jitter
29 ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder_gfx_mem.h"
33 #include "streamout_jit.h"
34 #include "gen_state_llvm.h"
35 #include "functionpasses/passes.h"
38 using namespace SwrJit
;
40 //////////////////////////////////////////////////////////////////////////
41 /// Interface to Jitting a fetch shader
42 //////////////////////////////////////////////////////////////////////////
43 struct StreamOutJit
: public BuilderGfxMem
45 StreamOutJit(JitManager
* pJitMgr
) : BuilderGfxMem(pJitMgr
){};
47 // returns pointer to SWR_STREAMOUT_BUFFER
48 Value
* getSOBuffer(Value
* pSoCtx
, uint32_t buffer
)
50 return LOAD(pSoCtx
, {0, SWR_STREAMOUT_CONTEXT_pBuffer
, buffer
});
53 //////////////////////////////////////////////////////////////////////////
54 // @brief checks if streamout buffer is oob
55 // @return <i1> true/false
56 Value
* oob(const STREAMOUT_COMPILE_STATE
& state
, Value
* pSoCtx
, uint32_t buffer
)
58 Value
* returnMask
= C(false);
60 Value
* pBuf
= getSOBuffer(pSoCtx
, buffer
);
63 // @todo bool data types should generate <i1> llvm type
64 Value
* enabled
= TRUNC(LOAD(pBuf
, {0, SWR_STREAMOUT_BUFFER_enable
}), IRB()->getInt1Ty());
67 Value
* bufferSize
= LOAD(pBuf
, {0, SWR_STREAMOUT_BUFFER_bufferSize
});
69 // load current streamOffset
70 Value
* streamOffset
= LOAD(pBuf
, {0, SWR_STREAMOUT_BUFFER_streamOffset
});
73 Value
* pitch
= LOAD(pBuf
, {0, SWR_STREAMOUT_BUFFER_pitch
});
75 // buffer is considered oob if in use in a decl but not enabled
76 returnMask
= OR(returnMask
, NOT(enabled
));
78 // buffer is oob if cannot fit a prims worth of verts
79 Value
* newOffset
= ADD(streamOffset
, MUL(pitch
, C(state
.numVertsPerPrim
)));
80 returnMask
= OR(returnMask
, ICMP_SGT(newOffset
, bufferSize
));
85 //////////////////////////////////////////////////////////////////////////
86 // @brief converts scalar bitmask to <4 x i32> suitable for shuffle vector,
87 // packing the active mask bits
88 // ex. bitmask 0011 -> (0, 1, 0, 0)
89 // bitmask 1000 -> (3, 0, 0, 0)
90 // bitmask 1100 -> (2, 3, 0, 0)
91 Value
* PackMask(uint32_t bitmask
)
93 std::vector
<Constant
*> indices(4, C(0));
96 while (_BitScanForward(&index
, bitmask
))
98 indices
[elem
++] = C((int)index
);
99 bitmask
&= ~(1 << index
);
102 return ConstantVector::get(indices
);
105 //////////////////////////////////////////////////////////////////////////
106 // @brief convert scalar bitmask to <4xfloat> bitmask
107 Value
* ToMask(uint32_t bitmask
)
109 std::vector
<Constant
*> indices
;
110 for (uint32_t i
= 0; i
< 4; ++i
)
112 if (bitmask
& (1 << i
))
114 indices
.push_back(C(true));
118 indices
.push_back(C(false));
121 return ConstantVector::get(indices
);
124 //////////////////////////////////////////////////////////////////////////
125 // @brief processes a single decl from the streamout stream. Reads 4 components from the input
126 // stream and writes N components to the output buffer given the componentMask or if
127 // a hole, just increments the buffer pointer
128 // @param pStream - pointer to current attribute
129 // @param pOutBuffers - pointers to the current location of each output buffer
130 // @param decl - input decl
131 void buildDecl(Value
* pStream
, Value
* pOutBuffers
[4], const STREAMOUT_DECL
& decl
)
133 uint32_t numComponents
= _mm_popcnt_u32(decl
.componentMask
);
134 uint32_t packedMask
= (1 << numComponents
) - 1;
137 // increment stream pointer to correct slot
138 Value
* pAttrib
= GEP(pStream
, C(4 * decl
.attribSlot
));
140 // load 4 components from stream
141 Type
* simd4Ty
= getVectorType(IRB()->getFloatTy(), 4);
142 Type
* simd4PtrTy
= PointerType::get(simd4Ty
, 0);
143 pAttrib
= BITCAST(pAttrib
, simd4PtrTy
);
144 Value
* vattrib
= LOAD(pAttrib
);
146 // shuffle/pack enabled components
147 Value
* vpackedAttrib
= VSHUFFLE(vattrib
, vattrib
, PackMask(decl
.componentMask
));
149 // store to output buffer
150 // cast SO buffer to i8*, needed by maskstore
151 Value
* pOut
= BITCAST(pOutBuffers
[decl
.bufferIndex
], PointerType::get(simd4Ty
, 0));
153 // cast input to <4xfloat>
154 Value
* src
= BITCAST(vpackedAttrib
, simd4Ty
);
156 // cast mask to <4xi1>
157 Value
* mask
= ToMask(packedMask
);
158 MASKED_STORE(src
, pOut
, 4, mask
, PointerType::get(simd4Ty
, 0), MEM_CLIENT::GFX_MEM_CLIENT_STREAMOUT
);
161 // increment SO buffer
162 pOutBuffers
[decl
.bufferIndex
] = GEP(pOutBuffers
[decl
.bufferIndex
], C(numComponents
));
165 //////////////////////////////////////////////////////////////////////////
166 // @brief builds a single vertex worth of data for the given stream
167 // @param streamState - state for this stream
168 // @param pCurVertex - pointer to src stream vertex data
169 // @param pOutBuffer - pointers to up to 4 SO buffers
170 void buildVertex(const STREAMOUT_STREAM
& streamState
, Value
* pCurVertex
, Value
* pOutBuffer
[4])
172 for (uint32_t d
= 0; d
< streamState
.numDecls
; ++d
)
174 const STREAMOUT_DECL
& decl
= streamState
.decl
[d
];
175 buildDecl(pCurVertex
, pOutBuffer
, decl
);
179 void buildStream(const STREAMOUT_COMPILE_STATE
& state
,
180 const STREAMOUT_STREAM
& streamState
,
182 BasicBlock
* returnBB
,
185 // get list of active SO buffers
186 std::unordered_set
<uint32_t> activeSOBuffers
;
187 for (uint32_t d
= 0; d
< streamState
.numDecls
; ++d
)
189 const STREAMOUT_DECL
& decl
= streamState
.decl
[d
];
190 activeSOBuffers
.insert(decl
.bufferIndex
);
193 // always increment numPrimStorageNeeded
194 Value
* numPrimStorageNeeded
= LOAD(pSoCtx
, {0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded
});
195 numPrimStorageNeeded
= ADD(numPrimStorageNeeded
, C(1));
196 STORE(numPrimStorageNeeded
, pSoCtx
, {0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded
});
198 // check OOB on active SO buffers. If any buffer is out of bound, don't write
199 // the primitive to any buffer
200 Value
* oobMask
= C(false);
201 for (uint32_t buffer
: activeSOBuffers
)
203 oobMask
= OR(oobMask
, oob(state
, pSoCtx
, buffer
));
206 BasicBlock
* validBB
= BasicBlock::Create(JM()->mContext
, "valid", soFunc
);
209 COND_BR(oobMask
, returnBB
, validBB
);
211 IRB()->SetInsertPoint(validBB
);
213 Value
* numPrimsWritten
= LOAD(pSoCtx
, {0, SWR_STREAMOUT_CONTEXT_numPrimsWritten
});
214 numPrimsWritten
= ADD(numPrimsWritten
, C(1));
215 STORE(numPrimsWritten
, pSoCtx
, {0, SWR_STREAMOUT_CONTEXT_numPrimsWritten
});
217 // compute start pointer for each output buffer
218 Value
* pOutBuffer
[4];
219 Value
* pOutBufferStartVertex
[4];
220 Value
* outBufferPitch
[4];
221 for (uint32_t b
: activeSOBuffers
)
223 Value
* pBuf
= getSOBuffer(pSoCtx
, b
);
224 Value
* pData
= LOAD(pBuf
, {0, SWR_STREAMOUT_BUFFER_pBuffer
});
225 Value
* streamOffset
= LOAD(pBuf
, {0, SWR_STREAMOUT_BUFFER_streamOffset
});
226 pOutBuffer
[b
] = GEP(pData
, streamOffset
, PointerType::get(IRB()->getInt32Ty(), 0));
227 pOutBufferStartVertex
[b
] = pOutBuffer
[b
];
229 outBufferPitch
[b
] = LOAD(pBuf
, {0, SWR_STREAMOUT_BUFFER_pitch
});
232 // loop over the vertices of the prim
233 Value
* pStreamData
= LOAD(pSoCtx
, {0, SWR_STREAMOUT_CONTEXT_pPrimData
});
234 for (uint32_t v
= 0; v
< state
.numVertsPerPrim
; ++v
)
236 buildVertex(streamState
, pStreamData
, pOutBuffer
);
238 // increment stream and output buffer pointers
239 // stream verts are always 32*4 dwords apart
240 pStreamData
= GEP(pStreamData
, C(SWR_VTX_NUM_SLOTS
* 4));
242 // output buffers offset using pitch in buffer state
243 for (uint32_t b
: activeSOBuffers
)
245 pOutBufferStartVertex
[b
] = GEP(pOutBufferStartVertex
[b
], outBufferPitch
[b
]);
246 pOutBuffer
[b
] = pOutBufferStartVertex
[b
];
250 // update each active buffer's streamOffset
251 for (uint32_t b
: activeSOBuffers
)
253 Value
* pBuf
= getSOBuffer(pSoCtx
, b
);
254 Value
* streamOffset
= LOAD(pBuf
, {0, SWR_STREAMOUT_BUFFER_streamOffset
});
255 streamOffset
= ADD(streamOffset
, MUL(C(state
.numVertsPerPrim
), outBufferPitch
[b
]));
256 STORE(streamOffset
, pBuf
, {0, SWR_STREAMOUT_BUFFER_streamOffset
});
260 Function
* Create(const STREAMOUT_COMPILE_STATE
& state
)
262 std::stringstream
fnName("SO_",
263 std::ios_base::in
| std::ios_base::out
| std::ios_base::ate
);
264 fnName
<< ComputeCRC(0, &state
, sizeof(state
));
266 std::vector
<Type
*> args
{
269 PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT*
272 FunctionType
* fTy
= FunctionType::get(IRB()->getVoidTy(), args
, false);
273 Function
* soFunc
= Function::Create(
274 fTy
, GlobalValue::ExternalLinkage
, fnName
.str(), JM()->mpCurrentModule
);
276 soFunc
->getParent()->setModuleIdentifier(soFunc
->getName());
278 // create return basic block
279 BasicBlock
* entry
= BasicBlock::Create(JM()->mContext
, "entry", soFunc
);
280 BasicBlock
* returnBB
= BasicBlock::Create(JM()->mContext
, "return", soFunc
);
282 IRB()->SetInsertPoint(entry
);
285 auto argitr
= soFunc
->arg_begin();
287 Value
* privateContext
= &*argitr
++;
288 privateContext
->setName("privateContext");
289 SetPrivateContext(privateContext
);
291 mpWorkerData
= &*argitr
;
293 mpWorkerData
->setName("pWorkerData");
295 Value
* pSoCtx
= &*argitr
++;
296 pSoCtx
->setName("pSoCtx");
298 const STREAMOUT_STREAM
& streamState
= state
.stream
;
299 buildStream(state
, streamState
, pSoCtx
, returnBB
, soFunc
);
303 IRB()->SetInsertPoint(returnBB
);
306 JitManager::DumpToFile(soFunc
, "SoFunc");
308 ::FunctionPassManager
passes(JM()->mpCurrentModule
);
310 passes
.add(createBreakCriticalEdgesPass());
311 passes
.add(createCFGSimplificationPass());
312 passes
.add(createEarlyCSEPass());
313 passes
.add(createPromoteMemoryToRegisterPass());
314 passes
.add(createCFGSimplificationPass());
315 passes
.add(createEarlyCSEPass());
316 passes
.add(createInstructionCombiningPass());
317 #if LLVM_VERSION_MAJOR <= 11
318 passes
.add(createConstantPropagationPass());
320 passes
.add(createSCCPPass());
321 passes
.add(createAggressiveDCEPass());
323 passes
.add(createLowerX86Pass(this));
327 JitManager::DumpToFile(soFunc
, "SoFunc_optimized");
334 //////////////////////////////////////////////////////////////////////////
335 /// @brief JITs from streamout shader IR
336 /// @param hJitMgr - JitManager handle
337 /// @param func - LLVM function IR
338 /// @return PFN_SO_FUNC - pointer to SOS function
339 PFN_SO_FUNC
JitStreamoutFunc(HANDLE hJitMgr
, const HANDLE hFunc
)
341 llvm::Function
* func
= (llvm::Function
*)hFunc
;
342 JitManager
* pJitMgr
= reinterpret_cast<JitManager
*>(hJitMgr
);
343 PFN_SO_FUNC pfnStreamOut
;
344 pfnStreamOut
= (PFN_SO_FUNC
)(pJitMgr
->mpExec
->getFunctionAddress(func
->getName().str()));
345 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot
346 // add new IR to the module
347 pJitMgr
->mIsModuleFinalized
= true;
349 pJitMgr
->DumpAsm(func
, "SoFunc_optimized");
355 //////////////////////////////////////////////////////////////////////////
356 /// @brief JIT compiles streamout shader
357 /// @param hJitMgr - JitManager handle
358 /// @param state - SO state to build function from
359 extern "C" PFN_SO_FUNC JITCALL
JitCompileStreamout(HANDLE hJitMgr
,
360 const STREAMOUT_COMPILE_STATE
& state
)
362 JitManager
* pJitMgr
= reinterpret_cast<JitManager
*>(hJitMgr
);
364 STREAMOUT_COMPILE_STATE soState
= state
;
365 if (soState
.offsetAttribs
)
367 for (uint32_t i
= 0; i
< soState
.stream
.numDecls
; ++i
)
369 soState
.stream
.decl
[i
].attribSlot
-= soState
.offsetAttribs
;
373 pJitMgr
->SetupNewModule();
375 StreamOutJit
theJit(pJitMgr
);
376 HANDLE hFunc
= theJit
.Create(soState
);
378 return JitStreamoutFunc(hJitMgr
, hFunc
);